From 50ca4fccd89d45a30b91a11a6400f371782070bb Mon Sep 17 00:00:00 2001 From: yann Date: Thu, 14 Nov 2024 10:47:58 +0100 Subject: [PATCH] just one loop to fill the list, "extend" with each page list --- phase2/main.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/phase2/main.py b/phase2/main.py index 214dcd1..bd15300 100644 --- a/phase2/main.py +++ b/phase2/main.py @@ -1,7 +1,7 @@ import requests from bs4 import BeautifulSoup -# récupère le soup depuis l'url +# get soup from url def get_html(url): r = requests.get(url) html = r.content @@ -35,16 +35,18 @@ def check_for_pages(category_url): return url_list -def get_product_list(url_category_page, url): - list = [] +# get product list url from a given url category page; +# extract and build each product url using the main url (second arg) +def get_product_url_list(url_category_page, url): + liste = [] soup = get_html(url_category_page) for i in soup.find_all("article"): relative_url = i.h3.a.get('href') product_url = url + relative_url.split('../')[-1] - list.append(product_url) + liste.append(product_url) - return(list) + return liste def main(): # init @@ -61,18 +63,18 @@ def main(): # check if multiple page and get url list url_list = check_for_pages(category_url) - print("Liste des URLs des pages: ", url_list) - # get product list for every url_list - product_list_url = [] + + # get product list for each url_list, extend the main product url list with + product_url_list = [] for i in url_list: - get_list = get_product_list(i, url) - for j in get_list: - product_list_url.append(j) + product_url_list.extend(get_product_url_list(i, url)) - print("Liste des URL des produits: ", product_list_url) - print("Longueur de la liste: ", len(product_list_url)) + + + print("Liste des URL des produits: ", product_url_list) + print("Longueur de la liste: ", len(product_url_list)) if __name__ == '__main__': main()