From 8213f0849c3a4d658800db1487a171473697b8f5 Mon Sep 17 00:00:00 2001 From: yann Date: Wed, 13 Nov 2024 17:09:06 +0100 Subject: [PATCH] test if multiple page, get URL, create list of product, and refactor main --- phase2/main.py | 70 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/phase2/main.py b/phase2/main.py index d5a90da..214dcd1 100644 --- a/phase2/main.py +++ b/phase2/main.py @@ -1,19 +1,16 @@ import requests from bs4 import BeautifulSoup -url = "https://books.toscrape.com/" -r = requests.get(url) -html = r.content -soup = BeautifulSoup(html, 'html.parser') +# récupère le soup depuis l'url +def get_html(url): + r = requests.get(url) + html = r.content + soup = BeautifulSoup(html, 'html.parser') + return soup -category = "travel" - -catego_info = [] -catego_dict = {} # get category and URL from side div and put them as a list [catego, url] in a list - -def get_category_list(soup): +def get_category_list(soup, url): catego_info = [] catego_dict = {} for li in soup.find("div", class_="side_categories").find_all("li"): @@ -22,15 +19,60 @@ def get_category_list(soup): return catego_dict -page_url = get_category_list(soup)['travel'] -soup2 = BeautifulSoup(requests.get(page_url).content, "html.parser") +def check_for_pages(category_url): + soup_catego = get_html(category_url) + total = int(soup_catego.form.strong.text) + url_list = [category_url] -def get_product_list(soup): + if total > 20: + new_url_base = category_url.replace('index.html','') + j = 2 + for i in range(total//20): + page = "page-" + str(j) + ".html" + url_list.append(new_url_base + page) + j += 1 + + return url_list + + +def get_product_list(url_category_page, url): list = [] + soup = get_html(url_category_page) + for i in soup.find_all("article"): relative_url = i.h3.a.get('href') product_url = url + relative_url.split('../')[-1] list.append(product_url) + return(list) -print(len(get_product_list(soup2))) +def main(): + # init + url = "https://books.toscrape.com/" + category = "default" + + # get functional variables + soup = get_html(url) + liste_categories = get_category_list(soup, url) + + # get category URL to do some tests on it + category_url = liste_categories[category] + print(category_url) + + # check if multiple page and get url list + url_list = check_for_pages(category_url) + + print("Liste des URLs des pages: ", url_list) + # get product list for every url_list + product_list_url = [] + for i in url_list: + get_list = get_product_list(i, url) + for j in get_list: + product_list_url.append(j) + + + print("Liste des URL des produits: ", product_list_url) + print("Longueur de la liste: ", len(product_list_url)) + +if __name__ == '__main__': + main()