test if multiple page, get URL, create list of product, and refactor main

2024-11-13 17:09:06 +01:00
parent e3ac12ff9b
commit 8213f0849c
1 changed files with 56 additions and 14 deletions
--- a/phase2/main.py
+++ b/phase2/main.py
@@ -1,19 +1,16 @@
 import requests
 from bs4 import BeautifulSoup
-url = "https://books.toscrape.com/"
+# récupère le soup depuis l'url
 def get_html(url):
    r = requests.get(url)
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup
 category = "travel"
 catego_info = []
 catego_dict = {}
 # get category and URL from side div and put them as a list [catego, url] in a list
-
+def get_category_list(soup, url):
 def get_category_list(soup):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li"):
@@ -22,15 +19,60 @@ def get_category_list(soup):
    return catego_dict
-page_url = get_category_list(soup)['travel']
+def check_for_pages(category_url):
-soup2 = BeautifulSoup(requests.get(page_url).content, "html.parser")
+    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
-def get_product_list(soup):
+    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 def get_product_list(url_category_page, url):
    list = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + relative_url.split('../')[-1]
        list.append(product_url)
    return(list)
-print(len(get_product_list(soup2)))
+def main():
    # init
    url = "https://books.toscrape.com/"
    category = "default"
    # get functional variables
    soup = get_html(url)
    liste_categories = get_category_list(soup, url)
    # get category URL to do some tests on it
    category_url = liste_categories[category]
    print(category_url)
    # check if multiple page and get url list
    url_list = check_for_pages(category_url)
    print("Liste des URLs des pages: ", url_list)
    # get product list for every url_list
    product_list_url = []
    for i in url_list:
        get_list = get_product_list(i, url)
        for j in get_list:
            product_list_url.append(j)
    print("Liste des URL des produits: ", product_list_url)
    print("Longueur de la liste: ", len(product_list_url))
 if __name__ == '__main__':
    main()