import requests from bs4 import BeautifulSoup # récupère le soup depuis l'url def get_html(url): r = requests.get(url) html = r.content soup = BeautifulSoup(html, 'html.parser') return soup # get category and URL from side div and put them as a list [catego, url] in a list def get_category_list(soup, url): catego_info = [] catego_dict = {} for li in soup.find("div", class_="side_categories").find_all("li"): catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') return catego_dict def check_for_pages(category_url): soup_catego = get_html(category_url) total = int(soup_catego.form.strong.text) url_list = [category_url] if total > 20: new_url_base = category_url.replace('index.html','') j = 2 for i in range(total//20): page = "page-" + str(j) + ".html" url_list.append(new_url_base + page) j += 1 return url_list def get_product_list(url_category_page, url): list = [] soup = get_html(url_category_page) for i in soup.find_all("article"): relative_url = i.h3.a.get('href') product_url = url + relative_url.split('../')[-1] list.append(product_url) return(list) def main(): # init url = "https://books.toscrape.com/" category = "default" # get functional variables soup = get_html(url) liste_categories = get_category_list(soup, url) # get category URL to do some tests on it category_url = liste_categories[category] print(category_url) # check if multiple page and get url list url_list = check_for_pages(category_url) print("Liste des URLs des pages: ", url_list) # get product list for every url_list product_list_url = [] for i in url_list: get_list = get_product_list(i, url) for j in get_list: product_list_url.append(j) print("Liste des URL des produits: ", product_list_url) print("Longueur de la liste: ", len(product_list_url)) if __name__ == '__main__': main()