import requests from bs4 import BeautifulSoup # get soup from url def get_html(url): r = requests.get(url) html = r.content soup = BeautifulSoup(html, 'html.parser') return soup # get category and URL from side div and put them as a list [catego, url] in a list def get_category_list(soup, url): catego_info = [] catego_dict = {} for li in soup.find("div", class_="side_categories").find_all("li"): catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') return catego_dict def check_for_pages(category_url): soup_catego = get_html(category_url) total = int(soup_catego.form.strong.text) url_list = [category_url] if total > 20: new_url_base = category_url.replace('index.html','') j = 2 for i in range(total//20): page = "page-" + str(j) + ".html" url_list.append(new_url_base + page) j += 1 return url_list # get product list url from a given url category page; # extract and build each product url using the main url (second arg) def get_product_url_list(url_category_page, url): liste = [] soup = get_html(url_category_page) for i in soup.find_all("article"): relative_url = i.h3.a.get('href') product_url = url + relative_url.split('../')[-1] liste.append(product_url) return liste def main(): # init url = "https://books.toscrape.com/" category = "default" # get functional variables soup = get_html(url) liste_categories = get_category_list(soup, url) # get category URL to do some tests on it category_url = liste_categories[category] print(category_url) # check if multiple page and get url list url_list = check_for_pages(category_url) print("Liste des URLs des pages: ", url_list) # get product list for each url_list, extend the main product url list with product_url_list = [] for i in url_list: product_url_list.extend(get_product_url_list(i, url)) print("Liste des URL des produits: ", product_url_list) print("Longueur de la liste: ", len(product_url_list)) if __name__ == '__main__': main()