diff --git a/phase2/main.py b/phase2/main.py index bd15300..0fa6b83 100644 --- a/phase2/main.py +++ b/phase2/main.py @@ -1,5 +1,6 @@ import requests from bs4 import BeautifulSoup +import phase1 # get soup from url def get_html(url): @@ -43,7 +44,7 @@ def get_product_url_list(url_category_page, url): for i in soup.find_all("article"): relative_url = i.h3.a.get('href') - product_url = url + relative_url.split('../')[-1] + product_url = url + "catalogue/" + relative_url.split('../')[-1] liste.append(product_url) return liste @@ -51,7 +52,7 @@ def get_product_url_list(url_category_page, url): def main(): # init url = "https://books.toscrape.com/" - category = "default" + category = "fantasy" # get functional variables soup = get_html(url) @@ -69,12 +70,20 @@ def main(): product_url_list = [] for i in url_list: product_url_list.extend(get_product_url_list(i, url)) +# print("Liste des URL des produits: ", product_url_list) +# print("Longueur de la liste: ", len(product_url_list)) + # combine with phase 1 and write in csv for each url from product_url_list named with category + data = [] + for page_url in product_url_list: + page_soup = get_html(page_url) +# print(page_soup) +# print(phase1.get_category(page_soup)) +# print(phase1.get_data(page_soup, page_url)) + data.append(phase1.get_data(page_soup, page_url)) - - print("Liste des URL des produits: ", product_url_list) - print("Longueur de la liste: ", len(product_url_list)) + phase1.data_output(data, category) if __name__ == '__main__': main()