build main to call function from phase 1 : build data from each page and write file

This commit is contained in:
yann 2024-11-14 12:37:52 +01:00
parent 27d37fb5d3
commit 2bbf684c26

View File

@ -1,5 +1,6 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import phase1
# get soup from url # get soup from url
def get_html(url): def get_html(url):
@ -43,7 +44,7 @@ def get_product_url_list(url_category_page, url):
for i in soup.find_all("article"): for i in soup.find_all("article"):
relative_url = i.h3.a.get('href') relative_url = i.h3.a.get('href')
product_url = url + relative_url.split('../')[-1] product_url = url + "catalogue/" + relative_url.split('../')[-1]
liste.append(product_url) liste.append(product_url)
return liste return liste
@ -51,7 +52,7 @@ def get_product_url_list(url_category_page, url):
def main(): def main():
# init # init
url = "https://books.toscrape.com/" url = "https://books.toscrape.com/"
category = "default" category = "fantasy"
# get functional variables # get functional variables
soup = get_html(url) soup = get_html(url)
@ -69,12 +70,20 @@ def main():
product_url_list = [] product_url_list = []
for i in url_list: for i in url_list:
product_url_list.extend(get_product_url_list(i, url)) product_url_list.extend(get_product_url_list(i, url))
# print("Liste des URL des produits: ", product_url_list)
# print("Longueur de la liste: ", len(product_url_list))
# combine with phase 1 and write in csv for each url from product_url_list named with category
data = []
for page_url in product_url_list:
page_soup = get_html(page_url)
# print(page_soup)
# print(phase1.get_category(page_soup))
# print(phase1.get_data(page_soup, page_url))
data.append(phase1.get_data(page_soup, page_url))
phase1.data_output(data, category)
print("Liste des URL des produits: ", product_url_list)
print("Longueur de la liste: ", len(product_url_list))
if __name__ == '__main__': if __name__ == '__main__':
main() main()