build main to call function from phase 1 : build data from each page and write file
This commit is contained in:
parent
27d37fb5d3
commit
2bbf684c26
@ -1,5 +1,6 @@
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import phase1
|
||||||
|
|
||||||
# get soup from url
|
# get soup from url
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
@ -43,7 +44,7 @@ def get_product_url_list(url_category_page, url):
|
|||||||
|
|
||||||
for i in soup.find_all("article"):
|
for i in soup.find_all("article"):
|
||||||
relative_url = i.h3.a.get('href')
|
relative_url = i.h3.a.get('href')
|
||||||
product_url = url + relative_url.split('../')[-1]
|
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||||
liste.append(product_url)
|
liste.append(product_url)
|
||||||
|
|
||||||
return liste
|
return liste
|
||||||
@ -51,7 +52,7 @@ def get_product_url_list(url_category_page, url):
|
|||||||
def main():
|
def main():
|
||||||
# init
|
# init
|
||||||
url = "https://books.toscrape.com/"
|
url = "https://books.toscrape.com/"
|
||||||
category = "default"
|
category = "fantasy"
|
||||||
|
|
||||||
# get functional variables
|
# get functional variables
|
||||||
soup = get_html(url)
|
soup = get_html(url)
|
||||||
@ -69,12 +70,20 @@ def main():
|
|||||||
product_url_list = []
|
product_url_list = []
|
||||||
for i in url_list:
|
for i in url_list:
|
||||||
product_url_list.extend(get_product_url_list(i, url))
|
product_url_list.extend(get_product_url_list(i, url))
|
||||||
|
# print("Liste des URL des produits: ", product_url_list)
|
||||||
|
# print("Longueur de la liste: ", len(product_url_list))
|
||||||
|
|
||||||
|
|
||||||
|
# combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||||
|
data = []
|
||||||
|
for page_url in product_url_list:
|
||||||
|
page_soup = get_html(page_url)
|
||||||
|
# print(page_soup)
|
||||||
|
# print(phase1.get_category(page_soup))
|
||||||
|
# print(phase1.get_data(page_soup, page_url))
|
||||||
|
data.append(phase1.get_data(page_soup, page_url))
|
||||||
|
|
||||||
|
phase1.data_output(data, category)
|
||||||
print("Liste des URL des produits: ", product_url_list)
|
|
||||||
print("Longueur de la liste: ", len(product_url_list))
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user