Projet2/phase2/main.py

81 lines
2.2 KiB
Python

import requests
from bs4 import BeautifulSoup
# get soup from url
def get_html(url):
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
catego_info = []
catego_dict = {}
for li in soup.find("div", class_="side_categories").find_all("li"):
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
return catego_dict
def check_for_pages(category_url):
soup_catego = get_html(category_url)
total = int(soup_catego.form.strong.text)
url_list = [category_url]
if total > 20:
new_url_base = category_url.replace('index.html','')
j = 2
for i in range(total//20):
page = "page-" + str(j) + ".html"
url_list.append(new_url_base + page)
j += 1
return url_list
# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
liste = []
soup = get_html(url_category_page)
for i in soup.find_all("article"):
relative_url = i.h3.a.get('href')
product_url = url + relative_url.split('../')[-1]
liste.append(product_url)
return liste
def main():
# init
url = "https://books.toscrape.com/"
category = "default"
# get functional variables
soup = get_html(url)
liste_categories = get_category_list(soup, url)
# get category URL to do some tests on it
category_url = liste_categories[category]
print(category_url)
# check if multiple page and get url list
url_list = check_for_pages(category_url)
print("Liste des URLs des pages: ", url_list)
# get product list for each url_list, extend the main product url list with
product_url_list = []
for i in url_list:
product_url_list.extend(get_product_url_list(i, url))
print("Liste des URL des produits: ", product_url_list)
print("Longueur de la liste: ", len(product_url_list))
if __name__ == '__main__':
main()