Projet2/phase4/main.py
2024-11-14 14:48:59 +01:00

189 lines
5.8 KiB
Python

import requests
from bs4 import BeautifulSoup
import csv
# get soup from url
def get_html(url):
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# extract the product title from page
def get_title(soup):
title = soup.find("div", class_="product_main").h1.string
return title
# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
product_info = {}
for tr in soup.table.find_all("tr"):
product_info[tr.th.string] = tr.td.string
#extract the amount from string and case it
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
product_info['Availability'] = availability
return product_info
# get relative link from page and build the full URL
def get_image_url(soup, url):
link = soup.img.get('src')
url_site = "https://" + url.split('/')[2]
img_url = url_site + "/" + link.replace('../', '')
return img_url
# get full description as string
# luckily this <p> was the only one without class
# and manage the case where there's no description
def product_description(soup):
try:
desc = soup.find("p", class_='').string
except AttributeError:
desc = "None"
return desc
# get category from breadcrumb
def get_category(soup):
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
return bread
# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
info = [
url, product_information(soup)['UPC'],
get_title(soup),
product_information(soup)['Price (incl. tax)'],
product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'],
product_description(soup),
get_category(soup),
product_information(soup)['Number of reviews'],
get_image_url(soup, url)
]
return info
# write the file
def data_output(info, file):
file = file + ".csv"
fieldnames = ['product_page_url',
'universal_ product_code (upc)',
'title',
'price_including_tax',
'price_excluding_tax',
'number_available',
'product_description',
'category',
'review_rating',
'image_url']
with open(file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter = ',')
writer.writerow(fieldnames)
for i in info:
writer.writerow(i)
return file
# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
catego_info = []
catego_dict = {}
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
return catego_info
def check_for_pages(category_url):
soup_catego = get_html(category_url)
total = int(soup_catego.form.strong.text)
url_list = [category_url]
if total > 20:
new_url_base = category_url.replace('index.html','')
j = 2
for i in range(total//20):
page = "page-" + str(j) + ".html"
url_list.append(new_url_base + page)
j += 1
return url_list
# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
liste = []
soup = get_html(url_category_page)
for i in soup.find_all("article"):
relative_url = i.h3.a.get('href')
product_url = url + "catalogue/" + relative_url.split('../')[-1]
liste.append(product_url)
return liste
# collect category from all
# then grab all product for each and write a file with category name
def main():
# init
url = "https://books.toscrape.com/"
# get html from URL
soup = get_html(url)
# init counters
total_category = len(get_category_list(soup, url))
total_books = int(soup.form.strong.text)
processed_books = 0
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
# go ahead for each category
for line in get_category_list(soup, url):
category = line[0]
category_url = line[1]
total_category -= 1
# display what category is processed
print("\n -> Traitement de la catégorie : " + category)
# check if multiple pages and create a URL list
url_list = check_for_pages(category_url)
# get product list for each url_list, extend the main product url list with
product_url_list = []
for i in url_list:
product_url_list.extend(get_product_url_list(i, url))
# print("Liste des URL des produits: ", product_url_list)
print(len(product_url_list), " livres présents")
processed_books += len(product_url_list)
# combine with phase 1 and write in csv for each url from product_url_list named with category
data = []
for page_url in product_url_list:
page_soup = get_html(page_url)
# print(page_soup)
# print(phase1.get_category(page_soup))
# print(phase1.get_data(page_soup, page_url))
data.append(get_data(page_soup, page_url))
print(processed_books, " livres traités")
print(total_books - processed_books, " livres restants")
print(total_category, " catégories restantes")
print("Done.\n Fichier " + data_output(data, category))
print("\n Traitement terminé.")
if __name__ == '__main__':
main()