add finale folder, creates category dir and image name from title, full readme
This commit is contained in:
parent
aa0d3a7819
commit
6fa035fc1a
68
rendu/README.md
Normal file
68
rendu/README.md
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
# Books Online
|
||||||
|
|
||||||
|
Suivi du prix des livres chez "Books To Scrape"
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
Ces instructions vous permettent de :
|
||||||
|
- récupérer le programme,
|
||||||
|
- d'installer l'environnement nécessaire à son exécution,
|
||||||
|
- de l'exécuter,
|
||||||
|
- d'en connaitre le résultat
|
||||||
|
|
||||||
|
|
||||||
|
### Pré-requis
|
||||||
|
|
||||||
|
```
|
||||||
|
paquets : python 3.11, python3.11-venv, git
|
||||||
|
modules : python requests, BeautifulSoup, csv, os
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel :
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
- créez un environnement virtuel
|
||||||
|
$ python3.11 -m venv env
|
||||||
|
$ source env/bin/activate
|
||||||
|
|
||||||
|
- cloner le dépôt, aller dans le bon dossier
|
||||||
|
git clone https://mcstn.fr/gitea/Yann/Projet2.git
|
||||||
|
cd Projet2/rendu
|
||||||
|
|
||||||
|
- installez les modules
|
||||||
|
pip
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Executer les tests
|
||||||
|
|
||||||
|
N/A
|
||||||
|
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
N/A
|
||||||
|
|
||||||
|
## Technologies :
|
||||||
|
|
||||||
|
* HTML5/CSS3
|
||||||
|
* [Boostrap](https://getbootstrap.com/)
|
||||||
|
|
||||||
|
## Contribution
|
||||||
|
|
||||||
|
Merci de lire les fichiers :
|
||||||
|
* [CONTRIBUTING.md](https://github.com/OpenClassrooms-Student-Center/7688581-Expert-Git-GitHub/blob/main/CONTRIBUTING.md)
|
||||||
|
* [CODE_OF_CONDUCT.md](https://github.com/OpenClassrooms-Student-Center/7688581-Expert-Git-GitHub/blob/main/CONTRIBUTING.md)
|
||||||
|
|
||||||
|
## Auteurs
|
||||||
|
|
||||||
|
* Yann ALEXANDRE <yann@needsome.coffee>
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
N/A
|
222
rendu/main.py
Normal file
222
rendu/main.py
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
### EXTRACTION ###
|
||||||
|
|
||||||
|
# get soup from url
|
||||||
|
def get_html(url):
|
||||||
|
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
|
||||||
|
html = r.content
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
return soup
|
||||||
|
|
||||||
|
# extract the product title from page
|
||||||
|
def get_title(soup):
|
||||||
|
title = soup.find("div", class_="product_main").h1.string
|
||||||
|
return title
|
||||||
|
|
||||||
|
# extract the product_information from the table and put them in a dict
|
||||||
|
# and extract quantity from string
|
||||||
|
def product_information(soup):
|
||||||
|
product_info = {}
|
||||||
|
for tr in soup.table.find_all("tr"):
|
||||||
|
product_info[tr.th.string] = tr.td.string
|
||||||
|
#extract the amount from string and case it
|
||||||
|
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||||
|
product_info['Availability'] = availability
|
||||||
|
return product_info
|
||||||
|
|
||||||
|
# extract full description as string
|
||||||
|
# luckily this <p> was the only one without class
|
||||||
|
# and manage the case where there's no description
|
||||||
|
def product_description(soup):
|
||||||
|
try:
|
||||||
|
desc = soup.find("p", class_='').string
|
||||||
|
except AttributeError:
|
||||||
|
desc = "None"
|
||||||
|
|
||||||
|
return desc
|
||||||
|
|
||||||
|
# extract category from breadcrumb
|
||||||
|
def get_category(soup):
|
||||||
|
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||||
|
return bread
|
||||||
|
|
||||||
|
# get product list url from a given url category page;
|
||||||
|
# extract and build each product url using the main url (second arg)
|
||||||
|
def get_product_url_list(url_category_page, url):
|
||||||
|
liste = []
|
||||||
|
soup = get_html(url_category_page)
|
||||||
|
|
||||||
|
for i in soup.find_all("article"):
|
||||||
|
relative_url = i.h3.a.get('href')
|
||||||
|
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||||
|
liste.append(product_url)
|
||||||
|
|
||||||
|
return liste
|
||||||
|
|
||||||
|
# check if a category has multiple pages and extract URLs
|
||||||
|
def check_for_pages(category_url):
|
||||||
|
soup_catego = get_html(category_url)
|
||||||
|
total = int(soup_catego.form.strong.text)
|
||||||
|
url_list = [category_url]
|
||||||
|
|
||||||
|
if total > 20:
|
||||||
|
new_url_base = category_url.replace('index.html','')
|
||||||
|
j = 2
|
||||||
|
for i in range(total//20):
|
||||||
|
page = "page-" + str(j) + ".html"
|
||||||
|
url_list.append(new_url_base + page)
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
return url_list
|
||||||
|
|
||||||
|
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||||
|
def get_category_list(soup, url):
|
||||||
|
catego_info = []
|
||||||
|
catego_dict = {}
|
||||||
|
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
|
||||||
|
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||||
|
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||||
|
return catego_info
|
||||||
|
|
||||||
|
### TRANSFORMATION ###
|
||||||
|
|
||||||
|
# get relative link from page and build the full URL
|
||||||
|
def get_image_url(soup, url):
|
||||||
|
link = soup.img.get('src')
|
||||||
|
url_site = "https://" + url.split('/')[2]
|
||||||
|
img_url = url_site + "/" + link.replace('../', '')
|
||||||
|
return img_url
|
||||||
|
|
||||||
|
# create a list with all information consecutively
|
||||||
|
# /!\ don't know if that's the best way
|
||||||
|
def get_data(soup, url):
|
||||||
|
info = [
|
||||||
|
url,
|
||||||
|
product_information(soup)['UPC'],
|
||||||
|
get_title(soup),
|
||||||
|
product_information(soup)['Price (incl. tax)'],
|
||||||
|
product_information(soup)['Price (excl. tax)'],
|
||||||
|
product_information(soup)['Availability'],
|
||||||
|
product_description(soup),
|
||||||
|
get_category(soup),
|
||||||
|
product_information(soup)['Number of reviews'],
|
||||||
|
get_image_url(soup, url)
|
||||||
|
]
|
||||||
|
return info
|
||||||
|
|
||||||
|
### LOAD ###
|
||||||
|
|
||||||
|
# write the file
|
||||||
|
def data_output(info, file):
|
||||||
|
file = file + ".csv"
|
||||||
|
fieldnames = ['product_page_url',
|
||||||
|
'universal_ product_code (upc)',
|
||||||
|
'title',
|
||||||
|
'price_including_tax',
|
||||||
|
'price_excluding_tax',
|
||||||
|
'number_available',
|
||||||
|
'product_description',
|
||||||
|
'category',
|
||||||
|
'review_rating',
|
||||||
|
'image_url']
|
||||||
|
|
||||||
|
with open(file, 'w') as csv_file:
|
||||||
|
writer = csv.writer(csv_file, delimiter = ',')
|
||||||
|
writer.writerow(fieldnames)
|
||||||
|
for i in info:
|
||||||
|
writer.writerow(i)
|
||||||
|
|
||||||
|
return file
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# collect category from all
|
||||||
|
# then grab all product for each and write a file with category name
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# init
|
||||||
|
url = "https://books.toscrape.com/"
|
||||||
|
|
||||||
|
### EXTRACTION ###
|
||||||
|
# get html from URL
|
||||||
|
soup = get_html(url)
|
||||||
|
|
||||||
|
# init counters
|
||||||
|
total_category = len(get_category_list(soup, url))
|
||||||
|
total_books = int(soup.form.strong.text)
|
||||||
|
processed_books = 0
|
||||||
|
|
||||||
|
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
|
||||||
|
|
||||||
|
# go ahead for each category
|
||||||
|
for line in get_category_list(soup, url):
|
||||||
|
category = line[0]
|
||||||
|
category_url = line[1]
|
||||||
|
|
||||||
|
total_category -= 1
|
||||||
|
# display what category is processed
|
||||||
|
print("\n -> Traitement de la catégorie : " + category)
|
||||||
|
|
||||||
|
# check if multiple pages and create a URL list
|
||||||
|
url_list = check_for_pages(category_url)
|
||||||
|
|
||||||
|
# get product list for each url_list, extend the main product url list with
|
||||||
|
product_url_list = []
|
||||||
|
for i in url_list:
|
||||||
|
product_url_list.extend(get_product_url_list(i, url))
|
||||||
|
# print("Liste des URL des produits: ", product_url_list)
|
||||||
|
print(len(product_url_list), " livres présents")
|
||||||
|
processed_books += len(product_url_list)
|
||||||
|
|
||||||
|
|
||||||
|
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||||
|
data = []
|
||||||
|
img_nb = 1
|
||||||
|
|
||||||
|
# EXTRACT data for each product page
|
||||||
|
for page_url in product_url_list:
|
||||||
|
|
||||||
|
# create the category directory. If it exists already, just continue
|
||||||
|
try:
|
||||||
|
os.mkdir(category)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# EXTRACT data : html from product page, and product data from the page
|
||||||
|
page_soup = get_html(page_url)
|
||||||
|
|
||||||
|
# EXTRACTION + TRANSFORMATION
|
||||||
|
product_data = get_data(page_soup, page_url)
|
||||||
|
# print(page_soup)
|
||||||
|
# print(phase1.get_category(page_soup))
|
||||||
|
# print(phase1.get_data(page_soup, page_url))
|
||||||
|
|
||||||
|
# LOAD data in a list
|
||||||
|
data.append(product_data)
|
||||||
|
|
||||||
|
# PHASE 4 : get img for every book and name it with category and incremental number
|
||||||
|
# EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
|
||||||
|
img_url = get_image_url(page_soup, page_url)
|
||||||
|
print(category, "/", product_data[2], ".png")
|
||||||
|
with open(category + "/" + product_data[2] + ".png", "wb") as img_file:
|
||||||
|
img_file.write(requests.get(img_url).content)
|
||||||
|
img_nb += 1
|
||||||
|
|
||||||
|
print(processed_books, " livres traités")
|
||||||
|
print(total_books - processed_books, " livres restants")
|
||||||
|
print(total_category, " catégories restantes")
|
||||||
|
|
||||||
|
# LOAD : write the list in the CSV file
|
||||||
|
print("Done.\n Fichier " + data_output(data, category))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("\n Traitement terminé.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
2
rendu/requirements.txt
Normal file
2
rendu/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
beautifulsoup4==4.12.3
|
||||||
|
requests==2.32.3
|
Loading…
x
Reference in New Issue
Block a user