Compare commits

...

31 Commits

Author SHA1 Message Date
913968b8c6 fix typo 2024-11-28 08:36:33 +01:00
b99c187bc5 add details for directories 2024-11-23 18:57:07 +01:00
ec4ad03fca add execution duration 2024-11-20 13:07:30 +01:00
9398f8fae3 adapt instructions with directory change 2024-11-20 11:56:51 +01:00
6cb7913af2 main folder to put files in it, replace category name space, fix / issue in img name 2024-11-20 11:56:10 +01:00
cece9d1874 remove the print() for test 2024-11-20 10:15:58 +01:00
b74090865e add info and structure 2024-11-20 10:12:21 +01:00
6fa035fc1a add finale folder, creates category dir and image name from title, full readme 2024-11-20 09:31:07 +01:00
aa0d3a7819 re-organize code to show ETL phases, add comments 2024-11-19 12:40:23 +01:00
73b302a2bc improve comments, indicate phases 2024-11-19 12:25:34 +01:00
90f3b22efb add a screenshot of the result 2024-11-14 15:08:05 +01:00
4785b2e6d8 add way to retrieve images : use requests and write binary in file. Name it with category and incremental number 2024-11-14 15:03:30 +01:00
22ccd97fa3 add content :/ 2024-11-14 14:49:50 +01:00
852c0e781b init phase4 2024-11-14 14:48:59 +01:00
c9aaef7222 add screenshot of the result 2024-11-14 14:27:23 +01:00
549291cd6c Téléverser les fichiers vers "phase3"
Screenshot du résultat

Signed-off-by: Yann <yann@needsome.coffee>
2024-11-14 13:22:29 +00:00
b34a5d123c refactor output counters 2024-11-14 14:16:36 +01:00
ebd5f5acd4 works. Add processed book and book to go counters displayed 2024-11-14 14:07:04 +01:00
d020998add all functions in same place, and loop in main 2024-11-14 13:56:55 +01:00
12dd0c9dfc init phase3 2024-11-14 13:24:12 +01:00
dd370cca8d add title 2024-11-14 13:22:36 +01:00
c35f7454a2 add text for fancy output and remove previous print (were testing) 2024-11-14 13:20:33 +01:00
4247f1ac83 manage exception when no description 2024-11-14 13:19:21 +01:00
2bbf684c26 build main to call function from phase 1 : build data from each page and write file 2024-11-14 12:37:52 +01:00
27d37fb5d3 copy phase1/main.py as phase1.py to import in main 2024-11-14 12:35:35 +01:00
50ca4fccd8 just one loop to fill the list, "extend" with each page list 2024-11-14 10:47:58 +01:00
c92ce51aa0 refactor some comments 2024-11-13 17:24:44 +01:00
8213f0849c test if multiple page, get URL, create list of product, and refactor main 2024-11-13 17:09:06 +01:00
e3ac12ff9b get category_list from home and get product url from a category (if one page) 2024-11-13 15:46:48 +01:00
7e6875a497 create phase2 folder+readme 2024-11-13 13:48:34 +01:00
c0fcd21346 add get_category 2024-11-13 13:44:27 +01:00
14 changed files with 963 additions and 5 deletions

View File

@@ -1,3 +1,7 @@
Un dossier pour chaque phase du projet # INTRODUCTION
Ce dépôt contient un dossier pour chaque phase du projet, avec les instructions respectives dans chaque README.
Le contenu à considérer pour la **soutenance** est dans le dossier **"rendu"**
Avec chacun un README contenant les instructions

View File

@@ -36,7 +36,12 @@ def product_description(soup):
desc = soup.find("p", class_='').string desc = soup.find("p", class_='').string
return desc return desc
#create a list with all information consecutively # get category from breadcrumb
def get_category(soup):
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
return bread
# create a list with all information consecutively
# /!\ don't know if that's the best way # /!\ don't know if that's the best way
def get_data(soup, url): def get_data(soup, url):
info = [url, product_information(soup)['UPC'], info = [url, product_information(soup)['UPC'],
@@ -45,13 +50,13 @@ def get_data(soup, url):
product_information(soup)['Price (excl. tax)'], product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'], product_information(soup)['Availability'],
product_description(soup), product_description(soup),
"TODO", get_category(soup),
product_information(soup)['Number of reviews'], product_information(soup)['Number of reviews'],
get_image_url(soup, url) get_image_url(soup, url)
] ]
return info return info
#write the file # write the file
def data_output(info, file): def data_output(info, file):
fieldnames = ['product_page_url', fieldnames = ['product_page_url',
'universal_ product_code (upc)', 'universal_ product_code (upc)',

17
phase2/README.md Normal file
View File

@@ -0,0 +1,17 @@
# Phase 2
Maintenant que vous avez obtenu les informations concernant un premier livre,
vous pouvez essayer de récupérer toutes les données nécessaires pour toute une
catégorie d'ouvrages.
Choisissez n'importe quelle catégorie sur le site de Books to Scrape. Écrivez un
script Python qui consulte la page de la catégorie choisie, et extrait l'URL de la
page Produit de chaque livre appartenant à cette catégorie.
Combinez cela avec le travail que vous avez déjà effectué dans la phase 1 afin
d'extraire les données produit de tous les livres de la catégorie choisie, puis écrivez
les données dans un seul fichier CSV.
Remarque : certaines pages de catégorie comptent plus de 20 livres, qui sont
donc répartis sur différentes pages (« pagination »). Votre application doit être
capable de parcourir automatiquement les multiples pages si présentes

90
phase2/main.py Normal file
View File

@@ -0,0 +1,90 @@
import requests
from bs4 import BeautifulSoup
import phase1
# get soup from url
def get_html(url):
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
catego_info = []
catego_dict = {}
for li in soup.find("div", class_="side_categories").find_all("li"):
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
return catego_dict
def check_for_pages(category_url):
soup_catego = get_html(category_url)
total = int(soup_catego.form.strong.text)
url_list = [category_url]
if total > 20:
new_url_base = category_url.replace('index.html','')
j = 2
for i in range(total//20):
page = "page-" + str(j) + ".html"
url_list.append(new_url_base + page)
j += 1
return url_list
# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
liste = []
soup = get_html(url_category_page)
for i in soup.find_all("article"):
relative_url = i.h3.a.get('href')
product_url = url + "catalogue/" + relative_url.split('../')[-1]
liste.append(product_url)
return liste
def main():
# init
url = "https://books.toscrape.com/"
category = "default"
# get functional variables
soup = get_html(url)
liste_categories = get_category_list(soup, url)
# get category URL to do some tests on it
category_url = liste_categories[category]
print(category_url)
# check if multiple page and get url list
url_list = check_for_pages(category_url)
# print("Liste des URLs des pages: ", url_list)
# get product list for each url_list, extend the main product url list with
product_url_list = []
for i in url_list:
product_url_list.extend(get_product_url_list(i, url))
# print("Liste des URL des produits: ", product_url_list)
print("Nombre de livres: ", len(product_url_list))
# combine with phase 1 and write in csv for each url from product_url_list named with category
data = []
for page_url in product_url_list:
page_soup = get_html(page_url)
# print(page_soup)
# print(phase1.get_category(page_soup))
# print(phase1.get_data(page_soup, page_url))
data.append(phase1.get_data(page_soup, page_url))
print("Done.\n Fichier " + phase1.data_output(data, category))
if __name__ == '__main__':
main()

113
phase2/phase1.py Normal file
View File

@@ -0,0 +1,113 @@
import requests
from bs4 import BeautifulSoup
import csv
def extract_web(url):
r = requests.get(url)
page = r.content
return page
# extract the product title from page
def get_title(soup):
title = soup.find("div", class_="product_main").h1.string
return title
# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
product_info = {}
for tr in soup.table.find_all("tr"):
product_info[tr.th.string] = tr.td.string
#extract the amount from string and case it
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
product_info['Availability'] = availability
return product_info
# get relative link from page and build the full URL
def get_image_url(soup, url):
link = soup.img.get('src')
url_site = "https://" + url.split('/')[2]
img_url = url_site + "/" + link.replace('../', '')
return img_url
# get full description as string
# luckily this <p> was the only one without class
# and manage the case where there's no description
def product_description(soup):
try:
desc = soup.find("p", class_='').string
except AttributeError:
desc = "None"
return desc
# get category from breadcrumb
def get_category(soup):
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
return bread
# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
info = [
url, product_information(soup)['UPC'],
get_title(soup),
product_information(soup)['Price (incl. tax)'],
product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'],
product_description(soup),
get_category(soup),
product_information(soup)['Number of reviews'],
get_image_url(soup, url)
]
return info
# write the file
def data_output(info, file):
file = file + ".csv"
fieldnames = ['product_page_url',
'universal_ product_code (upc)',
'title',
'price_including_tax',
'price_excluding_tax',
'number_available',
'product_description',
'category',
'review_rating',
'image_url']
with open(file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter = ',')
writer.writerow(fieldnames)
for i in info:
writer.writerow(i)
return file
def main():
url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
html = extract_web(url)
soup = BeautifulSoup(html, "html.parser")
test = product_information(soup)
print(test['Availability'])
info = get_data(soup, url)
print(info)
data_output(info, 'output.csv')
if __name__ == "__main__":
main()

14
phase3/README.md Normal file
View File

@@ -0,0 +1,14 @@
# Phase 3
Ensuite, étendez votre travail à l'écriture d'un script qui consulte le site de Books
to Scrape, extrait toutes les catégories de livres disponibles, puis extrait les
informations produit de tous les livres appartenant à toutes les différentes
catégories.
Vous devrez écrire les données dans un fichier CSV distinct pour
chaque catégorie de livres.
# Résultat
![ screenshot](screenshot.png)

188
phase3/main.py Normal file
View File

@@ -0,0 +1,188 @@
import requests
from bs4 import BeautifulSoup
import csv
# get soup from url
def get_html(url):
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# extract the product title from page
def get_title(soup):
title = soup.find("div", class_="product_main").h1.string
return title
# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
product_info = {}
for tr in soup.table.find_all("tr"):
product_info[tr.th.string] = tr.td.string
#extract the amount from string and case it
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
product_info['Availability'] = availability
return product_info
# get relative link from page and build the full URL
def get_image_url(soup, url):
link = soup.img.get('src')
url_site = "https://" + url.split('/')[2]
img_url = url_site + "/" + link.replace('../', '')
return img_url
# get full description as string
# luckily this <p> was the only one without class
# and manage the case where there's no description
def product_description(soup):
try:
desc = soup.find("p", class_='').string
except AttributeError:
desc = "None"
return desc
# get category from breadcrumb
def get_category(soup):
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
return bread
# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
info = [
url, product_information(soup)['UPC'],
get_title(soup),
product_information(soup)['Price (incl. tax)'],
product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'],
product_description(soup),
get_category(soup),
product_information(soup)['Number of reviews'],
get_image_url(soup, url)
]
return info
# write the file
def data_output(info, file):
file = file + ".csv"
fieldnames = ['product_page_url',
'universal_ product_code (upc)',
'title',
'price_including_tax',
'price_excluding_tax',
'number_available',
'product_description',
'category',
'review_rating',
'image_url']
with open(file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter = ',')
writer.writerow(fieldnames)
for i in info:
writer.writerow(i)
return file
# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
catego_info = []
catego_dict = {}
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
return catego_info
def check_for_pages(category_url):
soup_catego = get_html(category_url)
total = int(soup_catego.form.strong.text)
url_list = [category_url]
if total > 20:
new_url_base = category_url.replace('index.html','')
j = 2
for i in range(total//20):
page = "page-" + str(j) + ".html"
url_list.append(new_url_base + page)
j += 1
return url_list
# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
liste = []
soup = get_html(url_category_page)
for i in soup.find_all("article"):
relative_url = i.h3.a.get('href')
product_url = url + "catalogue/" + relative_url.split('../')[-1]
liste.append(product_url)
return liste
# collect category from all
# then grab all product for each and write a file with category name
def main():
# init
url = "https://books.toscrape.com/"
# get html from URL
soup = get_html(url)
# init counters
total_category = len(get_category_list(soup, url))
total_books = int(soup.form.strong.text)
processed_books = 0
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
# go ahead for each category
for line in get_category_list(soup, url):
category = line[0]
category_url = line[1]
total_category -= 1
# display what category is processed
print("\n -> Traitement de la catégorie : " + category)
# check if multiple pages and create a URL list
url_list = check_for_pages(category_url)
# get product list for each url_list, extend the main product url list with
product_url_list = []
for i in url_list:
product_url_list.extend(get_product_url_list(i, url))
# print("Liste des URL des produits: ", product_url_list)
print(len(product_url_list), " livres présents")
processed_books += len(product_url_list)
# combine with phase 1 and write in csv for each url from product_url_list named with category
data = []
for page_url in product_url_list:
page_soup = get_html(page_url)
# print(page_soup)
# print(phase1.get_category(page_soup))
# print(phase1.get_data(page_soup, page_url))
data.append(get_data(page_soup, page_url))
print(processed_books, " livres traités")
print(total_books - processed_books, " livres restants")
print(total_category, " catégories restantes")
print("Done.\n Fichier " + data_output(data, category))
print("\n Traitement terminé.")
if __name__ == '__main__':
main()

BIN
phase3/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

8
phase4/README.md Normal file
View File

@@ -0,0 +1,8 @@
# Phase 4
Enfin, prolongez votre travail existant pour télécharger et enregistrer le fichier
image de chaque page Produit que vous consultez.
# Résultat
![ screenshot](screenshot.jpg)

204
phase4/main.py Normal file
View File

@@ -0,0 +1,204 @@
import requests
from bs4 import BeautifulSoup
import csv
### EXTRACTION ###
# get soup from url
def get_html(url):
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
html = r.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# extract the product title from page
def get_title(soup):
title = soup.find("div", class_="product_main").h1.string
return title
# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
product_info = {}
for tr in soup.table.find_all("tr"):
product_info[tr.th.string] = tr.td.string
#extract the amount from string and case it
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
product_info['Availability'] = availability
return product_info
# extract full description as string
# luckily this <p> was the only one without class
# and manage the case where there's no description
def product_description(soup):
try:
desc = soup.find("p", class_='').string
except AttributeError:
desc = "None"
return desc
# extract category from breadcrumb
def get_category(soup):
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
return bread
# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
liste = []
soup = get_html(url_category_page)
for i in soup.find_all("article"):
relative_url = i.h3.a.get('href')
product_url = url + "catalogue/" + relative_url.split('../')[-1]
liste.append(product_url)
return liste
# check if a category has multiple pages and extract URLs
def check_for_pages(category_url):
soup_catego = get_html(category_url)
total = int(soup_catego.form.strong.text)
url_list = [category_url]
if total > 20:
new_url_base = category_url.replace('index.html','')
j = 2
for i in range(total//20):
page = "page-" + str(j) + ".html"
url_list.append(new_url_base + page)
j += 1
return url_list
# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
catego_info = []
catego_dict = {}
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
return catego_info
### TRANSFORMATION ###
# get relative link from page and build the full URL
def get_image_url(soup, url):
link = soup.img.get('src')
url_site = "https://" + url.split('/')[2]
img_url = url_site + "/" + link.replace('../', '')
return img_url
# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
info = [
url, product_information(soup)['UPC'],
get_title(soup),
product_information(soup)['Price (incl. tax)'],
product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'],
product_description(soup),
get_category(soup),
product_information(soup)['Number of reviews'],
get_image_url(soup, url)
]
return info
### LOAD ###
# write the file
def data_output(info, file):
file = file + ".csv"
fieldnames = ['product_page_url',
'universal_ product_code (upc)',
'title',
'price_including_tax',
'price_excluding_tax',
'number_available',
'product_description',
'category',
'review_rating',
'image_url']
with open(file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter = ',')
writer.writerow(fieldnames)
for i in info:
writer.writerow(i)
return file
# collect category from all
# then grab all product for each and write a file with category name
def main():
# init
url = "https://books.toscrape.com/"
### EXTRACTION ###
# get html from URL
soup = get_html(url)
# init counters
total_category = len(get_category_list(soup, url))
total_books = int(soup.form.strong.text)
processed_books = 0
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
# go ahead for each category
for line in get_category_list(soup, url):
category = line[0]
category_url = line[1]
total_category -= 1
# display what category is processed
print("\n -> Traitement de la catégorie : " + category)
# check if multiple pages and create a URL list
url_list = check_for_pages(category_url)
# get product list for each url_list, extend the main product url list with
product_url_list = []
for i in url_list:
product_url_list.extend(get_product_url_list(i, url))
# print("Liste des URL des produits: ", product_url_list)
print(len(product_url_list), " livres présents")
processed_books += len(product_url_list)
### TRANSFORMATION ###
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
data = []
img_nb = 1
for page_url in product_url_list:
page_soup = get_html(page_url)
# print(page_soup)
# print(phase1.get_category(page_soup))
# print(phase1.get_data(page_soup, page_url))
data.append(get_data(page_soup, page_url))
### LOAD ###
# PHASE 4 : get img for every book and name it with category and incremental number
img_url = get_image_url(page_soup, page_url)
with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
img_file.write(requests.get(img_url).content)
img_nb += 1
print(processed_books, " livres traités")
print(total_books - processed_books, " livres restants")
print(total_category, " catégories restantes")
### LOAD ###
print("Done.\n Fichier " + data_output(data, category))
print("\n Traitement terminé.")
if __name__ == '__main__':
main()

BIN
phase4/screenshot.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

87
rendu/README.md Normal file
View File

@@ -0,0 +1,87 @@
# Books Online
Suivi du prix des livres chez "Books To Scrape"
## Introduction
Ces instructions vous permettent de :
- récupérer le programme,
- d'installer l'environnement nécessaire à son exécution,
- de l'exécuter,
- d'en connaitre le résultat
### Pré-requis
```
paquets : python 3.11, python3.11-venv, git
modules : python requests, BeautifulSoup, csv, os
```
### Installation
Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel :
créer l'environnement virtuel
```
python3.11 -m venv env
source env/bin/activate
```
cloner le dépôt, aller dans le bon dossier
```
git clone https://mcstn.fr/gitea/Yann/Projet2.git
cd Projet2/rendu
```
installer les modules
```
pip install -r requirements.txt
```
## Exécution
exécuter la commande :
```
python3 main.py
```
## Résultat
Les fichiers sont placés dans un répertoire "resultat"
Le programme récupère les catégories sur la page d'accueil de l'URL, puis, pour chaque catégorie :
1. affiche la catégorie traitée, le nombre de catégories restantes, de livres présents, traités au total et restants
2. crée un dossier du nom de la catégorie, y enregistre les images des livres nommées en fonction du titre
3. crée un fichier csv au nom de la catégorie, avec :
- product_page_url
- universal_ product_code (upc)
- title
- price_including_tax
- price_excluding_tax
- number_available
- product_description
- category
- review_rating
- image_url
```
$ time python3.11 main.py
1000 à traiter répartis en 50 catégories.
[ ... ]
Traitement terminé.
real 20m17,783s
user 4m30,695s
sys 0m3,172s
```
## Auteur
Yann <yann@needsome.coffee>
## License
N/A

226
rendu/main.py Normal file
View File

@@ -0,0 +1,226 @@
import requests
from bs4 import BeautifulSoup
import csv
import os
### EXTRACTION ###
# get soup from url
def get_html(url):
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
html = r.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# extract the product title from page
def get_title(soup):
title = soup.find("div", class_="product_main").h1.string
return title
# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
product_info = {}
for tr in soup.table.find_all("tr"):
product_info[tr.th.string] = tr.td.string
#extract the amount from string and case it
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
product_info['Availability'] = availability
return product_info
# extract full description as string
# luckily this <p> was the only one without class
# and manage the case where there's no description
def product_description(soup):
try:
desc = soup.find("p", class_='').string
except AttributeError:
desc = "None"
return desc
# extract category from breadcrumb
def get_category(soup):
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
return bread
# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
liste = []
soup = get_html(url_category_page)
for i in soup.find_all("article"):
relative_url = i.h3.a.get('href')
product_url = url + "catalogue/" + relative_url.split('../')[-1]
liste.append(product_url)
return liste
# check if a category has multiple pages and extract URLs
def check_for_pages(category_url):
soup_catego = get_html(category_url)
total = int(soup_catego.form.strong.text)
url_list = [category_url]
if total > 20:
new_url_base = category_url.replace('index.html','')
j = 2
for i in range(total//20):
page = "page-" + str(j) + ".html"
url_list.append(new_url_base + page)
j += 1
return url_list
# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
catego_info = []
catego_dict = {}
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
return catego_info
### TRANSFORMATION ###
# get relative link from page and build the full URL
def get_image_url(soup, url):
link = soup.img.get('src')
url_site = "https://" + url.split('/')[2]
img_url = url_site + "/" + link.replace('../', '')
return img_url
# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
info = [
url,
product_information(soup)['UPC'],
get_title(soup),
product_information(soup)['Price (incl. tax)'],
product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'],
product_description(soup),
get_category(soup),
product_information(soup)['Number of reviews'],
get_image_url(soup, url)
]
return info
### LOAD ###
# write the file
def data_output(info, file):
file = file + ".csv"
fieldnames = ['product_page_url',
'universal_ product_code (upc)',
'title',
'price_including_tax',
'price_excluding_tax',
'number_available',
'product_description',
'category',
'review_rating',
'image_url']
with open(file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter = ',')
writer.writerow(fieldnames)
for i in info:
writer.writerow(i)
return file
# collect category from all
# then grab all product for each and write a file with category name
def main():
# init
url = "https://books.toscrape.com/"
os.mkdir("resultat")
### EXTRACTION ###
# get html from URL
soup = get_html(url)
# init counters
total_category = len(get_category_list(soup, url))
total_books = int(soup.form.strong.text)
processed_books = 0
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
# go ahead for each category
for line in get_category_list(soup, url):
# remove space in category name, to prevent potential issue on directory creation
category = line[0].replace(' ', '_')
category_url = line[1]
category_path = "resultat/" + category
total_category -= 1
# display what category is processed
print("\n -> Traitement de la catégorie : " + category)
# check if multiple pages and create a URL list
url_list = check_for_pages(category_url)
# get product list for each url_list, extend the main product url list with
product_url_list = []
for i in url_list:
product_url_list.extend(get_product_url_list(i, url))
# print("Liste des URL des produits: ", product_url_list)
print(len(product_url_list), " livres présents")
processed_books += len(product_url_list)
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
data = []
img_nb = 1
# go ahead for each product of category
# EXTRACT data for each product page
for page_url in product_url_list:
# create the category directory. If it exists already, just continue
try:
os.mkdir(category_path)
except FileExistsError:
pass
# EXTRACT data : html from product page, and product data from the page
page_soup = get_html(page_url)
# EXTRACTION + TRANSFORMATION
product_data = get_data(page_soup, page_url)
# LOAD data in a list
data.append(product_data)
# protect path creation by removing "/" in product name
img_name = (product_data[2] + ".png").replace('/', '_')
# PHASE 4 : get img for every book and name it with category and incremental number
# EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
img_url = get_image_url(page_soup, page_url)
with open(category_path + "/" + img_name, "wb") as img_file:
img_file.write(requests.get(img_url).content)
img_nb += 1
print(processed_books, " livres traités")
print(total_books - processed_books, " livres restants")
print(total_category, " catégories restantes")
# LOAD : write the list in the CSV file
print("Done.\n Fichier " + data_output(data, category_path))
print("\n Traitement terminé.")
if __name__ == '__main__':
main()

2
rendu/requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
beautifulsoup4==4.12.3
requests==2.32.3