Compare commits
33 Commits
5d6a9bc263
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 913968b8c6 | |||
| b99c187bc5 | |||
| ec4ad03fca | |||
| 9398f8fae3 | |||
| 6cb7913af2 | |||
| cece9d1874 | |||
| b74090865e | |||
| 6fa035fc1a | |||
| aa0d3a7819 | |||
| 73b302a2bc | |||
| 90f3b22efb | |||
| 4785b2e6d8 | |||
| 22ccd97fa3 | |||
| 852c0e781b | |||
| c9aaef7222 | |||
| 549291cd6c | |||
| b34a5d123c | |||
| ebd5f5acd4 | |||
| d020998add | |||
| 12dd0c9dfc | |||
| dd370cca8d | |||
| c35f7454a2 | |||
| 4247f1ac83 | |||
| 2bbf684c26 | |||
| 27d37fb5d3 | |||
| 50ca4fccd8 | |||
| c92ce51aa0 | |||
| 8213f0849c | |||
| e3ac12ff9b | |||
| 7e6875a497 | |||
| c0fcd21346 | |||
| 7b7f216be8 | |||
| 3a6cf9b87e |
@@ -1,3 +1,7 @@
|
||||
Un dossier pour chaque phase du projet
|
||||
# INTRODUCTION
|
||||
|
||||
Ce dépôt contient un dossier pour chaque phase du projet, avec les instructions respectives dans chaque README.
|
||||
|
||||
Le contenu à considérer pour la **soutenance** est dans le dossier **"rendu"**
|
||||
|
||||
|
||||
Avec chacun un README contenant les instructions
|
||||
@@ -36,7 +36,12 @@ def product_description(soup):
|
||||
desc = soup.find("p", class_='').string
|
||||
return desc
|
||||
|
||||
#create a list with all information consecutively
|
||||
# get category from breadcrumb
|
||||
def get_category(soup):
|
||||
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||
return bread
|
||||
|
||||
# create a list with all information consecutively
|
||||
# /!\ don't know if that's the best way
|
||||
def get_data(soup, url):
|
||||
info = [url, product_information(soup)['UPC'],
|
||||
@@ -45,15 +50,25 @@ def get_data(soup, url):
|
||||
product_information(soup)['Price (excl. tax)'],
|
||||
product_information(soup)['Availability'],
|
||||
product_description(soup),
|
||||
"TODO",
|
||||
get_category(soup),
|
||||
product_information(soup)['Number of reviews'],
|
||||
get_image_url(soup, url)
|
||||
]
|
||||
return info
|
||||
|
||||
#write the file
|
||||
# write the file
|
||||
def data_output(info, file):
|
||||
fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url']
|
||||
fieldnames = ['product_page_url',
|
||||
'universal_ product_code (upc)',
|
||||
'title',
|
||||
'price_including_tax',
|
||||
'price_excluding_tax',
|
||||
'number_available',
|
||||
'product_description',
|
||||
'category',
|
||||
'review_rating',
|
||||
'image_url']
|
||||
|
||||
with open(file, 'w') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter = ',')
|
||||
writer.writerow(fieldnames)
|
||||
|
||||
17
phase2/README.md
Normal file
17
phase2/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# Phase 2
|
||||
|
||||
Maintenant que vous avez obtenu les informations concernant un premier livre,
|
||||
vous pouvez essayer de récupérer toutes les données nécessaires pour toute une
|
||||
catégorie d'ouvrages.
|
||||
|
||||
Choisissez n'importe quelle catégorie sur le site de Books to Scrape. Écrivez un
|
||||
script Python qui consulte la page de la catégorie choisie, et extrait l'URL de la
|
||||
page Produit de chaque livre appartenant à cette catégorie.
|
||||
|
||||
Combinez cela avec le travail que vous avez déjà effectué dans la phase 1 afin
|
||||
d'extraire les données produit de tous les livres de la catégorie choisie, puis écrivez
|
||||
les données dans un seul fichier CSV.
|
||||
|
||||
Remarque : certaines pages de catégorie comptent plus de 20 livres, qui sont
|
||||
donc répartis sur différentes pages (« pagination »). Votre application doit être
|
||||
capable de parcourir automatiquement les multiples pages si présentes
|
||||
90
phase2/main.py
Normal file
90
phase2/main.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import phase1
|
||||
|
||||
# get soup from url
|
||||
def get_html(url):
|
||||
r = requests.get(url)
|
||||
html = r.content
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup
|
||||
|
||||
|
||||
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||
def get_category_list(soup, url):
|
||||
catego_info = []
|
||||
catego_dict = {}
|
||||
for li in soup.find("div", class_="side_categories").find_all("li"):
|
||||
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||
return catego_dict
|
||||
|
||||
|
||||
def check_for_pages(category_url):
|
||||
soup_catego = get_html(category_url)
|
||||
total = int(soup_catego.form.strong.text)
|
||||
url_list = [category_url]
|
||||
|
||||
if total > 20:
|
||||
new_url_base = category_url.replace('index.html','')
|
||||
j = 2
|
||||
for i in range(total//20):
|
||||
page = "page-" + str(j) + ".html"
|
||||
url_list.append(new_url_base + page)
|
||||
j += 1
|
||||
|
||||
return url_list
|
||||
|
||||
|
||||
# get product list url from a given url category page;
|
||||
# extract and build each product url using the main url (second arg)
|
||||
def get_product_url_list(url_category_page, url):
|
||||
liste = []
|
||||
soup = get_html(url_category_page)
|
||||
|
||||
for i in soup.find_all("article"):
|
||||
relative_url = i.h3.a.get('href')
|
||||
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||
liste.append(product_url)
|
||||
|
||||
return liste
|
||||
|
||||
def main():
|
||||
# init
|
||||
url = "https://books.toscrape.com/"
|
||||
category = "default"
|
||||
|
||||
# get functional variables
|
||||
soup = get_html(url)
|
||||
liste_categories = get_category_list(soup, url)
|
||||
|
||||
# get category URL to do some tests on it
|
||||
category_url = liste_categories[category]
|
||||
print(category_url)
|
||||
|
||||
# check if multiple page and get url list
|
||||
url_list = check_for_pages(category_url)
|
||||
# print("Liste des URLs des pages: ", url_list)
|
||||
|
||||
# get product list for each url_list, extend the main product url list with
|
||||
product_url_list = []
|
||||
for i in url_list:
|
||||
product_url_list.extend(get_product_url_list(i, url))
|
||||
# print("Liste des URL des produits: ", product_url_list)
|
||||
print("Nombre de livres: ", len(product_url_list))
|
||||
|
||||
|
||||
# combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||
data = []
|
||||
for page_url in product_url_list:
|
||||
page_soup = get_html(page_url)
|
||||
# print(page_soup)
|
||||
# print(phase1.get_category(page_soup))
|
||||
# print(phase1.get_data(page_soup, page_url))
|
||||
data.append(phase1.get_data(page_soup, page_url))
|
||||
|
||||
print("Done.\n Fichier " + phase1.data_output(data, category))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
113
phase2/phase1.py
Normal file
113
phase2/phase1.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
|
||||
def extract_web(url):
|
||||
r = requests.get(url)
|
||||
page = r.content
|
||||
return page
|
||||
|
||||
# extract the product title from page
|
||||
def get_title(soup):
|
||||
title = soup.find("div", class_="product_main").h1.string
|
||||
return title
|
||||
|
||||
# extract the product_information from the table and put them in a dict
|
||||
# and extract quantity from string
|
||||
def product_information(soup):
|
||||
product_info = {}
|
||||
for tr in soup.table.find_all("tr"):
|
||||
product_info[tr.th.string] = tr.td.string
|
||||
#extract the amount from string and case it
|
||||
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||
product_info['Availability'] = availability
|
||||
return product_info
|
||||
|
||||
# get relative link from page and build the full URL
|
||||
def get_image_url(soup, url):
|
||||
link = soup.img.get('src')
|
||||
url_site = "https://" + url.split('/')[2]
|
||||
img_url = url_site + "/" + link.replace('../', '')
|
||||
return img_url
|
||||
|
||||
# get full description as string
|
||||
# luckily this <p> was the only one without class
|
||||
# and manage the case where there's no description
|
||||
def product_description(soup):
|
||||
try:
|
||||
desc = soup.find("p", class_='').string
|
||||
except AttributeError:
|
||||
desc = "None"
|
||||
|
||||
return desc
|
||||
|
||||
# get category from breadcrumb
|
||||
def get_category(soup):
|
||||
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||
return bread
|
||||
|
||||
# create a list with all information consecutively
|
||||
# /!\ don't know if that's the best way
|
||||
def get_data(soup, url):
|
||||
info = [
|
||||
url, product_information(soup)['UPC'],
|
||||
get_title(soup),
|
||||
product_information(soup)['Price (incl. tax)'],
|
||||
product_information(soup)['Price (excl. tax)'],
|
||||
product_information(soup)['Availability'],
|
||||
product_description(soup),
|
||||
get_category(soup),
|
||||
product_information(soup)['Number of reviews'],
|
||||
get_image_url(soup, url)
|
||||
]
|
||||
|
||||
|
||||
return info
|
||||
|
||||
# write the file
|
||||
def data_output(info, file):
|
||||
file = file + ".csv"
|
||||
fieldnames = ['product_page_url',
|
||||
'universal_ product_code (upc)',
|
||||
'title',
|
||||
'price_including_tax',
|
||||
'price_excluding_tax',
|
||||
'number_available',
|
||||
'product_description',
|
||||
'category',
|
||||
'review_rating',
|
||||
'image_url']
|
||||
|
||||
with open(file, 'w') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter = ',')
|
||||
writer.writerow(fieldnames)
|
||||
for i in info:
|
||||
writer.writerow(i)
|
||||
|
||||
return file
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
|
||||
|
||||
html = extract_web(url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
test = product_information(soup)
|
||||
print(test['Availability'])
|
||||
|
||||
info = get_data(soup, url)
|
||||
print(info)
|
||||
data_output(info, 'output.csv')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
14
phase3/README.md
Normal file
14
phase3/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
# Phase 3
|
||||
|
||||
Ensuite, étendez votre travail à l'écriture d'un script qui consulte le site de Books
|
||||
to Scrape, extrait toutes les catégories de livres disponibles, puis extrait les
|
||||
informations produit de tous les livres appartenant à toutes les différentes
|
||||
catégories.
|
||||
|
||||
Vous devrez écrire les données dans un fichier CSV distinct pour
|
||||
chaque catégorie de livres.
|
||||
|
||||
|
||||
# Résultat
|
||||
|
||||

|
||||
188
phase3/main.py
Normal file
188
phase3/main.py
Normal file
@@ -0,0 +1,188 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
|
||||
# get soup from url
|
||||
def get_html(url):
|
||||
r = requests.get(url)
|
||||
html = r.content
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup
|
||||
|
||||
# extract the product title from page
|
||||
def get_title(soup):
|
||||
title = soup.find("div", class_="product_main").h1.string
|
||||
return title
|
||||
|
||||
# extract the product_information from the table and put them in a dict
|
||||
# and extract quantity from string
|
||||
def product_information(soup):
|
||||
product_info = {}
|
||||
for tr in soup.table.find_all("tr"):
|
||||
product_info[tr.th.string] = tr.td.string
|
||||
#extract the amount from string and case it
|
||||
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||
product_info['Availability'] = availability
|
||||
return product_info
|
||||
|
||||
# get relative link from page and build the full URL
|
||||
def get_image_url(soup, url):
|
||||
link = soup.img.get('src')
|
||||
url_site = "https://" + url.split('/')[2]
|
||||
img_url = url_site + "/" + link.replace('../', '')
|
||||
return img_url
|
||||
|
||||
# get full description as string
|
||||
# luckily this <p> was the only one without class
|
||||
# and manage the case where there's no description
|
||||
def product_description(soup):
|
||||
try:
|
||||
desc = soup.find("p", class_='').string
|
||||
except AttributeError:
|
||||
desc = "None"
|
||||
|
||||
return desc
|
||||
|
||||
# get category from breadcrumb
|
||||
def get_category(soup):
|
||||
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||
return bread
|
||||
|
||||
# create a list with all information consecutively
|
||||
# /!\ don't know if that's the best way
|
||||
def get_data(soup, url):
|
||||
info = [
|
||||
url, product_information(soup)['UPC'],
|
||||
get_title(soup),
|
||||
product_information(soup)['Price (incl. tax)'],
|
||||
product_information(soup)['Price (excl. tax)'],
|
||||
product_information(soup)['Availability'],
|
||||
product_description(soup),
|
||||
get_category(soup),
|
||||
product_information(soup)['Number of reviews'],
|
||||
get_image_url(soup, url)
|
||||
]
|
||||
|
||||
|
||||
return info
|
||||
|
||||
# write the file
|
||||
def data_output(info, file):
|
||||
file = file + ".csv"
|
||||
fieldnames = ['product_page_url',
|
||||
'universal_ product_code (upc)',
|
||||
'title',
|
||||
'price_including_tax',
|
||||
'price_excluding_tax',
|
||||
'number_available',
|
||||
'product_description',
|
||||
'category',
|
||||
'review_rating',
|
||||
'image_url']
|
||||
|
||||
with open(file, 'w') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter = ',')
|
||||
writer.writerow(fieldnames)
|
||||
for i in info:
|
||||
writer.writerow(i)
|
||||
|
||||
return file
|
||||
|
||||
|
||||
|
||||
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||
def get_category_list(soup, url):
|
||||
catego_info = []
|
||||
catego_dict = {}
|
||||
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
|
||||
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||
return catego_info
|
||||
|
||||
|
||||
def check_for_pages(category_url):
|
||||
soup_catego = get_html(category_url)
|
||||
total = int(soup_catego.form.strong.text)
|
||||
url_list = [category_url]
|
||||
|
||||
if total > 20:
|
||||
new_url_base = category_url.replace('index.html','')
|
||||
j = 2
|
||||
for i in range(total//20):
|
||||
page = "page-" + str(j) + ".html"
|
||||
url_list.append(new_url_base + page)
|
||||
j += 1
|
||||
|
||||
return url_list
|
||||
|
||||
|
||||
# get product list url from a given url category page;
|
||||
# extract and build each product url using the main url (second arg)
|
||||
def get_product_url_list(url_category_page, url):
|
||||
liste = []
|
||||
soup = get_html(url_category_page)
|
||||
|
||||
for i in soup.find_all("article"):
|
||||
relative_url = i.h3.a.get('href')
|
||||
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||
liste.append(product_url)
|
||||
|
||||
return liste
|
||||
|
||||
# collect category from all
|
||||
# then grab all product for each and write a file with category name
|
||||
|
||||
def main():
|
||||
# init
|
||||
url = "https://books.toscrape.com/"
|
||||
|
||||
# get html from URL
|
||||
soup = get_html(url)
|
||||
|
||||
# init counters
|
||||
total_category = len(get_category_list(soup, url))
|
||||
total_books = int(soup.form.strong.text)
|
||||
processed_books = 0
|
||||
|
||||
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
|
||||
# go ahead for each category
|
||||
for line in get_category_list(soup, url):
|
||||
category = line[0]
|
||||
category_url = line[1]
|
||||
|
||||
total_category -= 1
|
||||
# display what category is processed
|
||||
print("\n -> Traitement de la catégorie : " + category)
|
||||
|
||||
# check if multiple pages and create a URL list
|
||||
url_list = check_for_pages(category_url)
|
||||
|
||||
# get product list for each url_list, extend the main product url list with
|
||||
product_url_list = []
|
||||
for i in url_list:
|
||||
product_url_list.extend(get_product_url_list(i, url))
|
||||
# print("Liste des URL des produits: ", product_url_list)
|
||||
print(len(product_url_list), " livres présents")
|
||||
processed_books += len(product_url_list)
|
||||
|
||||
# combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||
data = []
|
||||
for page_url in product_url_list:
|
||||
page_soup = get_html(page_url)
|
||||
# print(page_soup)
|
||||
# print(phase1.get_category(page_soup))
|
||||
# print(phase1.get_data(page_soup, page_url))
|
||||
data.append(get_data(page_soup, page_url))
|
||||
|
||||
print(processed_books, " livres traités")
|
||||
print(total_books - processed_books, " livres restants")
|
||||
print(total_category, " catégories restantes")
|
||||
print("Done.\n Fichier " + data_output(data, category))
|
||||
|
||||
|
||||
|
||||
|
||||
print("\n Traitement terminé.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
BIN
phase3/screenshot.png
Normal file
BIN
phase3/screenshot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
8
phase4/README.md
Normal file
8
phase4/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Phase 4
|
||||
|
||||
Enfin, prolongez votre travail existant pour télécharger et enregistrer le fichier
|
||||
image de chaque page Produit que vous consultez.
|
||||
|
||||
# Résultat
|
||||
|
||||

|
||||
204
phase4/main.py
Normal file
204
phase4/main.py
Normal file
@@ -0,0 +1,204 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
|
||||
### EXTRACTION ###
|
||||
|
||||
# get soup from url
|
||||
def get_html(url):
|
||||
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
|
||||
html = r.content
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup
|
||||
|
||||
# extract the product title from page
|
||||
def get_title(soup):
|
||||
title = soup.find("div", class_="product_main").h1.string
|
||||
return title
|
||||
|
||||
# extract the product_information from the table and put them in a dict
|
||||
# and extract quantity from string
|
||||
def product_information(soup):
|
||||
product_info = {}
|
||||
for tr in soup.table.find_all("tr"):
|
||||
product_info[tr.th.string] = tr.td.string
|
||||
#extract the amount from string and case it
|
||||
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||
product_info['Availability'] = availability
|
||||
return product_info
|
||||
|
||||
# extract full description as string
|
||||
# luckily this <p> was the only one without class
|
||||
# and manage the case where there's no description
|
||||
def product_description(soup):
|
||||
try:
|
||||
desc = soup.find("p", class_='').string
|
||||
except AttributeError:
|
||||
desc = "None"
|
||||
|
||||
return desc
|
||||
|
||||
# extract category from breadcrumb
|
||||
def get_category(soup):
|
||||
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||
return bread
|
||||
|
||||
# get product list url from a given url category page;
|
||||
# extract and build each product url using the main url (second arg)
|
||||
def get_product_url_list(url_category_page, url):
|
||||
liste = []
|
||||
soup = get_html(url_category_page)
|
||||
|
||||
for i in soup.find_all("article"):
|
||||
relative_url = i.h3.a.get('href')
|
||||
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||
liste.append(product_url)
|
||||
|
||||
return liste
|
||||
|
||||
# check if a category has multiple pages and extract URLs
|
||||
def check_for_pages(category_url):
|
||||
soup_catego = get_html(category_url)
|
||||
total = int(soup_catego.form.strong.text)
|
||||
url_list = [category_url]
|
||||
|
||||
if total > 20:
|
||||
new_url_base = category_url.replace('index.html','')
|
||||
j = 2
|
||||
for i in range(total//20):
|
||||
page = "page-" + str(j) + ".html"
|
||||
url_list.append(new_url_base + page)
|
||||
j += 1
|
||||
|
||||
return url_list
|
||||
|
||||
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||
def get_category_list(soup, url):
|
||||
catego_info = []
|
||||
catego_dict = {}
|
||||
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
|
||||
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||
return catego_info
|
||||
|
||||
### TRANSFORMATION ###
|
||||
|
||||
# get relative link from page and build the full URL
|
||||
def get_image_url(soup, url):
|
||||
link = soup.img.get('src')
|
||||
url_site = "https://" + url.split('/')[2]
|
||||
img_url = url_site + "/" + link.replace('../', '')
|
||||
return img_url
|
||||
|
||||
# create a list with all information consecutively
|
||||
# /!\ don't know if that's the best way
|
||||
def get_data(soup, url):
|
||||
info = [
|
||||
url, product_information(soup)['UPC'],
|
||||
get_title(soup),
|
||||
product_information(soup)['Price (incl. tax)'],
|
||||
product_information(soup)['Price (excl. tax)'],
|
||||
product_information(soup)['Availability'],
|
||||
product_description(soup),
|
||||
get_category(soup),
|
||||
product_information(soup)['Number of reviews'],
|
||||
get_image_url(soup, url)
|
||||
]
|
||||
return info
|
||||
|
||||
### LOAD ###
|
||||
|
||||
# write the file
|
||||
def data_output(info, file):
|
||||
file = file + ".csv"
|
||||
fieldnames = ['product_page_url',
|
||||
'universal_ product_code (upc)',
|
||||
'title',
|
||||
'price_including_tax',
|
||||
'price_excluding_tax',
|
||||
'number_available',
|
||||
'product_description',
|
||||
'category',
|
||||
'review_rating',
|
||||
'image_url']
|
||||
|
||||
with open(file, 'w') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter = ',')
|
||||
writer.writerow(fieldnames)
|
||||
for i in info:
|
||||
writer.writerow(i)
|
||||
|
||||
return file
|
||||
|
||||
|
||||
|
||||
# collect category from all
|
||||
# then grab all product for each and write a file with category name
|
||||
|
||||
def main():
|
||||
# init
|
||||
url = "https://books.toscrape.com/"
|
||||
|
||||
### EXTRACTION ###
|
||||
# get html from URL
|
||||
soup = get_html(url)
|
||||
|
||||
# init counters
|
||||
total_category = len(get_category_list(soup, url))
|
||||
total_books = int(soup.form.strong.text)
|
||||
processed_books = 0
|
||||
|
||||
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
|
||||
|
||||
# go ahead for each category
|
||||
for line in get_category_list(soup, url):
|
||||
category = line[0]
|
||||
category_url = line[1]
|
||||
|
||||
total_category -= 1
|
||||
# display what category is processed
|
||||
print("\n -> Traitement de la catégorie : " + category)
|
||||
|
||||
# check if multiple pages and create a URL list
|
||||
url_list = check_for_pages(category_url)
|
||||
|
||||
# get product list for each url_list, extend the main product url list with
|
||||
product_url_list = []
|
||||
for i in url_list:
|
||||
product_url_list.extend(get_product_url_list(i, url))
|
||||
# print("Liste des URL des produits: ", product_url_list)
|
||||
print(len(product_url_list), " livres présents")
|
||||
processed_books += len(product_url_list)
|
||||
|
||||
### TRANSFORMATION ###
|
||||
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||
data = []
|
||||
img_nb = 1
|
||||
for page_url in product_url_list:
|
||||
page_soup = get_html(page_url)
|
||||
# print(page_soup)
|
||||
# print(phase1.get_category(page_soup))
|
||||
# print(phase1.get_data(page_soup, page_url))
|
||||
data.append(get_data(page_soup, page_url))
|
||||
|
||||
### LOAD ###
|
||||
# PHASE 4 : get img for every book and name it with category and incremental number
|
||||
img_url = get_image_url(page_soup, page_url)
|
||||
with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
|
||||
img_file.write(requests.get(img_url).content)
|
||||
img_nb += 1
|
||||
|
||||
print(processed_books, " livres traités")
|
||||
print(total_books - processed_books, " livres restants")
|
||||
print(total_category, " catégories restantes")
|
||||
|
||||
### LOAD ###
|
||||
print("Done.\n Fichier " + data_output(data, category))
|
||||
|
||||
|
||||
|
||||
|
||||
print("\n Traitement terminé.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
BIN
phase4/screenshot.jpg
Normal file
BIN
phase4/screenshot.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 31 KiB |
87
rendu/README.md
Normal file
87
rendu/README.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# Books Online
|
||||
|
||||
Suivi du prix des livres chez "Books To Scrape"
|
||||
|
||||
## Introduction
|
||||
|
||||
Ces instructions vous permettent de :
|
||||
- récupérer le programme,
|
||||
- d'installer l'environnement nécessaire à son exécution,
|
||||
- de l'exécuter,
|
||||
- d'en connaitre le résultat
|
||||
|
||||
|
||||
### Pré-requis
|
||||
|
||||
```
|
||||
paquets : python 3.11, python3.11-venv, git
|
||||
modules : python requests, BeautifulSoup, csv, os
|
||||
```
|
||||
|
||||
### Installation
|
||||
|
||||
Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel :
|
||||
|
||||
créer l'environnement virtuel
|
||||
|
||||
```
|
||||
python3.11 -m venv env
|
||||
source env/bin/activate
|
||||
```
|
||||
cloner le dépôt, aller dans le bon dossier
|
||||
```
|
||||
git clone https://mcstn.fr/gitea/Yann/Projet2.git
|
||||
cd Projet2/rendu
|
||||
```
|
||||
installer les modules
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Exécution
|
||||
|
||||
exécuter la commande :
|
||||
```
|
||||
python3 main.py
|
||||
```
|
||||
|
||||
## Résultat
|
||||
|
||||
Les fichiers sont placés dans un répertoire "resultat"
|
||||
|
||||
Le programme récupère les catégories sur la page d'accueil de l'URL, puis, pour chaque catégorie :
|
||||
1. affiche la catégorie traitée, le nombre de catégories restantes, de livres présents, traités au total et restants
|
||||
2. crée un dossier du nom de la catégorie, y enregistre les images des livres nommées en fonction du titre
|
||||
3. crée un fichier csv au nom de la catégorie, avec :
|
||||
- product_page_url
|
||||
- universal_ product_code (upc)
|
||||
- title
|
||||
- price_including_tax
|
||||
- price_excluding_tax
|
||||
- number_available
|
||||
- product_description
|
||||
- category
|
||||
- review_rating
|
||||
- image_url
|
||||
|
||||
```
|
||||
$ time python3.11 main.py
|
||||
1000 à traiter répartis en 50 catégories.
|
||||
|
||||
[ ... ]
|
||||
|
||||
Traitement terminé.
|
||||
|
||||
real 20m17,783s
|
||||
user 4m30,695s
|
||||
sys 0m3,172s
|
||||
```
|
||||
## Auteur
|
||||
|
||||
Yann <yann@needsome.coffee>
|
||||
|
||||
|
||||
|
||||
## License
|
||||
|
||||
N/A
|
||||
226
rendu/main.py
Normal file
226
rendu/main.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
import os
|
||||
|
||||
### EXTRACTION ###
|
||||
|
||||
# get soup from url
|
||||
def get_html(url):
|
||||
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
|
||||
html = r.content
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup
|
||||
|
||||
# extract the product title from page
|
||||
def get_title(soup):
|
||||
title = soup.find("div", class_="product_main").h1.string
|
||||
return title
|
||||
|
||||
# extract the product_information from the table and put them in a dict
|
||||
# and extract quantity from string
|
||||
def product_information(soup):
|
||||
product_info = {}
|
||||
for tr in soup.table.find_all("tr"):
|
||||
product_info[tr.th.string] = tr.td.string
|
||||
#extract the amount from string and case it
|
||||
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||
product_info['Availability'] = availability
|
||||
return product_info
|
||||
|
||||
# extract full description as string
|
||||
# luckily this <p> was the only one without class
|
||||
# and manage the case where there's no description
|
||||
def product_description(soup):
|
||||
try:
|
||||
desc = soup.find("p", class_='').string
|
||||
except AttributeError:
|
||||
desc = "None"
|
||||
|
||||
return desc
|
||||
|
||||
# extract category from breadcrumb
|
||||
def get_category(soup):
|
||||
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||
return bread
|
||||
|
||||
# get product list url from a given url category page;
|
||||
# extract and build each product url using the main url (second arg)
|
||||
def get_product_url_list(url_category_page, url):
|
||||
liste = []
|
||||
soup = get_html(url_category_page)
|
||||
|
||||
for i in soup.find_all("article"):
|
||||
relative_url = i.h3.a.get('href')
|
||||
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||
liste.append(product_url)
|
||||
|
||||
return liste
|
||||
|
||||
# check if a category has multiple pages and extract URLs
|
||||
def check_for_pages(category_url):
|
||||
soup_catego = get_html(category_url)
|
||||
total = int(soup_catego.form.strong.text)
|
||||
url_list = [category_url]
|
||||
|
||||
if total > 20:
|
||||
new_url_base = category_url.replace('index.html','')
|
||||
j = 2
|
||||
for i in range(total//20):
|
||||
page = "page-" + str(j) + ".html"
|
||||
url_list.append(new_url_base + page)
|
||||
j += 1
|
||||
|
||||
return url_list
|
||||
|
||||
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||
def get_category_list(soup, url):
|
||||
catego_info = []
|
||||
catego_dict = {}
|
||||
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
|
||||
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||
return catego_info
|
||||
|
||||
### TRANSFORMATION ###
|
||||
|
||||
# get relative link from page and build the full URL
|
||||
def get_image_url(soup, url):
|
||||
link = soup.img.get('src')
|
||||
url_site = "https://" + url.split('/')[2]
|
||||
img_url = url_site + "/" + link.replace('../', '')
|
||||
return img_url
|
||||
|
||||
# create a list with all information consecutively
|
||||
# /!\ don't know if that's the best way
|
||||
def get_data(soup, url):
|
||||
info = [
|
||||
url,
|
||||
product_information(soup)['UPC'],
|
||||
get_title(soup),
|
||||
product_information(soup)['Price (incl. tax)'],
|
||||
product_information(soup)['Price (excl. tax)'],
|
||||
product_information(soup)['Availability'],
|
||||
product_description(soup),
|
||||
get_category(soup),
|
||||
product_information(soup)['Number of reviews'],
|
||||
get_image_url(soup, url)
|
||||
]
|
||||
return info
|
||||
|
||||
### LOAD ###
|
||||
|
||||
# write the file
|
||||
def data_output(info, file):
|
||||
file = file + ".csv"
|
||||
fieldnames = ['product_page_url',
|
||||
'universal_ product_code (upc)',
|
||||
'title',
|
||||
'price_including_tax',
|
||||
'price_excluding_tax',
|
||||
'number_available',
|
||||
'product_description',
|
||||
'category',
|
||||
'review_rating',
|
||||
'image_url']
|
||||
|
||||
with open(file, 'w') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter = ',')
|
||||
writer.writerow(fieldnames)
|
||||
for i in info:
|
||||
writer.writerow(i)
|
||||
|
||||
return file
|
||||
|
||||
|
||||
|
||||
# collect category from all
|
||||
# then grab all product for each and write a file with category name
|
||||
|
||||
def main():
|
||||
# init
|
||||
url = "https://books.toscrape.com/"
|
||||
os.mkdir("resultat")
|
||||
|
||||
### EXTRACTION ###
|
||||
# get html from URL
|
||||
soup = get_html(url)
|
||||
|
||||
# init counters
|
||||
total_category = len(get_category_list(soup, url))
|
||||
total_books = int(soup.form.strong.text)
|
||||
processed_books = 0
|
||||
|
||||
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
|
||||
|
||||
# go ahead for each category
|
||||
for line in get_category_list(soup, url):
|
||||
# remove space in category name, to prevent potential issue on directory creation
|
||||
category = line[0].replace(' ', '_')
|
||||
|
||||
category_url = line[1]
|
||||
category_path = "resultat/" + category
|
||||
total_category -= 1
|
||||
|
||||
# display what category is processed
|
||||
print("\n -> Traitement de la catégorie : " + category)
|
||||
|
||||
# check if multiple pages and create a URL list
|
||||
url_list = check_for_pages(category_url)
|
||||
|
||||
# get product list for each url_list, extend the main product url list with
|
||||
product_url_list = []
|
||||
for i in url_list:
|
||||
product_url_list.extend(get_product_url_list(i, url))
|
||||
# print("Liste des URL des produits: ", product_url_list)
|
||||
print(len(product_url_list), " livres présents")
|
||||
processed_books += len(product_url_list)
|
||||
|
||||
|
||||
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||
data = []
|
||||
img_nb = 1
|
||||
|
||||
# go ahead for each product of category
|
||||
# EXTRACT data for each product page
|
||||
for page_url in product_url_list:
|
||||
|
||||
# create the category directory. If it exists already, just continue
|
||||
try:
|
||||
os.mkdir(category_path)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
# EXTRACT data : html from product page, and product data from the page
|
||||
page_soup = get_html(page_url)
|
||||
|
||||
# EXTRACTION + TRANSFORMATION
|
||||
product_data = get_data(page_soup, page_url)
|
||||
|
||||
# LOAD data in a list
|
||||
data.append(product_data)
|
||||
|
||||
# protect path creation by removing "/" in product name
|
||||
img_name = (product_data[2] + ".png").replace('/', '_')
|
||||
|
||||
# PHASE 4 : get img for every book and name it with category and incremental number
|
||||
# EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
|
||||
img_url = get_image_url(page_soup, page_url)
|
||||
with open(category_path + "/" + img_name, "wb") as img_file:
|
||||
img_file.write(requests.get(img_url).content)
|
||||
img_nb += 1
|
||||
|
||||
print(processed_books, " livres traités")
|
||||
print(total_books - processed_books, " livres restants")
|
||||
print(total_category, " catégories restantes")
|
||||
|
||||
# LOAD : write the list in the CSV file
|
||||
print("Done.\n Fichier " + data_output(data, category_path))
|
||||
|
||||
|
||||
|
||||
|
||||
print("\n Traitement terminé.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
2
rendu/requirements.txt
Normal file
2
rendu/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
beautifulsoup4==4.12.3
|
||||
requests==2.32.3
|
||||
Reference in New Issue
Block a user