re-organize code to show ETL phases, add comments
This commit is contained in:
parent
73b302a2bc
commit
aa0d3a7819
109
phase4/main.py
109
phase4/main.py
@ -2,6 +2,8 @@ import requests
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
### EXTRACTION ###
|
||||||
|
|
||||||
# get soup from url
|
# get soup from url
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
|
r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
|
||||||
@ -25,14 +27,7 @@ def product_information(soup):
|
|||||||
product_info['Availability'] = availability
|
product_info['Availability'] = availability
|
||||||
return product_info
|
return product_info
|
||||||
|
|
||||||
# get relative link from page and build the full URL
|
# extract full description as string
|
||||||
def get_image_url(soup, url):
|
|
||||||
link = soup.img.get('src')
|
|
||||||
url_site = "https://" + url.split('/')[2]
|
|
||||||
img_url = url_site + "/" + link.replace('../', '')
|
|
||||||
return img_url
|
|
||||||
|
|
||||||
# get full description as string
|
|
||||||
# luckily this <p> was the only one without class
|
# luckily this <p> was the only one without class
|
||||||
# and manage the case where there's no description
|
# and manage the case where there's no description
|
||||||
def product_description(soup):
|
def product_description(soup):
|
||||||
@ -43,11 +38,58 @@ def product_description(soup):
|
|||||||
|
|
||||||
return desc
|
return desc
|
||||||
|
|
||||||
# get category from breadcrumb
|
# extract category from breadcrumb
|
||||||
def get_category(soup):
|
def get_category(soup):
|
||||||
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||||
return bread
|
return bread
|
||||||
|
|
||||||
|
# get product list url from a given url category page;
|
||||||
|
# extract and build each product url using the main url (second arg)
|
||||||
|
def get_product_url_list(url_category_page, url):
|
||||||
|
liste = []
|
||||||
|
soup = get_html(url_category_page)
|
||||||
|
|
||||||
|
for i in soup.find_all("article"):
|
||||||
|
relative_url = i.h3.a.get('href')
|
||||||
|
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
||||||
|
liste.append(product_url)
|
||||||
|
|
||||||
|
return liste
|
||||||
|
|
||||||
|
# check if a category has multiple pages and extract URLs
|
||||||
|
def check_for_pages(category_url):
|
||||||
|
soup_catego = get_html(category_url)
|
||||||
|
total = int(soup_catego.form.strong.text)
|
||||||
|
url_list = [category_url]
|
||||||
|
|
||||||
|
if total > 20:
|
||||||
|
new_url_base = category_url.replace('index.html','')
|
||||||
|
j = 2
|
||||||
|
for i in range(total//20):
|
||||||
|
page = "page-" + str(j) + ".html"
|
||||||
|
url_list.append(new_url_base + page)
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
return url_list
|
||||||
|
|
||||||
|
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||||
|
def get_category_list(soup, url):
|
||||||
|
catego_info = []
|
||||||
|
catego_dict = {}
|
||||||
|
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
|
||||||
|
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||||
|
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||||
|
return catego_info
|
||||||
|
|
||||||
|
### TRANSFORMATION ###
|
||||||
|
|
||||||
|
# get relative link from page and build the full URL
|
||||||
|
def get_image_url(soup, url):
|
||||||
|
link = soup.img.get('src')
|
||||||
|
url_site = "https://" + url.split('/')[2]
|
||||||
|
img_url = url_site + "/" + link.replace('../', '')
|
||||||
|
return img_url
|
||||||
|
|
||||||
# create a list with all information consecutively
|
# create a list with all information consecutively
|
||||||
# /!\ don't know if that's the best way
|
# /!\ don't know if that's the best way
|
||||||
def get_data(soup, url):
|
def get_data(soup, url):
|
||||||
@ -62,10 +104,10 @@ def get_data(soup, url):
|
|||||||
product_information(soup)['Number of reviews'],
|
product_information(soup)['Number of reviews'],
|
||||||
get_image_url(soup, url)
|
get_image_url(soup, url)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
### LOAD ###
|
||||||
|
|
||||||
# write the file
|
# write the file
|
||||||
def data_output(info, file):
|
def data_output(info, file):
|
||||||
file = file + ".csv"
|
file = file + ".csv"
|
||||||
@ -90,45 +132,6 @@ def data_output(info, file):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# get category and URL from side div and put them as a list [catego, url] in a list
|
|
||||||
def get_category_list(soup, url):
|
|
||||||
catego_info = []
|
|
||||||
catego_dict = {}
|
|
||||||
for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
|
|
||||||
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
|
||||||
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
|
||||||
return catego_info
|
|
||||||
|
|
||||||
|
|
||||||
def check_for_pages(category_url):
|
|
||||||
soup_catego = get_html(category_url)
|
|
||||||
total = int(soup_catego.form.strong.text)
|
|
||||||
url_list = [category_url]
|
|
||||||
|
|
||||||
if total > 20:
|
|
||||||
new_url_base = category_url.replace('index.html','')
|
|
||||||
j = 2
|
|
||||||
for i in range(total//20):
|
|
||||||
page = "page-" + str(j) + ".html"
|
|
||||||
url_list.append(new_url_base + page)
|
|
||||||
j += 1
|
|
||||||
|
|
||||||
return url_list
|
|
||||||
|
|
||||||
|
|
||||||
# get product list url from a given url category page;
|
|
||||||
# extract and build each product url using the main url (second arg)
|
|
||||||
def get_product_url_list(url_category_page, url):
|
|
||||||
liste = []
|
|
||||||
soup = get_html(url_category_page)
|
|
||||||
|
|
||||||
for i in soup.find_all("article"):
|
|
||||||
relative_url = i.h3.a.get('href')
|
|
||||||
product_url = url + "catalogue/" + relative_url.split('../')[-1]
|
|
||||||
liste.append(product_url)
|
|
||||||
|
|
||||||
return liste
|
|
||||||
|
|
||||||
# collect category from all
|
# collect category from all
|
||||||
# then grab all product for each and write a file with category name
|
# then grab all product for each and write a file with category name
|
||||||
|
|
||||||
@ -136,6 +139,7 @@ def main():
|
|||||||
# init
|
# init
|
||||||
url = "https://books.toscrape.com/"
|
url = "https://books.toscrape.com/"
|
||||||
|
|
||||||
|
### EXTRACTION ###
|
||||||
# get html from URL
|
# get html from URL
|
||||||
soup = get_html(url)
|
soup = get_html(url)
|
||||||
|
|
||||||
@ -145,6 +149,7 @@ def main():
|
|||||||
processed_books = 0
|
processed_books = 0
|
||||||
|
|
||||||
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
|
print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
|
||||||
|
|
||||||
# go ahead for each category
|
# go ahead for each category
|
||||||
for line in get_category_list(soup, url):
|
for line in get_category_list(soup, url):
|
||||||
category = line[0]
|
category = line[0]
|
||||||
@ -165,6 +170,7 @@ def main():
|
|||||||
print(len(product_url_list), " livres présents")
|
print(len(product_url_list), " livres présents")
|
||||||
processed_books += len(product_url_list)
|
processed_books += len(product_url_list)
|
||||||
|
|
||||||
|
### TRANSFORMATION ###
|
||||||
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
|
# PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
|
||||||
data = []
|
data = []
|
||||||
img_nb = 1
|
img_nb = 1
|
||||||
@ -175,6 +181,7 @@ def main():
|
|||||||
# print(phase1.get_data(page_soup, page_url))
|
# print(phase1.get_data(page_soup, page_url))
|
||||||
data.append(get_data(page_soup, page_url))
|
data.append(get_data(page_soup, page_url))
|
||||||
|
|
||||||
|
### LOAD ###
|
||||||
# PHASE 4 : get img for every book and name it with category and incremental number
|
# PHASE 4 : get img for every book and name it with category and incremental number
|
||||||
img_url = get_image_url(page_soup, page_url)
|
img_url = get_image_url(page_soup, page_url)
|
||||||
with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
|
with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
|
||||||
@ -184,6 +191,8 @@ def main():
|
|||||||
print(processed_books, " livres traités")
|
print(processed_books, " livres traités")
|
||||||
print(total_books - processed_books, " livres restants")
|
print(total_books - processed_books, " livres restants")
|
||||||
print(total_category, " catégories restantes")
|
print(total_category, " catégories restantes")
|
||||||
|
|
||||||
|
### LOAD ###
|
||||||
print("Done.\n Fichier " + data_output(data, category))
|
print("Done.\n Fichier " + data_output(data, category))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user