re-organize code to show ETL phases, add comments

2024-11-19 12:40:23 +01:00 · 2024-11-19 12:40:23 +01:00 · aa0d3a7819
commit aa0d3a7819
parent 73b302a2bc
1 changed files with 59 additions and 50 deletions
--- a/phase4/main.py
+++ b/phase4/main.py
@ -2,6 +2,8 @@ import requests
 from bs4 import BeautifulSoup
 import csv
 ### EXTRACTION  ###
 # get soup from url
 def get_html(url):
    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
@ -25,14 +27,7 @@ def product_information(soup):
    product_info['Availability'] = availability
    return product_info
-# get relative link from page and build the full URL
+# extract full description as string
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # get full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
@ -43,11 +38,58 @@ def product_description(soup):
    return desc
-# get category from breadcrumb
+# extract category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 # check if a category has multiple pages and extract URLs
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info
 ### TRANSFORMATION ###
 # get relative link from page and build the full URL
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
@ -62,10 +104,10 @@ def get_data(soup, url):
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
    return info
 ### LOAD ###
 # write the file
 def data_output(info, file):
    file = file + ".csv"
@ -90,45 +132,6 @@ def data_output(info, file):
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 # collect category from all
 # then grab all product for each and write a file with category name
@ -136,6 +139,7 @@ def main():
    # init
    url = "https://books.toscrape.com/"
    ### EXTRACTION ###
    # get html from URL
    soup = get_html(url)
@ -145,6 +149,7 @@ def main():
    processed_books = 0
    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
    # go ahead for each category
    for line in get_category_list(soup, url):
        category = line[0]
@ -165,6 +170,7 @@ def main():
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)
        ### TRANSFORMATION ###
        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        img_nb = 1
@ -175,6 +181,7 @@ def main():
            #        print(phase1.get_data(page_soup, page_url))
            data.append(get_data(page_soup, page_url))
            ### LOAD ###
            # PHASE 4 : get img for every book and name it with category and incremental number
            img_url = get_image_url(page_soup, page_url)
            with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
@ -184,6 +191,8 @@ def main():
        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
        ### LOAD ###
        print("Done.\n  Fichier " + data_output(data, category))