re-organize code to show ETL phases, add comments

2024-11-19 12:40:23 +01:00 · 2024-11-19 12:40:23 +01:00 · aa0d3a7819
commit aa0d3a7819
parent 73b302a2bc
1 changed files with 59 additions and 50 deletions
--- a/phase4/main.py
+++ b/phase4/main.py
@ -2,6 +2,8 @@ import requests
 from bs4 import BeautifulSoup
 import csv

+### EXTRACTION  ###
+
 # get soup from url
 def get_html(url):
    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
@ -25,14 +27,7 @@ def product_information(soup):
    product_info['Availability'] = availability
    return product_info

-# get relative link from page and build the full URL
-def get_image_url(soup, url):
-    link = soup.img.get('src')
-    url_site = "https://" + url.split('/')[2]
-    img_url = url_site + "/" + link.replace('../', '')
-    return img_url
-
-# get full description as string
+# extract full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
@ -43,11 +38,58 @@ def product_description(soup):

    return desc

-# get category from breadcrumb
+# extract category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread

+# get product list url from a given url category page;
+# extract and build each product url using the main url (second arg)
+def get_product_url_list(url_category_page, url):
+    liste = []
+    soup = get_html(url_category_page)
+
+    for i in soup.find_all("article"):
+        relative_url = i.h3.a.get('href')
+        product_url = url + "catalogue/" + relative_url.split('../')[-1]
+        liste.append(product_url)
+
+    return liste
+
+# check if a category has multiple pages and extract URLs
+def check_for_pages(category_url):
+    soup_catego = get_html(category_url)
+    total = int(soup_catego.form.strong.text)
+    url_list = [category_url]
+
+    if total > 20:
+        new_url_base = category_url.replace('index.html','')
+        j = 2
+        for i in range(total//20):
+            page = "page-" + str(j) + ".html"
+            url_list.append(new_url_base + page)
+            j += 1
+
+    return url_list
+
+# get category and URL from side div and put them as a list [catego, url] in a list
+def get_category_list(soup, url):
+    catego_info = []
+    catego_dict = {}
+    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
+        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
+        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
+    return catego_info
+
+### TRANSFORMATION ###
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
@ -62,10 +104,10 @@ def get_data(soup, url):
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
-
-
    return info

+### LOAD ###
+
 # write the file
 def data_output(info, file):
    file = file + ".csv"
@ -90,45 +132,6 @@ def data_output(info, file):



-# get category and URL from side div and put them as a list [catego, url] in a list
-def get_category_list(soup, url):
-    catego_info = []
-    catego_dict = {}
-    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
-        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
-        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
-    return catego_info
-
-
-def check_for_pages(category_url):
-    soup_catego = get_html(category_url)
-    total = int(soup_catego.form.strong.text)
-    url_list = [category_url]
-
-    if total > 20:
-        new_url_base = category_url.replace('index.html','')
-        j = 2
-        for i in range(total//20):
-            page = "page-" + str(j) + ".html"
-            url_list.append(new_url_base + page)
-            j += 1
-
-    return url_list
-
-
-# get product list url from a given url category page;
-# extract and build each product url using the main url (second arg)
-def get_product_url_list(url_category_page, url):
-    liste = []
-    soup = get_html(url_category_page)
-
-    for i in soup.find_all("article"):
-        relative_url = i.h3.a.get('href')
-        product_url = url + "catalogue/" + relative_url.split('../')[-1]
-        liste.append(product_url)
-
-    return liste
-
 # collect category from all
 # then grab all product for each and write a file with category name

@ -136,6 +139,7 @@ def main():
    # init
    url = "https://books.toscrape.com/"

+    ### EXTRACTION ###
    # get html from URL
    soup = get_html(url)

@ -145,6 +149,7 @@ def main():
    processed_books = 0

    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
+
    # go ahead for each category
    for line in get_category_list(soup, url):
        category = line[0]
@ -165,6 +170,7 @@ def main():
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)

+        ### TRANSFORMATION ###
        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        img_nb = 1
@ -175,6 +181,7 @@ def main():
            #        print(phase1.get_data(page_soup, page_url))
            data.append(get_data(page_soup, page_url))

+            ### LOAD ###
            # PHASE 4 : get img for every book and name it with category and incremental number
            img_url = get_image_url(page_soup, page_url)
            with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
@ -184,6 +191,8 @@ def main():
        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
+
+        ### LOAD ###
        print("Done.\n  Fichier " + data_output(data, category))