init phase4

2024-11-14 14:48:59 +01:00
parent c9aaef7222
commit 852c0e781b
2 changed files with 190 additions and 0 deletions
--- a/phase4/README.md
+++ b/phase4/README.md
@@ -0,0 +1,2 @@
+# Phase 4
+
--- a/phase4/main.py
+++ b/phase4/main.py
@@ -0,0 +1,188 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+
+# get soup from url
+def get_html(url):
+    r = requests.get(url)
+    html = r.content
+    soup = BeautifulSoup(html, 'html.parser')
+    return soup
+
+# extract the product title from page
+def get_title(soup):
+    title = soup.find("div", class_="product_main").h1.string
+    return title
+
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
+def product_information(soup):
+    product_info = {}
+    for tr in soup.table.find_all("tr"):
+        product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability'] = availability
+    return product_info
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
+# get full description as string
+# luckily this <p> was the only one without class
+# and manage the case where there's no description
+def product_description(soup):
+    try:
+        desc = soup.find("p", class_='').string
+    except AttributeError:
+        desc = "None"
+
+    return desc
+
+# get category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# create a list with all information consecutively
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info = [
+        url, product_information(soup)['UPC'],
+        get_title(soup),
+        product_information(soup)['Price (incl. tax)'],
+        product_information(soup)['Price (excl. tax)'],
+        product_information(soup)['Availability'],
+        product_description(soup),
+        get_category(soup),
+        product_information(soup)['Number of reviews'],
+        get_image_url(soup, url)
+        ]
+
+
+    return info
+
+# write the file
+def data_output(info, file):
+    file = file + ".csv"
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+
+    with open(file, 'w') as csv_file:
+        writer = csv.writer(csv_file, delimiter = ',')
+        writer.writerow(fieldnames)
+        for i in info:
+            writer.writerow(i)
+
+    return file
+
+
+
+# get category and URL from side div and put them as a list [catego, url] in a list
+def get_category_list(soup, url):
+    catego_info = []
+    catego_dict = {}
+    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
+        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
+        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
+    return catego_info
+
+
+def check_for_pages(category_url):
+    soup_catego = get_html(category_url)
+    total = int(soup_catego.form.strong.text)
+    url_list = [category_url]
+
+    if total > 20:
+        new_url_base = category_url.replace('index.html','')
+        j = 2
+        for i in range(total//20):
+            page = "page-" + str(j) + ".html"
+            url_list.append(new_url_base + page)
+            j += 1
+
+    return url_list
+
+
+# get product list url from a given url category page;
+# extract and build each product url using the main url (second arg)
+def get_product_url_list(url_category_page, url):
+    liste = []
+    soup = get_html(url_category_page)
+
+    for i in soup.find_all("article"):
+        relative_url = i.h3.a.get('href')
+        product_url = url + "catalogue/" + relative_url.split('../')[-1]
+        liste.append(product_url)
+
+    return liste
+
+# collect category from all
+# then grab all product for each and write a file with category name
+
+def main():
+    # init
+    url = "https://books.toscrape.com/"
+
+    # get html from URL
+    soup = get_html(url)
+
+    # init counters
+    total_category = len(get_category_list(soup, url))
+    total_books = int(soup.form.strong.text)
+    processed_books = 0
+
+    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
+    # go ahead for each category
+    for line in get_category_list(soup, url):
+        category = line[0]
+        category_url = line[1]
+
+        total_category -= 1
+        # display what category is processed
+        print("\n -> Traitement de la catégorie : " + category)
+
+        # check if multiple pages and create a URL list
+        url_list = check_for_pages(category_url)
+
+        # get product list for each url_list, extend the main product url list with
+        product_url_list = []
+        for i in url_list:
+            product_url_list.extend(get_product_url_list(i, url))
+        #    print("Liste des URL des produits: ", product_url_list)
+        print(len(product_url_list), " livres présents")
+        processed_books += len(product_url_list)
+
+        # combine with phase 1 and write in csv for each url from product_url_list named with category
+        data = []
+        for page_url in product_url_list:
+            page_soup = get_html(page_url)
+            #        print(page_soup)
+            #       print(phase1.get_category(page_soup))
+            #        print(phase1.get_data(page_soup, page_url))
+            data.append(get_data(page_soup, page_url))
+
+        print(processed_books, " livres traités")
+        print(total_books - processed_books, " livres restants")
+        print(total_category, " catégories restantes")
+        print("Done.\n  Fichier " + data_output(data, category))
+
+
+
+
+    print("\n Traitement terminé.")
+
+if __name__ == '__main__':
+    main()