fix typo

add details for directories
add execution duration
2024-11-28 08:36:33 +01:00 · 2024-11-23 18:57:07 +01:00 · 2024-11-20 13:07:30 +01:00 · 2024-11-20 11:56:51 +01:00 · 2024-11-20 11:56:10 +01:00 · 2024-11-20 10:15:58 +01:00
14 changed files with 974 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
-Un dossier pour chaque phase du projet
+# INTRODUCTION
+
+Ce dépôt contient un dossier pour chaque phase du projet, avec les instructions respectives dans chaque README.
+
+Le contenu à considérer pour la **soutenance** est dans le dossier **"rendu"**
+

-Avec chacun un README contenant les instructions
--- a/phase1/main.py
+++ b/phase1/main.py
@@ -36,7 +36,12 @@ def product_description(soup):
    desc = soup.find("p", class_='').string
    return desc

-#create a list with all information consecutively
+# get category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [url, product_information(soup)['UPC'],
@@ -45,15 +50,25 @@ def get_data(soup, url):
            product_information(soup)['Price (excl. tax)'],
            product_information(soup)['Availability'],
            product_description(soup),
-            "TODO",
+            get_category(soup),
            product_information(soup)['Number of reviews'],
            get_image_url(soup, url)
            ]
    return info

-#write the file
+# write the file
 def data_output(info, file):
-    fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url']
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+    
    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
--- a/phase2/README.md
+++ b/phase2/README.md
@@ -0,0 +1,17 @@
+# Phase 2
+
+Maintenant que vous avez obtenu les informations concernant un premier livre,
+vous pouvez essayer de récupérer toutes les données nécessaires pour toute une
+catégorie d'ouvrages.
+
+Choisissez n'importe quelle catégorie sur le site de Books to Scrape. Écrivez un
+script Python qui consulte la page de la catégorie choisie, et extrait l'URL de la
+page Produit de chaque livre appartenant à cette catégorie.
+
+Combinez cela avec le travail que vous avez déjà effectué dans la phase 1 afin
+d'extraire les données produit de tous les livres de la catégorie choisie, puis écrivez
+les données dans un seul fichier CSV.
+
+Remarque : certaines pages de catégorie comptent plus de 20 livres, qui sont
+donc répartis sur différentes pages (« pagination »). Votre application doit être
+capable de parcourir automatiquement les multiples pages si présentes
--- a/phase2/main.py
+++ b/phase2/main.py
@@ -0,0 +1,90 @@
+import requests
+from bs4 import BeautifulSoup
+import phase1
+
+# get soup from url
+def get_html(url):
+    r = requests.get(url)
+    html = r.content
+    soup = BeautifulSoup(html, 'html.parser')
+    return soup
+
+
+# get category and URL from side div and put them as a list [catego, url] in a list
+def get_category_list(soup, url):
+    catego_info = []
+    catego_dict = {}
+    for li in soup.find("div", class_="side_categories").find_all("li"):
+        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
+        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
+    return catego_dict
+
+
+def check_for_pages(category_url):
+    soup_catego = get_html(category_url)
+    total = int(soup_catego.form.strong.text)
+    url_list = [category_url]
+
+    if total > 20:
+        new_url_base = category_url.replace('index.html','')
+        j = 2
+        for i in range(total//20):
+            page = "page-" + str(j) + ".html"
+            url_list.append(new_url_base + page)
+            j += 1
+
+    return url_list
+
+
+# get product list url from a given url category page;
+# extract and build each product url using the main url (second arg)
+def get_product_url_list(url_category_page, url):
+    liste = []
+    soup = get_html(url_category_page)
+
+    for i in soup.find_all("article"):
+        relative_url = i.h3.a.get('href')
+        product_url = url + "catalogue/" + relative_url.split('../')[-1]
+        liste.append(product_url)
+
+    return liste
+
+def main():
+    # init
+    url = "https://books.toscrape.com/"
+    category = "default"
+
+    # get functional variables
+    soup = get_html(url)
+    liste_categories = get_category_list(soup, url)
+
+    # get category URL to do some tests on it
+    category_url = liste_categories[category]
+    print(category_url)
+
+    # check if multiple page and get url list
+    url_list = check_for_pages(category_url)
+#    print("Liste des URLs des pages: ", url_list)
+
+    # get product list for each url_list, extend the main product url list with
+    product_url_list = []
+    for i in url_list:
+        product_url_list.extend(get_product_url_list(i, url))
+#    print("Liste des URL des produits: ", product_url_list)
+    print("Nombre de livres: ", len(product_url_list))
+
+
+    # combine with phase 1 and write in csv for each url from product_url_list named with category
+    data = []
+    for page_url in product_url_list:
+        page_soup = get_html(page_url)
+#        print(page_soup)
+#       print(phase1.get_category(page_soup))
+#        print(phase1.get_data(page_soup, page_url))
+        data.append(phase1.get_data(page_soup, page_url))
+
+    print("Done.\n Fichier " + phase1.data_output(data, category))
+
+
+if __name__ == '__main__':
+    main()
--- a/phase2/phase1.py
+++ b/phase2/phase1.py
@@ -0,0 +1,113 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+
+def extract_web(url):
+    r = requests.get(url)
+    page = r.content
+    return page
+
+# extract the product title from page
+def get_title(soup):
+    title = soup.find("div", class_="product_main").h1.string
+    return title
+
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
+def product_information(soup):
+    product_info = {}
+    for tr in soup.table.find_all("tr"):
+        product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability'] = availability
+    return product_info
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
+# get full description as string
+# luckily this <p> was the only one without class
+# and manage the case where there's no description
+def product_description(soup):
+    try:
+        desc = soup.find("p", class_='').string
+    except AttributeError:
+        desc = "None"
+
+    return desc
+
+# get category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# create a list with all information consecutively
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info = [
+        url, product_information(soup)['UPC'],
+        get_title(soup),
+        product_information(soup)['Price (incl. tax)'],
+        product_information(soup)['Price (excl. tax)'],
+        product_information(soup)['Availability'],
+        product_description(soup),
+        get_category(soup),
+        product_information(soup)['Number of reviews'],
+        get_image_url(soup, url)
+        ]
+
+
+    return info
+
+# write the file
+def data_output(info, file):
+    file = file + ".csv"
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+
+    with open(file, 'w') as csv_file:
+        writer = csv.writer(csv_file, delimiter = ',')
+        writer.writerow(fieldnames)
+        for i in info:
+            writer.writerow(i)
+
+    return file
+
+
+
+
+
+
+def main():
+
+    url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
+
+    html = extract_web(url)
+    soup = BeautifulSoup(html, "html.parser")
+    test = product_information(soup)
+    print(test['Availability'])
+
+    info = get_data(soup, url)
+    print(info)
+    data_output(info, 'output.csv')
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+
--- a/phase3/README.md
+++ b/phase3/README.md
@@ -0,0 +1,14 @@
+# Phase 3
+
+Ensuite, étendez votre travail à l'écriture d'un script qui consulte le site de Books
+to Scrape, extrait toutes les catégories de livres disponibles, puis extrait les
+informations produit de tous les livres appartenant à toutes les différentes
+catégories. 
+
+Vous devrez écrire les données dans un fichier CSV distinct pour
+chaque catégorie de livres.
+
+
+# Résultat
+
+![ screenshot](screenshot.png)
--- a/phase3/main.py
+++ b/phase3/main.py
@@ -0,0 +1,188 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+
+# get soup from url
+def get_html(url):
+    r = requests.get(url)
+    html = r.content
+    soup = BeautifulSoup(html, 'html.parser')
+    return soup
+
+# extract the product title from page
+def get_title(soup):
+    title = soup.find("div", class_="product_main").h1.string
+    return title
+
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
+def product_information(soup):
+    product_info = {}
+    for tr in soup.table.find_all("tr"):
+        product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability'] = availability
+    return product_info
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
+# get full description as string
+# luckily this <p> was the only one without class
+# and manage the case where there's no description
+def product_description(soup):
+    try:
+        desc = soup.find("p", class_='').string
+    except AttributeError:
+        desc = "None"
+
+    return desc
+
+# get category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# create a list with all information consecutively
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info = [
+        url, product_information(soup)['UPC'],
+        get_title(soup),
+        product_information(soup)['Price (incl. tax)'],
+        product_information(soup)['Price (excl. tax)'],
+        product_information(soup)['Availability'],
+        product_description(soup),
+        get_category(soup),
+        product_information(soup)['Number of reviews'],
+        get_image_url(soup, url)
+        ]
+
+
+    return info
+
+# write the file
+def data_output(info, file):
+    file = file + ".csv"
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+
+    with open(file, 'w') as csv_file:
+        writer = csv.writer(csv_file, delimiter = ',')
+        writer.writerow(fieldnames)
+        for i in info:
+            writer.writerow(i)
+
+    return file
+
+
+
+# get category and URL from side div and put them as a list [catego, url] in a list
+def get_category_list(soup, url):
+    catego_info = []
+    catego_dict = {}
+    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
+        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
+        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
+    return catego_info
+
+
+def check_for_pages(category_url):
+    soup_catego = get_html(category_url)
+    total = int(soup_catego.form.strong.text)
+    url_list = [category_url]
+
+    if total > 20:
+        new_url_base = category_url.replace('index.html','')
+        j = 2
+        for i in range(total//20):
+            page = "page-" + str(j) + ".html"
+            url_list.append(new_url_base + page)
+            j += 1
+
+    return url_list
+
+
+# get product list url from a given url category page;
+# extract and build each product url using the main url (second arg)
+def get_product_url_list(url_category_page, url):
+    liste = []
+    soup = get_html(url_category_page)
+
+    for i in soup.find_all("article"):
+        relative_url = i.h3.a.get('href')
+        product_url = url + "catalogue/" + relative_url.split('../')[-1]
+        liste.append(product_url)
+
+    return liste
+
+# collect category from all
+# then grab all product for each and write a file with category name
+
+def main():
+    # init
+    url = "https://books.toscrape.com/"
+
+    # get html from URL
+    soup = get_html(url)
+
+    # init counters
+    total_category = len(get_category_list(soup, url))
+    total_books = int(soup.form.strong.text)
+    processed_books = 0
+
+    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
+    # go ahead for each category
+    for line in get_category_list(soup, url):
+        category = line[0]
+        category_url = line[1]
+
+        total_category -= 1
+        # display what category is processed
+        print("\n -> Traitement de la catégorie : " + category)
+
+        # check if multiple pages and create a URL list
+        url_list = check_for_pages(category_url)
+
+        # get product list for each url_list, extend the main product url list with
+        product_url_list = []
+        for i in url_list:
+            product_url_list.extend(get_product_url_list(i, url))
+        #    print("Liste des URL des produits: ", product_url_list)
+        print(len(product_url_list), " livres présents")
+        processed_books += len(product_url_list)
+
+        # combine with phase 1 and write in csv for each url from product_url_list named with category
+        data = []
+        for page_url in product_url_list:
+            page_soup = get_html(page_url)
+            #        print(page_soup)
+            #       print(phase1.get_category(page_soup))
+            #        print(phase1.get_data(page_soup, page_url))
+            data.append(get_data(page_soup, page_url))
+
+        print(processed_books, " livres traités")
+        print(total_books - processed_books, " livres restants")
+        print(total_category, " catégories restantes")
+        print("Done.\n  Fichier " + data_output(data, category))
+
+
+
+
+    print("\n Traitement terminé.")
+
+if __name__ == '__main__':
+    main()
--- a/phase3/screenshot.png
+++ b/phase3/screenshot.png
--- a/phase4/README.md
+++ b/phase4/README.md
@@ -0,0 +1,8 @@
+# Phase 4
+
+Enfin, prolongez votre travail existant pour télécharger et enregistrer le fichier
+image de chaque page Produit que vous consultez.
+
+# Résultat
+
+![ screenshot](screenshot.jpg)
--- a/phase4/main.py
+++ b/phase4/main.py
@@ -0,0 +1,204 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+
+### EXTRACTION  ###
+
+# get soup from url
+def get_html(url):
+    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
+    html = r.content
+    soup = BeautifulSoup(html, 'html.parser')
+    return soup
+
+# extract the product title from page
+def get_title(soup):
+    title = soup.find("div", class_="product_main").h1.string
+    return title
+
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
+def product_information(soup):
+    product_info = {}
+    for tr in soup.table.find_all("tr"):
+        product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability'] = availability
+    return product_info
+
+# extract full description as string
+# luckily this <p> was the only one without class
+# and manage the case where there's no description
+def product_description(soup):
+    try:
+        desc = soup.find("p", class_='').string
+    except AttributeError:
+        desc = "None"
+
+    return desc
+
+# extract category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# get product list url from a given url category page;
+# extract and build each product url using the main url (second arg)
+def get_product_url_list(url_category_page, url):
+    liste = []
+    soup = get_html(url_category_page)
+
+    for i in soup.find_all("article"):
+        relative_url = i.h3.a.get('href')
+        product_url = url + "catalogue/" + relative_url.split('../')[-1]
+        liste.append(product_url)
+
+    return liste
+
+# check if a category has multiple pages and extract URLs
+def check_for_pages(category_url):
+    soup_catego = get_html(category_url)
+    total = int(soup_catego.form.strong.text)
+    url_list = [category_url]
+
+    if total > 20:
+        new_url_base = category_url.replace('index.html','')
+        j = 2
+        for i in range(total//20):
+            page = "page-" + str(j) + ".html"
+            url_list.append(new_url_base + page)
+            j += 1
+
+    return url_list
+
+# get category and URL from side div and put them as a list [catego, url] in a list
+def get_category_list(soup, url):
+    catego_info = []
+    catego_dict = {}
+    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
+        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
+        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
+    return catego_info
+
+### TRANSFORMATION ###
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
+# create a list with all information consecutively
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info = [
+        url, product_information(soup)['UPC'],
+        get_title(soup),
+        product_information(soup)['Price (incl. tax)'],
+        product_information(soup)['Price (excl. tax)'],
+        product_information(soup)['Availability'],
+        product_description(soup),
+        get_category(soup),
+        product_information(soup)['Number of reviews'],
+        get_image_url(soup, url)
+        ]
+    return info
+
+### LOAD ###
+
+# write the file
+def data_output(info, file):
+    file = file + ".csv"
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+
+    with open(file, 'w') as csv_file:
+        writer = csv.writer(csv_file, delimiter = ',')
+        writer.writerow(fieldnames)
+        for i in info:
+            writer.writerow(i)
+
+    return file
+
+
+
+# collect category from all
+# then grab all product for each and write a file with category name
+
+def main():
+    # init
+    url = "https://books.toscrape.com/"
+
+    ### EXTRACTION ###
+    # get html from URL
+    soup = get_html(url)
+
+    # init counters
+    total_category = len(get_category_list(soup, url))
+    total_books = int(soup.form.strong.text)
+    processed_books = 0
+
+    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
+
+    # go ahead for each category
+    for line in get_category_list(soup, url):
+        category = line[0]
+        category_url = line[1]
+
+        total_category -= 1
+        # display what category is processed
+        print("\n -> Traitement de la catégorie : " + category)
+
+        # check if multiple pages and create a URL list
+        url_list = check_for_pages(category_url)
+
+        # get product list for each url_list, extend the main product url list with
+        product_url_list = []
+        for i in url_list:
+            product_url_list.extend(get_product_url_list(i, url))
+        #    print("Liste des URL des produits: ", product_url_list)
+        print(len(product_url_list), " livres présents")
+        processed_books += len(product_url_list)
+
+        ### TRANSFORMATION ###
+        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
+        data = []
+        img_nb = 1
+        for page_url in product_url_list:
+            page_soup = get_html(page_url)
+            #        print(page_soup)
+            #       print(phase1.get_category(page_soup))
+            #        print(phase1.get_data(page_soup, page_url))
+            data.append(get_data(page_soup, page_url))
+
+            ### LOAD ###
+            # PHASE 4 : get img for every book and name it with category and incremental number
+            img_url = get_image_url(page_soup, page_url)
+            with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
+                img_file.write(requests.get(img_url).content)
+            img_nb += 1
+
+        print(processed_books, " livres traités")
+        print(total_books - processed_books, " livres restants")
+        print(total_category, " catégories restantes")
+
+        ### LOAD ###
+        print("Done.\n  Fichier " + data_output(data, category))
+
+
+
+
+    print("\n Traitement terminé.")
+
+if __name__ == '__main__':
+    main()
--- a/phase4/screenshot.jpg
+++ b/phase4/screenshot.jpg
--- a/rendu/README.md
+++ b/rendu/README.md
@@ -0,0 +1,87 @@
+# Books Online
+
+Suivi du prix des livres chez "Books To Scrape"
+
+## Introduction
+
+Ces instructions vous permettent de :
+- récupérer le programme, 
+- d'installer l'environnement nécessaire à son exécution, 
+- de l'exécuter,
+- d'en connaitre le résultat
+
+
+### Pré-requis
+
+```
+paquets : python 3.11, python3.11-venv, git 
+modules : python requests, BeautifulSoup, csv, os
+```
+
+### Installation
+
+Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel :
+
+créer l'environnement virtuel 
+
+```
+python3.11 -m venv env
+source env/bin/activate
+```
+cloner le dépôt, aller dans le bon dossier
+```
+git clone https://mcstn.fr/gitea/Yann/Projet2.git
+cd Projet2/rendu
+```
+installer les modules
+```
+pip install -r requirements.txt
+```
+
+## Exécution
+
+exécuter la commande :
+```
+python3 main.py
+```
+
+## Résultat
+
+Les fichiers sont placés dans un répertoire "resultat"
+
+Le programme récupère les catégories sur la page d'accueil de l'URL, puis, pour chaque catégorie : 
+1. affiche la catégorie traitée, le nombre de catégories restantes, de livres présents, traités au total et restants
+2. crée un dossier du nom de la catégorie, y enregistre les images des livres nommées en fonction du titre
+3. crée un fichier csv au nom de la catégorie, avec :
+   - product_page_url
+   - universal_ product_code (upc)
+   - title
+   - price_including_tax
+   - price_excluding_tax
+   - number_available
+   - product_description
+   - category
+   - review_rating
+   - image_url
+
+```
+$ time python3.11 main.py 
+1000  à traiter répartis en  50  catégories.
+
+[ ... ]
+
+ Traitement terminé.
+
+real	20m17,783s
+user	4m30,695s
+sys	0m3,172s
+```
+## Auteur
+
+Yann  <yann@needsome.coffee>
+
+
+
+## License
+
+N/A
--- a/rendu/main.py
+++ b/rendu/main.py
@@ -0,0 +1,226 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+import os
+
+### EXTRACTION  ###
+
+# get soup from url
+def get_html(url):
+    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
+    html = r.content
+    soup = BeautifulSoup(html, 'html.parser')
+    return soup
+
+# extract the product title from page
+def get_title(soup):
+    title = soup.find("div", class_="product_main").h1.string
+    return title
+
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
+def product_information(soup):
+    product_info = {}
+    for tr in soup.table.find_all("tr"):
+        product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability'] = availability
+    return product_info
+
+# extract full description as string
+# luckily this <p> was the only one without class
+# and manage the case where there's no description
+def product_description(soup):
+    try:
+        desc = soup.find("p", class_='').string
+    except AttributeError:
+        desc = "None"
+
+    return desc
+
+# extract category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# get product list url from a given url category page;
+# extract and build each product url using the main url (second arg)
+def get_product_url_list(url_category_page, url):
+    liste = []
+    soup = get_html(url_category_page)
+
+    for i in soup.find_all("article"):
+        relative_url = i.h3.a.get('href')
+        product_url = url + "catalogue/" + relative_url.split('../')[-1]
+        liste.append(product_url)
+
+    return liste
+
+# check if a category has multiple pages and extract URLs
+def check_for_pages(category_url):
+    soup_catego = get_html(category_url)
+    total = int(soup_catego.form.strong.text)
+    url_list = [category_url]
+
+    if total > 20:
+        new_url_base = category_url.replace('index.html','')
+        j = 2
+        for i in range(total//20):
+            page = "page-" + str(j) + ".html"
+            url_list.append(new_url_base + page)
+            j += 1
+
+    return url_list
+
+# get category and URL from side div and put them as a list [catego, url] in a list
+def get_category_list(soup, url):
+    catego_info = []
+    catego_dict = {}
+    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
+        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
+        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
+    return catego_info
+
+### TRANSFORMATION ###
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
+# create a list with all information consecutively
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info = [
+        url,
+        product_information(soup)['UPC'],
+        get_title(soup),
+        product_information(soup)['Price (incl. tax)'],
+        product_information(soup)['Price (excl. tax)'],
+        product_information(soup)['Availability'],
+        product_description(soup),
+        get_category(soup),
+        product_information(soup)['Number of reviews'],
+        get_image_url(soup, url)
+        ]
+    return info
+
+### LOAD ###
+
+# write the file
+def data_output(info, file):
+    file = file + ".csv"
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+
+    with open(file, 'w') as csv_file:
+        writer = csv.writer(csv_file, delimiter = ',')
+        writer.writerow(fieldnames)
+        for i in info:
+            writer.writerow(i)
+
+    return file
+
+
+
+# collect category from all
+# then grab all product for each and write a file with category name
+
+def main():
+    # init
+    url = "https://books.toscrape.com/"
+    os.mkdir("resultat")
+
+    ### EXTRACTION ###
+    # get html from URL
+    soup = get_html(url)
+
+    # init counters
+    total_category = len(get_category_list(soup, url))
+    total_books = int(soup.form.strong.text)
+    processed_books = 0
+
+    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
+
+    # go ahead for each category
+    for line in get_category_list(soup, url):
+        # remove space in category name, to prevent potential issue on directory creation
+        category = line[0].replace(' ', '_')
+
+        category_url = line[1]
+        category_path = "resultat/" + category
+        total_category -= 1
+
+        # display what category is processed
+        print("\n -> Traitement de la catégorie : " + category)
+
+        # check if multiple pages and create a URL list
+        url_list = check_for_pages(category_url)
+
+        # get product list for each url_list, extend the main product url list with
+        product_url_list = []
+        for i in url_list:
+            product_url_list.extend(get_product_url_list(i, url))
+        #    print("Liste des URL des produits: ", product_url_list)
+        print(len(product_url_list), " livres présents")
+        processed_books += len(product_url_list)
+
+
+        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
+        data = []
+        img_nb = 1
+
+        # go ahead for each product of category
+        # EXTRACT data for each product page
+        for page_url in product_url_list:
+
+            # create the category directory. If it exists already, just continue
+            try:
+                os.mkdir(category_path)
+            except FileExistsError:
+                pass
+
+            # EXTRACT data : html from product page, and product data from the page
+            page_soup = get_html(page_url)
+
+            # EXTRACTION + TRANSFORMATION
+            product_data = get_data(page_soup, page_url)
+
+            # LOAD data in a list
+            data.append(product_data)
+
+            # protect path creation by removing "/" in product name
+            img_name = (product_data[2] + ".png").replace('/', '_')
+
+            # PHASE 4 : get img for every book and name it with category and incremental number
+            # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
+            img_url = get_image_url(page_soup, page_url)
+            with open(category_path + "/" + img_name, "wb") as img_file:
+                img_file.write(requests.get(img_url).content)
+            img_nb += 1
+
+        print(processed_books, " livres traités")
+        print(total_books - processed_books, " livres restants")
+        print(total_category, " catégories restantes")
+
+        # LOAD : write the list in the CSV file
+        print("Done.\n  Fichier " + data_output(data, category_path))
+
+
+
+
+    print("\n Traitement terminé.")
+
+if __name__ == '__main__':
+    main()
--- a/rendu/requirements.txt
+++ b/rendu/requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4==4.12.3
+requests==2.32.3
Author	SHA1	Message	Date
yann	913968b8c6	fix typo	2024-11-28 08:36:33 +01:00
yann	b99c187bc5	add details for directories	2024-11-23 18:57:07 +01:00
yann	ec4ad03fca	add execution duration	2024-11-20 13:07:30 +01:00
yann	9398f8fae3	adapt instructions with directory change	2024-11-20 11:56:51 +01:00
yann	6cb7913af2	main folder to put files in it, replace category name space, fix / issue in img name	2024-11-20 11:56:10 +01:00
yann	cece9d1874	remove the print() for test	2024-11-20 10:15:58 +01:00
yann	b74090865e	add info and structure	2024-11-20 10:12:21 +01:00
yann	6fa035fc1a	add finale folder, creates category dir and image name from title, full readme	2024-11-20 09:31:07 +01:00
yann	aa0d3a7819	re-organize code to show ETL phases, add comments	2024-11-19 12:40:23 +01:00
yann	73b302a2bc	improve comments, indicate phases	2024-11-19 12:25:34 +01:00
yann	90f3b22efb	add a screenshot of the result	2024-11-14 15:08:05 +01:00
yann	4785b2e6d8	add way to retrieve images : use requests and write binary in file. Name it with category and incremental number	2024-11-14 15:03:30 +01:00
yann	22ccd97fa3	add content :/	2024-11-14 14:49:50 +01:00
yann	852c0e781b	init phase4	2024-11-14 14:48:59 +01:00
yann	c9aaef7222	add screenshot of the result	2024-11-14 14:27:23 +01:00
Yann	549291cd6c	Téléverser les fichiers vers "phase3" Screenshot du résultat Signed-off-by: Yann <yann@needsome.coffee>	2024-11-14 13:22:29 +00:00
yann	b34a5d123c	refactor output counters	2024-11-14 14:16:36 +01:00
yann	ebd5f5acd4	works. Add processed book and book to go counters displayed	2024-11-14 14:07:04 +01:00
yann	d020998add	all functions in same place, and loop in main	2024-11-14 13:56:55 +01:00
yann	12dd0c9dfc	init phase3	2024-11-14 13:24:12 +01:00
yann	dd370cca8d	add title	2024-11-14 13:22:36 +01:00
yann	c35f7454a2	add text for fancy output and remove previous print (were testing)	2024-11-14 13:20:33 +01:00
yann	4247f1ac83	manage exception when no description	2024-11-14 13:19:21 +01:00
yann	2bbf684c26	build main to call function from phase 1 : build data from each page and write file	2024-11-14 12:37:52 +01:00
yann	27d37fb5d3	copy phase1/main.py as phase1.py to import in main	2024-11-14 12:35:35 +01:00
yann	50ca4fccd8	just one loop to fill the list, "extend" with each page list	2024-11-14 10:47:58 +01:00
yann	c92ce51aa0	refactor some comments	2024-11-13 17:24:44 +01:00
yann	8213f0849c	test if multiple page, get URL, create list of product, and refactor main	2024-11-13 17:09:06 +01:00
yann	e3ac12ff9b	get category_list from home and get product url from a category (if one page)	2024-11-13 15:46:48 +01:00
yann	7e6875a497	create phase2 folder+readme	2024-11-13 13:48:34 +01:00
yann	c0fcd21346	add get_category	2024-11-13 13:44:27 +01:00
yann	7b7f216be8	Merge branch 'main' of https://mcstn.fr/gitea/Yann/Projet2 je ne sais pas vraiment ce que ça va faire... j'ai commit, push puis amend mon commit local... j'essaye de sync	2024-11-13 13:38:13 +01:00
yann	3a6cf9b87e	remove url_base, refactor list get_data, fix comment and PEP8	2024-11-13 11:06:11 +01:00