fix typo

add details for directories
add execution duration
2024-11-28 08:36:33 +01:00 · 2024-11-23 18:57:07 +01:00 · 2024-11-20 13:07:30 +01:00 · 2024-11-20 11:56:51 +01:00 · 2024-11-20 11:56:10 +01:00 · 2024-11-20 10:15:58 +01:00
14 changed files with 963 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
-Un dossier pour chaque phase du projet
+# INTRODUCTION
 Ce dépôt contient un dossier pour chaque phase du projet, avec les instructions respectives dans chaque README.
 Le contenu à considérer pour la **soutenance** est dans le dossier **"rendu"**
 Avec chacun un README contenant les instructions
--- a/phase1/main.py
+++ b/phase1/main.py
@@ -36,7 +36,12 @@ def product_description(soup):
    desc = soup.find("p", class_='').string
    return desc
-#create a list with all information consecutively
+# get category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [url, product_information(soup)['UPC'],
@@ -45,13 +50,13 @@ def get_data(soup, url):
            product_information(soup)['Price (excl. tax)'],
            product_information(soup)['Availability'],
            product_description(soup),
-            "TODO",
+            get_category(soup),
            product_information(soup)['Number of reviews'],
            get_image_url(soup, url)
            ]
    return info
-#write the file
+# write the file
 def data_output(info, file):
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
--- a/phase2/README.md
+++ b/phase2/README.md
@@ -0,0 +1,17 @@
 # Phase 2
 Maintenant que vous avez obtenu les informations concernant un premier livre,
 vous pouvez essayer de récupérer toutes les données nécessaires pour toute une
 catégorie d'ouvrages.
 Choisissez n'importe quelle catégorie sur le site de Books to Scrape. Écrivez un
 script Python qui consulte la page de la catégorie choisie, et extrait l'URL de la
 page Produit de chaque livre appartenant à cette catégorie.
 Combinez cela avec le travail que vous avez déjà effectué dans la phase 1 afin
 d'extraire les données produit de tous les livres de la catégorie choisie, puis écrivez
 les données dans un seul fichier CSV.
 Remarque : certaines pages de catégorie comptent plus de 20 livres, qui sont
 donc répartis sur différentes pages (« pagination »). Votre application doit être
 capable de parcourir automatiquement les multiples pages si présentes
--- a/phase2/main.py
+++ b/phase2/main.py
@@ -0,0 +1,90 @@
 import requests
 from bs4 import BeautifulSoup
 import phase1
 # get soup from url
 def get_html(url):
    r = requests.get(url)
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li"):
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_dict
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 def main():
    # init
    url = "https://books.toscrape.com/"
    category = "default"
    # get functional variables
    soup = get_html(url)
    liste_categories = get_category_list(soup, url)
    # get category URL to do some tests on it
    category_url = liste_categories[category]
    print(category_url)
    # check if multiple page and get url list
    url_list = check_for_pages(category_url)
 #    print("Liste des URLs des pages: ", url_list)
    # get product list for each url_list, extend the main product url list with
    product_url_list = []
    for i in url_list:
        product_url_list.extend(get_product_url_list(i, url))
 #    print("Liste des URL des produits: ", product_url_list)
    print("Nombre de livres: ", len(product_url_list))
    # combine with phase 1 and write in csv for each url from product_url_list named with category
    data = []
    for page_url in product_url_list:
        page_soup = get_html(page_url)
 #        print(page_soup)
 #       print(phase1.get_category(page_soup))
 #        print(phase1.get_data(page_soup, page_url))
        data.append(phase1.get_data(page_soup, page_url))
    print("Done.\n Fichier " + phase1.data_output(data, category))
 if __name__ == '__main__':
    main()
--- a/phase2/phase1.py
+++ b/phase2/phase1.py
@@ -0,0 +1,113 @@
 import requests
 from bs4 import BeautifulSoup
 import csv
 def extract_web(url):
    r = requests.get(url)
    page = r.content
    return page
 # extract the product title from page
 def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title
 # extract the product_information from the table and put them in a dict
 # and extract quantity from string
 def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info
 # get relative link from page and build the full URL
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # get full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
    try:
        desc = soup.find("p", class_='').string
    except AttributeError:
        desc = "None"
    return desc
 # get category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [
        url, product_information(soup)['UPC'],
        get_title(soup),
        product_information(soup)['Price (incl. tax)'],
        product_information(soup)['Price (excl. tax)'],
        product_information(soup)['Availability'],
        product_description(soup),
        get_category(soup),
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
    return info
 # write the file
 def data_output(info, file):
    file = file + ".csv"
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']
    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        for i in info:
            writer.writerow(i)
    return file
 def main():
    url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
    html = extract_web(url)
    soup = BeautifulSoup(html, "html.parser")
    test = product_information(soup)
    print(test['Availability'])
    info = get_data(soup, url)
    print(info)
    data_output(info, 'output.csv')
 if __name__ == "__main__":
    main()
--- a/phase3/README.md
+++ b/phase3/README.md
@@ -0,0 +1,14 @@
 # Phase 3
 Ensuite, étendez votre travail à l'écriture d'un script qui consulte le site de Books
 to Scrape, extrait toutes les catégories de livres disponibles, puis extrait les
 informations produit de tous les livres appartenant à toutes les différentes
 catégories. 
 Vous devrez écrire les données dans un fichier CSV distinct pour
 chaque catégorie de livres.
 # Résultat
 ![ screenshot](screenshot.png)
--- a/phase3/main.py
+++ b/phase3/main.py
@@ -0,0 +1,188 @@
 import requests
 from bs4 import BeautifulSoup
 import csv
 # get soup from url
 def get_html(url):
    r = requests.get(url)
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup
 # extract the product title from page
 def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title
 # extract the product_information from the table and put them in a dict
 # and extract quantity from string
 def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info
 # get relative link from page and build the full URL
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # get full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
    try:
        desc = soup.find("p", class_='').string
    except AttributeError:
        desc = "None"
    return desc
 # get category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [
        url, product_information(soup)['UPC'],
        get_title(soup),
        product_information(soup)['Price (incl. tax)'],
        product_information(soup)['Price (excl. tax)'],
        product_information(soup)['Availability'],
        product_description(soup),
        get_category(soup),
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
    return info
 # write the file
 def data_output(info, file):
    file = file + ".csv"
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']
    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        for i in info:
            writer.writerow(i)
    return file
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 # collect category from all
 # then grab all product for each and write a file with category name
 def main():
    # init
    url = "https://books.toscrape.com/"
    # get html from URL
    soup = get_html(url)
    # init counters
    total_category = len(get_category_list(soup, url))
    total_books = int(soup.form.strong.text)
    processed_books = 0
    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
    # go ahead for each category
    for line in get_category_list(soup, url):
        category = line[0]
        category_url = line[1]
        total_category -= 1
        # display what category is processed
        print("\n -> Traitement de la catégorie : " + category)
        # check if multiple pages and create a URL list
        url_list = check_for_pages(category_url)
        # get product list for each url_list, extend the main product url list with
        product_url_list = []
        for i in url_list:
            product_url_list.extend(get_product_url_list(i, url))
        #    print("Liste des URL des produits: ", product_url_list)
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)
        # combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        for page_url in product_url_list:
            page_soup = get_html(page_url)
            #        print(page_soup)
            #       print(phase1.get_category(page_soup))
            #        print(phase1.get_data(page_soup, page_url))
            data.append(get_data(page_soup, page_url))
        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
        print("Done.\n  Fichier " + data_output(data, category))
    print("\n Traitement terminé.")
 if __name__ == '__main__':
    main()
--- a/phase3/screenshot.png
+++ b/phase3/screenshot.png
--- a/phase4/README.md
+++ b/phase4/README.md
@@ -0,0 +1,8 @@
 # Phase 4
 Enfin, prolongez votre travail existant pour télécharger et enregistrer le fichier
 image de chaque page Produit que vous consultez.
 # Résultat
 ![ screenshot](screenshot.jpg)
--- a/phase4/main.py
+++ b/phase4/main.py
@@ -0,0 +1,204 @@
 import requests
 from bs4 import BeautifulSoup
 import csv
 ### EXTRACTION  ###
 # get soup from url
 def get_html(url):
    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup
 # extract the product title from page
 def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title
 # extract the product_information from the table and put them in a dict
 # and extract quantity from string
 def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info
 # extract full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
    try:
        desc = soup.find("p", class_='').string
    except AttributeError:
        desc = "None"
    return desc
 # extract category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 # check if a category has multiple pages and extract URLs
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info
 ### TRANSFORMATION ###
 # get relative link from page and build the full URL
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [
        url, product_information(soup)['UPC'],
        get_title(soup),
        product_information(soup)['Price (incl. tax)'],
        product_information(soup)['Price (excl. tax)'],
        product_information(soup)['Availability'],
        product_description(soup),
        get_category(soup),
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
    return info
 ### LOAD ###
 # write the file
 def data_output(info, file):
    file = file + ".csv"
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']
    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        for i in info:
            writer.writerow(i)
    return file
 # collect category from all
 # then grab all product for each and write a file with category name
 def main():
    # init
    url = "https://books.toscrape.com/"
    ### EXTRACTION ###
    # get html from URL
    soup = get_html(url)
    # init counters
    total_category = len(get_category_list(soup, url))
    total_books = int(soup.form.strong.text)
    processed_books = 0
    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
    # go ahead for each category
    for line in get_category_list(soup, url):
        category = line[0]
        category_url = line[1]
        total_category -= 1
        # display what category is processed
        print("\n -> Traitement de la catégorie : " + category)
        # check if multiple pages and create a URL list
        url_list = check_for_pages(category_url)
        # get product list for each url_list, extend the main product url list with
        product_url_list = []
        for i in url_list:
            product_url_list.extend(get_product_url_list(i, url))
        #    print("Liste des URL des produits: ", product_url_list)
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)
        ### TRANSFORMATION ###
        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        img_nb = 1
        for page_url in product_url_list:
            page_soup = get_html(page_url)
            #        print(page_soup)
            #       print(phase1.get_category(page_soup))
            #        print(phase1.get_data(page_soup, page_url))
            data.append(get_data(page_soup, page_url))
            ### LOAD ###
            # PHASE 4 : get img for every book and name it with category and incremental number
            img_url = get_image_url(page_soup, page_url)
            with open(category + "-" + str(img_nb) + ".png", "wb") as img_file:
                img_file.write(requests.get(img_url).content)
            img_nb += 1
        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
        ### LOAD ###
        print("Done.\n  Fichier " + data_output(data, category))
    print("\n Traitement terminé.")
 if __name__ == '__main__':
    main()
--- a/phase4/screenshot.jpg
+++ b/phase4/screenshot.jpg
--- a/rendu/README.md
+++ b/rendu/README.md
@@ -0,0 +1,87 @@
 # Books Online
 Suivi du prix des livres chez "Books To Scrape"
 ## Introduction
 Ces instructions vous permettent de :
 - récupérer le programme, 
 - d'installer l'environnement nécessaire à son exécution, 
 - de l'exécuter,
 - d'en connaitre le résultat
 ### Pré-requis
 ```
 paquets : python 3.11, python3.11-venv, git 
 modules : python requests, BeautifulSoup, csv, os
 ```
 ### Installation
 Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel :
 créer l'environnement virtuel 
 ```
 python3.11 -m venv env
 source env/bin/activate
 ```
 cloner le dépôt, aller dans le bon dossier
 ```
 git clone https://mcstn.fr/gitea/Yann/Projet2.git
 cd Projet2/rendu
 ```
 installer les modules
 ```
 pip install -r requirements.txt
 ```
 ## Exécution
 exécuter la commande :
 ```
 python3 main.py
 ```
 ## Résultat
 Les fichiers sont placés dans un répertoire "resultat"
 Le programme récupère les catégories sur la page d'accueil de l'URL, puis, pour chaque catégorie : 
 1. affiche la catégorie traitée, le nombre de catégories restantes, de livres présents, traités au total et restants
 2. crée un dossier du nom de la catégorie, y enregistre les images des livres nommées en fonction du titre
 3. crée un fichier csv au nom de la catégorie, avec :
   - product_page_url
   - universal_ product_code (upc)
   - title
   - price_including_tax
   - price_excluding_tax
   - number_available
   - product_description
   - category
   - review_rating
   - image_url
 ```
 $ time python3.11 main.py 
 1000  à traiter répartis en  50  catégories.
 [ ... ]
 Traitement terminé.
 real	20m17,783s
 user	4m30,695s
 sys	0m3,172s
 ```
 ## Auteur
 Yann  <yann@needsome.coffee>
 ## License
 N/A
--- a/rendu/main.py
+++ b/rendu/main.py
@@ -0,0 +1,226 @@
 import requests
 from bs4 import BeautifulSoup
 import csv
 import os
 ### EXTRACTION  ###
 # get soup from url
 def get_html(url):
    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup
 # extract the product title from page
 def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title
 # extract the product_information from the table and put them in a dict
 # and extract quantity from string
 def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info
 # extract full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
    try:
        desc = soup.find("p", class_='').string
    except AttributeError:
        desc = "None"
    return desc
 # extract category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 # check if a category has multiple pages and extract URLs
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info
 ### TRANSFORMATION ###
 # get relative link from page and build the full URL
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [
        url,
        product_information(soup)['UPC'],
        get_title(soup),
        product_information(soup)['Price (incl. tax)'],
        product_information(soup)['Price (excl. tax)'],
        product_information(soup)['Availability'],
        product_description(soup),
        get_category(soup),
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
    return info
 ### LOAD ###
 # write the file
 def data_output(info, file):
    file = file + ".csv"
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']
    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        for i in info:
            writer.writerow(i)
    return file
 # collect category from all
 # then grab all product for each and write a file with category name
 def main():
    # init
    url = "https://books.toscrape.com/"
    os.mkdir("resultat")
    ### EXTRACTION ###
    # get html from URL
    soup = get_html(url)
    # init counters
    total_category = len(get_category_list(soup, url))
    total_books = int(soup.form.strong.text)
    processed_books = 0
    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
    # go ahead for each category
    for line in get_category_list(soup, url):
        # remove space in category name, to prevent potential issue on directory creation
        category = line[0].replace(' ', '_')
        category_url = line[1]
        category_path = "resultat/" + category
        total_category -= 1
        # display what category is processed
        print("\n -> Traitement de la catégorie : " + category)
        # check if multiple pages and create a URL list
        url_list = check_for_pages(category_url)
        # get product list for each url_list, extend the main product url list with
        product_url_list = []
        for i in url_list:
            product_url_list.extend(get_product_url_list(i, url))
        #    print("Liste des URL des produits: ", product_url_list)
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)
        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        img_nb = 1
        # go ahead for each product of category
        # EXTRACT data for each product page
        for page_url in product_url_list:
            # create the category directory. If it exists already, just continue
            try:
                os.mkdir(category_path)
            except FileExistsError:
                pass
            # EXTRACT data : html from product page, and product data from the page
            page_soup = get_html(page_url)
            # EXTRACTION + TRANSFORMATION
            product_data = get_data(page_soup, page_url)
            # LOAD data in a list
            data.append(product_data)
            # protect path creation by removing "/" in product name
            img_name = (product_data[2] + ".png").replace('/', '_')
            # PHASE 4 : get img for every book and name it with category and incremental number
            # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
            img_url = get_image_url(page_soup, page_url)
            with open(category_path + "/" + img_name, "wb") as img_file:
                img_file.write(requests.get(img_url).content)
            img_nb += 1
        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
        # LOAD : write the list in the CSV file
        print("Done.\n  Fichier " + data_output(data, category_path))
    print("\n Traitement terminé.")
 if __name__ == '__main__':
    main()
--- a/rendu/requirements.txt
+++ b/rendu/requirements.txt
@@ -0,0 +1,2 @@
 beautifulsoup4==4.12.3
 requests==2.32.3
Author	SHA1	Message	Date
yann	913968b8c6	fix typo	2024-11-28 08:36:33 +01:00
yann	b99c187bc5	add details for directories	2024-11-23 18:57:07 +01:00
yann	ec4ad03fca	add execution duration	2024-11-20 13:07:30 +01:00
yann	9398f8fae3	adapt instructions with directory change	2024-11-20 11:56:51 +01:00
yann	6cb7913af2	main folder to put files in it, replace category name space, fix / issue in img name	2024-11-20 11:56:10 +01:00
yann	cece9d1874	remove the print() for test	2024-11-20 10:15:58 +01:00
yann	b74090865e	add info and structure	2024-11-20 10:12:21 +01:00
yann	6fa035fc1a	add finale folder, creates category dir and image name from title, full readme	2024-11-20 09:31:07 +01:00
yann	aa0d3a7819	re-organize code to show ETL phases, add comments	2024-11-19 12:40:23 +01:00
yann	73b302a2bc	improve comments, indicate phases	2024-11-19 12:25:34 +01:00
yann	90f3b22efb	add a screenshot of the result	2024-11-14 15:08:05 +01:00
yann	4785b2e6d8	add way to retrieve images : use requests and write binary in file. Name it with category and incremental number	2024-11-14 15:03:30 +01:00
yann	22ccd97fa3	add content :/	2024-11-14 14:49:50 +01:00
yann	852c0e781b	init phase4	2024-11-14 14:48:59 +01:00
yann	c9aaef7222	add screenshot of the result	2024-11-14 14:27:23 +01:00
Yann	549291cd6c	Téléverser les fichiers vers "phase3" Screenshot du résultat Signed-off-by: Yann <yann@needsome.coffee>	2024-11-14 13:22:29 +00:00
yann	b34a5d123c	refactor output counters	2024-11-14 14:16:36 +01:00
yann	ebd5f5acd4	works. Add processed book and book to go counters displayed	2024-11-14 14:07:04 +01:00
yann	d020998add	all functions in same place, and loop in main	2024-11-14 13:56:55 +01:00
yann	12dd0c9dfc	init phase3	2024-11-14 13:24:12 +01:00
yann	dd370cca8d	add title	2024-11-14 13:22:36 +01:00
yann	c35f7454a2	add text for fancy output and remove previous print (were testing)	2024-11-14 13:20:33 +01:00
yann	4247f1ac83	manage exception when no description	2024-11-14 13:19:21 +01:00
yann	2bbf684c26	build main to call function from phase 1 : build data from each page and write file	2024-11-14 12:37:52 +01:00
yann	27d37fb5d3	copy phase1/main.py as phase1.py to import in main	2024-11-14 12:35:35 +01:00
yann	50ca4fccd8	just one loop to fill the list, "extend" with each page list	2024-11-14 10:47:58 +01:00
yann	c92ce51aa0	refactor some comments	2024-11-13 17:24:44 +01:00
yann	8213f0849c	test if multiple page, get URL, create list of product, and refactor main	2024-11-13 17:09:06 +01:00
yann	e3ac12ff9b	get category_list from home and get product url from a category (if one page)	2024-11-13 15:46:48 +01:00
yann	7e6875a497	create phase2 folder+readme	2024-11-13 13:48:34 +01:00
yann	c0fcd21346	add get_category	2024-11-13 13:44:27 +01:00