add finale folder, creates category dir and image name from title, full readme

2024-11-20 09:31:07 +01:00 · 2024-11-20 09:31:07 +01:00 · 6fa035fc1a
commit 6fa035fc1a
parent aa0d3a7819
3 changed files with 292 additions and 0 deletions
--- a/rendu/README.md
+++ b/rendu/README.md
@ -0,0 +1,68 @@
 # Books Online
 Suivi du prix des livres chez "Books To Scrape"
 ## Introduction
 Ces instructions vous permettent de :
 - récupérer le programme, 
 - d'installer l'environnement nécessaire à son exécution, 
 - de l'exécuter,
 - d'en connaitre le résultat
 ### Pré-requis
 ```
 paquets : python 3.11, python3.11-venv, git 
 modules : python requests, BeautifulSoup, csv, os
 ```
 ### Installation
 Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel :
 ```
 - créez un environnement virtuel 
 $ python3.11 -m venv env
 $ source env/bin/activate
 - cloner le dépôt, aller dans le bon dossier
 git clone https://mcstn.fr/gitea/Yann/Projet2.git
 cd Projet2/rendu
 - installez les modules
 pip 
 ```
 ## Executer les tests
 N/A
 ## Deployment
 N/A
 ## Technologies :
 * HTML5/CSS3
 * [Boostrap](https://getbootstrap.com/)
 ## Contribution
 Merci de lire les fichiers :
 * [CONTRIBUTING.md](https://github.com/OpenClassrooms-Student-Center/7688581-Expert-Git-GitHub/blob/main/CONTRIBUTING.md)
 * [CODE_OF_CONDUCT.md](https://github.com/OpenClassrooms-Student-Center/7688581-Expert-Git-GitHub/blob/main/CONTRIBUTING.md) 
 ## Auteurs
 * Yann ALEXANDRE <yann@needsome.coffee>
 ## License
 N/A
--- a/rendu/main.py
+++ b/rendu/main.py
@ -0,0 +1,222 @@
 import requests
 from bs4 import BeautifulSoup
 import csv
 import os
 ### EXTRACTION  ###
 # get soup from url
 def get_html(url):
    r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'})
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup
 # extract the product title from page
 def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title
 # extract the product_information from the table and put them in a dict
 # and extract quantity from string
 def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info
 # extract full description as string
 # luckily this <p> was the only one without class
 # and manage the case where there's no description
 def product_description(soup):
    try:
        desc = soup.find("p", class_='').string
    except AttributeError:
        desc = "None"
    return desc
 # extract category from breadcrumb
 def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread
 # get product list url from a given url category page;
 # extract and build each product url using the main url (second arg)
 def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)
    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)
    return liste
 # check if a category has multiple pages and extract URLs
 def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]
    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1
    return url_list
 # get category and URL from side div and put them as a list [catego, url] in a list
 def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info
 ### TRANSFORMATION ###
 # get relative link from page and build the full URL
 def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url
 # create a list with all information consecutively
 # /!\ don't know if that's the best way
 def get_data(soup, url):
    info = [
        url,
        product_information(soup)['UPC'],
        get_title(soup),
        product_information(soup)['Price (incl. tax)'],
        product_information(soup)['Price (excl. tax)'],
        product_information(soup)['Availability'],
        product_description(soup),
        get_category(soup),
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]
    return info
 ### LOAD ###
 # write the file
 def data_output(info, file):
    file = file + ".csv"
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']
    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        for i in info:
            writer.writerow(i)
    return file
 # collect category from all
 # then grab all product for each and write a file with category name
 def main():
    # init
    url = "https://books.toscrape.com/"
    ### EXTRACTION ###
    # get html from URL
    soup = get_html(url)
    # init counters
    total_category = len(get_category_list(soup, url))
    total_books = int(soup.form.strong.text)
    processed_books = 0
    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
    # go ahead for each category
    for line in get_category_list(soup, url):
        category = line[0]
        category_url = line[1]
        total_category -= 1
        # display what category is processed
        print("\n -> Traitement de la catégorie : " + category)
        # check if multiple pages and create a URL list
        url_list = check_for_pages(category_url)
        # get product list for each url_list, extend the main product url list with
        product_url_list = []
        for i in url_list:
            product_url_list.extend(get_product_url_list(i, url))
        #    print("Liste des URL des produits: ", product_url_list)
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)
        # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        img_nb = 1
        # EXTRACT data for each product page
        for page_url in product_url_list:
            # create the category directory. If it exists already, just continue
            try:
                os.mkdir(category)
            except FileExistsError:
                pass
            # EXTRACT data : html from product page, and product data from the page
            page_soup = get_html(page_url)
            # EXTRACTION + TRANSFORMATION
            product_data = get_data(page_soup, page_url)
            #        print(page_soup)
            #       print(phase1.get_category(page_soup))
            #        print(phase1.get_data(page_soup, page_url))
            # LOAD data in a list
            data.append(product_data)
            # PHASE 4 : get img for every book and name it with category and incremental number
            # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
            img_url = get_image_url(page_soup, page_url)
            print(category, "/", product_data[2], ".png")
            with open(category + "/" + product_data[2] + ".png", "wb") as img_file:
                img_file.write(requests.get(img_url).content)
            img_nb += 1
        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
        # LOAD : write the list in the CSV file
        print("Done.\n  Fichier " + data_output(data, category))
    print("\n Traitement terminé.")
 if __name__ == '__main__':
    main()
--- a/rendu/requirements.txt
+++ b/rendu/requirements.txt
@ -0,0 +1,2 @@
 beautifulsoup4==4.12.3
 requests==2.32.3