diff --git a/rendu/README.md b/rendu/README.md new file mode 100644 index 0000000..cb746c0 --- /dev/null +++ b/rendu/README.md @@ -0,0 +1,68 @@ +# Books Online + +Suivi du prix des livres chez "Books To Scrape" + +## Introduction + +Ces instructions vous permettent de : +- récupérer le programme, +- d'installer l'environnement nécessaire à son exécution, +- de l'exécuter, +- d'en connaitre le résultat + + +### Pré-requis + +``` +paquets : python 3.11, python3.11-venv, git +modules : python requests, BeautifulSoup, csv, os + +``` + +### Installation + +Voici les étapes à suivre pour avoir un environnement d'exécution opérationnel : + + +``` +- créez un environnement virtuel +$ python3.11 -m venv env +$ source env/bin/activate + +- cloner le dépôt, aller dans le bon dossier +git clone https://mcstn.fr/gitea/Yann/Projet2.git +cd Projet2/rendu + +- installez les modules +pip +``` + + + +## Executer les tests + +N/A + + +## Deployment + +N/A + +## Technologies : + +* HTML5/CSS3 +* [Boostrap](https://getbootstrap.com/) + +## Contribution + +Merci de lire les fichiers : +* [CONTRIBUTING.md](https://github.com/OpenClassrooms-Student-Center/7688581-Expert-Git-GitHub/blob/main/CONTRIBUTING.md) +* [CODE_OF_CONDUCT.md](https://github.com/OpenClassrooms-Student-Center/7688581-Expert-Git-GitHub/blob/main/CONTRIBUTING.md) + +## Auteurs + +* Yann ALEXANDRE + +## License + +N/A \ No newline at end of file diff --git a/rendu/main.py b/rendu/main.py new file mode 100644 index 0000000..c244f94 --- /dev/null +++ b/rendu/main.py @@ -0,0 +1,222 @@ +import requests +from bs4 import BeautifulSoup +import csv +import os + +### EXTRACTION ### + +# get soup from url +def get_html(url): + r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'}) + html = r.content + soup = BeautifulSoup(html, 'html.parser') + return soup + +# extract the product title from page +def get_title(soup): + title = soup.find("div", class_="product_main").h1.string + return title + +# extract the product_information from the table and put them in a dict +# and extract quantity from string +def product_information(soup): + product_info = {} + for tr in soup.table.find_all("tr"): + product_info[tr.th.string] = tr.td.string + #extract the amount from string and case it + availability = int(''.join(filter(str.isdigit, product_info['Availability']))) + product_info['Availability'] = availability + return product_info + +# extract full description as string +# luckily this

was the only one without class +# and manage the case where there's no description +def product_description(soup): + try: + desc = soup.find("p", class_='').string + except AttributeError: + desc = "None" + + return desc + +# extract category from breadcrumb +def get_category(soup): + bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text + return bread + +# get product list url from a given url category page; +# extract and build each product url using the main url (second arg) +def get_product_url_list(url_category_page, url): + liste = [] + soup = get_html(url_category_page) + + for i in soup.find_all("article"): + relative_url = i.h3.a.get('href') + product_url = url + "catalogue/" + relative_url.split('../')[-1] + liste.append(product_url) + + return liste + +# check if a category has multiple pages and extract URLs +def check_for_pages(category_url): + soup_catego = get_html(category_url) + total = int(soup_catego.form.strong.text) + url_list = [category_url] + + if total > 20: + new_url_base = category_url.replace('index.html','') + j = 2 + for i in range(total//20): + page = "page-" + str(j) + ".html" + url_list.append(new_url_base + page) + j += 1 + + return url_list + +# get category and URL from side div and put them as a list [catego, url] in a list +def get_category_list(soup, url): + catego_info = [] + catego_dict = {} + for li in soup.find("div", class_="side_categories").find_all("li")[1:]: + catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) + catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') + return catego_info + +### TRANSFORMATION ### + +# get relative link from page and build the full URL +def get_image_url(soup, url): + link = soup.img.get('src') + url_site = "https://" + url.split('/')[2] + img_url = url_site + "/" + link.replace('../', '') + return img_url + +# create a list with all information consecutively +# /!\ don't know if that's the best way +def get_data(soup, url): + info = [ + url, + product_information(soup)['UPC'], + get_title(soup), + product_information(soup)['Price (incl. tax)'], + product_information(soup)['Price (excl. tax)'], + product_information(soup)['Availability'], + product_description(soup), + get_category(soup), + product_information(soup)['Number of reviews'], + get_image_url(soup, url) + ] + return info + +### LOAD ### + +# write the file +def data_output(info, file): + file = file + ".csv" + fieldnames = ['product_page_url', + 'universal_ product_code (upc)', + 'title', + 'price_including_tax', + 'price_excluding_tax', + 'number_available', + 'product_description', + 'category', + 'review_rating', + 'image_url'] + + with open(file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter = ',') + writer.writerow(fieldnames) + for i in info: + writer.writerow(i) + + return file + + + +# collect category from all +# then grab all product for each and write a file with category name + +def main(): + # init + url = "https://books.toscrape.com/" + + ### EXTRACTION ### + # get html from URL + soup = get_html(url) + + # init counters + total_category = len(get_category_list(soup, url)) + total_books = int(soup.form.strong.text) + processed_books = 0 + + print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.") + + # go ahead for each category + for line in get_category_list(soup, url): + category = line[0] + category_url = line[1] + + total_category -= 1 + # display what category is processed + print("\n -> Traitement de la catégorie : " + category) + + # check if multiple pages and create a URL list + url_list = check_for_pages(category_url) + + # get product list for each url_list, extend the main product url list with + product_url_list = [] + for i in url_list: + product_url_list.extend(get_product_url_list(i, url)) + # print("Liste des URL des produits: ", product_url_list) + print(len(product_url_list), " livres présents") + processed_books += len(product_url_list) + + + # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category + data = [] + img_nb = 1 + + # EXTRACT data for each product page + for page_url in product_url_list: + + # create the category directory. If it exists already, just continue + try: + os.mkdir(category) + except FileExistsError: + pass + + # EXTRACT data : html from product page, and product data from the page + page_soup = get_html(page_url) + + # EXTRACTION + TRANSFORMATION + product_data = get_data(page_soup, page_url) + # print(page_soup) + # print(phase1.get_category(page_soup)) + # print(phase1.get_data(page_soup, page_url)) + + # LOAD data in a list + data.append(product_data) + + # PHASE 4 : get img for every book and name it with category and incremental number + # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title + img_url = get_image_url(page_soup, page_url) + print(category, "/", product_data[2], ".png") + with open(category + "/" + product_data[2] + ".png", "wb") as img_file: + img_file.write(requests.get(img_url).content) + img_nb += 1 + + print(processed_books, " livres traités") + print(total_books - processed_books, " livres restants") + print(total_category, " catégories restantes") + + # LOAD : write the list in the CSV file + print("Done.\n Fichier " + data_output(data, category)) + + + + + print("\n Traitement terminé.") + +if __name__ == '__main__': + main() diff --git a/rendu/requirements.txt b/rendu/requirements.txt new file mode 100644 index 0000000..bf4dcfc --- /dev/null +++ b/rendu/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4==4.12.3 +requests==2.32.3