diff --git a/phase4/README.md b/phase4/README.md new file mode 100644 index 0000000..c9ce19a --- /dev/null +++ b/phase4/README.md @@ -0,0 +1,2 @@ +# Phase 4 + diff --git a/phase4/main.py b/phase4/main.py new file mode 100644 index 0000000..892cb1b --- /dev/null +++ b/phase4/main.py @@ -0,0 +1,188 @@ +import requests +from bs4 import BeautifulSoup +import csv + +# get soup from url +def get_html(url): + r = requests.get(url) + html = r.content + soup = BeautifulSoup(html, 'html.parser') + return soup + +# extract the product title from page +def get_title(soup): + title = soup.find("div", class_="product_main").h1.string + return title + +# extract the product_information from the table and put them in a dict +# and extract quantity from string +def product_information(soup): + product_info = {} + for tr in soup.table.find_all("tr"): + product_info[tr.th.string] = tr.td.string + #extract the amount from string and case it + availability = int(''.join(filter(str.isdigit, product_info['Availability']))) + product_info['Availability'] = availability + return product_info + +# get relative link from page and build the full URL +def get_image_url(soup, url): + link = soup.img.get('src') + url_site = "https://" + url.split('/')[2] + img_url = url_site + "/" + link.replace('../', '') + return img_url + +# get full description as string +# luckily this
was the only one without class +# and manage the case where there's no description +def product_description(soup): + try: + desc = soup.find("p", class_='').string + except AttributeError: + desc = "None" + + return desc + +# get category from breadcrumb +def get_category(soup): + bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text + return bread + +# create a list with all information consecutively +# /!\ don't know if that's the best way +def get_data(soup, url): + info = [ + url, product_information(soup)['UPC'], + get_title(soup), + product_information(soup)['Price (incl. tax)'], + product_information(soup)['Price (excl. tax)'], + product_information(soup)['Availability'], + product_description(soup), + get_category(soup), + product_information(soup)['Number of reviews'], + get_image_url(soup, url) + ] + + + return info + +# write the file +def data_output(info, file): + file = file + ".csv" + fieldnames = ['product_page_url', + 'universal_ product_code (upc)', + 'title', + 'price_including_tax', + 'price_excluding_tax', + 'number_available', + 'product_description', + 'category', + 'review_rating', + 'image_url'] + + with open(file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter = ',') + writer.writerow(fieldnames) + for i in info: + writer.writerow(i) + + return file + + + +# get category and URL from side div and put them as a list [catego, url] in a list +def get_category_list(soup, url): + catego_info = [] + catego_dict = {} + for li in soup.find("div", class_="side_categories").find_all("li")[1:]: + catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) + catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') + return catego_info + + +def check_for_pages(category_url): + soup_catego = get_html(category_url) + total = int(soup_catego.form.strong.text) + url_list = [category_url] + + if total > 20: + new_url_base = category_url.replace('index.html','') + j = 2 + for i in range(total//20): + page = "page-" + str(j) + ".html" + url_list.append(new_url_base + page) + j += 1 + + return url_list + + +# get product list url from a given url category page; +# extract and build each product url using the main url (second arg) +def get_product_url_list(url_category_page, url): + liste = [] + soup = get_html(url_category_page) + + for i in soup.find_all("article"): + relative_url = i.h3.a.get('href') + product_url = url + "catalogue/" + relative_url.split('../')[-1] + liste.append(product_url) + + return liste + +# collect category from all +# then grab all product for each and write a file with category name + +def main(): + # init + url = "https://books.toscrape.com/" + + # get html from URL + soup = get_html(url) + + # init counters + total_category = len(get_category_list(soup, url)) + total_books = int(soup.form.strong.text) + processed_books = 0 + + print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.") + # go ahead for each category + for line in get_category_list(soup, url): + category = line[0] + category_url = line[1] + + total_category -= 1 + # display what category is processed + print("\n -> Traitement de la catégorie : " + category) + + # check if multiple pages and create a URL list + url_list = check_for_pages(category_url) + + # get product list for each url_list, extend the main product url list with + product_url_list = [] + for i in url_list: + product_url_list.extend(get_product_url_list(i, url)) + # print("Liste des URL des produits: ", product_url_list) + print(len(product_url_list), " livres présents") + processed_books += len(product_url_list) + + # combine with phase 1 and write in csv for each url from product_url_list named with category + data = [] + for page_url in product_url_list: + page_soup = get_html(page_url) + # print(page_soup) + # print(phase1.get_category(page_soup)) + # print(phase1.get_data(page_soup, page_url)) + data.append(get_data(page_soup, page_url)) + + print(processed_books, " livres traités") + print(total_books - processed_books, " livres restants") + print(total_category, " catégories restantes") + print("Done.\n Fichier " + data_output(data, category)) + + + + + print("\n Traitement terminé.") + +if __name__ == '__main__': + main()