diff --git a/phase4/main.py b/phase4/main.py index cbcc880..4a6b81e 100644 --- a/phase4/main.py +++ b/phase4/main.py @@ -2,6 +2,8 @@ import requests from bs4 import BeautifulSoup import csv +### EXTRACTION ### + # get soup from url def get_html(url): r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'}) @@ -25,14 +27,7 @@ def product_information(soup): product_info['Availability'] = availability return product_info -# get relative link from page and build the full URL -def get_image_url(soup, url): - link = soup.img.get('src') - url_site = "https://" + url.split('/')[2] - img_url = url_site + "/" + link.replace('../', '') - return img_url - -# get full description as string +# extract full description as string # luckily this
was the only one without class # and manage the case where there's no description def product_description(soup): @@ -43,11 +38,58 @@ def product_description(soup): return desc -# get category from breadcrumb +# extract category from breadcrumb def get_category(soup): bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text return bread +# get product list url from a given url category page; +# extract and build each product url using the main url (second arg) +def get_product_url_list(url_category_page, url): + liste = [] + soup = get_html(url_category_page) + + for i in soup.find_all("article"): + relative_url = i.h3.a.get('href') + product_url = url + "catalogue/" + relative_url.split('../')[-1] + liste.append(product_url) + + return liste + +# check if a category has multiple pages and extract URLs +def check_for_pages(category_url): + soup_catego = get_html(category_url) + total = int(soup_catego.form.strong.text) + url_list = [category_url] + + if total > 20: + new_url_base = category_url.replace('index.html','') + j = 2 + for i in range(total//20): + page = "page-" + str(j) + ".html" + url_list.append(new_url_base + page) + j += 1 + + return url_list + +# get category and URL from side div and put them as a list [catego, url] in a list +def get_category_list(soup, url): + catego_info = [] + catego_dict = {} + for li in soup.find("div", class_="side_categories").find_all("li")[1:]: + catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) + catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') + return catego_info + +### TRANSFORMATION ### + +# get relative link from page and build the full URL +def get_image_url(soup, url): + link = soup.img.get('src') + url_site = "https://" + url.split('/')[2] + img_url = url_site + "/" + link.replace('../', '') + return img_url + # create a list with all information consecutively # /!\ don't know if that's the best way def get_data(soup, url): @@ -62,10 +104,10 @@ def get_data(soup, url): product_information(soup)['Number of reviews'], get_image_url(soup, url) ] - - return info +### LOAD ### + # write the file def data_output(info, file): file = file + ".csv" @@ -90,45 +132,6 @@ def data_output(info, file): -# get category and URL from side div and put them as a list [catego, url] in a list -def get_category_list(soup, url): - catego_info = [] - catego_dict = {} - for li in soup.find("div", class_="side_categories").find_all("li")[1:]: - catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) - catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') - return catego_info - - -def check_for_pages(category_url): - soup_catego = get_html(category_url) - total = int(soup_catego.form.strong.text) - url_list = [category_url] - - if total > 20: - new_url_base = category_url.replace('index.html','') - j = 2 - for i in range(total//20): - page = "page-" + str(j) + ".html" - url_list.append(new_url_base + page) - j += 1 - - return url_list - - -# get product list url from a given url category page; -# extract and build each product url using the main url (second arg) -def get_product_url_list(url_category_page, url): - liste = [] - soup = get_html(url_category_page) - - for i in soup.find_all("article"): - relative_url = i.h3.a.get('href') - product_url = url + "catalogue/" + relative_url.split('../')[-1] - liste.append(product_url) - - return liste - # collect category from all # then grab all product for each and write a file with category name @@ -136,6 +139,7 @@ def main(): # init url = "https://books.toscrape.com/" + ### EXTRACTION ### # get html from URL soup = get_html(url) @@ -145,6 +149,7 @@ def main(): processed_books = 0 print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.") + # go ahead for each category for line in get_category_list(soup, url): category = line[0] @@ -165,6 +170,7 @@ def main(): print(len(product_url_list), " livres présents") processed_books += len(product_url_list) + ### TRANSFORMATION ### # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category data = [] img_nb = 1 @@ -175,6 +181,7 @@ def main(): # print(phase1.get_data(page_soup, page_url)) data.append(get_data(page_soup, page_url)) + ### LOAD ### # PHASE 4 : get img for every book and name it with category and incremental number img_url = get_image_url(page_soup, page_url) with open(category + "-" + str(img_nb) + ".png", "wb") as img_file: @@ -184,6 +191,8 @@ def main(): print(processed_books, " livres traités") print(total_books - processed_books, " livres restants") print(total_category, " catégories restantes") + + ### LOAD ### print("Done.\n Fichier " + data_output(data, category))