import requests from bs4 import BeautifulSoup import csv # get soup from url def get_html(url): r = requests.get(url, headers = {'User-agent': 'yann@needsome.coffee'}) html = r.content soup = BeautifulSoup(html, 'html.parser') return soup # extract the product title from page def get_title(soup): title = soup.find("div", class_="product_main").h1.string return title # extract the product_information from the table and put them in a dict # and extract quantity from string def product_information(soup): product_info = {} for tr in soup.table.find_all("tr"): product_info[tr.th.string] = tr.td.string #extract the amount from string and case it availability = int(''.join(filter(str.isdigit, product_info['Availability']))) product_info['Availability'] = availability return product_info # get relative link from page and build the full URL def get_image_url(soup, url): link = soup.img.get('src') url_site = "https://" + url.split('/')[2] img_url = url_site + "/" + link.replace('../', '') return img_url # get full description as string # luckily this

was the only one without class # and manage the case where there's no description def product_description(soup): try: desc = soup.find("p", class_='').string except AttributeError: desc = "None" return desc # get category from breadcrumb def get_category(soup): bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text return bread # create a list with all information consecutively # /!\ don't know if that's the best way def get_data(soup, url): info = [ url, product_information(soup)['UPC'], get_title(soup), product_information(soup)['Price (incl. tax)'], product_information(soup)['Price (excl. tax)'], product_information(soup)['Availability'], product_description(soup), get_category(soup), product_information(soup)['Number of reviews'], get_image_url(soup, url) ] return info # write the file def data_output(info, file): file = file + ".csv" fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] with open(file, 'w') as csv_file: writer = csv.writer(csv_file, delimiter = ',') writer.writerow(fieldnames) for i in info: writer.writerow(i) return file # get category and URL from side div and put them as a list [catego, url] in a list def get_category_list(soup, url): catego_info = [] catego_dict = {} for li in soup.find("div", class_="side_categories").find_all("li")[1:]: catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') return catego_info def check_for_pages(category_url): soup_catego = get_html(category_url) total = int(soup_catego.form.strong.text) url_list = [category_url] if total > 20: new_url_base = category_url.replace('index.html','') j = 2 for i in range(total//20): page = "page-" + str(j) + ".html" url_list.append(new_url_base + page) j += 1 return url_list # get product list url from a given url category page; # extract and build each product url using the main url (second arg) def get_product_url_list(url_category_page, url): liste = [] soup = get_html(url_category_page) for i in soup.find_all("article"): relative_url = i.h3.a.get('href') product_url = url + "catalogue/" + relative_url.split('../')[-1] liste.append(product_url) return liste # collect category from all # then grab all product for each and write a file with category name def main(): # init url = "https://books.toscrape.com/" # get html from URL soup = get_html(url) # init counters total_category = len(get_category_list(soup, url)) total_books = int(soup.form.strong.text) processed_books = 0 print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.") # go ahead for each category for line in get_category_list(soup, url): category = line[0] category_url = line[1] total_category -= 1 # display what category is processed print("\n -> Traitement de la catégorie : " + category) # check if multiple pages and create a URL list url_list = check_for_pages(category_url) # get product list for each url_list, extend the main product url list with product_url_list = [] for i in url_list: product_url_list.extend(get_product_url_list(i, url)) # print("Liste des URL des produits: ", product_url_list) print(len(product_url_list), " livres présents") processed_books += len(product_url_list) # PHASE 3 : combine with phase 1 and write in csv for each url from product_url_list named with category data = [] img_nb = 1 for page_url in product_url_list: page_soup = get_html(page_url) # print(page_soup) # print(phase1.get_category(page_soup)) # print(phase1.get_data(page_soup, page_url)) data.append(get_data(page_soup, page_url)) # PHASE 4 : get img for every book and name it with category and incremental number img_url = get_image_url(page_soup, page_url) with open(category + "-" + str(img_nb) + ".png", "wb") as img_file: img_file.write(requests.get(img_url).content) img_nb += 1 print(processed_books, " livres traités") print(total_books - processed_books, " livres restants") print(total_category, " catégories restantes") print("Done.\n Fichier " + data_output(data, category)) print("\n Traitement terminé.") if __name__ == '__main__': main()