import requests from bs4 import BeautifulSoup import csv def extract_web(url): r = requests.get(url) page = r.content return page # extract the product title from page def get_title(soup): title = soup.find("div", class_="product_main").h1.string return title # extract the product_information from the table and put them in a dict # and extract quantity from string def product_information(soup): product_info = {} for tr in soup.table.find_all("tr"): product_info[tr.th.string] = tr.td.string #extract the amount from string and case it availability = int(''.join(filter(str.isdigit, product_info['Availability']))) product_info['Availability'] = availability return product_info # get relative link from page and build the full URL def get_image_url(soup, url): link = soup.img.get('src') url_site = "https://" + url.split('/')[2] img_url = url_site + "/" + link.replace('../', '') return img_url # get full description as string # luckily this

was the only one without class def product_description(soup): desc = soup.find("p", class_='').string return desc # get category from breadcrumb def get_category(soup): bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text return bread # create a list with all information consecutively # /!\ don't know if that's the best way def get_data(soup, url): info = [url, product_information(soup)['UPC'], get_title(soup), product_information(soup)['Price (incl. tax)'], product_information(soup)['Price (excl. tax)'], product_information(soup)['Availability'], product_description(soup), get_category(soup), product_information(soup)['Number of reviews'], get_image_url(soup, url) ] return info # write the file def data_output(info, file): fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] with open(file, 'w') as csv_file: writer = csv.writer(csv_file, delimiter = ',') writer.writerow(fieldnames) writer.writerow(info) def main(): url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" html = extract_web(url) soup = BeautifulSoup(html, "html.parser") test = product_information(soup) print(test['Availability']) info = get_data(soup, url) print(info) data_output(info, 'output.csv') if __name__ == "__main__": main()