import requests from bs4 import BeautifulSoup import csv def extract_web(url): r = requests.get(url) page = r.content return page # extract the product title from page def get_title(soup): title = soup.find("div", class_="product_main").h1.string return title # extract the product_information from the table and put them in a dict # and extract quantity from string def product_information(soup): product_info={} for tr in soup.table.find_all("tr"): product_info[tr.th.string] = tr.td.string #extract the amount from string and case it availability=int(''.join(filter(str.isdigit, product_info['Availability']))) product_info['Availability']=availability return product_info # get relative link from page and build the full URL def get_image_url(soup, url): link = soup.img.get('src') url_site="https://"+url.split('/')[2] img_url=url_site+"/"+link.replace('../', '') return img_url # get full description as string # luckily this

was the only one without class def product_description(soup): desc = soup.find("p", class_='').string return desc #create a dict with all information for writing loop later # /!\ don't know if that's the best way def get_data(soup, url): info = [url, product_information(soup)['UPC'], get_title(soup), product_information(soup)['Price (incl. tax)'], product_information(soup)['Price (excl. tax)'], product_information(soup)['Availability'], product_description(soup), "TODO", product_information(soup)['Number of reviews'], get_image_url(soup, url)] return info #write the file def data_output(info, file): fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] with open(file, 'w') as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerow(fieldnames) writer.writerow(info) def main(): url_site="https://books.toscrape.com" url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" html = extract_web(url) soup = BeautifulSoup(html, "html.parser") test=product_information(soup) print(test['Availability']) info=get_data(soup, url) print(info) data_output(info, 'output.csv') if __name__ == "__main__": main()