import requests from bs4 import BeautifulSoup import csv def extract_web(url): r = requests.get(url) page = r.content return page # extract the product title from page def get_title(soup): title = soup.find("div", class_="product_main").h1.string return title # extract the product_information from the table and put them in a dict # and extract quantity from string def product_information(soup): product_info={} for tr in soup.table.find_all("tr"): product_info[tr.th.string] = tr.td.string #extract the amount from string and case it availability=int(''.join(filter(str.isdigit, product_info['Availability']))) product_info['Availability']=availability return product_info # get relative link from page and build the full URL def get_image_url(soup, url): link = soup.img.get('src') url_site="https://"+url.split('/')[2] img_url=url_site+"/"+link.replace('../', '') return img_url # get full description as string # luckily this

was the only one without class def product_description(soup): desc = soup.find("p", class_='').string return desc #create a dict with all information for writing loop later # /!\ don't know if that's the best way def get_data(soup, url): info= {} info = { 'product_page_url': url, 'universal_ product_code (upc)': product_information(soup)['UPC'], 'title': get_title(soup), 'price_including_tax': product_information(soup)['Price (incl. tax)'], 'price_excluding_tax': product_information(soup)['Price (excl. tax)'], 'number_available': product_information(soup)['Availability'], 'product_description': product_description(soup), 'category': "TODO", 'review_rating': product_information(soup)['Number of reviews'], 'image_url': get_image_url(soup, url) } return info #write the file def data_output(info, file): fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] with open('output.csv', 'w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fieldnames ) writer.writeheader() for line in info: writer.writerow(line) def main(): url_site="https://books.toscrape.com" url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" html = extract_web(url) soup = BeautifulSoup(html, "html.parser") test=product_information(soup) print(test['Availability']) info=get_data(soup, url) data_output(info, 'output.csv') if __name__ == "__main__": main()