From 9d7edf3e9a39503accd25706b1dfa69b5fbb9205 Mon Sep 17 00:00:00 2001 From: yann Date: Tue, 12 Nov 2024 19:02:32 +0100 Subject: [PATCH] improve function, added comment... writer to do --- phase1/main.py | 60 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/phase1/main.py b/phase1/main.py index 53f4bf7..f13accd 100644 --- a/phase1/main.py +++ b/phase1/main.py @@ -2,41 +2,89 @@ import requests from bs4 import BeautifulSoup import csv -url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" - def extract_web(url): r = requests.get(url) page = r.content return page +# extract the product title from page def get_title(soup): title = soup.find("div", class_="product_main").h1.string return title - +# extract the product_information from the table and put them in a dict +# and extract quantity from string def product_information(soup): product_info={} for tr in soup.table.find_all("tr"): product_info[tr.th.string] = tr.td.string + #extract the amount from string and case it + availability=int(''.join(filter(str.isdigit, product_info['Availability']))) + product_info['Availability']=availability return product_info -def get_image_url(soup): +# get relative link from page and build the full URL +def get_image_url(soup, url): link = soup.img.get('src') + url_site="https://"+url.split('/')[2] img_url=url_site+"/"+link.replace('../', '') return img_url +# get full description as string +# luckily this

was the only one without class def product_description(soup): desc = soup.find("p", class_='').string return desc -if __name__ == '__main__': +#create a dict with all information for writing loop later +# /!\ don't know if that's the best way +def get_data(soup, url): + info= {} + info = { + 'product_page_url': url, + 'universal_ product_code (upc)': product_information(soup)['UPC'], + 'title': get_title(soup), + 'price_including_tax': product_information(soup)['Price (incl. tax)'], + 'price_excluding_tax': product_information(soup)['Price (excl. tax)'], + 'number_available': product_information(soup)['Availability'], + 'product_description': product_description(soup), + 'category': "TODO", + 'review_rating': product_information(soup)['Number of reviews'], + 'image_url': get_image_url(soup, url) + } + return info +#write the file +def data_output(info, file): + fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] + with open('output.csv', 'w') as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=fieldnames ) + + writer.writeheader() + for line in info: + writer.writerow(line) + + + + + + + +def main(): url_site="https://books.toscrape.com" url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" html = extract_web(url) soup = BeautifulSoup(html, "html.parser") - print(product_information(soup)) + test=product_information(soup) + print(test['Availability']) + + info=get_data(soup, url) + data_output(info, 'output.csv') + +if __name__ == "__main__": + main() +