From 27d37fb5d35ea02502da4ae6fc4ab546ed5d7d26 Mon Sep 17 00:00:00 2001 From: yann Date: Thu, 14 Nov 2024 12:35:35 +0100 Subject: [PATCH] copy phase1/main.py as phase1.py to import in main --- phase2/phase1.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 phase2/phase1.py diff --git a/phase2/phase1.py b/phase2/phase1.py new file mode 100644 index 0000000..ae55d86 --- /dev/null +++ b/phase2/phase1.py @@ -0,0 +1,104 @@ +import requests +from bs4 import BeautifulSoup +import csv + +def extract_web(url): + r = requests.get(url) + page = r.content + return page + +# extract the product title from page +def get_title(soup): + title = soup.find("div", class_="product_main").h1.string + return title + +# extract the product_information from the table and put them in a dict +# and extract quantity from string +def product_information(soup): + product_info = {} + for tr in soup.table.find_all("tr"): + product_info[tr.th.string] = tr.td.string + #extract the amount from string and case it + availability = int(''.join(filter(str.isdigit, product_info['Availability']))) + product_info['Availability'] = availability + return product_info + +# get relative link from page and build the full URL +def get_image_url(soup, url): + link = soup.img.get('src') + url_site = "https://" + url.split('/')[2] + img_url = url_site + "/" + link.replace('../', '') + return img_url + +# get full description as string +# luckily this

was the only one without class +def product_description(soup): + desc = soup.find("p", class_='').string + return desc + +# get category from breadcrumb +def get_category(soup): + bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text + return bread + +# create a list with all information consecutively +# /!\ don't know if that's the best way +def get_data(soup, url): + info = [url, product_information(soup)['UPC'], + get_title(soup), + product_information(soup)['Price (incl. tax)'], + product_information(soup)['Price (excl. tax)'], + product_information(soup)['Availability'], + product_description(soup), + get_category(soup), + product_information(soup)['Number of reviews'], + get_image_url(soup, url) + ] + return info + +# write the file +def data_output(info, file): + file = file + ".csv" + fieldnames = ['product_page_url', + 'universal_ product_code (upc)', + 'title', + 'price_including_tax', + 'price_excluding_tax', + 'number_available', + 'product_description', + 'category', + 'review_rating', + 'image_url'] + + with open(file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter = ',') + writer.writerow(fieldnames) + for i in info: + writer.writerow(i) + + + + + + + +def main(): + + url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" + + html = extract_web(url) + soup = BeautifulSoup(html, "html.parser") + test = product_information(soup) + print(test['Availability']) + + info = get_data(soup, url) + print(info) + data_output(info, 'output.csv') + +if __name__ == "__main__": + main() + + + + +