copy phase1/main.py as phase1.py to import in main

2024-11-14 12:35:35 +01:00 · 2024-11-14 12:35:35 +01:00 · 27d37fb5d3
commit 27d37fb5d3
parent 50ca4fccd8
1 changed files with 104 additions and 0 deletions
--- a/phase2/phase1.py
+++ b/phase2/phase1.py
@ -0,0 +1,104 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+
+def extract_web(url):
+    r = requests.get(url)
+    page = r.content
+    return page
+
+# extract the product title from page
+def get_title(soup):
+    title = soup.find("div", class_="product_main").h1.string
+    return title
+
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
+def product_information(soup):
+    product_info = {}
+    for tr in soup.table.find_all("tr"):
+        product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability'] = availability
+    return product_info
+
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
+    link = soup.img.get('src')
+    url_site = "https://" + url.split('/')[2]
+    img_url = url_site + "/" + link.replace('../', '')
+    return img_url
+
+# get full description as string
+# luckily this <p> was the only one without class
+def product_description(soup):
+    desc = soup.find("p", class_='').string
+    return desc
+
+# get category from breadcrumb
+def get_category(soup):
+    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
+    return bread
+
+# create a list with all information consecutively
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info = [url, product_information(soup)['UPC'],
+            get_title(soup),
+            product_information(soup)['Price (incl. tax)'],
+            product_information(soup)['Price (excl. tax)'],
+            product_information(soup)['Availability'],
+            product_description(soup),
+            get_category(soup),
+            product_information(soup)['Number of reviews'],
+            get_image_url(soup, url)
+            ]
+    return info
+
+# write the file
+def data_output(info, file):
+    file = file + ".csv"
+    fieldnames = ['product_page_url',
+                  'universal_ product_code (upc)',
+                  'title',
+                  'price_including_tax',
+                  'price_excluding_tax',
+                  'number_available',
+                  'product_description',
+                  'category',
+                  'review_rating',
+                  'image_url']
+
+    with open(file, 'w') as csv_file:
+        writer = csv.writer(csv_file, delimiter = ',')
+        writer.writerow(fieldnames)
+        for i in info:
+            writer.writerow(i)
+
+
+
+
+
+
+
+def main():
+
+    url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
+
+    html = extract_web(url)
+    soup = BeautifulSoup(html, "html.parser")
+    test = product_information(soup)
+    print(test['Availability'])
+
+    info = get_data(soup, url)
+    print(info)
+    data_output(info, 'output.csv')
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+