From 9d7edf3e9a39503accd25706b1dfa69b5fbb9205 Mon Sep 17 00:00:00 2001
From: yann <yann@needsome.coffee>
Date: Tue, 12 Nov 2024 19:02:32 +0100
Subject: [PATCH] improve function, added comment... writer to do

---
 phase1/main.py | 60 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)
diff --git a/phase1/main.py b/phase1/main.py
index 53f4bf7..f13accd 100644
--- a/phase1/main.py
+++ b/phase1/main.py
@@ -2,41 +2,89 @@ import requests
 from bs4 import BeautifulSoup
 import csv
 
-url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
-
 def extract_web(url):
     r = requests.get(url)
     page = r.content
     return page
 
+# extract the product title from page
 def get_title(soup):
     title = soup.find("div", class_="product_main").h1.string
     return title
 
-
+# extract the product_information from the table and put them in a dict
+# and extract quantity from string
 def product_information(soup):
     product_info={}
     for tr in soup.table.find_all("tr"):
         product_info[tr.th.string] = tr.td.string
+    #extract the amount from string and case it
+    availability=int(''.join(filter(str.isdigit, product_info['Availability'])))
+    product_info['Availability']=availability
     return product_info
 
-def get_image_url(soup):
+# get relative link from page and build the full URL
+def get_image_url(soup, url):
     link = soup.img.get('src')
+    url_site="https://"+url.split('/')[2]
     img_url=url_site+"/"+link.replace('../', '')
     return img_url
 
+# get full description as string
+# luckily this <p> was the only one without class
 def product_description(soup):
     desc = soup.find("p", class_='').string
     return desc
 
-if __name__ == '__main__':
+#create a dict with all information for writing loop later
+# /!\ don't know if that's the best way
+def get_data(soup, url):
+    info= {}
+    info = {
+        'product_page_url': url,
+        'universal_ product_code (upc)': product_information(soup)['UPC'],
+        'title': get_title(soup),
+        'price_including_tax': product_information(soup)['Price (incl. tax)'],
+        'price_excluding_tax': product_information(soup)['Price (excl. tax)'],
+        'number_available': product_information(soup)['Availability'],
+        'product_description': product_description(soup),
+        'category': "TODO",
+        'review_rating':  product_information(soup)['Number of reviews'],
+        'image_url': get_image_url(soup, url)
+    }
+    return info
 
+#write the file
+def data_output(info, file):
+    fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url']
+    with open('output.csv', 'w') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames )
+
+        writer.writeheader()
+        for line in info:
+            writer.writerow(line)
+
+
+
+
+
+
+
+def main():
     url_site="https://books.toscrape.com"
     url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
 
     html = extract_web(url)
     soup = BeautifulSoup(html, "html.parser")
-    print(product_information(soup))
+    test=product_information(soup)
+    print(test['Availability'])
+
+    info=get_data(soup, url)
+    data_output(info, 'output.csv')
+
+if __name__ == "__main__":
+    main()
+