diff --git a/phase1/main.py b/phase1/main.py index 7f46836..a2968a9 100644 --- a/phase1/main.py +++ b/phase1/main.py @@ -15,19 +15,19 @@ def get_title(soup): # extract the product_information from the table and put them in a dict # and extract quantity from string def product_information(soup): - product_info={} + product_info = {} for tr in soup.table.find_all("tr"): product_info[tr.th.string] = tr.td.string #extract the amount from string and case it - availability=int(''.join(filter(str.isdigit, product_info['Availability']))) - product_info['Availability']=availability + availability = int(''.join(filter(str.isdigit, product_info['Availability']))) + product_info['Availability'] = availability return product_info # get relative link from page and build the full URL def get_image_url(soup, url): link = soup.img.get('src') - url_site="https://"+url.split('/')[2] - img_url=url_site+"/"+link.replace('../', '') + url_site = "https://" + url.split('/')[2] + img_url = url_site + "/" + link.replace('../', '') return img_url # get full description as string @@ -36,17 +36,36 @@ def product_description(soup): desc = soup.find("p", class_='').string return desc -#create a dict with all information for writing loop later +#create a list with all information consecutively # /!\ don't know if that's the best way def get_data(soup, url): - info = [url, product_information(soup)['UPC'], get_title(soup), product_information(soup)['Price (incl. tax)'], product_information(soup)['Price (excl. tax)'], product_information(soup)['Availability'], product_description(soup), "TODO", product_information(soup)['Number of reviews'], get_image_url(soup, url)] + info = [url, product_information(soup)['UPC'], + get_title(soup), + product_information(soup)['Price (incl. tax)'], + product_information(soup)['Price (excl. tax)'], + product_information(soup)['Availability'], + product_description(soup), + "TODO", + product_information(soup)['Number of reviews'], + get_image_url(soup, url) + ] return info #write the file def data_output(info, file): - fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] + fieldnames = ['product_page_url', + 'universal_ product_code (upc)', + 'title', + 'price_including_tax', + 'price_excluding_tax', + 'number_available', + 'product_description', + 'category', + 'review_rating', + 'image_url'] + with open(file, 'w') as csv_file: - writer = csv.writer(csv_file, delimiter=',') + writer = csv.writer(csv_file, delimiter = ',') writer.writerow(fieldnames) writer.writerow(info) @@ -57,15 +76,15 @@ def data_output(info, file): def main(): - url_site="https://books.toscrape.com" + url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" html = extract_web(url) soup = BeautifulSoup(html, "html.parser") - test=product_information(soup) + test = product_information(soup) print(test['Availability']) - info=get_data(soup, url) + info = get_data(soup, url) print(info) data_output(info, 'output.csv')