remove url_base, refactor list get_data, fix comment and PEP8

This commit is contained in:
yann 2024-11-13 11:03:25 +01:00
parent 1adcf0b224
commit 5d6a9bc263

View File

@ -15,19 +15,19 @@ def get_title(soup):
# extract the product_information from the table and put them in a dict # extract the product_information from the table and put them in a dict
# and extract quantity from string # and extract quantity from string
def product_information(soup): def product_information(soup):
product_info={} product_info = {}
for tr in soup.table.find_all("tr"): for tr in soup.table.find_all("tr"):
product_info[tr.th.string] = tr.td.string product_info[tr.th.string] = tr.td.string
#extract the amount from string and case it #extract the amount from string and case it
availability=int(''.join(filter(str.isdigit, product_info['Availability']))) availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
product_info['Availability']=availability product_info['Availability'] = availability
return product_info return product_info
# get relative link from page and build the full URL # get relative link from page and build the full URL
def get_image_url(soup, url): def get_image_url(soup, url):
link = soup.img.get('src') link = soup.img.get('src')
url_site="https://"+url.split('/')[2] url_site = "https://" + url.split('/')[2]
img_url=url_site+"/"+link.replace('../', '') img_url = url_site + "/" + link.replace('../', '')
return img_url return img_url
# get full description as string # get full description as string
@ -36,17 +36,26 @@ def product_description(soup):
desc = soup.find("p", class_='').string desc = soup.find("p", class_='').string
return desc return desc
#create a dict with all information for writing loop later #create a list with all information consecutively
# /!\ don't know if that's the best way # /!\ don't know if that's the best way
def get_data(soup, url): def get_data(soup, url):
info = [url, product_information(soup)['UPC'], get_title(soup), product_information(soup)['Price (incl. tax)'], product_information(soup)['Price (excl. tax)'], product_information(soup)['Availability'], product_description(soup), "TODO", product_information(soup)['Number of reviews'], get_image_url(soup, url)] info = [url, product_information(soup)['UPC'],
get_title(soup),
product_information(soup)['Price (incl. tax)'],
product_information(soup)['Price (excl. tax)'],
product_information(soup)['Availability'],
product_description(soup),
"TODO",
product_information(soup)['Number of reviews'],
get_image_url(soup, url)
]
return info return info
#write the file #write the file
def data_output(info, file): def data_output(info, file):
fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url'] fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url']
with open(file, 'w') as csv_file: with open(file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter=',') writer = csv.writer(csv_file, delimiter = ',')
writer.writerow(fieldnames) writer.writerow(fieldnames)
writer.writerow(info) writer.writerow(info)
@ -57,15 +66,15 @@ def data_output(info, file):
def main(): def main():
url_site="https://books.toscrape.com"
url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html" url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
html = extract_web(url) html = extract_web(url)
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
test=product_information(soup) test = product_information(soup)
print(test['Availability']) print(test['Availability'])
info=get_data(soup, url) info = get_data(soup, url)
print(info) print(info)
data_output(info, 'output.csv') data_output(info, 'output.csv')