improve function, added comment... writer to do
This commit is contained in:
parent
fd53a2d704
commit
9d7edf3e9a
@ -2,41 +2,89 @@ import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
|
||||
url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
|
||||
|
||||
def extract_web(url):
|
||||
r = requests.get(url)
|
||||
page = r.content
|
||||
return page
|
||||
|
||||
# extract the product title from page
|
||||
def get_title(soup):
|
||||
title = soup.find("div", class_="product_main").h1.string
|
||||
return title
|
||||
|
||||
|
||||
# extract the product_information from the table and put them in a dict
|
||||
# and extract quantity from string
|
||||
def product_information(soup):
|
||||
product_info={}
|
||||
for tr in soup.table.find_all("tr"):
|
||||
product_info[tr.th.string] = tr.td.string
|
||||
#extract the amount from string and case it
|
||||
availability=int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||
product_info['Availability']=availability
|
||||
return product_info
|
||||
|
||||
def get_image_url(soup):
|
||||
# get relative link from page and build the full URL
|
||||
def get_image_url(soup, url):
|
||||
link = soup.img.get('src')
|
||||
url_site="https://"+url.split('/')[2]
|
||||
img_url=url_site+"/"+link.replace('../', '')
|
||||
return img_url
|
||||
|
||||
# get full description as string
|
||||
# luckily this <p> was the only one without class
|
||||
def product_description(soup):
|
||||
desc = soup.find("p", class_='').string
|
||||
return desc
|
||||
|
||||
if __name__ == '__main__':
|
||||
#create a dict with all information for writing loop later
|
||||
# /!\ don't know if that's the best way
|
||||
def get_data(soup, url):
|
||||
info= {}
|
||||
info = {
|
||||
'product_page_url': url,
|
||||
'universal_ product_code (upc)': product_information(soup)['UPC'],
|
||||
'title': get_title(soup),
|
||||
'price_including_tax': product_information(soup)['Price (incl. tax)'],
|
||||
'price_excluding_tax': product_information(soup)['Price (excl. tax)'],
|
||||
'number_available': product_information(soup)['Availability'],
|
||||
'product_description': product_description(soup),
|
||||
'category': "TODO",
|
||||
'review_rating': product_information(soup)['Number of reviews'],
|
||||
'image_url': get_image_url(soup, url)
|
||||
}
|
||||
return info
|
||||
|
||||
#write the file
|
||||
def data_output(info, file):
|
||||
fieldnames = ['product_page_url', 'universal_ product_code (upc)', 'title', 'price_including_tax', 'price_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url']
|
||||
with open('output.csv', 'w') as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=fieldnames )
|
||||
|
||||
writer.writeheader()
|
||||
for line in info:
|
||||
writer.writerow(line)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
url_site="https://books.toscrape.com"
|
||||
url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
|
||||
|
||||
html = extract_web(url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
print(product_information(soup))
|
||||
test=product_information(soup)
|
||||
print(test['Availability'])
|
||||
|
||||
info=get_data(soup, url)
|
||||
data_output(info, 'output.csv')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user