copy phase1/main.py as phase1.py to import in main
This commit is contained in:
parent
50ca4fccd8
commit
27d37fb5d3
104
phase2/phase1.py
Normal file
104
phase2/phase1.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
|
||||||
|
def extract_web(url):
|
||||||
|
r = requests.get(url)
|
||||||
|
page = r.content
|
||||||
|
return page
|
||||||
|
|
||||||
|
# extract the product title from page
|
||||||
|
def get_title(soup):
|
||||||
|
title = soup.find("div", class_="product_main").h1.string
|
||||||
|
return title
|
||||||
|
|
||||||
|
# extract the product_information from the table and put them in a dict
|
||||||
|
# and extract quantity from string
|
||||||
|
def product_information(soup):
|
||||||
|
product_info = {}
|
||||||
|
for tr in soup.table.find_all("tr"):
|
||||||
|
product_info[tr.th.string] = tr.td.string
|
||||||
|
#extract the amount from string and case it
|
||||||
|
availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
|
||||||
|
product_info['Availability'] = availability
|
||||||
|
return product_info
|
||||||
|
|
||||||
|
# get relative link from page and build the full URL
|
||||||
|
def get_image_url(soup, url):
|
||||||
|
link = soup.img.get('src')
|
||||||
|
url_site = "https://" + url.split('/')[2]
|
||||||
|
img_url = url_site + "/" + link.replace('../', '')
|
||||||
|
return img_url
|
||||||
|
|
||||||
|
# get full description as string
|
||||||
|
# luckily this <p> was the only one without class
|
||||||
|
def product_description(soup):
|
||||||
|
desc = soup.find("p", class_='').string
|
||||||
|
return desc
|
||||||
|
|
||||||
|
# get category from breadcrumb
|
||||||
|
def get_category(soup):
|
||||||
|
bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
|
||||||
|
return bread
|
||||||
|
|
||||||
|
# create a list with all information consecutively
|
||||||
|
# /!\ don't know if that's the best way
|
||||||
|
def get_data(soup, url):
|
||||||
|
info = [url, product_information(soup)['UPC'],
|
||||||
|
get_title(soup),
|
||||||
|
product_information(soup)['Price (incl. tax)'],
|
||||||
|
product_information(soup)['Price (excl. tax)'],
|
||||||
|
product_information(soup)['Availability'],
|
||||||
|
product_description(soup),
|
||||||
|
get_category(soup),
|
||||||
|
product_information(soup)['Number of reviews'],
|
||||||
|
get_image_url(soup, url)
|
||||||
|
]
|
||||||
|
return info
|
||||||
|
|
||||||
|
# write the file
|
||||||
|
def data_output(info, file):
|
||||||
|
file = file + ".csv"
|
||||||
|
fieldnames = ['product_page_url',
|
||||||
|
'universal_ product_code (upc)',
|
||||||
|
'title',
|
||||||
|
'price_including_tax',
|
||||||
|
'price_excluding_tax',
|
||||||
|
'number_available',
|
||||||
|
'product_description',
|
||||||
|
'category',
|
||||||
|
'review_rating',
|
||||||
|
'image_url']
|
||||||
|
|
||||||
|
with open(file, 'w') as csv_file:
|
||||||
|
writer = csv.writer(csv_file, delimiter = ',')
|
||||||
|
writer.writerow(fieldnames)
|
||||||
|
for i in info:
|
||||||
|
writer.writerow(i)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"
|
||||||
|
|
||||||
|
html = extract_web(url)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
test = product_information(soup)
|
||||||
|
print(test['Availability'])
|
||||||
|
|
||||||
|
info = get_data(soup, url)
|
||||||
|
print(info)
|
||||||
|
data_output(info, 'output.csv')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user