Projet2/phase1/main.py

import requests
from bs4 import BeautifulSoup
import csv

def extract_web(url):
    r = requests.get(url)
    page = r.content
    return page

# extract the product title from page
def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title

# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info

# get relative link from page and build the full URL
def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url

# get full description as string
# luckily this <p> was the only one without class
def product_description(soup):
    desc = soup.find("p", class_='').string
    return desc

# get category from breadcrumb
def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread

# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
    info = [url, product_information(soup)['UPC'],
            get_title(soup),
            product_information(soup)['Price (incl. tax)'],
            product_information(soup)['Price (excl. tax)'],
            product_information(soup)['Availability'],
            product_description(soup),
            get_category(soup),
            product_information(soup)['Number of reviews'],
            get_image_url(soup, url)
            ]
    return info

# write the file
def data_output(info, file):
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']

    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        writer.writerow(info)


def main():

    url = "https://books.toscrape.com/catalogue/set-me-free_988/index.html"

    html = extract_web(url)
    soup = BeautifulSoup(html, "html.parser")
    test = product_information(soup)
    print(test['Availability'])

    info = get_data(soup, url)
    print(info)
    data_output(info, 'output.csv')

if __name__ == "__main__":
    main()