Projet2/phase4/main.py

import requests
from bs4 import BeautifulSoup
import csv

# get soup from url
def get_html(url):
    r = requests.get(url)
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup

# extract the product title from page
def get_title(soup):
    title = soup.find("div", class_="product_main").h1.string
    return title

# extract the product_information from the table and put them in a dict
# and extract quantity from string
def product_information(soup):
    product_info = {}
    for tr in soup.table.find_all("tr"):
        product_info[tr.th.string] = tr.td.string
    #extract the amount from string and case it
    availability = int(''.join(filter(str.isdigit, product_info['Availability'])))
    product_info['Availability'] = availability
    return product_info

# get relative link from page and build the full URL
def get_image_url(soup, url):
    link = soup.img.get('src')
    url_site = "https://" + url.split('/')[2]
    img_url = url_site + "/" + link.replace('../', '')
    return img_url

# get full description as string
# luckily this <p> was the only one without class
# and manage the case where there's no description
def product_description(soup):
    try:
        desc = soup.find("p", class_='').string
    except AttributeError:
        desc = "None"

    return desc

# get category from breadcrumb
def get_category(soup):
    bread = soup.find("ul", class_="breadcrumb").find_all("a")[-1].text
    return bread

# create a list with all information consecutively
# /!\ don't know if that's the best way
def get_data(soup, url):
    info = [
        url, product_information(soup)['UPC'],
        get_title(soup),
        product_information(soup)['Price (incl. tax)'],
        product_information(soup)['Price (excl. tax)'],
        product_information(soup)['Availability'],
        product_description(soup),
        get_category(soup),
        product_information(soup)['Number of reviews'],
        get_image_url(soup, url)
        ]


    return info

# write the file
def data_output(info, file):
    file = file + ".csv"
    fieldnames = ['product_page_url',
                  'universal_ product_code (upc)',
                  'title',
                  'price_including_tax',
                  'price_excluding_tax',
                  'number_available',
                  'product_description',
                  'category',
                  'review_rating',
                  'image_url']

    with open(file, 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',')
        writer.writerow(fieldnames)
        for i in info:
            writer.writerow(i)

    return file


# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li")[1:]:
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_info


def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]

    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1

    return url_list


# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)

    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)

    return liste

# collect category from all
# then grab all product for each and write a file with category name

def main():
    # init
    url = "https://books.toscrape.com/"

    # get html from URL
    soup = get_html(url)

    # init counters
    total_category = len(get_category_list(soup, url))
    total_books = int(soup.form.strong.text)
    processed_books = 0

    print(total_books, " à traiter répartis en ", total_category, " catégories.\nGo.")
    # go ahead for each category
    for line in get_category_list(soup, url):
        category = line[0]
        category_url = line[1]

        total_category -= 1
        # display what category is processed
        print("\n -> Traitement de la catégorie : " + category)

        # check if multiple pages and create a URL list
        url_list = check_for_pages(category_url)

        # get product list for each url_list, extend the main product url list with
        product_url_list = []
        for i in url_list:
            product_url_list.extend(get_product_url_list(i, url))
        #    print("Liste des URL des produits: ", product_url_list)
        print(len(product_url_list), " livres présents")
        processed_books += len(product_url_list)

        # combine with phase 1 and write in csv for each url from product_url_list named with category
        data = []
        for page_url in product_url_list:
            page_soup = get_html(page_url)
            #        print(page_soup)
            #       print(phase1.get_category(page_soup))
            #        print(phase1.get_data(page_soup, page_url))
            data.append(get_data(page_soup, page_url))

        print(processed_books, " livres traités")
        print(total_books - processed_books, " livres restants")
        print(total_category, " catégories restantes")
        print("Done.\n  Fichier " + data_output(data, category))


    print("\n Traitement terminé.")

if __name__ == '__main__':
    main()