Projet2/phase2/main.py

import requests
from bs4 import BeautifulSoup
import phase1

# get soup from url
def get_html(url):
    r = requests.get(url)
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    return soup


# get category and URL from side div and put them as a list [catego, url] in a list
def get_category_list(soup, url):
    catego_info = []
    catego_dict = {}
    for li in soup.find("div", class_="side_categories").find_all("li"):
        catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
        catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
    return catego_dict


def check_for_pages(category_url):
    soup_catego = get_html(category_url)
    total = int(soup_catego.form.strong.text)
    url_list = [category_url]

    if total > 20:
        new_url_base = category_url.replace('index.html','')
        j = 2
        for i in range(total//20):
            page = "page-" + str(j) + ".html"
            url_list.append(new_url_base + page)
            j += 1

    return url_list


# get product list url from a given url category page;
# extract and build each product url using the main url (second arg)
def get_product_url_list(url_category_page, url):
    liste = []
    soup = get_html(url_category_page)

    for i in soup.find_all("article"):
        relative_url = i.h3.a.get('href')
        product_url = url + "catalogue/" + relative_url.split('../')[-1]
        liste.append(product_url)

    return liste

def main():
    # init
    url = "https://books.toscrape.com/"
    category = "default"

    # get functional variables
    soup = get_html(url)
    liste_categories = get_category_list(soup, url)

    # get category URL to do some tests on it
    category_url = liste_categories[category]
    print(category_url)

    # check if multiple page and get url list
    url_list = check_for_pages(category_url)
#    print("Liste des URLs des pages: ", url_list)

    # get product list for each url_list, extend the main product url list with
    product_url_list = []
    for i in url_list:
        product_url_list.extend(get_product_url_list(i, url))
#    print("Liste des URL des produits: ", product_url_list)
    print("Nombre de livres: ", len(product_url_list))


    # combine with phase 1 and write in csv for each url from product_url_list named with category
    data = []
    for page_url in product_url_list:
        page_soup = get_html(page_url)
#        print(page_soup)
#       print(phase1.get_category(page_soup))
#        print(phase1.get_data(page_soup, page_url))
        data.append(phase1.get_data(page_soup, page_url))

    print("Done.\n Fichier " + phase1.data_output(data, category))


if __name__ == '__main__':
    main()