From e3ac12ff9b047a8582b7f1a189a14a73a3ea1ed1 Mon Sep 17 00:00:00 2001 From: yann Date: Wed, 13 Nov 2024 15:46:48 +0100 Subject: [PATCH] get category_list from home and get product url from a category (if one page) --- phase2/main.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/phase2/main.py b/phase2/main.py index e69de29..d5a90da 100644 --- a/phase2/main.py +++ b/phase2/main.py @@ -0,0 +1,36 @@ +import requests +from bs4 import BeautifulSoup + +url = "https://books.toscrape.com/" +r = requests.get(url) +html = r.content +soup = BeautifulSoup(html, 'html.parser') + + +category = "travel" + +catego_info = [] +catego_dict = {} +# get category and URL from side div and put them as a list [catego, url] in a list + +def get_category_list(soup): + catego_info = [] + catego_dict = {} + for li in soup.find("div", class_="side_categories").find_all("li"): + catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) + catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') + return catego_dict + + +page_url = get_category_list(soup)['travel'] +soup2 = BeautifulSoup(requests.get(page_url).content, "html.parser") + +def get_product_list(soup): + list = [] + for i in soup.find_all("article"): + relative_url = i.h3.a.get('href') + product_url = url + relative_url.split('../')[-1] + list.append(product_url) + return(list) + +print(len(get_product_list(soup2)))