diff --git a/phase2/main.py b/phase2/main.py index e69de29..d5a90da 100644 --- a/phase2/main.py +++ b/phase2/main.py @@ -0,0 +1,36 @@ +import requests +from bs4 import BeautifulSoup + +url = "https://books.toscrape.com/" +r = requests.get(url) +html = r.content +soup = BeautifulSoup(html, 'html.parser') + + +category = "travel" + +catego_info = [] +catego_dict = {} +# get category and URL from side div and put them as a list [catego, url] in a list + +def get_category_list(soup): + catego_info = [] + catego_dict = {} + for li in soup.find("div", class_="side_categories").find_all("li"): + catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')]) + catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href') + return catego_dict + + +page_url = get_category_list(soup)['travel'] +soup2 = BeautifulSoup(requests.get(page_url).content, "html.parser") + +def get_product_list(soup): + list = [] + for i in soup.find_all("article"): + relative_url = i.h3.a.get('href') + product_url = url + relative_url.split('../')[-1] + list.append(product_url) + return(list) + +print(len(get_product_list(soup2)))