get category_list from home and get product url from a category (if one page)
This commit is contained in:
parent
7e6875a497
commit
e3ac12ff9b
@ -0,0 +1,36 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
url = "https://books.toscrape.com/"
|
||||||
|
r = requests.get(url)
|
||||||
|
html = r.content
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
|
||||||
|
category = "travel"
|
||||||
|
|
||||||
|
catego_info = []
|
||||||
|
catego_dict = {}
|
||||||
|
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||||
|
|
||||||
|
def get_category_list(soup):
|
||||||
|
catego_info = []
|
||||||
|
catego_dict = {}
|
||||||
|
for li in soup.find("div", class_="side_categories").find_all("li"):
|
||||||
|
catego_info.append([li.a.get_text(strip = True).lower(), url + li.a.get('href')])
|
||||||
|
catego_dict[li.a.get_text(strip = True).lower()] = url + li.a.get('href')
|
||||||
|
return catego_dict
|
||||||
|
|
||||||
|
|
||||||
|
page_url = get_category_list(soup)['travel']
|
||||||
|
soup2 = BeautifulSoup(requests.get(page_url).content, "html.parser")
|
||||||
|
|
||||||
|
def get_product_list(soup):
|
||||||
|
list = []
|
||||||
|
for i in soup.find_all("article"):
|
||||||
|
relative_url = i.h3.a.get('href')
|
||||||
|
product_url = url + relative_url.split('../')[-1]
|
||||||
|
list.append(product_url)
|
||||||
|
return(list)
|
||||||
|
|
||||||
|
print(len(get_product_list(soup2)))
|
Loading…
x
Reference in New Issue
Block a user