test if multiple page, get URL, create list of product, and refactor main
This commit is contained in:
parent
e3ac12ff9b
commit
8213f0849c
@ -1,19 +1,16 @@
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
url = "https://books.toscrape.com/"
|
# récupère le soup depuis l'url
|
||||||
|
def get_html(url):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
html = r.content
|
html = r.content
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
category = "travel"
|
|
||||||
|
|
||||||
catego_info = []
|
|
||||||
catego_dict = {}
|
|
||||||
# get category and URL from side div and put them as a list [catego, url] in a list
|
# get category and URL from side div and put them as a list [catego, url] in a list
|
||||||
|
def get_category_list(soup, url):
|
||||||
def get_category_list(soup):
|
|
||||||
catego_info = []
|
catego_info = []
|
||||||
catego_dict = {}
|
catego_dict = {}
|
||||||
for li in soup.find("div", class_="side_categories").find_all("li"):
|
for li in soup.find("div", class_="side_categories").find_all("li"):
|
||||||
@ -22,15 +19,60 @@ def get_category_list(soup):
|
|||||||
return catego_dict
|
return catego_dict
|
||||||
|
|
||||||
|
|
||||||
page_url = get_category_list(soup)['travel']
|
def check_for_pages(category_url):
|
||||||
soup2 = BeautifulSoup(requests.get(page_url).content, "html.parser")
|
soup_catego = get_html(category_url)
|
||||||
|
total = int(soup_catego.form.strong.text)
|
||||||
|
url_list = [category_url]
|
||||||
|
|
||||||
def get_product_list(soup):
|
if total > 20:
|
||||||
|
new_url_base = category_url.replace('index.html','')
|
||||||
|
j = 2
|
||||||
|
for i in range(total//20):
|
||||||
|
page = "page-" + str(j) + ".html"
|
||||||
|
url_list.append(new_url_base + page)
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
return url_list
|
||||||
|
|
||||||
|
|
||||||
|
def get_product_list(url_category_page, url):
|
||||||
list = []
|
list = []
|
||||||
|
soup = get_html(url_category_page)
|
||||||
|
|
||||||
for i in soup.find_all("article"):
|
for i in soup.find_all("article"):
|
||||||
relative_url = i.h3.a.get('href')
|
relative_url = i.h3.a.get('href')
|
||||||
product_url = url + relative_url.split('../')[-1]
|
product_url = url + relative_url.split('../')[-1]
|
||||||
list.append(product_url)
|
list.append(product_url)
|
||||||
|
|
||||||
return(list)
|
return(list)
|
||||||
|
|
||||||
print(len(get_product_list(soup2)))
|
def main():
|
||||||
|
# init
|
||||||
|
url = "https://books.toscrape.com/"
|
||||||
|
category = "default"
|
||||||
|
|
||||||
|
# get functional variables
|
||||||
|
soup = get_html(url)
|
||||||
|
liste_categories = get_category_list(soup, url)
|
||||||
|
|
||||||
|
# get category URL to do some tests on it
|
||||||
|
category_url = liste_categories[category]
|
||||||
|
print(category_url)
|
||||||
|
|
||||||
|
# check if multiple page and get url list
|
||||||
|
url_list = check_for_pages(category_url)
|
||||||
|
|
||||||
|
print("Liste des URLs des pages: ", url_list)
|
||||||
|
# get product list for every url_list
|
||||||
|
product_list_url = []
|
||||||
|
for i in url_list:
|
||||||
|
get_list = get_product_list(i, url)
|
||||||
|
for j in get_list:
|
||||||
|
product_list_url.append(j)
|
||||||
|
|
||||||
|
|
||||||
|
print("Liste des URL des produits: ", product_list_url)
|
||||||
|
print("Longueur de la liste: ", len(product_list_url))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user