main folder to put files in it, replace category name space, fix / issue in img name
This commit is contained in:
parent
cece9d1874
commit
6cb7913af2
@ -140,6 +140,7 @@ def data_output(info, file):
|
|||||||
def main():
|
def main():
|
||||||
# init
|
# init
|
||||||
url = "https://books.toscrape.com/"
|
url = "https://books.toscrape.com/"
|
||||||
|
os.mkdir("resultat")
|
||||||
|
|
||||||
### EXTRACTION ###
|
### EXTRACTION ###
|
||||||
# get html from URL
|
# get html from URL
|
||||||
@ -154,10 +155,13 @@ def main():
|
|||||||
|
|
||||||
# go ahead for each category
|
# go ahead for each category
|
||||||
for line in get_category_list(soup, url):
|
for line in get_category_list(soup, url):
|
||||||
category = line[0]
|
# remove space in category name, to prevent potential issue on directory creation
|
||||||
category_url = line[1]
|
category = line[0].replace(' ', '_')
|
||||||
|
|
||||||
|
category_url = line[1]
|
||||||
|
category_path = "resultat/" + category
|
||||||
total_category -= 1
|
total_category -= 1
|
||||||
|
|
||||||
# display what category is processed
|
# display what category is processed
|
||||||
print("\n -> Traitement de la catégorie : " + category)
|
print("\n -> Traitement de la catégorie : " + category)
|
||||||
|
|
||||||
@ -177,12 +181,13 @@ def main():
|
|||||||
data = []
|
data = []
|
||||||
img_nb = 1
|
img_nb = 1
|
||||||
|
|
||||||
|
# go ahead for each product of category
|
||||||
# EXTRACT data for each product page
|
# EXTRACT data for each product page
|
||||||
for page_url in product_url_list:
|
for page_url in product_url_list:
|
||||||
|
|
||||||
# create the category directory. If it exists already, just continue
|
# create the category directory. If it exists already, just continue
|
||||||
try:
|
try:
|
||||||
os.mkdir(category)
|
os.mkdir(category_path)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -195,10 +200,13 @@ def main():
|
|||||||
# LOAD data in a list
|
# LOAD data in a list
|
||||||
data.append(product_data)
|
data.append(product_data)
|
||||||
|
|
||||||
|
# protect path creation by removing "/" in product name
|
||||||
|
img_name = (product_data[2] + ".png").replace('/', '_')
|
||||||
|
|
||||||
# PHASE 4 : get img for every book and name it with category and incremental number
|
# PHASE 4 : get img for every book and name it with category and incremental number
|
||||||
# EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
|
# EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
|
||||||
img_url = get_image_url(page_soup, page_url)
|
img_url = get_image_url(page_soup, page_url)
|
||||||
with open(category + "/" + product_data[2] + ".png", "wb") as img_file:
|
with open(category_path + "/" + img_name, "wb") as img_file:
|
||||||
img_file.write(requests.get(img_url).content)
|
img_file.write(requests.get(img_url).content)
|
||||||
img_nb += 1
|
img_nb += 1
|
||||||
|
|
||||||
@ -207,7 +215,7 @@ def main():
|
|||||||
print(total_category, " catégories restantes")
|
print(total_category, " catégories restantes")
|
||||||
|
|
||||||
# LOAD : write the list in the CSV file
|
# LOAD : write the list in the CSV file
|
||||||
print("Done.\n Fichier " + data_output(data, category))
|
print("Done.\n Fichier " + data_output(data, category_path))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user