main folder to put files in it, replace category name space, fix / issue in img name

This commit is contained in:
yann 2024-11-20 11:56:10 +01:00
parent cece9d1874
commit 6cb7913af2

View File

@ -140,6 +140,7 @@ def data_output(info, file):
def main(): def main():
# init # init
url = "https://books.toscrape.com/" url = "https://books.toscrape.com/"
os.mkdir("resultat")
### EXTRACTION ### ### EXTRACTION ###
# get html from URL # get html from URL
@ -154,10 +155,13 @@ def main():
# go ahead for each category # go ahead for each category
for line in get_category_list(soup, url): for line in get_category_list(soup, url):
category = line[0] # remove space in category name, to prevent potential issue on directory creation
category_url = line[1] category = line[0].replace(' ', '_')
category_url = line[1]
category_path = "resultat/" + category
total_category -= 1 total_category -= 1
# display what category is processed # display what category is processed
print("\n -> Traitement de la catégorie : " + category) print("\n -> Traitement de la catégorie : " + category)
@ -177,12 +181,13 @@ def main():
data = [] data = []
img_nb = 1 img_nb = 1
# go ahead for each product of category
# EXTRACT data for each product page # EXTRACT data for each product page
for page_url in product_url_list: for page_url in product_url_list:
# create the category directory. If it exists already, just continue # create the category directory. If it exists already, just continue
try: try:
os.mkdir(category) os.mkdir(category_path)
except FileExistsError: except FileExistsError:
pass pass
@ -195,10 +200,13 @@ def main():
# LOAD data in a list # LOAD data in a list
data.append(product_data) data.append(product_data)
# protect path creation by removing "/" in product name
img_name = (product_data[2] + ".png").replace('/', '_')
# PHASE 4 : get img for every book and name it with category and incremental number # PHASE 4 : get img for every book and name it with category and incremental number
# EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
img_url = get_image_url(page_soup, page_url) img_url = get_image_url(page_soup, page_url)
with open(category + "/" + product_data[2] + ".png", "wb") as img_file: with open(category_path + "/" + img_name, "wb") as img_file:
img_file.write(requests.get(img_url).content) img_file.write(requests.get(img_url).content)
img_nb += 1 img_nb += 1
@ -207,7 +215,7 @@ def main():
print(total_category, " catégories restantes") print(total_category, " catégories restantes")
# LOAD : write the list in the CSV file # LOAD : write the list in the CSV file
print("Done.\n Fichier " + data_output(data, category)) print("Done.\n Fichier " + data_output(data, category_path))