diff --git a/rendu/main.py b/rendu/main.py index b9bf9df..3f14dc8 100644 --- a/rendu/main.py +++ b/rendu/main.py @@ -140,6 +140,7 @@ def data_output(info, file): def main(): # init url = "https://books.toscrape.com/" + os.mkdir("resultat") ### EXTRACTION ### # get html from URL @@ -154,10 +155,13 @@ def main(): # go ahead for each category for line in get_category_list(soup, url): - category = line[0] - category_url = line[1] + # remove space in category name, to prevent potential issue on directory creation + category = line[0].replace(' ', '_') + category_url = line[1] + category_path = "resultat/" + category total_category -= 1 + # display what category is processed print("\n -> Traitement de la catégorie : " + category) @@ -177,12 +181,13 @@ def main(): data = [] img_nb = 1 + # go ahead for each product of category # EXTRACT data for each product page for page_url in product_url_list: # create the category directory. If it exists already, just continue try: - os.mkdir(category) + os.mkdir(category_path) except FileExistsError: pass @@ -195,10 +200,13 @@ def main(): # LOAD data in a list data.append(product_data) + # protect path creation by removing "/" in product name + img_name = (product_data[2] + ".png").replace('/', '_') + # PHASE 4 : get img for every book and name it with category and incremental number # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title img_url = get_image_url(page_soup, page_url) - with open(category + "/" + product_data[2] + ".png", "wb") as img_file: + with open(category_path + "/" + img_name, "wb") as img_file: img_file.write(requests.get(img_url).content) img_nb += 1 @@ -207,7 +215,7 @@ def main(): print(total_category, " catégories restantes") # LOAD : write the list in the CSV file - print("Done.\n Fichier " + data_output(data, category)) + print("Done.\n Fichier " + data_output(data, category_path))