main folder to put files in it, replace category name space, fix / issue in img name

2024-11-20 11:56:10 +01:00 · 2024-11-20 11:56:10 +01:00 · 6cb7913af2
commit 6cb7913af2
parent cece9d1874
1 changed files with 13 additions and 5 deletions
--- a/rendu/main.py
+++ b/rendu/main.py
@ -140,6 +140,7 @@ def data_output(info, file):
 def main():
    # init
    url = "https://books.toscrape.com/"
    os.mkdir("resultat")
    ### EXTRACTION ###
    # get html from URL
@ -154,10 +155,13 @@ def main():
    # go ahead for each category
    for line in get_category_list(soup, url):
-        category = line[0]
+        # remove space in category name, to prevent potential issue on directory creation
-        category_url = line[1]
+        category = line[0].replace(' ', '_')
        category_url = line[1]
        category_path = "resultat/" + category
        total_category -= 1
        # display what category is processed
        print("\n -> Traitement de la catégorie : " + category)
@ -177,12 +181,13 @@ def main():
        data = []
        img_nb = 1
        # go ahead for each product of category
        # EXTRACT data for each product page
        for page_url in product_url_list:
            # create the category directory. If it exists already, just continue
            try:
-                os.mkdir(category)
+                os.mkdir(category_path)
            except FileExistsError:
                pass
@ -195,10 +200,13 @@ def main():
            # LOAD data in a list
            data.append(product_data)
            # protect path creation by removing "/" in product name
            img_name = (product_data[2] + ".png").replace('/', '_')
            # PHASE 4 : get img for every book and name it with category and incremental number
            # EXTRACT images data -url, title, binary content- and LOAD binary content in a file named with the title
            img_url = get_image_url(page_soup, page_url)
-            with open(category + "/" + product_data[2] + ".png", "wb") as img_file:
+            with open(category_path + "/" + img_name, "wb") as img_file:
                img_file.write(requests.get(img_url).content)
            img_nb += 1
@ -207,7 +215,7 @@ def main():
        print(total_category, " catégories restantes")
        # LOAD : write the list in the CSV file
-        print("Done.\n  Fichier " + data_output(data, category))
+        print("Done.\n  Fichier " + data_output(data, category_path))