diff --git a/.idea/WalkFiles.iml b/.idea/WalkFiles.iml index 3786a27..6cb8b9a 100644 --- a/.idea/WalkFiles.iml +++ b/.idea/WalkFiles.iml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/22 WalkandSave.py b/22 WalkandSave.py index 8bf2f4f..7ab97c5 100644 --- a/22 WalkandSave.py +++ b/22 WalkandSave.py @@ -169,28 +169,59 @@ def load_folder_state(cursor, device_id: int): return out -def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now): - if folder_path in folder_state: - folder_id = folder_state[folder_path]["id"] - cursor.execute( - "UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s", - (now, folder_id) - ) - folder_state[folder_path]["deleted"] = 0 +import unicodedata + +import unicodedata +from datetime import datetime + +def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id): + # Normalize Unicode to avoid Černý vs Černý issue + folder_path = unicodedata.normalize("NFC", folder_path) + + # Cache key is folder_path + key = folder_path + + # 1) If we already know this folder → return cached ID + if key in folder_state: + return folder_state[key]["id"] + + now = datetime.now() + + # 2) Try to SELECT existing record + cursor.execute(""" + SELECT id + FROM folders + WHERE device_id = %s AND path = %s + LIMIT 1 + """, (device_id, folder_path)) + row = cursor.fetchone() + + if row: + folder_id = row[0] + folder_state[key] = {"id": folder_id, "deleted": 0} return folder_id - parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None - + # 3) INSERT new folder (idempotent) cursor.execute(""" - INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted) - VALUES (%s, %s, %s, %s, %s, 0) + INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen) + VALUES (%s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + id = LAST_INSERT_ID(id), + last_seen = VALUES(last_seen) """, (folder_path, parent_id, device_id, now, now)) + conn.commit() + folder_id = cursor.lastrowid - folder_state[folder_path] = {"id": folder_id, "deleted": 0} + + # 4) Save to memory cache + folder_state[key] = {"id": folder_id, "deleted": 0} + return folder_id + + # ====================================================== # LOAD LAST FILE STATE # ====================================================== @@ -225,8 +256,11 @@ def load_last_file_state(cursor, device_id: int): def walk_and_store_bulk(): BATCH_SIZE = 10000 - target_dir = r"\\tower1\#colddata" - device_name = "TOWER1" + # target_dir = r"\\tower1\#colddata" + # target_dir = r"z:" + target_dir = r"\\tower\ebooks" + # device_name = "TW22" + device_name = "TOWER" if not os.path.isdir(target_dir): print("Invalid directory:", target_dir) @@ -256,12 +290,20 @@ def walk_and_store_bulk(): # ------------------------------------------------- for root, dirs, files in os.walk(target_dir): folder_path = os.path.normpath(root) - parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None + # 1️⃣ determine parent_id correctly + if root == target_dir: + parent_id = None + else: + parent_folder_path = os.path.normpath(os.path.dirname(root)) + parent_id = get_or_create_folder(cursor, conn, folder_state, + device_id, parent_folder_path, + None) + # 2️⃣ now insert current folder with correct parent_id seen_folders.add(folder_path) folder_id = get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, - parent_path, now) + parent_id) # ------------------------------------------------- # FILE LOOP diff --git a/51 MD5Calculate.py b/51 MD5Calculate.py index c1ad0d7..bd5743a 100644 --- a/51 MD5Calculate.py +++ b/51 MD5Calculate.py @@ -229,7 +229,7 @@ def run_md5_calculator(device_name=None, if __name__ == "__main__": # Example usage: run_md5_calculator( - device_name="TWW11", + device_name="TOWER", extension="ANY", max_size="ANY", path_prefix=r"ANY"