This commit is contained in:
2025-11-25 11:21:00 +01:00
parent 034edb5bb2
commit df78ec1909
3 changed files with 61 additions and 19 deletions

2
.idea/WalkFiles.iml generated
View File

@@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" /> <excludeFolder url="file://$MODULE_DIR$/.venv" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.13 (walkfiles)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

View File

@@ -169,28 +169,59 @@ def load_folder_state(cursor, device_id: int):
return out return out
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now): import unicodedata
if folder_path in folder_state:
folder_id = folder_state[folder_path]["id"] import unicodedata
cursor.execute( from datetime import datetime
"UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s",
(now, folder_id) def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id):
) # Normalize Unicode to avoid Černý vs Černý issue
folder_state[folder_path]["deleted"] = 0 folder_path = unicodedata.normalize("NFC", folder_path)
# Cache key is folder_path
key = folder_path
# 1) If we already know this folder → return cached ID
if key in folder_state:
return folder_state[key]["id"]
now = datetime.now()
# 2) Try to SELECT existing record
cursor.execute("""
SELECT id
FROM folders
WHERE device_id = %s AND path = %s
LIMIT 1
""", (device_id, folder_path))
row = cursor.fetchone()
if row:
folder_id = row[0]
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id return folder_id
parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None # 3) INSERT new folder (idempotent)
cursor.execute(""" cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted) INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s, 0) VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
id = LAST_INSERT_ID(id),
last_seen = VALUES(last_seen)
""", (folder_path, parent_id, device_id, now, now)) """, (folder_path, parent_id, device_id, now, now))
conn.commit()
folder_id = cursor.lastrowid folder_id = cursor.lastrowid
folder_state[folder_path] = {"id": folder_id, "deleted": 0}
# 4) Save to memory cache
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id return folder_id
# ====================================================== # ======================================================
# LOAD LAST FILE STATE # LOAD LAST FILE STATE
# ====================================================== # ======================================================
@@ -225,8 +256,11 @@ def load_last_file_state(cursor, device_id: int):
def walk_and_store_bulk(): def walk_and_store_bulk():
BATCH_SIZE = 10000 BATCH_SIZE = 10000
target_dir = r"\\tower1\#colddata" # target_dir = r"\\tower1\#colddata"
device_name = "TOWER1" # target_dir = r"z:"
target_dir = r"\\tower\ebooks"
# device_name = "TW22"
device_name = "TOWER"
if not os.path.isdir(target_dir): if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir) print("Invalid directory:", target_dir)
@@ -256,12 +290,20 @@ def walk_and_store_bulk():
# ------------------------------------------------- # -------------------------------------------------
for root, dirs, files in os.walk(target_dir): for root, dirs, files in os.walk(target_dir):
folder_path = os.path.normpath(root) folder_path = os.path.normpath(root)
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None # 1⃣ determine parent_id correctly
if root == target_dir:
parent_id = None
else:
parent_folder_path = os.path.normpath(os.path.dirname(root))
parent_id = get_or_create_folder(cursor, conn, folder_state,
device_id, parent_folder_path,
None)
# 2⃣ now insert current folder with correct parent_id
seen_folders.add(folder_path) seen_folders.add(folder_path)
folder_id = get_or_create_folder(cursor, conn, folder_state, folder_id = get_or_create_folder(cursor, conn, folder_state,
device_id, folder_path, device_id, folder_path,
parent_path, now) parent_id)
# ------------------------------------------------- # -------------------------------------------------
# FILE LOOP # FILE LOOP

View File

@@ -229,7 +229,7 @@ def run_md5_calculator(device_name=None,
if __name__ == "__main__": if __name__ == "__main__":
# Example usage: # Example usage:
run_md5_calculator( run_md5_calculator(
device_name="TWW11", device_name="TOWER",
extension="ANY", extension="ANY",
max_size="ANY", max_size="ANY",
path_prefix=r"ANY" path_prefix=r"ANY"