This commit is contained in:
2025-11-25 11:21:00 +01:00
parent 034edb5bb2
commit df78ec1909
3 changed files with 61 additions and 19 deletions

2
.idea/WalkFiles.iml generated
View File

@@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.13 (walkfiles)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.12" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@@ -169,28 +169,59 @@ def load_folder_state(cursor, device_id: int):
return out
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now):
if folder_path in folder_state:
folder_id = folder_state[folder_path]["id"]
cursor.execute(
"UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s",
(now, folder_id)
)
folder_state[folder_path]["deleted"] = 0
import unicodedata
import unicodedata
from datetime import datetime
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id):
# Normalize Unicode to avoid Černý vs Černý issue
folder_path = unicodedata.normalize("NFC", folder_path)
# Cache key is folder_path
key = folder_path
# 1) If we already know this folder → return cached ID
if key in folder_state:
return folder_state[key]["id"]
now = datetime.now()
# 2) Try to SELECT existing record
cursor.execute("""
SELECT id
FROM folders
WHERE device_id = %s AND path = %s
LIMIT 1
""", (device_id, folder_path))
row = cursor.fetchone()
if row:
folder_id = row[0]
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None
# 3) INSERT new folder (idempotent)
cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted)
VALUES (%s, %s, %s, %s, %s, 0)
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
id = LAST_INSERT_ID(id),
last_seen = VALUES(last_seen)
""", (folder_path, parent_id, device_id, now, now))
conn.commit()
folder_id = cursor.lastrowid
folder_state[folder_path] = {"id": folder_id, "deleted": 0}
# 4) Save to memory cache
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# ======================================================
# LOAD LAST FILE STATE
# ======================================================
@@ -225,8 +256,11 @@ def load_last_file_state(cursor, device_id: int):
def walk_and_store_bulk():
BATCH_SIZE = 10000
target_dir = r"\\tower1\#colddata"
device_name = "TOWER1"
# target_dir = r"\\tower1\#colddata"
# target_dir = r"z:"
target_dir = r"\\tower\ebooks"
# device_name = "TW22"
device_name = "TOWER"
if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir)
@@ -256,12 +290,20 @@ def walk_and_store_bulk():
# -------------------------------------------------
for root, dirs, files in os.walk(target_dir):
folder_path = os.path.normpath(root)
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
# 1⃣ determine parent_id correctly
if root == target_dir:
parent_id = None
else:
parent_folder_path = os.path.normpath(os.path.dirname(root))
parent_id = get_or_create_folder(cursor, conn, folder_state,
device_id, parent_folder_path,
None)
# 2⃣ now insert current folder with correct parent_id
seen_folders.add(folder_path)
folder_id = get_or_create_folder(cursor, conn, folder_state,
device_id, folder_path,
parent_path, now)
parent_id)
# -------------------------------------------------
# FILE LOOP

View File

@@ -229,7 +229,7 @@ def run_md5_calculator(device_name=None,
if __name__ == "__main__":
# Example usage:
run_md5_calculator(
device_name="TWW11",
device_name="TOWER",
extension="ANY",
max_size="ANY",
path_prefix=r"ANY"