Files
walkfiles/WalkFilesOnBackupHDD/50 Onetimepathnormalization.py
2026-02-03 13:26:11 +01:00

189 lines
5.8 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ONE-TIME MIGRATION: Normalize full_path (NFC, forward slashes) + recompute path_hash
- Targets ONLY one host_name (Tower1 by default)
- Safe with UNIQUE(host_name, path_hash)
- Handles collisions by skipping conflicting rows and logging them
- DRY_RUN supported
"""
import sys
import time
import hashlib
import posixpath
import unicodedata
import pymysql
from pymysql.err import IntegrityError
# =========================
# CONFIG
# =========================
HOST_TO_FIX = "Tower" # <-- set your Unraid host_name exactly as stored in DB
DRY_RUN = True # <-- first run True; then switch to False to apply
BATCH_SELECT_FETCH = 5000 # server-side cursor fetch size (streaming)
COMMIT_EVERY = 2000 # commit after N successful updates (when DRY_RUN=False)
LOG_EVERY = 50000 # progress print
DB_CONFIG = {
"host": "192.168.1.50",
"port": 3306,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": False,
}
# =========================
# CANONICALIZATION
# =========================
def canonical_path(path_str: str) -> str:
if not path_str:
return path_str
path_str = path_str.replace("\\", "/")
path_str = posixpath.normpath(path_str)
path_str = unicodedata.normalize("NFC", path_str)
return path_str
def md5_bytes(path_str: str) -> bytes:
return hashlib.md5(path_str.encode("utf-8")).digest() # 16 raw bytes for BINARY(16)
# =========================
# MAIN
# =========================
def main():
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 Tower path_hash migration")
print(f"Host: {HOST_TO_FIX}")
print(f"DRY_RUN: {DRY_RUN}")
sys.stdout.flush()
db = pymysql.connect(**DB_CONFIG)
# streaming cursor for reading
read_cur = db.cursor(pymysql.cursors.SSCursor)
read_cur.execute(
"""
SELECT id, full_path, path_hash
FROM file_md5_index
WHERE host_name = %s
""",
(HOST_TO_FIX,),
)
# normal cursor for updates
upd_cur = db.cursor()
total = 0
needs_change = 0
updated_ok = 0
collisions = 0
other_errors = 0
start = time.time()
pending_commits = 0
# Optional: make server-side cursor fetch a bit larger
# (PyMySQL streams regardless; this just makes loop smoother)
# Not strictly necessary.
while True:
rows = read_cur.fetchmany(BATCH_SELECT_FETCH)
if not rows:
break
for rec_id, full_path, stored_hash in rows:
total += 1
new_path = canonical_path(full_path)
new_hash = md5_bytes(new_path)
# already canonical & correct
if new_path == full_path and new_hash == stored_hash:
if total % LOG_EVERY == 0:
elapsed = time.time() - start
print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s")
sys.stdout.flush()
continue
needs_change += 1
if DRY_RUN:
# in dry-run we just count; no DB writes
continue
# Update with collision handling via UNIQUE(host_name, path_hash)
try:
# Use a savepoint so a duplicate-key error doesn't kill the whole transaction
upd_cur.execute("SAVEPOINT sp_one;")
upd_cur.execute(
"""
UPDATE file_md5_index
SET full_path = %s,
path_hash = %s
WHERE id = %s
""",
(new_path, new_hash, rec_id),
)
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
updated_ok += 1
pending_commits += 1
if pending_commits >= COMMIT_EVERY:
db.commit()
pending_commits = 0
except IntegrityError as e:
# Duplicate key = collision on (host_name, path_hash)
# This means some OTHER row in the same host already has this new_hash.
upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;")
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
collisions += 1
# Print a short line occasionally (avoid huge spam)
if collisions <= 50 or collisions % 1000 == 0:
print(f"⚠ COLLISION id={rec_id} | {e}")
sys.stdout.flush()
except Exception as e:
upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;")
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
other_errors += 1
if other_errors <= 50 or other_errors % 1000 == 0:
print(f"❌ ERROR id={rec_id} | {e}")
sys.stdout.flush()
if total % LOG_EVERY == 0:
elapsed = time.time() - start
print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s")
sys.stdout.flush()
# finalize
if not DRY_RUN:
if pending_commits:
db.commit()
print("✅ Migration finished (committed).")
else:
print("⚠ DRY_RUN finished (no changes written).")
elapsed = time.time() - start
print("=" * 70)
print(f"Total rows checked : {total:,}")
print(f"Rows needing change : {needs_change:,}")
print(f"Rows updated : {updated_ok:,}")
print(f"Collisions (skipped) : {collisions:,}")
print(f"Other errors : {other_errors:,}")
print(f"Elapsed : {elapsed:.1f}s")
print("=" * 70)
read_cur.close()
upd_cur.close()
db.close()
if __name__ == "__main__":
main()