#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ ONE-TIME MIGRATION: Normalize full_path (NFC, forward slashes) + recompute path_hash - Targets ONLY one host_name (Tower1 by default) - Safe with UNIQUE(host_name, path_hash) - Handles collisions by skipping conflicting rows and logging them - DRY_RUN supported """ import sys import time import hashlib import posixpath import unicodedata import pymysql from pymysql.err import IntegrityError # ========================= # CONFIG # ========================= HOST_TO_FIX = "Tower" # <-- set your Unraid host_name exactly as stored in DB DRY_RUN = True # <-- first run True; then switch to False to apply BATCH_SELECT_FETCH = 5000 # server-side cursor fetch size (streaming) COMMIT_EVERY = 2000 # commit after N successful updates (when DRY_RUN=False) LOG_EVERY = 50000 # progress print DB_CONFIG = { "host": "192.168.1.50", "port": 3306, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": False, } # ========================= # CANONICALIZATION # ========================= def canonical_path(path_str: str) -> str: if not path_str: return path_str path_str = path_str.replace("\\", "/") path_str = posixpath.normpath(path_str) path_str = unicodedata.normalize("NFC", path_str) return path_str def md5_bytes(path_str: str) -> bytes: return hashlib.md5(path_str.encode("utf-8")).digest() # 16 raw bytes for BINARY(16) # ========================= # MAIN # ========================= def main(): print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 Tower path_hash migration") print(f"Host: {HOST_TO_FIX}") print(f"DRY_RUN: {DRY_RUN}") sys.stdout.flush() db = pymysql.connect(**DB_CONFIG) # streaming cursor for reading read_cur = db.cursor(pymysql.cursors.SSCursor) read_cur.execute( """ SELECT id, full_path, path_hash FROM file_md5_index WHERE host_name = %s """, (HOST_TO_FIX,), ) # normal cursor for updates upd_cur = db.cursor() total = 0 needs_change = 0 updated_ok = 0 collisions = 0 other_errors = 0 start = time.time() pending_commits = 0 # Optional: make server-side cursor fetch a bit larger # (PyMySQL streams regardless; this just makes loop smoother) # Not strictly necessary. while True: rows = read_cur.fetchmany(BATCH_SELECT_FETCH) if not rows: break for rec_id, full_path, stored_hash in rows: total += 1 new_path = canonical_path(full_path) new_hash = md5_bytes(new_path) # already canonical & correct if new_path == full_path and new_hash == stored_hash: if total % LOG_EVERY == 0: elapsed = time.time() - start print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s") sys.stdout.flush() continue needs_change += 1 if DRY_RUN: # in dry-run we just count; no DB writes continue # Update with collision handling via UNIQUE(host_name, path_hash) try: # Use a savepoint so a duplicate-key error doesn't kill the whole transaction upd_cur.execute("SAVEPOINT sp_one;") upd_cur.execute( """ UPDATE file_md5_index SET full_path = %s, path_hash = %s WHERE id = %s """, (new_path, new_hash, rec_id), ) upd_cur.execute("RELEASE SAVEPOINT sp_one;") updated_ok += 1 pending_commits += 1 if pending_commits >= COMMIT_EVERY: db.commit() pending_commits = 0 except IntegrityError as e: # Duplicate key = collision on (host_name, path_hash) # This means some OTHER row in the same host already has this new_hash. upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;") upd_cur.execute("RELEASE SAVEPOINT sp_one;") collisions += 1 # Print a short line occasionally (avoid huge spam) if collisions <= 50 or collisions % 1000 == 0: print(f"⚠ COLLISION id={rec_id} | {e}") sys.stdout.flush() except Exception as e: upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;") upd_cur.execute("RELEASE SAVEPOINT sp_one;") other_errors += 1 if other_errors <= 50 or other_errors % 1000 == 0: print(f"❌ ERROR id={rec_id} | {e}") sys.stdout.flush() if total % LOG_EVERY == 0: elapsed = time.time() - start print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s") sys.stdout.flush() # finalize if not DRY_RUN: if pending_commits: db.commit() print("✅ Migration finished (committed).") else: print("⚠ DRY_RUN finished (no changes written).") elapsed = time.time() - start print("=" * 70) print(f"Total rows checked : {total:,}") print(f"Rows needing change : {needs_change:,}") print(f"Rows updated : {updated_ok:,}") print(f"Collisions (skipped) : {collisions:,}") print(f"Other errors : {other_errors:,}") print(f"Elapsed : {elapsed:.1f}s") print("=" * 70) read_cur.close() upd_cur.close() db.close() if __name__ == "__main__": main()