z230
This commit is contained in:
188
WalkFilesOnBackupHDD/50 Onetimepathnormalization.py
Normal file
188
WalkFilesOnBackupHDD/50 Onetimepathnormalization.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
ONE-TIME MIGRATION: Normalize full_path (NFC, forward slashes) + recompute path_hash
|
||||
- Targets ONLY one host_name (Tower1 by default)
|
||||
- Safe with UNIQUE(host_name, path_hash)
|
||||
- Handles collisions by skipping conflicting rows and logging them
|
||||
- DRY_RUN supported
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import hashlib
|
||||
import posixpath
|
||||
import unicodedata
|
||||
import pymysql
|
||||
from pymysql.err import IntegrityError
|
||||
|
||||
# =========================
|
||||
# CONFIG
|
||||
# =========================
|
||||
HOST_TO_FIX = "Tower" # <-- set your Unraid host_name exactly as stored in DB
|
||||
DRY_RUN = True # <-- first run True; then switch to False to apply
|
||||
BATCH_SELECT_FETCH = 5000 # server-side cursor fetch size (streaming)
|
||||
COMMIT_EVERY = 2000 # commit after N successful updates (when DRY_RUN=False)
|
||||
LOG_EVERY = 50000 # progress print
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": "192.168.1.50",
|
||||
"port": 3306,
|
||||
"user": "root",
|
||||
"password": "Vlado9674+",
|
||||
"database": "torrents",
|
||||
"charset": "utf8mb4",
|
||||
"autocommit": False,
|
||||
}
|
||||
|
||||
# =========================
|
||||
# CANONICALIZATION
|
||||
# =========================
|
||||
def canonical_path(path_str: str) -> str:
|
||||
if not path_str:
|
||||
return path_str
|
||||
path_str = path_str.replace("\\", "/")
|
||||
path_str = posixpath.normpath(path_str)
|
||||
path_str = unicodedata.normalize("NFC", path_str)
|
||||
return path_str
|
||||
|
||||
def md5_bytes(path_str: str) -> bytes:
|
||||
return hashlib.md5(path_str.encode("utf-8")).digest() # 16 raw bytes for BINARY(16)
|
||||
|
||||
# =========================
|
||||
# MAIN
|
||||
# =========================
|
||||
def main():
|
||||
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 Tower path_hash migration")
|
||||
print(f"Host: {HOST_TO_FIX}")
|
||||
print(f"DRY_RUN: {DRY_RUN}")
|
||||
sys.stdout.flush()
|
||||
|
||||
db = pymysql.connect(**DB_CONFIG)
|
||||
|
||||
# streaming cursor for reading
|
||||
read_cur = db.cursor(pymysql.cursors.SSCursor)
|
||||
read_cur.execute(
|
||||
"""
|
||||
SELECT id, full_path, path_hash
|
||||
FROM file_md5_index
|
||||
WHERE host_name = %s
|
||||
""",
|
||||
(HOST_TO_FIX,),
|
||||
)
|
||||
|
||||
# normal cursor for updates
|
||||
upd_cur = db.cursor()
|
||||
|
||||
total = 0
|
||||
needs_change = 0
|
||||
updated_ok = 0
|
||||
collisions = 0
|
||||
other_errors = 0
|
||||
|
||||
start = time.time()
|
||||
pending_commits = 0
|
||||
|
||||
# Optional: make server-side cursor fetch a bit larger
|
||||
# (PyMySQL streams regardless; this just makes loop smoother)
|
||||
# Not strictly necessary.
|
||||
|
||||
while True:
|
||||
rows = read_cur.fetchmany(BATCH_SELECT_FETCH)
|
||||
if not rows:
|
||||
break
|
||||
|
||||
for rec_id, full_path, stored_hash in rows:
|
||||
total += 1
|
||||
|
||||
new_path = canonical_path(full_path)
|
||||
new_hash = md5_bytes(new_path)
|
||||
|
||||
# already canonical & correct
|
||||
if new_path == full_path and new_hash == stored_hash:
|
||||
if total % LOG_EVERY == 0:
|
||||
elapsed = time.time() - start
|
||||
print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s")
|
||||
sys.stdout.flush()
|
||||
continue
|
||||
|
||||
needs_change += 1
|
||||
|
||||
if DRY_RUN:
|
||||
# in dry-run we just count; no DB writes
|
||||
continue
|
||||
|
||||
# Update with collision handling via UNIQUE(host_name, path_hash)
|
||||
try:
|
||||
# Use a savepoint so a duplicate-key error doesn't kill the whole transaction
|
||||
upd_cur.execute("SAVEPOINT sp_one;")
|
||||
|
||||
upd_cur.execute(
|
||||
"""
|
||||
UPDATE file_md5_index
|
||||
SET full_path = %s,
|
||||
path_hash = %s
|
||||
WHERE id = %s
|
||||
""",
|
||||
(new_path, new_hash, rec_id),
|
||||
)
|
||||
|
||||
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
|
||||
|
||||
updated_ok += 1
|
||||
pending_commits += 1
|
||||
|
||||
if pending_commits >= COMMIT_EVERY:
|
||||
db.commit()
|
||||
pending_commits = 0
|
||||
|
||||
except IntegrityError as e:
|
||||
# Duplicate key = collision on (host_name, path_hash)
|
||||
# This means some OTHER row in the same host already has this new_hash.
|
||||
upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;")
|
||||
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
|
||||
collisions += 1
|
||||
|
||||
# Print a short line occasionally (avoid huge spam)
|
||||
if collisions <= 50 or collisions % 1000 == 0:
|
||||
print(f"⚠ COLLISION id={rec_id} | {e}")
|
||||
sys.stdout.flush()
|
||||
|
||||
except Exception as e:
|
||||
upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;")
|
||||
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
|
||||
other_errors += 1
|
||||
if other_errors <= 50 or other_errors % 1000 == 0:
|
||||
print(f"❌ ERROR id={rec_id} | {e}")
|
||||
sys.stdout.flush()
|
||||
|
||||
if total % LOG_EVERY == 0:
|
||||
elapsed = time.time() - start
|
||||
print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s")
|
||||
sys.stdout.flush()
|
||||
|
||||
# finalize
|
||||
if not DRY_RUN:
|
||||
if pending_commits:
|
||||
db.commit()
|
||||
print("✅ Migration finished (committed).")
|
||||
else:
|
||||
print("⚠ DRY_RUN finished (no changes written).")
|
||||
|
||||
elapsed = time.time() - start
|
||||
print("=" * 70)
|
||||
print(f"Total rows checked : {total:,}")
|
||||
print(f"Rows needing change : {needs_change:,}")
|
||||
print(f"Rows updated : {updated_ok:,}")
|
||||
print(f"Collisions (skipped) : {collisions:,}")
|
||||
print(f"Other errors : {other_errors:,}")
|
||||
print(f"Elapsed : {elapsed:.1f}s")
|
||||
print("=" * 70)
|
||||
|
||||
read_cur.close()
|
||||
upd_cur.close()
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user