""" reconcile.py — Cross-check MySQL content_hash values against .zip files on disk. Reports: 1. DB hashes with no blob on disk (missing backups) 2. Blob files on disk with no matching DB record (orphan blobs) 3. Summary stats """ import os import sys from indexer.config import BACKUP_PATH from indexer.db import get_connection def collect_disk_hashes(backup_root: str) -> set: """Walk backup dir and collect all hex hashes from .zip filenames.""" hashes = set() for dirpath, _dirnames, filenames in os.walk(backup_root): for fn in filenames: if fn.endswith(".zip"): hex_hash = fn[:-4] # strip ".zip" if len(hex_hash) == 64: hashes.add(hex_hash) else: print(f" WARN: unexpected zip name: {os.path.join(dirpath, fn)}") return hashes def collect_db_hashes(conn) -> set: """Fetch all distinct non-NULL content_hash values from files table.""" with conn.cursor() as cur: cur.execute("SELECT DISTINCT HEX(content_hash) FROM files WHERE content_hash IS NOT NULL") return {row[0].lower() for row in cur.fetchall()} def main(): if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH): print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}") sys.exit(1) print(f"Backup dir : {BACKUP_PATH}") print("Scanning disk blobs...") disk_hashes = collect_disk_hashes(BACKUP_PATH) print(f" Found {len(disk_hashes)} blob files on disk.") print("Loading DB hashes...") conn = get_connection() try: db_hashes = collect_db_hashes(conn) finally: conn.close() print(f" Found {len(db_hashes)} distinct hashes in DB.") # --- Options --- # PURGE_ORPHANS = True # uncomment to delete orphan blobs PURGE_ORPHANS = False # --- Reconcile --- missing_on_disk = db_hashes - disk_hashes orphans_on_disk = disk_hashes - db_hashes matched = db_hashes & disk_hashes print() print("=== Reconciliation Results ===") print(f" Matched (DB + disk) : {len(matched)}") print(f" Missing on disk : {len(missing_on_disk)}") print(f" Orphan blobs (no DB) : {len(orphans_on_disk)}") if missing_on_disk: print(f"\n--- Missing on disk ({len(missing_on_disk)}) ---") conn = get_connection() try: with conn.cursor() as cur: for h in sorted(missing_on_disk): cur.execute( "SELECT relative_path FROM files WHERE content_hash = UNHEX(%s) LIMIT 5", (h,) ) paths = [row[0] for row in cur.fetchall()] print(f" {h} -> {paths}") finally: conn.close() if orphans_on_disk: print(f"\n--- Orphan blobs ({len(orphans_on_disk)}) ---") total_orphan_bytes = 0 for h in sorted(orphans_on_disk): blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip") size = os.path.getsize(blob) if os.path.exists(blob) else 0 total_orphan_bytes += size if isinstance(size, int) else 0 print(f" {h} ({size} bytes)") print(f" Total orphan size: {total_orphan_bytes / 1024 / 1024:.1f} MB") if PURGE_ORPHANS: print("\n PURGING orphan blobs...") purged = 0 for h in sorted(orphans_on_disk): blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip") try: os.remove(blob) purged += 1 except OSError as e: print(f" WARN: could not delete {blob}: {e}") print(f" Purged {purged}/{len(orphans_on_disk)} orphan blobs.") if not missing_on_disk and not orphans_on_disk: print("\nAll clean — DB and disk are in sync.") if __name__ == "__main__": main()