Files
drobboxordinacebackup/reconcile.py
2026-02-12 07:54:22 +01:00

114 lines
3.9 KiB
Python

"""
reconcile.py — Cross-check MySQL content_hash values against .zip files on disk.
Reports:
1. DB hashes with no blob on disk (missing backups)
2. Blob files on disk with no matching DB record (orphan blobs)
3. Summary stats
"""
import os
import sys
from indexer.config import BACKUP_PATH
from indexer.db import get_connection
def collect_disk_hashes(backup_root: str) -> set:
"""Walk backup dir and collect all hex hashes from .zip filenames."""
hashes = set()
for dirpath, _dirnames, filenames in os.walk(backup_root):
for fn in filenames:
if fn.endswith(".zip"):
hex_hash = fn[:-4] # strip ".zip"
if len(hex_hash) == 64:
hashes.add(hex_hash)
else:
print(f" WARN: unexpected zip name: {os.path.join(dirpath, fn)}")
return hashes
def collect_db_hashes(conn) -> set:
"""Fetch all distinct non-NULL content_hash values from files table."""
with conn.cursor() as cur:
cur.execute("SELECT DISTINCT HEX(content_hash) FROM files WHERE content_hash IS NOT NULL")
return {row[0].lower() for row in cur.fetchall()}
def main():
if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH):
print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}")
sys.exit(1)
print(f"Backup dir : {BACKUP_PATH}")
print("Scanning disk blobs...")
disk_hashes = collect_disk_hashes(BACKUP_PATH)
print(f" Found {len(disk_hashes)} blob files on disk.")
print("Loading DB hashes...")
conn = get_connection()
try:
db_hashes = collect_db_hashes(conn)
finally:
conn.close()
print(f" Found {len(db_hashes)} distinct hashes in DB.")
# --- Options ---
# PURGE_ORPHANS = True # uncomment to delete orphan blobs
PURGE_ORPHANS = False
# --- Reconcile ---
missing_on_disk = db_hashes - disk_hashes
orphans_on_disk = disk_hashes - db_hashes
matched = db_hashes & disk_hashes
print()
print("=== Reconciliation Results ===")
print(f" Matched (DB + disk) : {len(matched)}")
print(f" Missing on disk : {len(missing_on_disk)}")
print(f" Orphan blobs (no DB) : {len(orphans_on_disk)}")
if missing_on_disk:
print(f"\n--- Missing on disk ({len(missing_on_disk)}) ---")
conn = get_connection()
try:
with conn.cursor() as cur:
for h in sorted(missing_on_disk):
cur.execute(
"SELECT relative_path FROM files WHERE content_hash = UNHEX(%s) LIMIT 5",
(h,)
)
paths = [row[0] for row in cur.fetchall()]
print(f" {h} -> {paths}")
finally:
conn.close()
if orphans_on_disk:
print(f"\n--- Orphan blobs ({len(orphans_on_disk)}) ---")
total_orphan_bytes = 0
for h in sorted(orphans_on_disk):
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
size = os.path.getsize(blob) if os.path.exists(blob) else 0
total_orphan_bytes += size if isinstance(size, int) else 0
print(f" {h} ({size} bytes)")
print(f" Total orphan size: {total_orphan_bytes / 1024 / 1024:.1f} MB")
if PURGE_ORPHANS:
print("\n PURGING orphan blobs...")
purged = 0
for h in sorted(orphans_on_disk):
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
try:
os.remove(blob)
purged += 1
except OSError as e:
print(f" WARN: could not delete {blob}: {e}")
print(f" Purged {purged}/{len(orphans_on_disk)} orphan blobs.")
if not missing_on_disk and not orphans_on_disk:
print("\nAll clean — DB and disk are in sync.")
if __name__ == "__main__":
main()