114 lines
3.9 KiB
Python
114 lines
3.9 KiB
Python
"""
|
|
reconcile.py — Cross-check MySQL content_hash values against .zip files on disk.
|
|
|
|
Reports:
|
|
1. DB hashes with no blob on disk (missing backups)
|
|
2. Blob files on disk with no matching DB record (orphan blobs)
|
|
3. Summary stats
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
from indexer.config import BACKUP_PATH
|
|
from indexer.db import get_connection
|
|
|
|
|
|
def collect_disk_hashes(backup_root: str) -> set:
|
|
"""Walk backup dir and collect all hex hashes from .zip filenames."""
|
|
hashes = set()
|
|
for dirpath, _dirnames, filenames in os.walk(backup_root):
|
|
for fn in filenames:
|
|
if fn.endswith(".zip"):
|
|
hex_hash = fn[:-4] # strip ".zip"
|
|
if len(hex_hash) == 64:
|
|
hashes.add(hex_hash)
|
|
else:
|
|
print(f" WARN: unexpected zip name: {os.path.join(dirpath, fn)}")
|
|
return hashes
|
|
|
|
|
|
def collect_db_hashes(conn) -> set:
|
|
"""Fetch all distinct non-NULL content_hash values from files table."""
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT DISTINCT HEX(content_hash) FROM files WHERE content_hash IS NOT NULL")
|
|
return {row[0].lower() for row in cur.fetchall()}
|
|
|
|
|
|
def main():
|
|
if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH):
|
|
print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}")
|
|
sys.exit(1)
|
|
|
|
print(f"Backup dir : {BACKUP_PATH}")
|
|
print("Scanning disk blobs...")
|
|
disk_hashes = collect_disk_hashes(BACKUP_PATH)
|
|
print(f" Found {len(disk_hashes)} blob files on disk.")
|
|
|
|
print("Loading DB hashes...")
|
|
conn = get_connection()
|
|
try:
|
|
db_hashes = collect_db_hashes(conn)
|
|
finally:
|
|
conn.close()
|
|
print(f" Found {len(db_hashes)} distinct hashes in DB.")
|
|
|
|
# --- Options ---
|
|
# PURGE_ORPHANS = True # uncomment to delete orphan blobs
|
|
PURGE_ORPHANS = False
|
|
|
|
# --- Reconcile ---
|
|
missing_on_disk = db_hashes - disk_hashes
|
|
orphans_on_disk = disk_hashes - db_hashes
|
|
matched = db_hashes & disk_hashes
|
|
|
|
print()
|
|
print("=== Reconciliation Results ===")
|
|
print(f" Matched (DB + disk) : {len(matched)}")
|
|
print(f" Missing on disk : {len(missing_on_disk)}")
|
|
print(f" Orphan blobs (no DB) : {len(orphans_on_disk)}")
|
|
|
|
if missing_on_disk:
|
|
print(f"\n--- Missing on disk ({len(missing_on_disk)}) ---")
|
|
conn = get_connection()
|
|
try:
|
|
with conn.cursor() as cur:
|
|
for h in sorted(missing_on_disk):
|
|
cur.execute(
|
|
"SELECT relative_path FROM files WHERE content_hash = UNHEX(%s) LIMIT 5",
|
|
(h,)
|
|
)
|
|
paths = [row[0] for row in cur.fetchall()]
|
|
print(f" {h} -> {paths}")
|
|
finally:
|
|
conn.close()
|
|
|
|
if orphans_on_disk:
|
|
print(f"\n--- Orphan blobs ({len(orphans_on_disk)}) ---")
|
|
total_orphan_bytes = 0
|
|
for h in sorted(orphans_on_disk):
|
|
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
|
|
size = os.path.getsize(blob) if os.path.exists(blob) else 0
|
|
total_orphan_bytes += size if isinstance(size, int) else 0
|
|
print(f" {h} ({size} bytes)")
|
|
print(f" Total orphan size: {total_orphan_bytes / 1024 / 1024:.1f} MB")
|
|
|
|
if PURGE_ORPHANS:
|
|
print("\n PURGING orphan blobs...")
|
|
purged = 0
|
|
for h in sorted(orphans_on_disk):
|
|
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
|
|
try:
|
|
os.remove(blob)
|
|
purged += 1
|
|
except OSError as e:
|
|
print(f" WARN: could not delete {blob}: {e}")
|
|
print(f" Purged {purged}/{len(orphans_on_disk)} orphan blobs.")
|
|
|
|
if not missing_on_disk and not orphans_on_disk:
|
|
print("\nAll clean — DB and disk are in sync.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|