z230
This commit is contained in:
113
reconcile.py
Normal file
113
reconcile.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""
|
||||
reconcile.py — Cross-check MySQL content_hash values against .zip files on disk.
|
||||
|
||||
Reports:
|
||||
1. DB hashes with no blob on disk (missing backups)
|
||||
2. Blob files on disk with no matching DB record (orphan blobs)
|
||||
3. Summary stats
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from indexer.config import BACKUP_PATH
|
||||
from indexer.db import get_connection
|
||||
|
||||
|
||||
def collect_disk_hashes(backup_root: str) -> set:
|
||||
"""Walk backup dir and collect all hex hashes from .zip filenames."""
|
||||
hashes = set()
|
||||
for dirpath, _dirnames, filenames in os.walk(backup_root):
|
||||
for fn in filenames:
|
||||
if fn.endswith(".zip"):
|
||||
hex_hash = fn[:-4] # strip ".zip"
|
||||
if len(hex_hash) == 64:
|
||||
hashes.add(hex_hash)
|
||||
else:
|
||||
print(f" WARN: unexpected zip name: {os.path.join(dirpath, fn)}")
|
||||
return hashes
|
||||
|
||||
|
||||
def collect_db_hashes(conn) -> set:
|
||||
"""Fetch all distinct non-NULL content_hash values from files table."""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT DISTINCT HEX(content_hash) FROM files WHERE content_hash IS NOT NULL")
|
||||
return {row[0].lower() for row in cur.fetchall()}
|
||||
|
||||
|
||||
def main():
|
||||
if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH):
|
||||
print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Backup dir : {BACKUP_PATH}")
|
||||
print("Scanning disk blobs...")
|
||||
disk_hashes = collect_disk_hashes(BACKUP_PATH)
|
||||
print(f" Found {len(disk_hashes)} blob files on disk.")
|
||||
|
||||
print("Loading DB hashes...")
|
||||
conn = get_connection()
|
||||
try:
|
||||
db_hashes = collect_db_hashes(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
print(f" Found {len(db_hashes)} distinct hashes in DB.")
|
||||
|
||||
# --- Options ---
|
||||
# PURGE_ORPHANS = True # uncomment to delete orphan blobs
|
||||
PURGE_ORPHANS = False
|
||||
|
||||
# --- Reconcile ---
|
||||
missing_on_disk = db_hashes - disk_hashes
|
||||
orphans_on_disk = disk_hashes - db_hashes
|
||||
matched = db_hashes & disk_hashes
|
||||
|
||||
print()
|
||||
print("=== Reconciliation Results ===")
|
||||
print(f" Matched (DB + disk) : {len(matched)}")
|
||||
print(f" Missing on disk : {len(missing_on_disk)}")
|
||||
print(f" Orphan blobs (no DB) : {len(orphans_on_disk)}")
|
||||
|
||||
if missing_on_disk:
|
||||
print(f"\n--- Missing on disk ({len(missing_on_disk)}) ---")
|
||||
conn = get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
for h in sorted(missing_on_disk):
|
||||
cur.execute(
|
||||
"SELECT relative_path FROM files WHERE content_hash = UNHEX(%s) LIMIT 5",
|
||||
(h,)
|
||||
)
|
||||
paths = [row[0] for row in cur.fetchall()]
|
||||
print(f" {h} -> {paths}")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if orphans_on_disk:
|
||||
print(f"\n--- Orphan blobs ({len(orphans_on_disk)}) ---")
|
||||
total_orphan_bytes = 0
|
||||
for h in sorted(orphans_on_disk):
|
||||
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
|
||||
size = os.path.getsize(blob) if os.path.exists(blob) else 0
|
||||
total_orphan_bytes += size if isinstance(size, int) else 0
|
||||
print(f" {h} ({size} bytes)")
|
||||
print(f" Total orphan size: {total_orphan_bytes / 1024 / 1024:.1f} MB")
|
||||
|
||||
if PURGE_ORPHANS:
|
||||
print("\n PURGING orphan blobs...")
|
||||
purged = 0
|
||||
for h in sorted(orphans_on_disk):
|
||||
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
|
||||
try:
|
||||
os.remove(blob)
|
||||
purged += 1
|
||||
except OSError as e:
|
||||
print(f" WARN: could not delete {blob}: {e}")
|
||||
print(f" Purged {purged}/{len(orphans_on_disk)} orphan blobs.")
|
||||
|
||||
if not missing_on_disk and not orphans_on_disk:
|
||||
print("\nAll clean — DB and disk are in sync.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user