diff --git a/indexer/db.py b/indexer/db.py index f2a9ea9..30a68a2 100644 --- a/indexer/db.py +++ b/indexer/db.py @@ -63,23 +63,33 @@ def load_all_files(cur) -> dict: def batch_insert_files(cur, files_list: list, run_id: int) -> dict: """ - Batch INSERT nových souborů. + Batch INSERT (or re-activate) souborů. + Handles re-appearing files that were previously deleted (exists_now=0) + via ON DUPLICATE KEY UPDATE. files_list: [{relative_path, file_name, directory, size, mtime, content_hash}] Returns: {relative_path: file_id} """ path_to_id = {} for i in range(0, len(files_list), BATCH_SIZE): chunk = files_list[i:i + BATCH_SIZE] - cur.executemany( - """INSERT INTO files - (relative_path, file_name, directory, file_size, mtime, - content_hash, first_seen_run, last_seen_run, exists_now) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""", - [(f["relative_path"], f["file_name"], f["directory"], - f["size"], f["mtime"], f["content_hash"], run_id, run_id) - for f in chunk] - ) - # Fetch real IDs — lastrowid+j is unreliable with executemany + for f in chunk: + cur.execute( + """INSERT INTO files + (relative_path, file_name, directory, file_size, mtime, + content_hash, first_seen_run, last_seen_run, exists_now) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1) + ON DUPLICATE KEY UPDATE + file_name = VALUES(file_name), + directory = VALUES(directory), + file_size = VALUES(file_size), + mtime = VALUES(mtime), + content_hash = VALUES(content_hash), + last_seen_run = VALUES(last_seen_run), + exists_now = 1""", + (f["relative_path"], f["file_name"], f["directory"], + f["size"], f["mtime"], f["content_hash"], run_id, run_id) + ) + # Fetch real IDs paths = [f["relative_path"] for f in chunk] placeholders = ",".join(["%s"] * len(paths)) cur.execute( diff --git a/reconcile.py b/reconcile.py index e58b5b2..e82ea25 100644 --- a/reconcile.py +++ b/reconcile.py @@ -54,10 +54,10 @@ def main(): print(f" Found {len(db_hashes)} distinct hashes in DB.") # --- Options --- -# PURGE_ORPHANS = True # uncomment to delete orphan blobs -PURGE_ORPHANS = False + # PURGE_ORPHANS = True # uncomment to delete orphan blobs + PURGE_ORPHANS = False -# --- Reconcile --- + # --- Reconcile --- missing_on_disk = db_hashes - disk_hashes orphans_on_disk = disk_hashes - db_hashes matched = db_hashes & disk_hashes diff --git a/recovery.py b/recovery.py index cbfcc84..c8a8608 100644 --- a/recovery.py +++ b/recovery.py @@ -1,7 +1,8 @@ """ Recovery script: reconstruct directory tree from a specific run. -Usage: python recovery.py +Usage: python recovery.py + (interactive — shows last 10 runs, asks which one to recover) For a given run_id, finds all files that existed at that point (first_seen_run <= run_id AND last_seen_run >= run_id) @@ -16,6 +17,39 @@ from indexer.config import BACKUP_PATH, BACKUP_PASSWORD from indexer.db import get_connection from indexer.backup import blob_path +DEFAULT_OUTPUT_DIR = r"U:\recovery" + + +def show_last_runs(n: int = 10): + """Show last N completed runs and return the list.""" + conn = get_connection() + cur = conn.cursor() + cur.execute( + """SELECT id, started_at, finished_at, status, + files_total, files_new, files_modified, files_deleted + FROM runs + ORDER BY id DESC + LIMIT %s""", + (n,) + ) + rows = cur.fetchall() + conn.close() + + if not rows: + print("No runs found in DB.") + return [] + + print(f"\n{'='*80}") + print(f"{'Run':>5} {'Started':>19} {'Status':>10} {'Total':>7} {'New':>5} {'Mod':>5} {'Del':>5}") + print(f"{'-'*80}") + for row in reversed(rows): + run_id, started, finished, status, total, new, mod, deleted = row + started_str = started.strftime("%Y-%m-%d %H:%M:%S") if started else "?" + print(f"{run_id:>5} {started_str:>19} {status:>10} {total or 0:>7} {new or 0:>5} {mod or 0:>5} {deleted or 0:>5}") + print(f"{'='*80}") + + return [r[0] for r in rows] + def recover(run_id: int, output_dir: str): conn = get_connection() @@ -34,12 +68,12 @@ def recover(run_id: int, output_dir: str): print(f"No files found for run #{run_id}.") return - print(f"Recovering {len(rows)} files from run #{run_id} to {output_dir}") + print(f"\nRecovering {len(rows)} files from run #{run_id} to {output_dir}") recovered = 0 missing = 0 password = BACKUP_PASSWORD.encode("utf-8") - for relative_path, content_hash in rows: + for i, (relative_path, content_hash) in enumerate(rows, 1): source = blob_path(BACKUP_PATH, content_hash) target = os.path.join(output_dir, relative_path.replace("/", os.sep)) @@ -67,14 +101,44 @@ def recover(run_id: int, output_dir: str): missing += 1 continue - print(f"\nRecovered: {recovered} Missing/errors: {missing}") + if i % 1000 == 0: + print(f" [{i}/{len(rows)}] recovered={recovered} missing={missing}") + + print(f"\n{'='*60}") + print(f"Recovery complete.") + print(f" Run : #{run_id}") + print(f" Output dir : {output_dir}") + print(f" Recovered : {recovered}") + print(f" Missing/err : {missing}") + print(f"{'='*60}") if __name__ == "__main__": - if len(sys.argv) != 3: - print("Usage: python recovery.py ") + run_ids = show_last_runs(10) + + if not run_ids: sys.exit(1) - run_id = int(sys.argv[1]) - output_dir = sys.argv[2] + print() + choice = input("Enter run ID to recover (or 'q' to quit): ").strip() + if choice.lower() == "q": + print("Aborted.") + sys.exit(0) + + try: + run_id = int(choice) + except ValueError: + print(f"Invalid run ID: {choice}") + sys.exit(1) + + output_dir = DEFAULT_OUTPUT_DIR + print(f"\nOutput directory: {output_dir}") + + if os.path.exists(output_dir) and os.listdir(output_dir): + confirm = input("Directory is not empty. Continue? (y/n): ").strip().lower() + if confirm != "y": + print("Aborted.") + sys.exit(0) + + os.makedirs(output_dir, exist_ok=True) recover(run_id, output_dir)