z230

2026-02-12 07:54:22 +01:00
parent 42cd021b9c
commit bd4aede24a
3 changed files with 96 additions and 22 deletions
@@ -63,23 +63,33 @@ def load_all_files(cur) -> dict:

 def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
    """
-    Batch INSERT nových souborů.
+    Batch INSERT (or re-activate) souborů.
+    Handles re-appearing files that were previously deleted (exists_now=0)
+    via ON DUPLICATE KEY UPDATE.
    files_list: [{relative_path, file_name, directory, size, mtime, content_hash}]
    Returns: {relative_path: file_id}
    """
    path_to_id = {}
    for i in range(0, len(files_list), BATCH_SIZE):
        chunk = files_list[i:i + BATCH_SIZE]
-        cur.executemany(
-            """INSERT INTO files
-               (relative_path, file_name, directory, file_size, mtime,
-                content_hash, first_seen_run, last_seen_run, exists_now)
-               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""",
-            [(f["relative_path"], f["file_name"], f["directory"],
-              f["size"], f["mtime"], f["content_hash"], run_id, run_id)
-             for f in chunk]
-        )
-        # Fetch real IDs — lastrowid+j is unreliable with executemany
+        for f in chunk:
+            cur.execute(
+                """INSERT INTO files
+                   (relative_path, file_name, directory, file_size, mtime,
+                    content_hash, first_seen_run, last_seen_run, exists_now)
+                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
+                   ON DUPLICATE KEY UPDATE
+                    file_name = VALUES(file_name),
+                    directory = VALUES(directory),
+                    file_size = VALUES(file_size),
+                    mtime = VALUES(mtime),
+                    content_hash = VALUES(content_hash),
+                    last_seen_run = VALUES(last_seen_run),
+                    exists_now = 1""",
+                (f["relative_path"], f["file_name"], f["directory"],
+                 f["size"], f["mtime"], f["content_hash"], run_id, run_id)
+            )
+        # Fetch real IDs
        paths = [f["relative_path"] for f in chunk]
        placeholders = ",".join(["%s"] * len(paths))
        cur.execute(
@@ -54,10 +54,10 @@ def main():
    print(f"  Found {len(db_hashes)} distinct hashes in DB.")

    # --- Options ---
-# PURGE_ORPHANS = True   # uncomment to delete orphan blobs
-PURGE_ORPHANS = False
+    # PURGE_ORPHANS = True   # uncomment to delete orphan blobs
+    PURGE_ORPHANS = False

-# --- Reconcile ---
+    # --- Reconcile ---
    missing_on_disk = db_hashes - disk_hashes
    orphans_on_disk = disk_hashes - db_hashes
    matched = db_hashes & disk_hashes
@@ -1,7 +1,8 @@
 """
 Recovery script: reconstruct directory tree from a specific run.

-Usage: python recovery.py <run_id> <output_dir>
+Usage: python recovery.py
+       (interactive — shows last 10 runs, asks which one to recover)

 For a given run_id, finds all files that existed at that point
 (first_seen_run <= run_id AND last_seen_run >= run_id)
@@ -16,6 +17,39 @@ from indexer.config import BACKUP_PATH, BACKUP_PASSWORD
 from indexer.db import get_connection
 from indexer.backup import blob_path

+DEFAULT_OUTPUT_DIR = r"U:\recovery"
+
+
+def show_last_runs(n: int = 10):
+    """Show last N completed runs and return the list."""
+    conn = get_connection()
+    cur = conn.cursor()
+    cur.execute(
+        """SELECT id, started_at, finished_at, status,
+                  files_total, files_new, files_modified, files_deleted
+           FROM runs
+           ORDER BY id DESC
+           LIMIT %s""",
+        (n,)
+    )
+    rows = cur.fetchall()
+    conn.close()
+
+    if not rows:
+        print("No runs found in DB.")
+        return []
+
+    print(f"\n{'='*80}")
+    print(f"{'Run':>5}  {'Started':>19}  {'Status':>10}  {'Total':>7}  {'New':>5}  {'Mod':>5}  {'Del':>5}")
+    print(f"{'-'*80}")
+    for row in reversed(rows):
+        run_id, started, finished, status, total, new, mod, deleted = row
+        started_str = started.strftime("%Y-%m-%d %H:%M:%S") if started else "?"
+        print(f"{run_id:>5}  {started_str:>19}  {status:>10}  {total or 0:>7}  {new or 0:>5}  {mod or 0:>5}  {deleted or 0:>5}")
+    print(f"{'='*80}")
+
+    return [r[0] for r in rows]
+

 def recover(run_id: int, output_dir: str):
    conn = get_connection()
@@ -34,12 +68,12 @@ def recover(run_id: int, output_dir: str):
        print(f"No files found for run #{run_id}.")
        return

-    print(f"Recovering {len(rows)} files from run #{run_id} to {output_dir}")
+    print(f"\nRecovering {len(rows)} files from run #{run_id} to {output_dir}")
    recovered = 0
    missing = 0
    password = BACKUP_PASSWORD.encode("utf-8")

-    for relative_path, content_hash in rows:
+    for i, (relative_path, content_hash) in enumerate(rows, 1):
        source = blob_path(BACKUP_PATH, content_hash)
        target = os.path.join(output_dir, relative_path.replace("/", os.sep))

@@ -67,14 +101,44 @@ def recover(run_id: int, output_dir: str):
            missing += 1
            continue

-    print(f"\nRecovered: {recovered}  Missing/errors: {missing}")
+        if i % 1000 == 0:
+            print(f"  [{i}/{len(rows)}] recovered={recovered} missing={missing}")
+
+    print(f"\n{'='*60}")
+    print(f"Recovery complete.")
+    print(f"  Run          : #{run_id}")
+    print(f"  Output dir   : {output_dir}")
+    print(f"  Recovered    : {recovered}")
+    print(f"  Missing/err  : {missing}")
+    print(f"{'='*60}")


 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print("Usage: python recovery.py <run_id> <output_dir>")
+    run_ids = show_last_runs(10)
+
+    if not run_ids:
        sys.exit(1)

-    run_id = int(sys.argv[1])
-    output_dir = sys.argv[2]
+    print()
+    choice = input("Enter run ID to recover (or 'q' to quit): ").strip()
+    if choice.lower() == "q":
+        print("Aborted.")
+        sys.exit(0)
+
+    try:
+        run_id = int(choice)
+    except ValueError:
+        print(f"Invalid run ID: {choice}")
+        sys.exit(1)
+
+    output_dir = DEFAULT_OUTPUT_DIR
+    print(f"\nOutput directory: {output_dir}")
+
+    if os.path.exists(output_dir) and os.listdir(output_dir):
+        confirm = input("Directory is not empty. Continue? (y/n): ").strip().lower()
+        if confirm != "y":
+            print("Aborted.")
+            sys.exit(0)
+
+    os.makedirs(output_dir, exist_ok=True)
    recover(run_id, output_dir)