z230

2026-02-09 20:16:37 +01:00
parent e7dd89962e
commit 9838164b88
9 changed files with 444 additions and 150 deletions
@@ -0,0 +1,14 @@
 {
  "permissions": {
    "allow": [
      "Bash(dir /s \"U:\\\\drobboxordinacebackup\")",
      "Bash(where:*)",
      "Bash(dir:*)",
      "Bash(python:*)",
      "Bash(pip install:*)",
      "Bash(tasklist:*)",
      "Bash(wmic process:*)",
      "Bash(taskkill:*)"
    ]
  }
 }
@@ -0,0 +1,42 @@
 import os
 import shutil
 import tempfile
 def blob_path(backup_root: str, content_hash: bytes) -> str:
    """Vrátí cestu k blob souboru: BACKUP/ab/cd/abcdef...blob"""
    hex_hash = content_hash.hex()
    return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".blob")
 def ensure_backed_up(files_with_hash: list, backup_root: str) -> int:
    """
    Zkopíruje soubory do content-addressable storage.
    files_with_hash: [(full_path, content_hash_bytes), ...]
    Přeskočí soubory, jejichž blob už existuje (deduplikace).
    Returns: počet nově zálohovaných souborů.
    """
    backed_up = 0
    for full_path, content_hash in files_with_hash:
        target = blob_path(backup_root, content_hash)
        if os.path.exists(target):
            continue
        target_dir = os.path.dirname(target)
        os.makedirs(target_dir, exist_ok=True)
        try:
            # Atomický zápis: temp soubor + přejmenování
            fd, tmp_path = tempfile.mkstemp(dir=target_dir, suffix=".tmp")
            os.close(fd)
            shutil.copy2(full_path, tmp_path)
            os.replace(tmp_path, target)
            backed_up += 1
        except (FileNotFoundError, PermissionError, OSError) as e:
            print(f"  WARN: backup failed for {full_path}: {e}")
            # Uklidíme temp soubor pokud existuje
            if os.path.exists(tmp_path):
                os.remove(tmp_path)
            continue
    return backed_up
@@ -1,7 +1,6 @@
 import os
 from dotenv import load_dotenv
 # načti .env z rootu projektu
 load_dotenv()
 # =========================
@@ -24,9 +23,11 @@ DB_CONFIG = {
 ROOT_PATH = os.getenv("ROOT_PATH")
 ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
 BACKUP_PATH = os.getenv("BACKUP_PATH")
 # =========================
 # Behaviour
 # =========================
 DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
@@ -1,91 +1,123 @@
 import pymysql
-import hashlib
+from datetime import datetime
-from indexer.config import DB_CONFIG, ROOT_NAME
+from indexer.config import DB_CONFIG, BATCH_SIZE
 def get_connection():
    return pymysql.connect(**DB_CONFIG)
-def preload_mark_all_missing():
+# ── Run management ──────────────────────────────────────────
    """
    Na začátku běhu:
    označí všechny soubory jako neexistující.
    Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
    """
    conn = get_connection()
    try:
        with conn.cursor() as cur:
            cur.execute("UPDATE files SET exists_now = 0")
        conn.commit()
    finally:
        conn.close()
-
+def create_run(cur) -> int:
 def path_hash(path: str) -> bytes:
    """
    MD5 hash cesty – pouze identifikátor, ne bezpečnostní hash
    """
    return hashlib.md5(path.encode("utf-8")).digest()
 def find_file_by_path(cur, path_hash_bytes):
    cur.execute(
-        """
+        "INSERT INTO runs (started_at, status) VALUES (%s, 'RUNNING')",
-        SELECT id, file_size, mtime, content_hash
+        (datetime.now(),)
        FROM files
        WHERE path_hash = %s
        """,
        (path_hash_bytes,)
    )
    return cur.fetchone()
 def insert_file(cur, file):
    cur.execute(
        """
        INSERT INTO files (
            root_name, full_path, path_hash,
            file_name, directory,
            file_size, mtime, content_hash,
            first_seen, last_seen, exists_now
        )
        VALUES (
            %s, %s, %s,
            %s, %s,
            %s, %s, %s,
            NOW(), NOW(), 1
        )
        """,
        (
            ROOT_NAME,
            file["full_path"],
            path_hash(file["full_path"]),
            file["file_name"],
            file["directory"],
            file["size"],
            file["mtime"],
            file["content_hash"],
        )
    )
    return cur.lastrowid
-def update_file(cur, file_id, file):
+def finalize_run(cur, run_id: int, stats: dict):
    cur.execute(
        """UPDATE runs
           SET finished_at = %s, status = 'COMPLETED',
               files_total = %s, files_new = %s, files_modified = %s,
               files_deleted = %s, files_unchanged = %s
           WHERE id = %s""",
        (datetime.now(), stats["total"], stats["new"], stats["modified"],
         stats["deleted"], stats["unchanged"], run_id)
    )
 def fail_run(cur, run_id: int):
    cur.execute(
        "UPDATE runs SET finished_at = %s, status = 'FAILED' WHERE id = %s",
        (datetime.now(), run_id)
    )
 # ── Load DB state ──────────────────────────────────────────
 def load_all_files(cur) -> dict:
    """
-        UPDATE files
+    Načte všechny existující soubory z DB do RAM.
-        SET file_size = %s,
+    Returns: {relative_path: {id, size, mtime, content_hash}}
-            mtime = %s,
+    """
-            content_hash = %s,
+    cur.execute(
-            last_seen = NOW(),
+        """SELECT id, relative_path, file_size, mtime, content_hash
-            exists_now = 1
+           FROM files WHERE exists_now = 1"""
        WHERE id = %s
        """,
        (
            file["size"],
            file["mtime"],
            file["content_hash"],
            file_id,
    )
    result = {}
    for row in cur.fetchall():
        file_id, rel_path, size, mtime, content_hash = row
        result[rel_path] = {
            "id": file_id,
            "size": size,
            "mtime": mtime,
            "content_hash": content_hash,
        }
    return result
 # ── Batch operations ────────────────────────────────────────
 def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
    """
    Batch INSERT nových souborů.
    files_list: [{relative_path, file_name, directory, size, mtime, content_hash}]
    Returns: {relative_path: file_id}
    """
    path_to_id = {}
    for i in range(0, len(files_list), BATCH_SIZE):
        chunk = files_list[i:i + BATCH_SIZE]
        cur.executemany(
            """INSERT INTO files
               (relative_path, file_name, directory, file_size, mtime,
                content_hash, first_seen_run, last_seen_run, exists_now)
               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""",
            [(f["relative_path"], f["file_name"], f["directory"],
              f["size"], f["mtime"], f["content_hash"], run_id, run_id)
             for f in chunk]
        )
        # pymysql executemany: lastrowid = first id in batch
        first_id = cur.lastrowid
        for j, f in enumerate(chunk):
            path_to_id[f["relative_path"]] = first_id + j
    return path_to_id
 def batch_update_modified(cur, files_list: list, run_id: int):
    """
    Batch UPDATE změněných souborů.
    files_list: [{id, size, mtime, content_hash}]
    """
    for i in range(0, len(files_list), BATCH_SIZE):
        chunk = files_list[i:i + BATCH_SIZE]
        cur.executemany(
            """UPDATE files
               SET file_size = %s, mtime = %s, content_hash = %s,
                   last_seen_run = %s, exists_now = 1
               WHERE id = %s""",
            [(f["size"], f["mtime"], f["content_hash"], run_id, f["id"])
             for f in chunk]
        )
 def batch_mark_deleted(cur, file_ids: list, run_id: int):
    """Batch UPDATE smazaných souborů — exists_now = 0."""
    for i in range(0, len(file_ids), BATCH_SIZE):
        chunk = file_ids[i:i + BATCH_SIZE]
        cur.executemany(
            "UPDATE files SET exists_now = 0, last_seen_run = %s WHERE id = %s",
            [(run_id, fid) for fid in chunk]
        )
 def batch_update_unchanged(cur, file_ids: list, run_id: int):
    """Batch UPDATE nezměněných souborů — jen last_seen_run."""
    for i in range(0, len(file_ids), BATCH_SIZE):
        chunk = file_ids[i:i + BATCH_SIZE]
        cur.executemany(
            "UPDATE files SET last_seen_run = %s, exists_now = 1 WHERE id = %s",
            [(run_id, fid) for fid in chunk]
        )
@@ -1,19 +1,21 @@
-def log_event(cur, file_id, event_type, old=None, new=None):
+from indexer.config import BATCH_SIZE
-    cur.execute(
+
 def batch_log_events(cur, events: list):
    """
-        INSERT INTO file_events (
+    Batch INSERT eventů do file_events.
-            file_id, event_type, event_time,
+    events: [{run_id, file_id, event_type, old_size, new_size, old_hash, new_hash}]
-            old_size, new_size,
+    """
-            old_hash, new_hash
+    if not events:
-        )
+        return
-        VALUES (%s, %s, NOW(), %s, %s, %s, %s)
+    for i in range(0, len(events), BATCH_SIZE):
-        """,
+        chunk = events[i:i + BATCH_SIZE]
-        (
+        cur.executemany(
-            file_id,
+            """INSERT INTO file_events
-            event_type,
+               (run_id, file_id, event_type, old_size, new_size, old_hash, new_hash)
-            old["size"] if old else None,
+               VALUES (%s, %s, %s, %s, %s, %s, %s)""",
-            new["size"] if new else None,
+            [(e["run_id"], e["file_id"], e["event_type"],
-            old["content_hash"] if old else None,
+              e.get("old_size"), e.get("new_size"),
-            new["content_hash"] if new else None,
+              e.get("old_hash"), e.get("new_hash"))
-        )
+             for e in chunk]
        )
@@ -1,21 +1,30 @@
 import os
 from datetime import datetime
 from indexer.hasher import blake3_file
-def scan_files(root_path):
+
 def scan_files(root_path: str) -> dict:
    """
    Projde celý adresářový strom a vrátí dict všech souborů.
    Nehasuje obsah — to se dělá až pro změněné soubory.
    Returns:
        {relative_path: {full_path, file_name, directory, size, mtime}}
    """
    result = {}
    for root, _, files in os.walk(root_path):
        for name in files:
            full_path = os.path.join(root, name)
            try:
                stat = os.stat(full_path)
-            except FileNotFoundError:
+            except (FileNotFoundError, PermissionError):
                continue
-
+            rel_path = os.path.relpath(full_path, root_path).replace("\\", "/")
-            yield {
+            rel_dir = os.path.relpath(root, root_path).replace("\\", "/")
-                "full_path": full_path.replace("\\", "/"),
+            result[rel_path] = {
                "full_path": full_path,
                "file_name": name,
-                "directory": root.replace("\\", "/"),
+                "directory": rel_dir,
                "size": stat.st_size,
                "mtime": datetime.fromtimestamp(stat.st_mtime),
                "content_hash": blake3_file(full_path),
            }
    return result
@@ -1,73 +1,200 @@
-from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN
+from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN, BACKUP_PATH
 from indexer.scanner import scan_files
 from indexer.hasher import blake3_file
 from indexer.db import (
-    get_connection,
+    get_connection, create_run, finalize_run, fail_run,
-    preload_mark_all_missing,
+    load_all_files, batch_insert_files, batch_update_modified,
-    find_file_by_path,
+    batch_mark_deleted, batch_update_unchanged,
    insert_file,
    update_file,
    path_hash,
 )
-from indexer.events import log_event
+from indexer.events import batch_log_events
 from indexer.backup import ensure_backed_up
 def main():
    print("=" * 60)
    print("ORDINACE DROPBOX BACKUP – INDEXER")
    print(f"Root    : {ROOT_PATH}")
-    print(f"Name    : {ROOT_NAME}")
+    print(f"Backup  : {BACKUP_PATH}")
    print(f"DRY RUN : {DRY_RUN}")
    print("=" * 60)
    # ── 1. Scan filesystem (fast, no hashing) ──
    print("\n[1/7] Scanning filesystem...")
    fs = scan_files(ROOT_PATH)
    print(f"       Found {len(fs)} files on disk.")
    if DRY_RUN:
        # V DRY_RUN režimu jen ukážeme co by se stalo
        print("\n[DRY RUN] No DB connection, showing scan results only.")
        print(f"  Files on disk: {len(fs)}")
        return
    # ── 2. Connect & create run ──
    conn = get_connection()
    cur = conn.cursor()
    run_id = create_run(cur)
    print(f"\n[2/7] Run #{run_id} created.")
-    if not DRY_RUN:
+    try:
-        preload_mark_all_missing()
+        # ── 3. Load DB state ──
        print("[3/7] Loading DB state...")
        db = load_all_files(cur)
        print(f"       {len(db)} files in DB (exists_now=1).")
-    created = modified = seen = 0
+        # ── 4. Diff ──
        print("[4/7] Diffing...")
        fs_paths = set(fs.keys())
        db_paths = set(db.keys())
-    for file in scan_files(ROOT_PATH):
+        new_paths = fs_paths - db_paths
-        seen += 1
+        deleted_paths = db_paths - fs_paths
-        ph = path_hash(file["full_path"])
+        existing_paths = fs_paths & db_paths
        row = find_file_by_path(cur, ph)
-        if row is None:
+        modified_paths = set()
-            created += 1
+        unchanged_paths = set()
-            if not DRY_RUN:
+        for p in existing_paths:
-                file_id = insert_file(cur, file)
+            fs_file = fs[p]
-                log_event(cur, file_id, "CREATED", new=file)
+            db_file = db[p]
            if fs_file["size"] != db_file["size"] or fs_file["mtime"] != db_file["mtime"]:
                modified_paths.add(p)
            else:
-            file_id, old_size, old_mtime, old_hash = row
+                unchanged_paths.add(p)
-            if old_size != file["size"] or old_hash != file["content_hash"]:
+
-                modified += 1
+        print(f"       NEW: {len(new_paths)}  MOD: {len(modified_paths)}  "
-                if not DRY_RUN:
+              f"DEL: {len(deleted_paths)}  SAME: {len(unchanged_paths)}")
-                    update_file(cur, file_id, file)
+
-                    log_event(
+        # ── 5. Process changes ──
-                        cur,
+        print("[5/7] Processing changes...")
-                        file_id,
+        events = []
-                        "MODIFIED",
+        files_to_backup = []
-                        old={"size": old_size, "content_hash": old_hash},
+
-                        new=file,
+        # 5a) NEW files — compute BLAKE3, batch INSERT
-                    )
+        if new_paths:
            print(f"  Hashing {len(new_paths)} new files...")
            new_files = []
            for p in new_paths:
                f = fs[p]
                try:
                    content_hash = blake3_file(f["full_path"])
                except (FileNotFoundError, PermissionError, OSError) as e:
                    print(f"  WARN: skip {p}: {e}")
                    continue
                new_files.append({
                    "relative_path": p,
                    "file_name": f["file_name"],
                    "directory": f["directory"],
                    "size": f["size"],
                    "mtime": f["mtime"],
                    "content_hash": content_hash,
                })
                files_to_backup.append((f["full_path"], content_hash))
            if new_files:
                path_to_id = batch_insert_files(cur, new_files, run_id)
                for nf in new_files:
                    events.append({
                        "run_id": run_id,
                        "file_id": path_to_id[nf["relative_path"]],
                        "event_type": "CREATED",
                        "new_size": nf["size"],
                        "new_hash": nf["content_hash"],
                    })
        # 5b) MODIFIED files — compute BLAKE3, batch UPDATE
        if modified_paths:
            print(f"  Hashing {len(modified_paths)} modified files...")
            mod_files = []
            for p in modified_paths:
                f = fs[p]
                db_file = db[p]
                try:
                    content_hash = blake3_file(f["full_path"])
                except (FileNotFoundError, PermissionError, OSError) as e:
                    print(f"  WARN: skip {p}: {e}")
                    continue
                mod_files.append({
                    "id": db_file["id"],
                    "size": f["size"],
                    "mtime": f["mtime"],
                    "content_hash": content_hash,
                })
                events.append({
                    "run_id": run_id,
                    "file_id": db_file["id"],
                    "event_type": "MODIFIED",
                    "old_size": db_file["size"],
                    "new_size": f["size"],
                    "old_hash": db_file["content_hash"],
                    "new_hash": content_hash,
                })
                files_to_backup.append((f["full_path"], content_hash))
            if mod_files:
                batch_update_modified(cur, mod_files, run_id)
        # 5c) DELETED files — batch UPDATE exists_now=0
        if deleted_paths:
            del_ids = [db[p]["id"] for p in deleted_paths]
            batch_mark_deleted(cur, del_ids, run_id)
            for p in deleted_paths:
                events.append({
                    "run_id": run_id,
                    "file_id": db[p]["id"],
                    "event_type": "DELETED",
                    "old_size": db[p]["size"],
                    "old_hash": db[p]["content_hash"],
                })
        # 5d) UNCHANGED files — batch UPDATE last_seen_run
        if unchanged_paths:
            unch_ids = [db[p]["id"] for p in unchanged_paths]
            batch_update_unchanged(cur, unch_ids, run_id)
        # 5e) Log all events
        if events:
            batch_log_events(cur, events)
        # ── 6. Backup ──
        if files_to_backup and BACKUP_PATH:
            print(f"[6/7] Backing up {len(files_to_backup)} files...")
            backed = ensure_backed_up(files_to_backup, BACKUP_PATH)
            print(f"       {backed} new blobs written.")
        else:
-                if not DRY_RUN:
+            print("[6/7] Nothing to backup.")
                    cur.execute(
                        "UPDATE files SET last_seen = NOW(), exists_now = 1 WHERE id = %s",
                        (file_id,)
                    )
-        if seen % 500 == 0:
+        # ── 7. Finalize ──
-            print(f"{seen} files scanned...")
+        stats = {
-
+            "total": len(fs),
-    if not DRY_RUN:
+            "new": len(new_paths),
            "modified": len(modified_paths),
            "deleted": len(deleted_paths),
            "unchanged": len(unchanged_paths),
        }
        finalize_run(cur, run_id, stats)
        conn.commit()
        print(f"[7/7] Run #{run_id} COMPLETED.")
-    print("================================")
+    except Exception as e:
-    print(f"Scanned  : {seen}")
+        print(f"\nERROR: {e}")
-    print(f"Created  : {created}")
+        try:
-    print(f"Modified : {modified}")
+            fail_run(cur, run_id)
-
+            conn.commit()
        except Exception:
            pass
        conn.rollback()
        raise
    finally:
        conn.close()
    # ── Summary ──
    print("\n" + "=" * 60)
    print(f"Total    : {stats['total']}")
    print(f"New      : {stats['new']}")
    print(f"Modified : {stats['modified']}")
    print(f"Deleted  : {stats['deleted']}")
    print(f"Unchanged: {stats['unchanged']}")
    print("=" * 60)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,64 @@
 """
 Recovery script: reconstruct directory tree from a specific run.
 Usage: python recovery.py <run_id> <output_dir>
 For a given run_id, finds all files that existed at that point
 (first_seen_run <= run_id AND last_seen_run >= run_id)
 and copies them from backup storage to output_dir preserving
 the original directory structure.
 """
 import os
 import sys
 import shutil
 from indexer.config import DB_CONFIG, BACKUP_PATH
 from indexer.db import get_connection
 from indexer.backup import blob_path
 def recover(run_id: int, output_dir: str):
    conn = get_connection()
    cur = conn.cursor()
    cur.execute(
        """SELECT relative_path, content_hash
           FROM files
           WHERE first_seen_run <= %s AND last_seen_run >= %s""",
        (run_id, run_id)
    )
    rows = cur.fetchall()
    conn.close()
    if not rows:
        print(f"No files found for run #{run_id}.")
        return
    print(f"Recovering {len(rows)} files from run #{run_id} to {output_dir}")
    recovered = 0
    missing = 0
    for relative_path, content_hash in rows:
        source = blob_path(BACKUP_PATH, content_hash)
        target = os.path.join(output_dir, relative_path.replace("/", os.sep))
        if not os.path.exists(source):
            print(f"  MISSING blob: {content_hash.hex()} for {relative_path}")
            missing += 1
            continue
        os.makedirs(os.path.dirname(target), exist_ok=True)
        shutil.copy2(source, target)
        recovered += 1
    print(f"\nRecovered: {recovered}  Missing blobs: {missing}")
 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python recovery.py <run_id> <output_dir>")
        sys.exit(1)
    run_id = int(sys.argv[1])
    output_dir = sys.argv[2]
    recover(run_id, output_dir)
@@ -0,0 +1,3 @@
 pymysql
 blake3
 python-dotenv