z230

2026-02-09 20:16:37 +01:00
parent e7dd89962e
commit 9838164b88
9 changed files with 444 additions and 150 deletions
@@ -0,0 +1,42 @@
+import os
+import shutil
+import tempfile
+
+
+def blob_path(backup_root: str, content_hash: bytes) -> str:
+    """Vrátí cestu k blob souboru: BACKUP/ab/cd/abcdef...blob"""
+    hex_hash = content_hash.hex()
+    return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".blob")
+
+
+def ensure_backed_up(files_with_hash: list, backup_root: str) -> int:
+    """
+    Zkopíruje soubory do content-addressable storage.
+    files_with_hash: [(full_path, content_hash_bytes), ...]
+    Přeskočí soubory, jejichž blob už existuje (deduplikace).
+    Returns: počet nově zálohovaných souborů.
+    """
+    backed_up = 0
+    for full_path, content_hash in files_with_hash:
+        target = blob_path(backup_root, content_hash)
+        if os.path.exists(target):
+            continue
+
+        target_dir = os.path.dirname(target)
+        os.makedirs(target_dir, exist_ok=True)
+
+        try:
+            # Atomický zápis: temp soubor + přejmenování
+            fd, tmp_path = tempfile.mkstemp(dir=target_dir, suffix=".tmp")
+            os.close(fd)
+            shutil.copy2(full_path, tmp_path)
+            os.replace(tmp_path, target)
+            backed_up += 1
+        except (FileNotFoundError, PermissionError, OSError) as e:
+            print(f"  WARN: backup failed for {full_path}: {e}")
+            # Uklidíme temp soubor pokud existuje
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+            continue
+
+    return backed_up
@@ -1,7 +1,6 @@
 import os
 from dotenv import load_dotenv

-# načti .env z rootu projektu
 load_dotenv()

 # =========================
@@ -24,9 +23,11 @@ DB_CONFIG = {

 ROOT_PATH = os.getenv("ROOT_PATH")
 ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
+BACKUP_PATH = os.getenv("BACKUP_PATH")

 # =========================
 # Behaviour
 # =========================

 DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
@@ -1,91 +1,123 @@
 import pymysql
-import hashlib
-from indexer.config import DB_CONFIG, ROOT_NAME
+from datetime import datetime
+from indexer.config import DB_CONFIG, BATCH_SIZE


 def get_connection():
    return pymysql.connect(**DB_CONFIG)


-def preload_mark_all_missing():
-    """
-    Na začátku běhu:
-    označí všechny soubory jako neexistující.
-    Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
-    """
-    conn = get_connection()
-    try:
-        with conn.cursor() as cur:
-            cur.execute("UPDATE files SET exists_now = 0")
-        conn.commit()
-    finally:
-        conn.close()
+# ── Run management ──────────────────────────────────────────

-
-def path_hash(path: str) -> bytes:
-    """
-    MD5 hash cesty – pouze identifikátor, ne bezpečnostní hash
-    """
-    return hashlib.md5(path.encode("utf-8")).digest()
-
-
-def find_file_by_path(cur, path_hash_bytes):
+def create_run(cur) -> int:
    cur.execute(
-        """
-        SELECT id, file_size, mtime, content_hash
-        FROM files
-        WHERE path_hash = %s
-        """,
-        (path_hash_bytes,)
-    )
-    return cur.fetchone()
-
-
-def insert_file(cur, file):
-    cur.execute(
-        """
-        INSERT INTO files (
-            root_name, full_path, path_hash,
-            file_name, directory,
-            file_size, mtime, content_hash,
-            first_seen, last_seen, exists_now
-        )
-        VALUES (
-            %s, %s, %s,
-            %s, %s,
-            %s, %s, %s,
-            NOW(), NOW(), 1
-        )
-        """,
-        (
-            ROOT_NAME,
-            file["full_path"],
-            path_hash(file["full_path"]),
-            file["file_name"],
-            file["directory"],
-            file["size"],
-            file["mtime"],
-            file["content_hash"],
-        )
+        "INSERT INTO runs (started_at, status) VALUES (%s, 'RUNNING')",
+        (datetime.now(),)
    )
    return cur.lastrowid


-def update_file(cur, file_id, file):
+def finalize_run(cur, run_id: int, stats: dict):
    cur.execute(
-        """
-        UPDATE files
-        SET file_size = %s,
-            mtime = %s,
-            content_hash = %s,
-            last_seen = NOW(),
-            exists_now = 1
-        WHERE id = %s
-        """,
-        (
-            file["size"],
-            file["mtime"],
-            file["content_hash"],
-            file_id,
-        )
+        """UPDATE runs
+           SET finished_at = %s, status = 'COMPLETED',
+               files_total = %s, files_new = %s, files_modified = %s,
+               files_deleted = %s, files_unchanged = %s
+           WHERE id = %s""",
+        (datetime.now(), stats["total"], stats["new"], stats["modified"],
+         stats["deleted"], stats["unchanged"], run_id)
    )
+
+
+def fail_run(cur, run_id: int):
+    cur.execute(
+        "UPDATE runs SET finished_at = %s, status = 'FAILED' WHERE id = %s",
+        (datetime.now(), run_id)
+    )
+
+
+# ── Load DB state ──────────────────────────────────────────
+
+def load_all_files(cur) -> dict:
+    """
+    Načte všechny existující soubory z DB do RAM.
+    Returns: {relative_path: {id, size, mtime, content_hash}}
+    """
+    cur.execute(
+        """SELECT id, relative_path, file_size, mtime, content_hash
+           FROM files WHERE exists_now = 1"""
+    )
+    result = {}
+    for row in cur.fetchall():
+        file_id, rel_path, size, mtime, content_hash = row
+        result[rel_path] = {
+            "id": file_id,
+            "size": size,
+            "mtime": mtime,
+            "content_hash": content_hash,
+        }
+    return result
+
+
+# ── Batch operations ────────────────────────────────────────
+
+def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
+    """
+    Batch INSERT nových souborů.
+    files_list: [{relative_path, file_name, directory, size, mtime, content_hash}]
+    Returns: {relative_path: file_id}
+    """
+    path_to_id = {}
+    for i in range(0, len(files_list), BATCH_SIZE):
+        chunk = files_list[i:i + BATCH_SIZE]
+        cur.executemany(
+            """INSERT INTO files
+               (relative_path, file_name, directory, file_size, mtime,
+                content_hash, first_seen_run, last_seen_run, exists_now)
+               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""",
+            [(f["relative_path"], f["file_name"], f["directory"],
+              f["size"], f["mtime"], f["content_hash"], run_id, run_id)
+             for f in chunk]
+        )
+        # pymysql executemany: lastrowid = first id in batch
+        first_id = cur.lastrowid
+        for j, f in enumerate(chunk):
+            path_to_id[f["relative_path"]] = first_id + j
+    return path_to_id
+
+
+def batch_update_modified(cur, files_list: list, run_id: int):
+    """
+    Batch UPDATE změněných souborů.
+    files_list: [{id, size, mtime, content_hash}]
+    """
+    for i in range(0, len(files_list), BATCH_SIZE):
+        chunk = files_list[i:i + BATCH_SIZE]
+        cur.executemany(
+            """UPDATE files
+               SET file_size = %s, mtime = %s, content_hash = %s,
+                   last_seen_run = %s, exists_now = 1
+               WHERE id = %s""",
+            [(f["size"], f["mtime"], f["content_hash"], run_id, f["id"])
+             for f in chunk]
+        )
+
+
+def batch_mark_deleted(cur, file_ids: list, run_id: int):
+    """Batch UPDATE smazaných souborů — exists_now = 0."""
+    for i in range(0, len(file_ids), BATCH_SIZE):
+        chunk = file_ids[i:i + BATCH_SIZE]
+        cur.executemany(
+            "UPDATE files SET exists_now = 0, last_seen_run = %s WHERE id = %s",
+            [(run_id, fid) for fid in chunk]
+        )
+
+
+def batch_update_unchanged(cur, file_ids: list, run_id: int):
+    """Batch UPDATE nezměněných souborů — jen last_seen_run."""
+    for i in range(0, len(file_ids), BATCH_SIZE):
+        chunk = file_ids[i:i + BATCH_SIZE]
+        cur.executemany(
+            "UPDATE files SET last_seen_run = %s, exists_now = 1 WHERE id = %s",
+            [(run_id, fid) for fid in chunk]
+        )
@@ -1,19 +1,21 @@
-def log_event(cur, file_id, event_type, old=None, new=None):
-    cur.execute(
-        """
-        INSERT INTO file_events (
-            file_id, event_type, event_time,
-            old_size, new_size,
-            old_hash, new_hash
+from indexer.config import BATCH_SIZE
+
+
+def batch_log_events(cur, events: list):
+    """
+    Batch INSERT eventů do file_events.
+    events: [{run_id, file_id, event_type, old_size, new_size, old_hash, new_hash}]
+    """
+    if not events:
+        return
+    for i in range(0, len(events), BATCH_SIZE):
+        chunk = events[i:i + BATCH_SIZE]
+        cur.executemany(
+            """INSERT INTO file_events
+               (run_id, file_id, event_type, old_size, new_size, old_hash, new_hash)
+               VALUES (%s, %s, %s, %s, %s, %s, %s)""",
+            [(e["run_id"], e["file_id"], e["event_type"],
+              e.get("old_size"), e.get("new_size"),
+              e.get("old_hash"), e.get("new_hash"))
+             for e in chunk]
        )
-        VALUES (%s, %s, NOW(), %s, %s, %s, %s)
-        """,
-        (
-            file_id,
-            event_type,
-            old["size"] if old else None,
-            new["size"] if new else None,
-            old["content_hash"] if old else None,
-            new["content_hash"] if new else None,
-        )
-    )
@@ -1,21 +1,30 @@
 import os
 from datetime import datetime
-from indexer.hasher import blake3_file

-def scan_files(root_path):
+
+def scan_files(root_path: str) -> dict:
+    """
+    Projde celý adresářový strom a vrátí dict všech souborů.
+    Nehasuje obsah — to se dělá až pro změněné soubory.
+
+    Returns:
+        {relative_path: {full_path, file_name, directory, size, mtime}}
+    """
+    result = {}
    for root, _, files in os.walk(root_path):
        for name in files:
            full_path = os.path.join(root, name)
            try:
                stat = os.stat(full_path)
-            except FileNotFoundError:
+            except (FileNotFoundError, PermissionError):
                continue
-
-            yield {
-                "full_path": full_path.replace("\\", "/"),
+            rel_path = os.path.relpath(full_path, root_path).replace("\\", "/")
+            rel_dir = os.path.relpath(root, root_path).replace("\\", "/")
+            result[rel_path] = {
+                "full_path": full_path,
                "file_name": name,
-                "directory": root.replace("\\", "/"),
+                "directory": rel_dir,
                "size": stat.st_size,
                "mtime": datetime.fromtimestamp(stat.st_mtime),
-                "content_hash": blake3_file(full_path),
            }
+    return result