notebook

2026-02-08 12:28:54 +01:00
parent dbc60ee42b
commit e7dd89962e
10 changed files with 249 additions and 1 deletions
@@ -35,7 +35,7 @@ Thumbs.db
 # Secrets / config
 # ===============================
 .env
-.env.*
+.env
 config.local.py
 settings.local.py

@@ -0,0 +1,32 @@
+import os
+from dotenv import load_dotenv
+
+# načti .env z rootu projektu
+load_dotenv()
+
+# =========================
+# Database
+# =========================
+
+DB_CONFIG = {
+    "host": os.getenv("DB_HOST"),
+    "port": int(os.getenv("DB_PORT", 3306)),
+    "user": os.getenv("DB_USER"),
+    "password": os.getenv("DB_PASSWORD"),
+    "database": os.getenv("DB_NAME"),
+    "charset": "utf8mb4",
+    "autocommit": False,
+}
+
+# =========================
+# Filesystem
+# =========================
+
+ROOT_PATH = os.getenv("ROOT_PATH")
+ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
+
+# =========================
+# Behaviour
+# =========================
+
+DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
@@ -0,0 +1,91 @@
+import pymysql
+import hashlib
+from indexer.config import DB_CONFIG, ROOT_NAME
+
+
+def get_connection():
+    return pymysql.connect(**DB_CONFIG)
+
+
+def preload_mark_all_missing():
+    """
+    Na začátku běhu:
+    označí všechny soubory jako neexistující.
+    Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
+    """
+    conn = get_connection()
+    try:
+        with conn.cursor() as cur:
+            cur.execute("UPDATE files SET exists_now = 0")
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def path_hash(path: str) -> bytes:
+    """
+    MD5 hash cesty – pouze identifikátor, ne bezpečnostní hash
+    """
+    return hashlib.md5(path.encode("utf-8")).digest()
+
+
+def find_file_by_path(cur, path_hash_bytes):
+    cur.execute(
+        """
+        SELECT id, file_size, mtime, content_hash
+        FROM files
+        WHERE path_hash = %s
+        """,
+        (path_hash_bytes,)
+    )
+    return cur.fetchone()
+
+
+def insert_file(cur, file):
+    cur.execute(
+        """
+        INSERT INTO files (
+            root_name, full_path, path_hash,
+            file_name, directory,
+            file_size, mtime, content_hash,
+            first_seen, last_seen, exists_now
+        )
+        VALUES (
+            %s, %s, %s,
+            %s, %s,
+            %s, %s, %s,
+            NOW(), NOW(), 1
+        )
+        """,
+        (
+            ROOT_NAME,
+            file["full_path"],
+            path_hash(file["full_path"]),
+            file["file_name"],
+            file["directory"],
+            file["size"],
+            file["mtime"],
+            file["content_hash"],
+        )
+    )
+    return cur.lastrowid
+
+
+def update_file(cur, file_id, file):
+    cur.execute(
+        """
+        UPDATE files
+        SET file_size = %s,
+            mtime = %s,
+            content_hash = %s,
+            last_seen = NOW(),
+            exists_now = 1
+        WHERE id = %s
+        """,
+        (
+            file["size"],
+            file["mtime"],
+            file["content_hash"],
+            file_id,
+        )
+    )
@@ -0,0 +1,19 @@
+def log_event(cur, file_id, event_type, old=None, new=None):
+    cur.execute(
+        """
+        INSERT INTO file_events (
+            file_id, event_type, event_time,
+            old_size, new_size,
+            old_hash, new_hash
+        )
+        VALUES (%s, %s, NOW(), %s, %s, %s, %s)
+        """,
+        (
+            file_id,
+            event_type,
+            old["size"] if old else None,
+            new["size"] if new else None,
+            old["content_hash"] if old else None,
+            new["content_hash"] if new else None,
+        )
+    )
@@ -0,0 +1,12 @@
+from blake3 import blake3
+
+
+def blake3_file(path, chunk_size=1024 * 1024):
+    """
+    Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti)
+    """
+    h = blake3()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(chunk_size), b""):
+            h.update(chunk)
+    return h.digest()
@@ -0,0 +1,21 @@
+import os
+from datetime import datetime
+from indexer.hasher import blake3_file
+
+def scan_files(root_path):
+    for root, _, files in os.walk(root_path):
+        for name in files:
+            full_path = os.path.join(root, name)
+            try:
+                stat = os.stat(full_path)
+            except FileNotFoundError:
+                continue
+
+            yield {
+                "full_path": full_path.replace("\\", "/"),
+                "file_name": name,
+                "directory": root.replace("\\", "/"),
+                "size": stat.st_size,
+                "mtime": datetime.fromtimestamp(stat.st_mtime),
+                "content_hash": blake3_file(full_path),
+            }
@@ -0,0 +1,73 @@
+from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN
+from indexer.scanner import scan_files
+from indexer.db import (
+    get_connection,
+    preload_mark_all_missing,
+    find_file_by_path,
+    insert_file,
+    update_file,
+    path_hash,
+)
+from indexer.events import log_event
+
+def main():
+    print("=" * 60)
+    print("ORDINACE DROPBOX BACKUP – INDEXER")
+    print(f"Root    : {ROOT_PATH}")
+    print(f"Name    : {ROOT_NAME}")
+    print(f"DRY RUN : {DRY_RUN}")
+    print("=" * 60)
+
+    conn = get_connection()
+    cur = conn.cursor()
+
+    if not DRY_RUN:
+        preload_mark_all_missing()
+
+    created = modified = seen = 0
+
+    for file in scan_files(ROOT_PATH):
+        seen += 1
+        ph = path_hash(file["full_path"])
+        row = find_file_by_path(cur, ph)
+
+        if row is None:
+            created += 1
+            if not DRY_RUN:
+                file_id = insert_file(cur, file)
+                log_event(cur, file_id, "CREATED", new=file)
+        else:
+            file_id, old_size, old_mtime, old_hash = row
+            if old_size != file["size"] or old_hash != file["content_hash"]:
+                modified += 1
+                if not DRY_RUN:
+                    update_file(cur, file_id, file)
+                    log_event(
+                        cur,
+                        file_id,
+                        "MODIFIED",
+                        old={"size": old_size, "content_hash": old_hash},
+                        new=file,
+                    )
+            else:
+                if not DRY_RUN:
+                    cur.execute(
+                        "UPDATE files SET last_seen = NOW(), exists_now = 1 WHERE id = %s",
+                        (file_id,)
+                    )
+
+        if seen % 500 == 0:
+            print(f"{seen} files scanned...")
+
+    if not DRY_RUN:
+        conn.commit()
+
+    print("================================")
+    print(f"Scanned  : {seen}")
+    print(f"Created  : {created}")
+    print(f"Modified : {modified}")
+
+    conn.close()
+
+if __name__ == "__main__":
+    main()