From e7dd89962e4ed0091987a6931905384798e52415 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Sun, 8 Feb 2026 12:28:54 +0100 Subject: [PATCH] notebook --- .gitignore | 2 +- README.md | Bin 56 -> 0 bytes indexer/__init__.py | 0 indexer/config.py | 32 ++++++++++++++++ indexer/db.py | 91 ++++++++++++++++++++++++++++++++++++++++++++ indexer/events.py | 19 +++++++++ indexer/hasher.py | 12 ++++++ indexer/scanner.py | 21 ++++++++++ main.py | 73 +++++++++++++++++++++++++++++++++++ requirements.txt | 0 10 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 indexer/__init__.py create mode 100644 indexer/config.py create mode 100644 indexer/db.py create mode 100644 indexer/events.py create mode 100644 indexer/hasher.py create mode 100644 indexer/scanner.py create mode 100644 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index e96aea0..9e91df7 100644 --- a/.gitignore +++ b/.gitignore @@ -35,7 +35,7 @@ Thumbs.db # Secrets / config # =============================== .env -.env.* +.env config.local.py settings.local.py diff --git a/README.md b/README.md index ac453e0e227e561dd3748cf77b687137f04180e5..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 56 zcmezWPnki1!G)oSA)lduAqhxU0QvqvehNb-LmopSLo!1uP{aw$%4R49s^VqfVgLYb C#R~BN diff --git a/indexer/__init__.py b/indexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/indexer/config.py b/indexer/config.py new file mode 100644 index 0000000..002a19e --- /dev/null +++ b/indexer/config.py @@ -0,0 +1,32 @@ +import os +from dotenv import load_dotenv + +# načti .env z rootu projektu +load_dotenv() + +# ========================= +# Database +# ========================= + +DB_CONFIG = { + "host": os.getenv("DB_HOST"), + "port": int(os.getenv("DB_PORT", 3306)), + "user": os.getenv("DB_USER"), + "password": os.getenv("DB_PASSWORD"), + "database": os.getenv("DB_NAME"), + "charset": "utf8mb4", + "autocommit": False, +} + +# ========================= +# Filesystem +# ========================= + +ROOT_PATH = os.getenv("ROOT_PATH") +ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE") + +# ========================= +# Behaviour +# ========================= + +DRY_RUN = os.getenv("DRY_RUN", "1") == "1" diff --git a/indexer/db.py b/indexer/db.py new file mode 100644 index 0000000..9449c2a --- /dev/null +++ b/indexer/db.py @@ -0,0 +1,91 @@ +import pymysql +import hashlib +from indexer.config import DB_CONFIG, ROOT_NAME + + +def get_connection(): + return pymysql.connect(**DB_CONFIG) + + +def preload_mark_all_missing(): + """ + Na začátku běhu: + označí všechny soubory jako neexistující. + Ty, které skener znovu najde, se přepnou zpět na exists_now = 1. + """ + conn = get_connection() + try: + with conn.cursor() as cur: + cur.execute("UPDATE files SET exists_now = 0") + conn.commit() + finally: + conn.close() + + +def path_hash(path: str) -> bytes: + """ + MD5 hash cesty – pouze identifikátor, ne bezpečnostní hash + """ + return hashlib.md5(path.encode("utf-8")).digest() + + +def find_file_by_path(cur, path_hash_bytes): + cur.execute( + """ + SELECT id, file_size, mtime, content_hash + FROM files + WHERE path_hash = %s + """, + (path_hash_bytes,) + ) + return cur.fetchone() + + +def insert_file(cur, file): + cur.execute( + """ + INSERT INTO files ( + root_name, full_path, path_hash, + file_name, directory, + file_size, mtime, content_hash, + first_seen, last_seen, exists_now + ) + VALUES ( + %s, %s, %s, + %s, %s, + %s, %s, %s, + NOW(), NOW(), 1 + ) + """, + ( + ROOT_NAME, + file["full_path"], + path_hash(file["full_path"]), + file["file_name"], + file["directory"], + file["size"], + file["mtime"], + file["content_hash"], + ) + ) + return cur.lastrowid + + +def update_file(cur, file_id, file): + cur.execute( + """ + UPDATE files + SET file_size = %s, + mtime = %s, + content_hash = %s, + last_seen = NOW(), + exists_now = 1 + WHERE id = %s + """, + ( + file["size"], + file["mtime"], + file["content_hash"], + file_id, + ) + ) diff --git a/indexer/events.py b/indexer/events.py new file mode 100644 index 0000000..a024d01 --- /dev/null +++ b/indexer/events.py @@ -0,0 +1,19 @@ +def log_event(cur, file_id, event_type, old=None, new=None): + cur.execute( + """ + INSERT INTO file_events ( + file_id, event_type, event_time, + old_size, new_size, + old_hash, new_hash + ) + VALUES (%s, %s, NOW(), %s, %s, %s, %s) + """, + ( + file_id, + event_type, + old["size"] if old else None, + new["size"] if new else None, + old["content_hash"] if old else None, + new["content_hash"] if new else None, + ) + ) diff --git a/indexer/hasher.py b/indexer/hasher.py new file mode 100644 index 0000000..29235d2 --- /dev/null +++ b/indexer/hasher.py @@ -0,0 +1,12 @@ +from blake3 import blake3 + + +def blake3_file(path, chunk_size=1024 * 1024): + """ + Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti) + """ + h = blake3() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(chunk_size), b""): + h.update(chunk) + return h.digest() diff --git a/indexer/scanner.py b/indexer/scanner.py new file mode 100644 index 0000000..b90066f --- /dev/null +++ b/indexer/scanner.py @@ -0,0 +1,21 @@ +import os +from datetime import datetime +from indexer.hasher import blake3_file + +def scan_files(root_path): + for root, _, files in os.walk(root_path): + for name in files: + full_path = os.path.join(root, name) + try: + stat = os.stat(full_path) + except FileNotFoundError: + continue + + yield { + "full_path": full_path.replace("\\", "/"), + "file_name": name, + "directory": root.replace("\\", "/"), + "size": stat.st_size, + "mtime": datetime.fromtimestamp(stat.st_mtime), + "content_hash": blake3_file(full_path), + } diff --git a/main.py b/main.py new file mode 100644 index 0000000..c4b285c --- /dev/null +++ b/main.py @@ -0,0 +1,73 @@ +from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN +from indexer.scanner import scan_files +from indexer.db import ( + get_connection, + preload_mark_all_missing, + find_file_by_path, + insert_file, + update_file, + path_hash, +) +from indexer.events import log_event + +def main(): + print("=" * 60) + print("ORDINACE DROPBOX BACKUP – INDEXER") + print(f"Root : {ROOT_PATH}") + print(f"Name : {ROOT_NAME}") + print(f"DRY RUN : {DRY_RUN}") + print("=" * 60) + + conn = get_connection() + cur = conn.cursor() + + if not DRY_RUN: + preload_mark_all_missing() + + created = modified = seen = 0 + + for file in scan_files(ROOT_PATH): + seen += 1 + ph = path_hash(file["full_path"]) + row = find_file_by_path(cur, ph) + + if row is None: + created += 1 + if not DRY_RUN: + file_id = insert_file(cur, file) + log_event(cur, file_id, "CREATED", new=file) + else: + file_id, old_size, old_mtime, old_hash = row + if old_size != file["size"] or old_hash != file["content_hash"]: + modified += 1 + if not DRY_RUN: + update_file(cur, file_id, file) + log_event( + cur, + file_id, + "MODIFIED", + old={"size": old_size, "content_hash": old_hash}, + new=file, + ) + else: + if not DRY_RUN: + cur.execute( + "UPDATE files SET last_seen = NOW(), exists_now = 1 WHERE id = %s", + (file_id,) + ) + + if seen % 500 == 0: + print(f"{seen} files scanned...") + + if not DRY_RUN: + conn.commit() + + print("================================") + print(f"Scanned : {seen}") + print(f"Created : {created}") + print(f"Modified : {modified}") + + conn.close() + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29