This commit is contained in:
2026-02-09 20:16:37 +01:00
parent e7dd89962e
commit 9838164b88
9 changed files with 444 additions and 150 deletions

42
indexer/backup.py Normal file
View File

@@ -0,0 +1,42 @@
import os
import shutil
import tempfile
def blob_path(backup_root: str, content_hash: bytes) -> str:
"""Vrátí cestu k blob souboru: BACKUP/ab/cd/abcdef...blob"""
hex_hash = content_hash.hex()
return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".blob")
def ensure_backed_up(files_with_hash: list, backup_root: str) -> int:
"""
Zkopíruje soubory do content-addressable storage.
files_with_hash: [(full_path, content_hash_bytes), ...]
Přeskočí soubory, jejichž blob už existuje (deduplikace).
Returns: počet nově zálohovaných souborů.
"""
backed_up = 0
for full_path, content_hash in files_with_hash:
target = blob_path(backup_root, content_hash)
if os.path.exists(target):
continue
target_dir = os.path.dirname(target)
os.makedirs(target_dir, exist_ok=True)
try:
# Atomický zápis: temp soubor + přejmenování
fd, tmp_path = tempfile.mkstemp(dir=target_dir, suffix=".tmp")
os.close(fd)
shutil.copy2(full_path, tmp_path)
os.replace(tmp_path, target)
backed_up += 1
except (FileNotFoundError, PermissionError, OSError) as e:
print(f" WARN: backup failed for {full_path}: {e}")
# Uklidíme temp soubor pokud existuje
if os.path.exists(tmp_path):
os.remove(tmp_path)
continue
return backed_up

View File

@@ -1,7 +1,6 @@
import os
from dotenv import load_dotenv
# načti .env z rootu projektu
load_dotenv()
# =========================
@@ -24,9 +23,11 @@ DB_CONFIG = {
ROOT_PATH = os.getenv("ROOT_PATH")
ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
BACKUP_PATH = os.getenv("BACKUP_PATH")
# =========================
# Behaviour
# =========================
DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))

View File

@@ -1,91 +1,123 @@
import pymysql
import hashlib
from indexer.config import DB_CONFIG, ROOT_NAME
from datetime import datetime
from indexer.config import DB_CONFIG, BATCH_SIZE
def get_connection():
return pymysql.connect(**DB_CONFIG)
def preload_mark_all_missing():
"""
Na začátku běhu:
označí všechny soubory jako neexistující.
Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("UPDATE files SET exists_now = 0")
conn.commit()
finally:
conn.close()
# ── Run management ──────────────────────────────────────────
def path_hash(path: str) -> bytes:
"""
MD5 hash cesty pouze identifikátor, ne bezpečnostní hash
"""
return hashlib.md5(path.encode("utf-8")).digest()
def find_file_by_path(cur, path_hash_bytes):
def create_run(cur) -> int:
cur.execute(
"""
SELECT id, file_size, mtime, content_hash
FROM files
WHERE path_hash = %s
""",
(path_hash_bytes,)
)
return cur.fetchone()
def insert_file(cur, file):
cur.execute(
"""
INSERT INTO files (
root_name, full_path, path_hash,
file_name, directory,
file_size, mtime, content_hash,
first_seen, last_seen, exists_now
)
VALUES (
%s, %s, %s,
%s, %s,
%s, %s, %s,
NOW(), NOW(), 1
)
""",
(
ROOT_NAME,
file["full_path"],
path_hash(file["full_path"]),
file["file_name"],
file["directory"],
file["size"],
file["mtime"],
file["content_hash"],
)
"INSERT INTO runs (started_at, status) VALUES (%s, 'RUNNING')",
(datetime.now(),)
)
return cur.lastrowid
def update_file(cur, file_id, file):
def finalize_run(cur, run_id: int, stats: dict):
cur.execute(
"""
UPDATE files
SET file_size = %s,
mtime = %s,
content_hash = %s,
last_seen = NOW(),
exists_now = 1
WHERE id = %s
""",
(
file["size"],
file["mtime"],
file["content_hash"],
file_id,
)
"""UPDATE runs
SET finished_at = %s, status = 'COMPLETED',
files_total = %s, files_new = %s, files_modified = %s,
files_deleted = %s, files_unchanged = %s
WHERE id = %s""",
(datetime.now(), stats["total"], stats["new"], stats["modified"],
stats["deleted"], stats["unchanged"], run_id)
)
def fail_run(cur, run_id: int):
cur.execute(
"UPDATE runs SET finished_at = %s, status = 'FAILED' WHERE id = %s",
(datetime.now(), run_id)
)
# ── Load DB state ──────────────────────────────────────────
def load_all_files(cur) -> dict:
"""
Načte všechny existující soubory z DB do RAM.
Returns: {relative_path: {id, size, mtime, content_hash}}
"""
cur.execute(
"""SELECT id, relative_path, file_size, mtime, content_hash
FROM files WHERE exists_now = 1"""
)
result = {}
for row in cur.fetchall():
file_id, rel_path, size, mtime, content_hash = row
result[rel_path] = {
"id": file_id,
"size": size,
"mtime": mtime,
"content_hash": content_hash,
}
return result
# ── Batch operations ────────────────────────────────────────
def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
"""
Batch INSERT nových souborů.
files_list: [{relative_path, file_name, directory, size, mtime, content_hash}]
Returns: {relative_path: file_id}
"""
path_to_id = {}
for i in range(0, len(files_list), BATCH_SIZE):
chunk = files_list[i:i + BATCH_SIZE]
cur.executemany(
"""INSERT INTO files
(relative_path, file_name, directory, file_size, mtime,
content_hash, first_seen_run, last_seen_run, exists_now)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""",
[(f["relative_path"], f["file_name"], f["directory"],
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
for f in chunk]
)
# pymysql executemany: lastrowid = first id in batch
first_id = cur.lastrowid
for j, f in enumerate(chunk):
path_to_id[f["relative_path"]] = first_id + j
return path_to_id
def batch_update_modified(cur, files_list: list, run_id: int):
"""
Batch UPDATE změněných souborů.
files_list: [{id, size, mtime, content_hash}]
"""
for i in range(0, len(files_list), BATCH_SIZE):
chunk = files_list[i:i + BATCH_SIZE]
cur.executemany(
"""UPDATE files
SET file_size = %s, mtime = %s, content_hash = %s,
last_seen_run = %s, exists_now = 1
WHERE id = %s""",
[(f["size"], f["mtime"], f["content_hash"], run_id, f["id"])
for f in chunk]
)
def batch_mark_deleted(cur, file_ids: list, run_id: int):
"""Batch UPDATE smazaných souborů — exists_now = 0."""
for i in range(0, len(file_ids), BATCH_SIZE):
chunk = file_ids[i:i + BATCH_SIZE]
cur.executemany(
"UPDATE files SET exists_now = 0, last_seen_run = %s WHERE id = %s",
[(run_id, fid) for fid in chunk]
)
def batch_update_unchanged(cur, file_ids: list, run_id: int):
"""Batch UPDATE nezměněných souborů — jen last_seen_run."""
for i in range(0, len(file_ids), BATCH_SIZE):
chunk = file_ids[i:i + BATCH_SIZE]
cur.executemany(
"UPDATE files SET last_seen_run = %s, exists_now = 1 WHERE id = %s",
[(run_id, fid) for fid in chunk]
)

View File

@@ -1,19 +1,21 @@
def log_event(cur, file_id, event_type, old=None, new=None):
cur.execute(
"""
INSERT INTO file_events (
file_id, event_type, event_time,
old_size, new_size,
old_hash, new_hash
from indexer.config import BATCH_SIZE
def batch_log_events(cur, events: list):
"""
Batch INSERT eventů do file_events.
events: [{run_id, file_id, event_type, old_size, new_size, old_hash, new_hash}]
"""
if not events:
return
for i in range(0, len(events), BATCH_SIZE):
chunk = events[i:i + BATCH_SIZE]
cur.executemany(
"""INSERT INTO file_events
(run_id, file_id, event_type, old_size, new_size, old_hash, new_hash)
VALUES (%s, %s, %s, %s, %s, %s, %s)""",
[(e["run_id"], e["file_id"], e["event_type"],
e.get("old_size"), e.get("new_size"),
e.get("old_hash"), e.get("new_hash"))
for e in chunk]
)
VALUES (%s, %s, NOW(), %s, %s, %s, %s)
""",
(
file_id,
event_type,
old["size"] if old else None,
new["size"] if new else None,
old["content_hash"] if old else None,
new["content_hash"] if new else None,
)
)

View File

@@ -1,21 +1,30 @@
import os
from datetime import datetime
from indexer.hasher import blake3_file
def scan_files(root_path):
def scan_files(root_path: str) -> dict:
"""
Projde celý adresářový strom a vrátí dict všech souborů.
Nehasuje obsah — to se dělá až pro změněné soubory.
Returns:
{relative_path: {full_path, file_name, directory, size, mtime}}
"""
result = {}
for root, _, files in os.walk(root_path):
for name in files:
full_path = os.path.join(root, name)
try:
stat = os.stat(full_path)
except FileNotFoundError:
except (FileNotFoundError, PermissionError):
continue
yield {
"full_path": full_path.replace("\\", "/"),
rel_path = os.path.relpath(full_path, root_path).replace("\\", "/")
rel_dir = os.path.relpath(root, root_path).replace("\\", "/")
result[rel_path] = {
"full_path": full_path,
"file_name": name,
"directory": root.replace("\\", "/"),
"directory": rel_dir,
"size": stat.st_size,
"mtime": datetime.fromtimestamp(stat.st_mtime),
"content_hash": blake3_file(full_path),
}
return result