This commit is contained in:
2026-02-08 12:28:54 +01:00
parent dbc60ee42b
commit e7dd89962e
10 changed files with 249 additions and 1 deletions

2
.gitignore vendored
View File

@@ -35,7 +35,7 @@ Thumbs.db
# Secrets / config
# ===============================
.env
.env.*
.env
config.local.py
settings.local.py

BIN
README.md

Binary file not shown.

0
indexer/__init__.py Normal file
View File

32
indexer/config.py Normal file
View File

@@ -0,0 +1,32 @@
import os
from dotenv import load_dotenv
# načti .env z rootu projektu
load_dotenv()
# =========================
# Database
# =========================
DB_CONFIG = {
"host": os.getenv("DB_HOST"),
"port": int(os.getenv("DB_PORT", 3306)),
"user": os.getenv("DB_USER"),
"password": os.getenv("DB_PASSWORD"),
"database": os.getenv("DB_NAME"),
"charset": "utf8mb4",
"autocommit": False,
}
# =========================
# Filesystem
# =========================
ROOT_PATH = os.getenv("ROOT_PATH")
ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
# =========================
# Behaviour
# =========================
DRY_RUN = os.getenv("DRY_RUN", "1") == "1"

91
indexer/db.py Normal file
View File

@@ -0,0 +1,91 @@
import pymysql
import hashlib
from indexer.config import DB_CONFIG, ROOT_NAME
def get_connection():
return pymysql.connect(**DB_CONFIG)
def preload_mark_all_missing():
"""
Na začátku běhu:
označí všechny soubory jako neexistující.
Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("UPDATE files SET exists_now = 0")
conn.commit()
finally:
conn.close()
def path_hash(path: str) -> bytes:
"""
MD5 hash cesty pouze identifikátor, ne bezpečnostní hash
"""
return hashlib.md5(path.encode("utf-8")).digest()
def find_file_by_path(cur, path_hash_bytes):
cur.execute(
"""
SELECT id, file_size, mtime, content_hash
FROM files
WHERE path_hash = %s
""",
(path_hash_bytes,)
)
return cur.fetchone()
def insert_file(cur, file):
cur.execute(
"""
INSERT INTO files (
root_name, full_path, path_hash,
file_name, directory,
file_size, mtime, content_hash,
first_seen, last_seen, exists_now
)
VALUES (
%s, %s, %s,
%s, %s,
%s, %s, %s,
NOW(), NOW(), 1
)
""",
(
ROOT_NAME,
file["full_path"],
path_hash(file["full_path"]),
file["file_name"],
file["directory"],
file["size"],
file["mtime"],
file["content_hash"],
)
)
return cur.lastrowid
def update_file(cur, file_id, file):
cur.execute(
"""
UPDATE files
SET file_size = %s,
mtime = %s,
content_hash = %s,
last_seen = NOW(),
exists_now = 1
WHERE id = %s
""",
(
file["size"],
file["mtime"],
file["content_hash"],
file_id,
)
)

19
indexer/events.py Normal file
View File

@@ -0,0 +1,19 @@
def log_event(cur, file_id, event_type, old=None, new=None):
cur.execute(
"""
INSERT INTO file_events (
file_id, event_type, event_time,
old_size, new_size,
old_hash, new_hash
)
VALUES (%s, %s, NOW(), %s, %s, %s, %s)
""",
(
file_id,
event_type,
old["size"] if old else None,
new["size"] if new else None,
old["content_hash"] if old else None,
new["content_hash"] if new else None,
)
)

12
indexer/hasher.py Normal file
View File

@@ -0,0 +1,12 @@
from blake3 import blake3
def blake3_file(path, chunk_size=1024 * 1024):
"""
Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti)
"""
h = blake3()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
h.update(chunk)
return h.digest()

21
indexer/scanner.py Normal file
View File

@@ -0,0 +1,21 @@
import os
from datetime import datetime
from indexer.hasher import blake3_file
def scan_files(root_path):
for root, _, files in os.walk(root_path):
for name in files:
full_path = os.path.join(root, name)
try:
stat = os.stat(full_path)
except FileNotFoundError:
continue
yield {
"full_path": full_path.replace("\\", "/"),
"file_name": name,
"directory": root.replace("\\", "/"),
"size": stat.st_size,
"mtime": datetime.fromtimestamp(stat.st_mtime),
"content_hash": blake3_file(full_path),
}

73
main.py Normal file
View File

@@ -0,0 +1,73 @@
from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN
from indexer.scanner import scan_files
from indexer.db import (
get_connection,
preload_mark_all_missing,
find_file_by_path,
insert_file,
update_file,
path_hash,
)
from indexer.events import log_event
def main():
print("=" * 60)
print("ORDINACE DROPBOX BACKUP INDEXER")
print(f"Root : {ROOT_PATH}")
print(f"Name : {ROOT_NAME}")
print(f"DRY RUN : {DRY_RUN}")
print("=" * 60)
conn = get_connection()
cur = conn.cursor()
if not DRY_RUN:
preload_mark_all_missing()
created = modified = seen = 0
for file in scan_files(ROOT_PATH):
seen += 1
ph = path_hash(file["full_path"])
row = find_file_by_path(cur, ph)
if row is None:
created += 1
if not DRY_RUN:
file_id = insert_file(cur, file)
log_event(cur, file_id, "CREATED", new=file)
else:
file_id, old_size, old_mtime, old_hash = row
if old_size != file["size"] or old_hash != file["content_hash"]:
modified += 1
if not DRY_RUN:
update_file(cur, file_id, file)
log_event(
cur,
file_id,
"MODIFIED",
old={"size": old_size, "content_hash": old_hash},
new=file,
)
else:
if not DRY_RUN:
cur.execute(
"UPDATE files SET last_seen = NOW(), exists_now = 1 WHERE id = %s",
(file_id,)
)
if seen % 500 == 0:
print(f"{seen} files scanned...")
if not DRY_RUN:
conn.commit()
print("================================")
print(f"Scanned : {seen}")
print(f"Created : {created}")
print(f"Modified : {modified}")
conn.close()
if __name__ == "__main__":
main()

0
requirements.txt Normal file
View File