notebook
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -35,7 +35,7 @@ Thumbs.db
|
||||
# Secrets / config
|
||||
# ===============================
|
||||
.env
|
||||
.env.*
|
||||
.env
|
||||
config.local.py
|
||||
settings.local.py
|
||||
|
||||
|
||||
0
indexer/__init__.py
Normal file
0
indexer/__init__.py
Normal file
32
indexer/config.py
Normal file
32
indexer/config.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# načti .env z rootu projektu
|
||||
load_dotenv()
|
||||
|
||||
# =========================
|
||||
# Database
|
||||
# =========================
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": os.getenv("DB_HOST"),
|
||||
"port": int(os.getenv("DB_PORT", 3306)),
|
||||
"user": os.getenv("DB_USER"),
|
||||
"password": os.getenv("DB_PASSWORD"),
|
||||
"database": os.getenv("DB_NAME"),
|
||||
"charset": "utf8mb4",
|
||||
"autocommit": False,
|
||||
}
|
||||
|
||||
# =========================
|
||||
# Filesystem
|
||||
# =========================
|
||||
|
||||
ROOT_PATH = os.getenv("ROOT_PATH")
|
||||
ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
|
||||
|
||||
# =========================
|
||||
# Behaviour
|
||||
# =========================
|
||||
|
||||
DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
|
||||
91
indexer/db.py
Normal file
91
indexer/db.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import pymysql
|
||||
import hashlib
|
||||
from indexer.config import DB_CONFIG, ROOT_NAME
|
||||
|
||||
|
||||
def get_connection():
|
||||
return pymysql.connect(**DB_CONFIG)
|
||||
|
||||
|
||||
def preload_mark_all_missing():
|
||||
"""
|
||||
Na začátku běhu:
|
||||
označí všechny soubory jako neexistující.
|
||||
Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
|
||||
"""
|
||||
conn = get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("UPDATE files SET exists_now = 0")
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def path_hash(path: str) -> bytes:
|
||||
"""
|
||||
MD5 hash cesty – pouze identifikátor, ne bezpečnostní hash
|
||||
"""
|
||||
return hashlib.md5(path.encode("utf-8")).digest()
|
||||
|
||||
|
||||
def find_file_by_path(cur, path_hash_bytes):
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT id, file_size, mtime, content_hash
|
||||
FROM files
|
||||
WHERE path_hash = %s
|
||||
""",
|
||||
(path_hash_bytes,)
|
||||
)
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
def insert_file(cur, file):
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO files (
|
||||
root_name, full_path, path_hash,
|
||||
file_name, directory,
|
||||
file_size, mtime, content_hash,
|
||||
first_seen, last_seen, exists_now
|
||||
)
|
||||
VALUES (
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s,
|
||||
NOW(), NOW(), 1
|
||||
)
|
||||
""",
|
||||
(
|
||||
ROOT_NAME,
|
||||
file["full_path"],
|
||||
path_hash(file["full_path"]),
|
||||
file["file_name"],
|
||||
file["directory"],
|
||||
file["size"],
|
||||
file["mtime"],
|
||||
file["content_hash"],
|
||||
)
|
||||
)
|
||||
return cur.lastrowid
|
||||
|
||||
|
||||
def update_file(cur, file_id, file):
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE files
|
||||
SET file_size = %s,
|
||||
mtime = %s,
|
||||
content_hash = %s,
|
||||
last_seen = NOW(),
|
||||
exists_now = 1
|
||||
WHERE id = %s
|
||||
""",
|
||||
(
|
||||
file["size"],
|
||||
file["mtime"],
|
||||
file["content_hash"],
|
||||
file_id,
|
||||
)
|
||||
)
|
||||
19
indexer/events.py
Normal file
19
indexer/events.py
Normal file
@@ -0,0 +1,19 @@
|
||||
def log_event(cur, file_id, event_type, old=None, new=None):
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO file_events (
|
||||
file_id, event_type, event_time,
|
||||
old_size, new_size,
|
||||
old_hash, new_hash
|
||||
)
|
||||
VALUES (%s, %s, NOW(), %s, %s, %s, %s)
|
||||
""",
|
||||
(
|
||||
file_id,
|
||||
event_type,
|
||||
old["size"] if old else None,
|
||||
new["size"] if new else None,
|
||||
old["content_hash"] if old else None,
|
||||
new["content_hash"] if new else None,
|
||||
)
|
||||
)
|
||||
12
indexer/hasher.py
Normal file
12
indexer/hasher.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from blake3 import blake3
|
||||
|
||||
|
||||
def blake3_file(path, chunk_size=1024 * 1024):
|
||||
"""
|
||||
Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti)
|
||||
"""
|
||||
h = blake3()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(chunk_size), b""):
|
||||
h.update(chunk)
|
||||
return h.digest()
|
||||
21
indexer/scanner.py
Normal file
21
indexer/scanner.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
from indexer.hasher import blake3_file
|
||||
|
||||
def scan_files(root_path):
|
||||
for root, _, files in os.walk(root_path):
|
||||
for name in files:
|
||||
full_path = os.path.join(root, name)
|
||||
try:
|
||||
stat = os.stat(full_path)
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
|
||||
yield {
|
||||
"full_path": full_path.replace("\\", "/"),
|
||||
"file_name": name,
|
||||
"directory": root.replace("\\", "/"),
|
||||
"size": stat.st_size,
|
||||
"mtime": datetime.fromtimestamp(stat.st_mtime),
|
||||
"content_hash": blake3_file(full_path),
|
||||
}
|
||||
73
main.py
Normal file
73
main.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN
|
||||
from indexer.scanner import scan_files
|
||||
from indexer.db import (
|
||||
get_connection,
|
||||
preload_mark_all_missing,
|
||||
find_file_by_path,
|
||||
insert_file,
|
||||
update_file,
|
||||
path_hash,
|
||||
)
|
||||
from indexer.events import log_event
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("ORDINACE DROPBOX BACKUP – INDEXER")
|
||||
print(f"Root : {ROOT_PATH}")
|
||||
print(f"Name : {ROOT_NAME}")
|
||||
print(f"DRY RUN : {DRY_RUN}")
|
||||
print("=" * 60)
|
||||
|
||||
conn = get_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
if not DRY_RUN:
|
||||
preload_mark_all_missing()
|
||||
|
||||
created = modified = seen = 0
|
||||
|
||||
for file in scan_files(ROOT_PATH):
|
||||
seen += 1
|
||||
ph = path_hash(file["full_path"])
|
||||
row = find_file_by_path(cur, ph)
|
||||
|
||||
if row is None:
|
||||
created += 1
|
||||
if not DRY_RUN:
|
||||
file_id = insert_file(cur, file)
|
||||
log_event(cur, file_id, "CREATED", new=file)
|
||||
else:
|
||||
file_id, old_size, old_mtime, old_hash = row
|
||||
if old_size != file["size"] or old_hash != file["content_hash"]:
|
||||
modified += 1
|
||||
if not DRY_RUN:
|
||||
update_file(cur, file_id, file)
|
||||
log_event(
|
||||
cur,
|
||||
file_id,
|
||||
"MODIFIED",
|
||||
old={"size": old_size, "content_hash": old_hash},
|
||||
new=file,
|
||||
)
|
||||
else:
|
||||
if not DRY_RUN:
|
||||
cur.execute(
|
||||
"UPDATE files SET last_seen = NOW(), exists_now = 1 WHERE id = %s",
|
||||
(file_id,)
|
||||
)
|
||||
|
||||
if seen % 500 == 0:
|
||||
print(f"{seen} files scanned...")
|
||||
|
||||
if not DRY_RUN:
|
||||
conn.commit()
|
||||
|
||||
print("================================")
|
||||
print(f"Scanned : {seen}")
|
||||
print(f"Created : {created}")
|
||||
print(f"Modified : {modified}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
requirements.txt
Normal file
0
requirements.txt
Normal file
Reference in New Issue
Block a user