From 07e6a9c374887f2a50d08206c37d81f8d8ecc59f Mon Sep 17 00:00:00 2001 From: vlado Date: Mon, 18 May 2026 07:04:08 +0200 Subject: [PATCH] reporter --- .env | 2 +- README.md | 108 ++++++++++++++++++++++++++++++++++++++ indexer/config.py | 7 +++ indexer/db.py | 49 ++++++++--------- indexer/hasher.py | 20 +++++++ indexer/logger.py | 34 ++++++++++++ indexer/scanner.py | 5 +- main.py | 128 +++++++++++++++++++++------------------------ 8 files changed, 254 insertions(+), 99 deletions(-) create mode 100644 indexer/logger.py diff --git a/.env b/.env index 1e22a76..c1ff53d 100644 --- a/.env +++ b/.env @@ -19,5 +19,5 @@ DB_PASSWORD=Vlado9674+ ROOT_PATH=z:\Dropbox\Ordinace\ ROOT_NAME=DropboxOrdinace BATCH_SIZE=1000 -BACKUP_PATH=w:\Onedrive\DropBoxBackupClaude\ +BACKUP_PATH=w:\OneDrive\DropBoxBackupClaude\ BACKUP_PASSWORD=Vlado7309208104++ diff --git a/README.md b/README.md index e69de29..ad81aca 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,108 @@ +# Dropbox Ordinace Backup + +Indexer a zálohovací systém pro Dropbox složku ordinace. Při každém spuštění projde filesystém, porovná stav s databází, zazálohuje nové/změněné soubory do content-addressable storage, vygeneruje Excel report a odešle emailové oznámení. + +## Architektura + +``` +main.py – hlavní orchestrátor (spouštět přímo) +indexer/ + config.py – načítá .env (DB, cesty, chování) + logger.py – setup logování do souboru + konzole + scanner.py – rychlý scan filesystému (bez hashování) + hasher.py – BLAKE3 hash, detekce cloud placeholderů, hydratace + db.py – všechny DB operace (runs, files, events) + events.py – batch INSERT do file_events + backup.py – AES-256 ZIP content-addressable storage +report.py – generuje Excel report z DB +recovery.py – obnova souborů ze zálohy +reconcile.py – reconciliace DB vs filesystem +``` + +## Konfigurace (.env) + +```env +DRY_RUN=false # true = jen scan, žádná DB ani záloha +LOG_LEVEL=INFO +LOG_DIR=C:\Reporting\DropboxBackup\logs + +DB_HOST=192.168.1.76 +DB_PORT=3306 +DB_NAME=OrdinaceDropBoxBackup +DB_USER=root +DB_PASSWORD=... + +ROOT_PATH=z:\Dropbox\Ordinace\ +ROOT_NAME=DropboxOrdinace +BATCH_SIZE=1000 +BACKUP_PATH=w:\OneDrive\DropBoxBackupClaude\ +BACKUP_PASSWORD=... +``` + +## Spuštění + +``` +cd C:\Reporting\DropboxBackup +C:\Reporting\Python\python.exe main.py +``` + +Nebo přes Task Scheduler: +- Program: `C:\Reporting\Python\python.exe` +- Argumenty: `C:\Reporting\DropboxBackup\main.py` +- Spustit v: `C:\Reporting\DropboxBackup` + +## Co dělá jeden run + +1. **Scan** – projde `ROOT_PATH`, zaznamená velikost a mtime každého souboru +2. **Diff** – porovná s DB: nové / změněné / smazané / beze změny +3. **Hashování** – BLAKE3 pro nové a změněné soubory + - Cloud placeholdery (Dropbox Smart Sync) – automaticky hydratuje (čeká max 120 s na stažení) +4. **DB update** – batch INSERT/UPDATE souborů a eventů +5. **Backup** – AES-256 ZIP do content-addressable storage (`BACKUP_PATH/ab/cd/.zip`), deduplikace hashem +6. **Report** – Excel soubor do `z:\Dropbox\!!!Days\Downloads Z230\` +7. **Email** – HTML shrnutí + přiložený `backup.log` na `vladimir.buzalka@buzalka.cz` + +## Logy + +Rotující denní log: `LOG_DIR\backup.log` (history 90 dní). + +## Záloha — storage formát + +Každý unikátní obsah je uložen jednou jako: +``` +BACKUP_PATH/ + ab/ + cd/ + abcdef...zip ← AES-256 ZIP s názvem .blob uvnitř +``` + +Deduplikace je automatická — soubory se stejným obsahem sdílí jeden ZIP. + +## Databáze + +MySQL databáze `OrdinaceDropBoxBackup` na `192.168.1.76`: + +| Tabulka | Popis | +|---|---| +| `runs` | Každé spuštění indexeru (status, statistiky) | +| `files` | Aktuální stav všech souborů (`exists_now`, hash, velikost) | +| `file_events` | Historie změn (CREATED / MODIFIED / DELETED) | + +## Závislosti + +``` +blake3 +pymysql +python-dotenv +pyzipper +openpyxl +msal +requests +``` + +Instalace: `C:\Reporting\Python\python.exe -m pip install -r requirements.txt` + +## Projekty + +- Produkce: `C:\Reporting\DropboxBackup\` +- Vývoj: `C:\Users\vlado\PycharmProjects\drobboxordinacebackup\` diff --git a/indexer/config.py b/indexer/config.py index 55814c1..d67f6b4 100644 --- a/indexer/config.py +++ b/indexer/config.py @@ -32,3 +32,10 @@ BACKUP_PASSWORD = os.getenv("BACKUP_PASSWORD") DRY_RUN = os.getenv("DRY_RUN", "true").lower() in ("1", "true", "yes") BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000)) + +# ========================= +# Logging +# ========================= + +LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") +LOG_DIR = os.getenv("LOG_DIR", r"C:\Reporting\DropboxBackup\logs") diff --git a/indexer/db.py b/indexer/db.py index 30a68a2..cbec078 100644 --- a/indexer/db.py +++ b/indexer/db.py @@ -1,8 +1,13 @@ +import unicodedata import pymysql from datetime import datetime from indexer.config import DB_CONFIG, BATCH_SIZE +def _nfc(s: str) -> str: + return unicodedata.normalize("NFC", s) if s else s + + def get_connection(): return pymysql.connect(**DB_CONFIG) @@ -50,7 +55,7 @@ def load_all_files(cur) -> dict: result = {} for row in cur.fetchall(): file_id, rel_path, size, mtime, content_hash = row - result[rel_path] = { + result[_nfc(rel_path)] = { "id": file_id, "size": size, "mtime": mtime, @@ -70,34 +75,24 @@ def batch_insert_files(cur, files_list: list, run_id: int) -> dict: Returns: {relative_path: file_id} """ path_to_id = {} - for i in range(0, len(files_list), BATCH_SIZE): - chunk = files_list[i:i + BATCH_SIZE] - for f in chunk: - cur.execute( - """INSERT INTO files - (relative_path, file_name, directory, file_size, mtime, - content_hash, first_seen_run, last_seen_run, exists_now) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1) - ON DUPLICATE KEY UPDATE - file_name = VALUES(file_name), - directory = VALUES(directory), - file_size = VALUES(file_size), - mtime = VALUES(mtime), - content_hash = VALUES(content_hash), - last_seen_run = VALUES(last_seen_run), - exists_now = 1""", - (f["relative_path"], f["file_name"], f["directory"], - f["size"], f["mtime"], f["content_hash"], run_id, run_id) - ) - # Fetch real IDs - paths = [f["relative_path"] for f in chunk] - placeholders = ",".join(["%s"] * len(paths)) + for f in files_list: cur.execute( - f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})", - paths, + """INSERT INTO files + (relative_path, file_name, directory, file_size, mtime, + content_hash, first_seen_run, last_seen_run, exists_now) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1) + ON DUPLICATE KEY UPDATE + file_name = VALUES(file_name), + directory = VALUES(directory), + file_size = VALUES(file_size), + mtime = VALUES(mtime), + content_hash = VALUES(content_hash), + last_seen_run = VALUES(last_seen_run), + exists_now = 1""", + (f["relative_path"], f["file_name"], f["directory"], + f["size"], f["mtime"], f["content_hash"], run_id, run_id) ) - for row in cur.fetchall(): - path_to_id[row[1]] = row[0] + path_to_id[f["relative_path"]] = cur.lastrowid return path_to_id diff --git a/indexer/hasher.py b/indexer/hasher.py index 8c2ccc5..5441347 100644 --- a/indexer/hasher.py +++ b/indexer/hasher.py @@ -1,4 +1,5 @@ import ctypes +import time from blake3 import blake3 @@ -17,6 +18,25 @@ def is_cloud_placeholder(path: str) -> bool: return bool(attrs & _CLOUD_MASK) +def hydrate_file(path: str, timeout: int = 120, poll: int = 3) -> bool: + """ + Přinutí Dropbox stáhnout cloud placeholder otevřením souboru. + Čeká max timeout sekund. Vrátí True pokud se soubor stáhl. + """ + try: + with open(path, "rb") as f: + f.read(1) + except OSError: + pass + + deadline = time.time() + timeout + while time.time() < deadline: + if not is_cloud_placeholder(path): + return True + time.sleep(poll) + return False + + def blake3_file(path, chunk_size=1024 * 1024): """Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti).""" h = blake3() diff --git a/indexer/logger.py b/indexer/logger.py new file mode 100644 index 0000000..d24a3f5 --- /dev/null +++ b/indexer/logger.py @@ -0,0 +1,34 @@ +import logging +import os +import sys +from logging.handlers import TimedRotatingFileHandler + +from indexer.config import LOG_LEVEL, LOG_DIR + + +def setup_logging() -> logging.Logger: + os.makedirs(LOG_DIR, exist_ok=True) + + level = getattr(logging, LOG_LEVEL.upper(), logging.INFO) + fmt = logging.Formatter( + "%(asctime)s [%(levelname)-8s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + file_handler = TimedRotatingFileHandler( + os.path.join(LOG_DIR, "backup.log"), + when="midnight", + backupCount=90, + encoding="utf-8", + ) + file_handler.setFormatter(fmt) + + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(fmt) + + logging.root.setLevel(level) + logging.root.handlers.clear() + logging.root.addHandler(file_handler) + logging.root.addHandler(console_handler) + + return logging.getLogger("backup") diff --git a/indexer/scanner.py b/indexer/scanner.py index 2e0d468..4685b22 100644 --- a/indexer/scanner.py +++ b/indexer/scanner.py @@ -1,4 +1,5 @@ import os +import unicodedata from datetime import datetime @@ -18,8 +19,8 @@ def scan_files(root_path: str) -> dict: stat = os.stat(full_path) except (FileNotFoundError, PermissionError): continue - rel_path = os.path.relpath(full_path, root_path).replace("\\", "/") - rel_dir = os.path.relpath(root, root_path).replace("\\", "/") + rel_path = unicodedata.normalize("NFC", os.path.relpath(full_path, root_path).replace("\\", "/")) + rel_dir = unicodedata.normalize("NFC", os.path.relpath(root, root_path).replace("\\", "/")) # Truncate microseconds — MySQL DATETIME rounds to whole seconds, # which causes false "modified" detections on every run. mtime = datetime.fromtimestamp(stat.st_mtime).replace(microsecond=0) diff --git a/main.py b/main.py index 45b851f..de57d64 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,7 @@ import os from datetime import datetime from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN, BACKUP_PATH +from indexer.logger import setup_logging from indexer.scanner import scan_files from indexer.hasher import blake3_file from indexer.db import ( @@ -11,42 +12,43 @@ from indexer.db import ( ) from indexer.events import batch_log_events from indexer.backup import ensure_backed_up -from indexer.hasher import is_cloud_placeholder +from indexer.hasher import is_cloud_placeholder, hydrate_file def main(): - print("=" * 60) - print("ORDINACE DROPBOX BACKUP – INDEXER") - print(f"Root : {ROOT_PATH}") - print(f"Backup : {BACKUP_PATH}") - print(f"DRY RUN : {DRY_RUN}") - print("=" * 60) + log = setup_logging() + + log.info("=" * 60) + log.info("ORDINACE DROPBOX BACKUP – INDEXER") + log.info(f"Root : {ROOT_PATH}") + log.info(f"Backup : {BACKUP_PATH}") + log.info(f"DRY RUN : {DRY_RUN}") + log.info("=" * 60) # ── 1. Scan filesystem (fast, no hashing) ── - print("\n[1/7] Scanning filesystem...") + log.info("[1/7] Scanning filesystem...") fs = scan_files(ROOT_PATH) - print(f" Found {len(fs)} files on disk.") + log.info(f" Found {len(fs)} files on disk.") if DRY_RUN: - # V DRY_RUN režimu jen ukážeme co by se stalo - print("\n[DRY RUN] No DB connection, showing scan results only.") - print(f" Files on disk: {len(fs)}") + log.info("[DRY RUN] No DB connection, showing scan results only.") + log.info(f" Files on disk: {len(fs)}") return # ── 2. Connect & create run ── conn = get_connection() cur = conn.cursor() run_id = create_run(cur) - print(f"\n[2/7] Run #{run_id} created.") + log.info(f"[2/7] Run #{run_id} created.") try: # ── 3. Load DB state ── - print("[3/7] Loading DB state...") + log.info("[3/7] Loading DB state...") db = load_all_files(cur) - print(f" {len(db)} files in DB (exists_now=1).") + log.info(f" {len(db)} files in DB (exists_now=1).") # ── 4. Diff ── - print("[4/7] Diffing...") + log.info("[4/7] Diffing...") fs_paths = set(fs.keys()) db_paths = set(db.keys()) @@ -64,11 +66,11 @@ def main(): else: unchanged_paths.add(p) - print(f" NEW: {len(new_paths)} MOD: {len(modified_paths)} " - f"DEL: {len(deleted_paths)} SAME: {len(unchanged_paths)}") + log.info(f" NEW: {len(new_paths)} MOD: {len(modified_paths)} " + f"DEL: {len(deleted_paths)} SAME: {len(unchanged_paths)}") # ── 5. Process changes ── - print("[5/7] Processing changes...") + log.info("[5/7] Processing changes...") events = [] files_to_backup = [] @@ -76,19 +78,21 @@ def main(): skipped_files = [] new_files = [] if new_paths: - print(f" Hashing {len(new_paths)} new files...") - new_files = [] + log.info(f" Hashing {len(new_paths)} new files...") for p in new_paths: f = fs[p] if is_cloud_placeholder(f["full_path"]): - reason = "not synced (cloud placeholder)" - print(f" WARN: skip {p}: {reason}") - skipped_files.append((p, reason)) - continue + log.warning(f" PLACEHOLDER {p} — čekám na stažení...") + if not hydrate_file(f["full_path"]): + reason = "not synced (cloud placeholder, hydration timeout)" + log.warning(f" SKIP {p}: {reason}") + skipped_files.append((p, reason)) + continue + log.info(f" OK hydrated: {p}") try: content_hash = blake3_file(f["full_path"]) except (FileNotFoundError, PermissionError, OSError) as e: - print(f" WARN: skip {p}: {e}") + log.warning(f" SKIP {p}: {e}") skipped_files.append((p, str(e))) continue new_files.append({ @@ -114,7 +118,7 @@ def main(): # 5b) MODIFIED files — compute BLAKE3, batch UPDATE if modified_paths: - print(f" Hashing {len(modified_paths)} modified files...") + log.info(f" Hashing {len(modified_paths)} modified files...") mod_files = [] for p in modified_paths: f = fs[p] @@ -122,7 +126,7 @@ def main(): try: content_hash = blake3_file(f["full_path"]) except (FileNotFoundError, PermissionError, OSError) as e: - print(f" WARN: skip {p}: {e}") + log.warning(f" SKIP {p}: {e}") continue mod_files.append({ "id": db_file["id"], @@ -168,16 +172,16 @@ def main(): # ── 6. Backup ── if files_to_backup and BACKUP_PATH: - print(f"[6/7] Backing up {len(files_to_backup)} files...") + log.info(f"[6/7] Backing up {len(files_to_backup)} files...") backed = ensure_backed_up(files_to_backup, BACKUP_PATH) - print(f" {backed} new blobs written.") + log.info(f" {backed} new blobs written.") else: - print("[6/7] Nothing to backup.") + log.info("[6/7] Nothing to backup.") # ── 7. Finalize ── stats = { "total": len(fs), - "new": len(new_files) if new_paths else 0, + "new": len(new_files), "modified": len(modified_paths), "deleted": len(deleted_paths), "unchanged": len(unchanged_paths), @@ -185,10 +189,10 @@ def main(): } finalize_run(cur, run_id, stats) conn.commit() - print(f"[7/7] Run #{run_id} COMPLETED.") + log.info(f"[7/7] Run #{run_id} COMPLETED.") except Exception as e: - print(f"\nERROR: {e}") + log.exception(f"FATAL ERROR: {e}") try: fail_run(cur, run_id) conn.commit() @@ -200,19 +204,19 @@ def main(): conn.close() # ── Summary ── - print("\n" + "=" * 60) - print(f"Total : {stats['total']}") - print(f"New : {stats['new']}") - print(f"Modified : {stats['modified']}") - print(f"Deleted : {stats['deleted']}") - print(f"Unchanged: {stats['unchanged']}") + log.info("=" * 60) + log.info(f"Total : {stats['total']}") + log.info(f"New : {stats['new']}") + log.info(f"Modified : {stats['modified']}") + log.info(f"Deleted : {stats['deleted']}") + log.info(f"Unchanged: {stats['unchanged']}") if skipped_files: - print(f"Skipped : {len(skipped_files)} (hash failed)") - print("-" * 60) + log.warning(f"Skipped : {len(skipped_files)} (hash failed)") + log.info("-" * 60) for path, reason in skipped_files: - print(f" SKIP: {path}") - print(f" {reason}") - print("=" * 60) + log.warning(f" SKIP: {path}") + log.warning(f" {reason}") + log.info("=" * 60) # ── 8. Generate Excel report ── report_path = None @@ -225,10 +229,10 @@ def main(): os.remove(os.path.join(report_dir, f)) timestamp = datetime.now().strftime("%Y-%m-%d %H_%M") report_path = os.path.join(report_dir, f"{timestamp} DropboxBackupReport.xlsx") - print(f"\n[8] Generating report...") + log.info("[8] Generating report...") generate_report(report_path) except Exception as e: - print(f" WARN: Report generation failed: {e}") + log.warning(f"Report generation failed: {e}") # ── 9. Send email notification ── try: @@ -252,23 +256,6 @@ def main(): {rows} """ - def _file_section(title, color, paths): - if not paths: - return "" - rows = "".join(f"{p}" for p in sorted(paths)) - return f""" -

{title} ({len(paths)})

- - {rows} -
""" - - new_paths_ok = [nf["relative_path"] for nf in new_files] - files_detail = ( - _file_section("✓ Nove soubory", "#2a7a2a", new_paths_ok) - + _file_section("✎ Zmenene soubory", "#a07000", list(modified_paths)) - + _file_section("✗ Smazane soubory", "#a00000", list(deleted_paths)) - ) - body = f"""

✓ Dropbox Ordinace Backup – {ts}

@@ -283,16 +270,19 @@ def main(): {skipped_row} {report_line} -{files_detail} {skipped_detail}

REPORTER • {ts}

""" - subject = f"Dropbox Backup #{run_id} \u2013 {ts} ({changes} zmen)" - send_mail("vladimir.buzalka@buzalka.cz", subject, body, html=True) - print(f"\n[9] Email odeslan na vladimir.buzalka@buzalka.cz") + subject = f"Dropbox Backup #{run_id} – {ts} ({changes} zmen)" + from indexer.config import LOG_DIR + log_file = os.path.join(LOG_DIR, "backup.log") + attachments = [log_file] if os.path.exists(log_file) else [] + + send_mail("vladimir.buzalka@buzalka.cz", subject, body, html=True, attachments=attachments) + log.info("[9] Email odeslan na vladimir.buzalka@buzalka.cz") except Exception as e: - print(f" WARN: Email failed: {e}") + log.warning(f"Email failed: {e}") if __name__ == "__main__":