From 07e6a9c374887f2a50d08206c37d81f8d8ecc59f Mon Sep 17 00:00:00 2001
From: vlado <vladimir.buzalka@buzalka.cz>
Date: Mon, 18 May 2026 07:04:08 +0200
Subject: [PATCH] reporter

---
 .env               |   2 +-
 README.md          | 108 ++++++++++++++++++++++++++++++++++++++
 indexer/config.py  |   7 +++
 indexer/db.py      |  49 ++++++++---------
 indexer/hasher.py  |  20 +++++++
 indexer/logger.py  |  34 ++++++++++++
 indexer/scanner.py |   5 +-
 main.py            | 128 +++++++++++++++++++++------------------------
 8 files changed, 254 insertions(+), 99 deletions(-)
 create mode 100644 indexer/logger.py
diff --git a/.env b/.env
index 1e22a76..c1ff53d 100644
--- a/.env
+++ b/.env
@@ -19,5 +19,5 @@ DB_PASSWORD=Vlado9674+
 ROOT_PATH=z:\Dropbox\Ordinace\
 ROOT_NAME=DropboxOrdinace
 BATCH_SIZE=1000
-BACKUP_PATH=w:\Onedrive\DropBoxBackupClaude\
+BACKUP_PATH=w:\OneDrive\DropBoxBackupClaude\
 BACKUP_PASSWORD=Vlado7309208104++
diff --git a/README.md b/README.md
index e69de29..ad81aca 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,108 @@
+# Dropbox Ordinace Backup
+
+Indexer a zálohovací systém pro Dropbox složku ordinace. Při každém spuštění projde filesystém, porovná stav s databází, zazálohuje nové/změněné soubory do content-addressable storage, vygeneruje Excel report a odešle emailové oznámení.
+
+## Architektura
+
+```
+main.py                  – hlavní orchestrátor (spouštět přímo)
+indexer/
+  config.py              – načítá .env (DB, cesty, chování)
+  logger.py              – setup logování do souboru + konzole
+  scanner.py             – rychlý scan filesystému (bez hashování)
+  hasher.py              – BLAKE3 hash, detekce cloud placeholderů, hydratace
+  db.py                  – všechny DB operace (runs, files, events)
+  events.py              – batch INSERT do file_events
+  backup.py              – AES-256 ZIP content-addressable storage
+report.py                – generuje Excel report z DB
+recovery.py              – obnova souborů ze zálohy
+reconcile.py             – reconciliace DB vs filesystem
+```
+
+## Konfigurace (.env)
+
+```env
+DRY_RUN=false            # true = jen scan, žádná DB ani záloha
+LOG_LEVEL=INFO
+LOG_DIR=C:\Reporting\DropboxBackup\logs
+
+DB_HOST=192.168.1.76
+DB_PORT=3306
+DB_NAME=OrdinaceDropBoxBackup
+DB_USER=root
+DB_PASSWORD=...
+
+ROOT_PATH=z:\Dropbox\Ordinace\
+ROOT_NAME=DropboxOrdinace
+BATCH_SIZE=1000
+BACKUP_PATH=w:\OneDrive\DropBoxBackupClaude\
+BACKUP_PASSWORD=...
+```
+
+## Spuštění
+
+```
+cd C:\Reporting\DropboxBackup
+C:\Reporting\Python\python.exe main.py
+```
+
+Nebo přes Task Scheduler:
+- Program: `C:\Reporting\Python\python.exe`
+- Argumenty: `C:\Reporting\DropboxBackup\main.py`
+- Spustit v: `C:\Reporting\DropboxBackup`
+
+## Co dělá jeden run
+
+1. **Scan** – projde `ROOT_PATH`, zaznamená velikost a mtime každého souboru
+2. **Diff** – porovná s DB: nové / změněné / smazané / beze změny
+3. **Hashování** – BLAKE3 pro nové a změněné soubory
+   - Cloud placeholdery (Dropbox Smart Sync) – automaticky hydratuje (čeká max 120 s na stažení)
+4. **DB update** – batch INSERT/UPDATE souborů a eventů
+5. **Backup** – AES-256 ZIP do content-addressable storage (`BACKUP_PATH/ab/cd/<blake3hash>.zip`), deduplikace hashem
+6. **Report** – Excel soubor do `z:\Dropbox\!!!Days\Downloads Z230\`
+7. **Email** – HTML shrnutí + přiložený `backup.log` na `vladimir.buzalka@buzalka.cz`
+
+## Logy
+
+Rotující denní log: `LOG_DIR\backup.log` (history 90 dní).
+
+## Záloha — storage formát
+
+Každý unikátní obsah je uložen jednou jako:
+```
+BACKUP_PATH/
+  ab/
+    cd/
+      abcdef...zip    ← AES-256 ZIP s názvem <blake3hex>.blob uvnitř
+```
+
+Deduplikace je automatická — soubory se stejným obsahem sdílí jeden ZIP.
+
+## Databáze
+
+MySQL databáze `OrdinaceDropBoxBackup` na `192.168.1.76`:
+
+| Tabulka | Popis |
+|---|---|
+| `runs` | Každé spuštění indexeru (status, statistiky) |
+| `files` | Aktuální stav všech souborů (`exists_now`, hash, velikost) |
+| `file_events` | Historie změn (CREATED / MODIFIED / DELETED) |
+
+## Závislosti
+
+```
+blake3
+pymysql
+python-dotenv
+pyzipper
+openpyxl
+msal
+requests
+```
+
+Instalace: `C:\Reporting\Python\python.exe -m pip install -r requirements.txt`
+
+## Projekty
+
+- Produkce: `C:\Reporting\DropboxBackup\`
+- Vývoj: `C:\Users\vlado\PycharmProjects\drobboxordinacebackup\`
diff --git a/indexer/config.py b/indexer/config.py
index 55814c1..d67f6b4 100644
--- a/indexer/config.py
+++ b/indexer/config.py
@@ -32,3 +32,10 @@ BACKUP_PASSWORD = os.getenv("BACKUP_PASSWORD")
 
 DRY_RUN = os.getenv("DRY_RUN", "true").lower() in ("1", "true", "yes")
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
+
+# =========================
+# Logging
+# =========================
+
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+LOG_DIR = os.getenv("LOG_DIR", r"C:\Reporting\DropboxBackup\logs")
diff --git a/indexer/db.py b/indexer/db.py
index 30a68a2..cbec078 100644
--- a/indexer/db.py
+++ b/indexer/db.py
@@ -1,8 +1,13 @@
+import unicodedata
 import pymysql
 from datetime import datetime
 from indexer.config import DB_CONFIG, BATCH_SIZE
 
 
+def _nfc(s: str) -> str:
+    return unicodedata.normalize("NFC", s) if s else s
+
+
 def get_connection():
     return pymysql.connect(**DB_CONFIG)
 
@@ -50,7 +55,7 @@ def load_all_files(cur) -> dict:
     result = {}
     for row in cur.fetchall():
         file_id, rel_path, size, mtime, content_hash = row
-        result[rel_path] = {
+        result[_nfc(rel_path)] = {
             "id": file_id,
             "size": size,
             "mtime": mtime,
@@ -70,34 +75,24 @@ def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
     Returns: {relative_path: file_id}
     """
     path_to_id = {}
-    for i in range(0, len(files_list), BATCH_SIZE):
-        chunk = files_list[i:i + BATCH_SIZE]
-        for f in chunk:
-            cur.execute(
-                """INSERT INTO files
-                   (relative_path, file_name, directory, file_size, mtime,
-                    content_hash, first_seen_run, last_seen_run, exists_now)
-                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
-                   ON DUPLICATE KEY UPDATE
-                    file_name = VALUES(file_name),
-                    directory = VALUES(directory),
-                    file_size = VALUES(file_size),
-                    mtime = VALUES(mtime),
-                    content_hash = VALUES(content_hash),
-                    last_seen_run = VALUES(last_seen_run),
-                    exists_now = 1""",
-                (f["relative_path"], f["file_name"], f["directory"],
-                 f["size"], f["mtime"], f["content_hash"], run_id, run_id)
-            )
-        # Fetch real IDs
-        paths = [f["relative_path"] for f in chunk]
-        placeholders = ",".join(["%s"] * len(paths))
+    for f in files_list:
         cur.execute(
-            f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})",
-            paths,
+            """INSERT INTO files
+               (relative_path, file_name, directory, file_size, mtime,
+                content_hash, first_seen_run, last_seen_run, exists_now)
+               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
+               ON DUPLICATE KEY UPDATE
+                file_name = VALUES(file_name),
+                directory = VALUES(directory),
+                file_size = VALUES(file_size),
+                mtime = VALUES(mtime),
+                content_hash = VALUES(content_hash),
+                last_seen_run = VALUES(last_seen_run),
+                exists_now = 1""",
+            (f["relative_path"], f["file_name"], f["directory"],
+             f["size"], f["mtime"], f["content_hash"], run_id, run_id)
         )
-        for row in cur.fetchall():
-            path_to_id[row[1]] = row[0]
+        path_to_id[f["relative_path"]] = cur.lastrowid
     return path_to_id
 
 
diff --git a/indexer/hasher.py b/indexer/hasher.py
index 8c2ccc5..5441347 100644
--- a/indexer/hasher.py
+++ b/indexer/hasher.py
@@ -1,4 +1,5 @@
 import ctypes
+import time
 
 from blake3 import blake3
 
@@ -17,6 +18,25 @@ def is_cloud_placeholder(path: str) -> bool:
     return bool(attrs & _CLOUD_MASK)
 
 
+def hydrate_file(path: str, timeout: int = 120, poll: int = 3) -> bool:
+    """
+    Přinutí Dropbox stáhnout cloud placeholder otevřením souboru.
+    Čeká max timeout sekund. Vrátí True pokud se soubor stáhl.
+    """
+    try:
+        with open(path, "rb") as f:
+            f.read(1)
+    except OSError:
+        pass
+
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if not is_cloud_placeholder(path):
+            return True
+        time.sleep(poll)
+    return False
+
+
 def blake3_file(path, chunk_size=1024 * 1024):
     """Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti)."""
     h = blake3()
diff --git a/indexer/logger.py b/indexer/logger.py
new file mode 100644
index 0000000..d24a3f5
--- /dev/null
+++ b/indexer/logger.py
@@ -0,0 +1,34 @@
+import logging
+import os
+import sys
+from logging.handlers import TimedRotatingFileHandler
+
+from indexer.config import LOG_LEVEL, LOG_DIR
+
+
+def setup_logging() -> logging.Logger:
+    os.makedirs(LOG_DIR, exist_ok=True)
+
+    level = getattr(logging, LOG_LEVEL.upper(), logging.INFO)
+    fmt = logging.Formatter(
+        "%(asctime)s [%(levelname)-8s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    file_handler = TimedRotatingFileHandler(
+        os.path.join(LOG_DIR, "backup.log"),
+        when="midnight",
+        backupCount=90,
+        encoding="utf-8",
+    )
+    file_handler.setFormatter(fmt)
+
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(fmt)
+
+    logging.root.setLevel(level)
+    logging.root.handlers.clear()
+    logging.root.addHandler(file_handler)
+    logging.root.addHandler(console_handler)
+
+    return logging.getLogger("backup")
diff --git a/indexer/scanner.py b/indexer/scanner.py
index 2e0d468..4685b22 100644
--- a/indexer/scanner.py
+++ b/indexer/scanner.py
@@ -1,4 +1,5 @@
 import os
+import unicodedata
 from datetime import datetime
 
 
@@ -18,8 +19,8 @@ def scan_files(root_path: str) -> dict:
                 stat = os.stat(full_path)
             except (FileNotFoundError, PermissionError):
                 continue
-            rel_path = os.path.relpath(full_path, root_path).replace("\\", "/")
-            rel_dir = os.path.relpath(root, root_path).replace("\\", "/")
+            rel_path = unicodedata.normalize("NFC", os.path.relpath(full_path, root_path).replace("\\", "/"))
+            rel_dir = unicodedata.normalize("NFC", os.path.relpath(root, root_path).replace("\\", "/"))
             # Truncate microseconds — MySQL DATETIME rounds to whole seconds,
             # which causes false "modified" detections on every run.
             mtime = datetime.fromtimestamp(stat.st_mtime).replace(microsecond=0)
diff --git a/main.py b/main.py
index 45b851f..de57d64 100644
--- a/main.py
+++ b/main.py
@@ -2,6 +2,7 @@ import os
 from datetime import datetime
 
 from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN, BACKUP_PATH
+from indexer.logger import setup_logging
 from indexer.scanner import scan_files
 from indexer.hasher import blake3_file
 from indexer.db import (
@@ -11,42 +12,43 @@ from indexer.db import (
 )
 from indexer.events import batch_log_events
 from indexer.backup import ensure_backed_up
-from indexer.hasher import is_cloud_placeholder
+from indexer.hasher import is_cloud_placeholder, hydrate_file
 
 
 def main():
-    print("=" * 60)
-    print("ORDINACE DROPBOX BACKUP – INDEXER")
-    print(f"Root    : {ROOT_PATH}")
-    print(f"Backup  : {BACKUP_PATH}")
-    print(f"DRY RUN : {DRY_RUN}")
-    print("=" * 60)
+    log = setup_logging()
+
+    log.info("=" * 60)
+    log.info("ORDINACE DROPBOX BACKUP – INDEXER")
+    log.info(f"Root    : {ROOT_PATH}")
+    log.info(f"Backup  : {BACKUP_PATH}")
+    log.info(f"DRY RUN : {DRY_RUN}")
+    log.info("=" * 60)
 
     # ── 1. Scan filesystem (fast, no hashing) ──
-    print("\n[1/7] Scanning filesystem...")
+    log.info("[1/7] Scanning filesystem...")
     fs = scan_files(ROOT_PATH)
-    print(f"       Found {len(fs)} files on disk.")
+    log.info(f"       Found {len(fs)} files on disk.")
 
     if DRY_RUN:
-        # V DRY_RUN režimu jen ukážeme co by se stalo
-        print("\n[DRY RUN] No DB connection, showing scan results only.")
-        print(f"  Files on disk: {len(fs)}")
+        log.info("[DRY RUN] No DB connection, showing scan results only.")
+        log.info(f"  Files on disk: {len(fs)}")
         return
 
     # ── 2. Connect & create run ──
     conn = get_connection()
     cur = conn.cursor()
     run_id = create_run(cur)
-    print(f"\n[2/7] Run #{run_id} created.")
+    log.info(f"[2/7] Run #{run_id} created.")
 
     try:
         # ── 3. Load DB state ──
-        print("[3/7] Loading DB state...")
+        log.info("[3/7] Loading DB state...")
         db = load_all_files(cur)
-        print(f"       {len(db)} files in DB (exists_now=1).")
+        log.info(f"       {len(db)} files in DB (exists_now=1).")
 
         # ── 4. Diff ──
-        print("[4/7] Diffing...")
+        log.info("[4/7] Diffing...")
         fs_paths = set(fs.keys())
         db_paths = set(db.keys())
 
@@ -64,11 +66,11 @@ def main():
             else:
                 unchanged_paths.add(p)
 
-        print(f"       NEW: {len(new_paths)}  MOD: {len(modified_paths)}  "
-              f"DEL: {len(deleted_paths)}  SAME: {len(unchanged_paths)}")
+        log.info(f"       NEW: {len(new_paths)}  MOD: {len(modified_paths)}  "
+                 f"DEL: {len(deleted_paths)}  SAME: {len(unchanged_paths)}")
 
         # ── 5. Process changes ──
-        print("[5/7] Processing changes...")
+        log.info("[5/7] Processing changes...")
         events = []
         files_to_backup = []
 
@@ -76,19 +78,21 @@ def main():
         skipped_files = []
         new_files = []
         if new_paths:
-            print(f"  Hashing {len(new_paths)} new files...")
-            new_files = []
+            log.info(f"  Hashing {len(new_paths)} new files...")
             for p in new_paths:
                 f = fs[p]
                 if is_cloud_placeholder(f["full_path"]):
-                    reason = "not synced (cloud placeholder)"
-                    print(f"  WARN: skip {p}: {reason}")
-                    skipped_files.append((p, reason))
-                    continue
+                    log.warning(f"  PLACEHOLDER {p} — čekám na stažení...")
+                    if not hydrate_file(f["full_path"]):
+                        reason = "not synced (cloud placeholder, hydration timeout)"
+                        log.warning(f"  SKIP {p}: {reason}")
+                        skipped_files.append((p, reason))
+                        continue
+                    log.info(f"  OK hydrated: {p}")
                 try:
                     content_hash = blake3_file(f["full_path"])
                 except (FileNotFoundError, PermissionError, OSError) as e:
-                    print(f"  WARN: skip {p}: {e}")
+                    log.warning(f"  SKIP {p}: {e}")
                     skipped_files.append((p, str(e)))
                     continue
                 new_files.append({
@@ -114,7 +118,7 @@ def main():
 
         # 5b) MODIFIED files — compute BLAKE3, batch UPDATE
         if modified_paths:
-            print(f"  Hashing {len(modified_paths)} modified files...")
+            log.info(f"  Hashing {len(modified_paths)} modified files...")
             mod_files = []
             for p in modified_paths:
                 f = fs[p]
@@ -122,7 +126,7 @@ def main():
                 try:
                     content_hash = blake3_file(f["full_path"])
                 except (FileNotFoundError, PermissionError, OSError) as e:
-                    print(f"  WARN: skip {p}: {e}")
+                    log.warning(f"  SKIP {p}: {e}")
                     continue
                 mod_files.append({
                     "id": db_file["id"],
@@ -168,16 +172,16 @@ def main():
 
         # ── 6. Backup ──
         if files_to_backup and BACKUP_PATH:
-            print(f"[6/7] Backing up {len(files_to_backup)} files...")
+            log.info(f"[6/7] Backing up {len(files_to_backup)} files...")
             backed = ensure_backed_up(files_to_backup, BACKUP_PATH)
-            print(f"       {backed} new blobs written.")
+            log.info(f"       {backed} new blobs written.")
         else:
-            print("[6/7] Nothing to backup.")
+            log.info("[6/7] Nothing to backup.")
 
         # ── 7. Finalize ──
         stats = {
             "total": len(fs),
-            "new": len(new_files) if new_paths else 0,
+            "new": len(new_files),
             "modified": len(modified_paths),
             "deleted": len(deleted_paths),
             "unchanged": len(unchanged_paths),
@@ -185,10 +189,10 @@ def main():
         }
         finalize_run(cur, run_id, stats)
         conn.commit()
-        print(f"[7/7] Run #{run_id} COMPLETED.")
+        log.info(f"[7/7] Run #{run_id} COMPLETED.")
 
     except Exception as e:
-        print(f"\nERROR: {e}")
+        log.exception(f"FATAL ERROR: {e}")
         try:
             fail_run(cur, run_id)
             conn.commit()
@@ -200,19 +204,19 @@ def main():
         conn.close()
 
     # ── Summary ──
-    print("\n" + "=" * 60)
-    print(f"Total    : {stats['total']}")
-    print(f"New      : {stats['new']}")
-    print(f"Modified : {stats['modified']}")
-    print(f"Deleted  : {stats['deleted']}")
-    print(f"Unchanged: {stats['unchanged']}")
+    log.info("=" * 60)
+    log.info(f"Total    : {stats['total']}")
+    log.info(f"New      : {stats['new']}")
+    log.info(f"Modified : {stats['modified']}")
+    log.info(f"Deleted  : {stats['deleted']}")
+    log.info(f"Unchanged: {stats['unchanged']}")
     if skipped_files:
-        print(f"Skipped  : {len(skipped_files)} (hash failed)")
-        print("-" * 60)
+        log.warning(f"Skipped  : {len(skipped_files)} (hash failed)")
+        log.info("-" * 60)
         for path, reason in skipped_files:
-            print(f"  SKIP: {path}")
-            print(f"        {reason}")
-    print("=" * 60)
+            log.warning(f"  SKIP: {path}")
+            log.warning(f"        {reason}")
+    log.info("=" * 60)
 
     # ── 8. Generate Excel report ──
     report_path = None
@@ -225,10 +229,10 @@ def main():
                 os.remove(os.path.join(report_dir, f))
         timestamp = datetime.now().strftime("%Y-%m-%d %H_%M")
         report_path = os.path.join(report_dir, f"{timestamp} DropboxBackupReport.xlsx")
-        print(f"\n[8] Generating report...")
+        log.info("[8] Generating report...")
         generate_report(report_path)
     except Exception as e:
-        print(f"  WARN: Report generation failed: {e}")
+        log.warning(f"Report generation failed: {e}")
 
     # ── 9. Send email notification ──
     try:
@@ -252,23 +256,6 @@ def main():
   {rows}
 </table>"""
 
-        def _file_section(title, color, paths):
-            if not paths:
-                return ""
-            rows = "".join(f"<tr><td style='padding:2px 8px;font-size:12px;'>{p}</td></tr>" for p in sorted(paths))
-            return f"""
-<h3 style="color:{color};margin-top:18px;">{title} ({len(paths)})</h3>
-<table border="0" cellpadding="2" cellspacing="0" style="border-collapse:collapse;width:100%;font-family:monospace;">
-  {rows}
-</table>"""
-
-        new_paths_ok = [nf["relative_path"] for nf in new_files]
-        files_detail = (
-            _file_section("&#10003; Nove soubory", "#2a7a2a", new_paths_ok)
-            + _file_section("&#9998; Zmenene soubory", "#a07000", list(modified_paths))
-            + _file_section("&#10007; Smazane soubory", "#a00000", list(deleted_paths))
-        )
-
         body = f"""
 <html><body style="font-family:Segoe UI,Arial,sans-serif;font-size:14px;color:#222;">
 <h2 style="color:#2e6da4;">&#10003; Dropbox Ordinace Backup &ndash; {ts}</h2>
@@ -283,16 +270,19 @@ def main():
   {skipped_row}
   {report_line}
 </table>
-{files_detail}
 {skipped_detail}
 <p style="color:#888;font-size:12px;margin-top:20px;">REPORTER &bull; {ts}</p>
 </body></html>
 """
-        subject = f"Dropbox Backup #{run_id} \u2013 {ts} ({changes} zmen)"
-        send_mail("vladimir.buzalka@buzalka.cz", subject, body, html=True)
-        print(f"\n[9] Email odeslan na vladimir.buzalka@buzalka.cz")
+        subject = f"Dropbox Backup #{run_id} – {ts} ({changes} zmen)"
+        from indexer.config import LOG_DIR
+        log_file = os.path.join(LOG_DIR, "backup.log")
+        attachments = [log_file] if os.path.exists(log_file) else []
+
+        send_mail("vladimir.buzalka@buzalka.cz", subject, body, html=True, attachments=attachments)
+        log.info("[9] Email odeslan na vladimir.buzalka@buzalka.cz")
     except Exception as e:
-        print(f"  WARN: Email failed: {e}")
+        log.warning(f"Email failed: {e}")
 
 
 if __name__ == "__main__":