This commit is contained in:
2026-05-18 07:04:08 +02:00
parent 52f04c2839
commit 07e6a9c374
8 changed files with 254 additions and 99 deletions
+1 -1
View File
@@ -19,5 +19,5 @@ DB_PASSWORD=Vlado9674+
ROOT_PATH=z:\Dropbox\Ordinace\
ROOT_NAME=DropboxOrdinace
BATCH_SIZE=1000
BACKUP_PATH=w:\Onedrive\DropBoxBackupClaude\
BACKUP_PATH=w:\OneDrive\DropBoxBackupClaude\
BACKUP_PASSWORD=Vlado7309208104++
+108
View File
@@ -0,0 +1,108 @@
# Dropbox Ordinace Backup
Indexer a zálohovací systém pro Dropbox složku ordinace. Při každém spuštění projde filesystém, porovná stav s databází, zazálohuje nové/změněné soubory do content-addressable storage, vygeneruje Excel report a odešle emailové oznámení.
## Architektura
```
main.py hlavní orchestrátor (spouštět přímo)
indexer/
config.py načítá .env (DB, cesty, chování)
logger.py setup logování do souboru + konzole
scanner.py rychlý scan filesystému (bez hashování)
hasher.py BLAKE3 hash, detekce cloud placeholderů, hydratace
db.py všechny DB operace (runs, files, events)
events.py batch INSERT do file_events
backup.py AES-256 ZIP content-addressable storage
report.py generuje Excel report z DB
recovery.py obnova souborů ze zálohy
reconcile.py reconciliace DB vs filesystem
```
## Konfigurace (.env)
```env
DRY_RUN=false # true = jen scan, žádná DB ani záloha
LOG_LEVEL=INFO
LOG_DIR=C:\Reporting\DropboxBackup\logs
DB_HOST=192.168.1.76
DB_PORT=3306
DB_NAME=OrdinaceDropBoxBackup
DB_USER=root
DB_PASSWORD=...
ROOT_PATH=z:\Dropbox\Ordinace\
ROOT_NAME=DropboxOrdinace
BATCH_SIZE=1000
BACKUP_PATH=w:\OneDrive\DropBoxBackupClaude\
BACKUP_PASSWORD=...
```
## Spuštění
```
cd C:\Reporting\DropboxBackup
C:\Reporting\Python\python.exe main.py
```
Nebo přes Task Scheduler:
- Program: `C:\Reporting\Python\python.exe`
- Argumenty: `C:\Reporting\DropboxBackup\main.py`
- Spustit v: `C:\Reporting\DropboxBackup`
## Co dělá jeden run
1. **Scan** projde `ROOT_PATH`, zaznamená velikost a mtime každého souboru
2. **Diff** porovná s DB: nové / změněné / smazané / beze změny
3. **Hashování** BLAKE3 pro nové a změněné soubory
- Cloud placeholdery (Dropbox Smart Sync) automaticky hydratuje (čeká max 120 s na stažení)
4. **DB update** batch INSERT/UPDATE souborů a eventů
5. **Backup** AES-256 ZIP do content-addressable storage (`BACKUP_PATH/ab/cd/<blake3hash>.zip`), deduplikace hashem
6. **Report** Excel soubor do `z:\Dropbox\!!!Days\Downloads Z230\`
7. **Email** HTML shrnutí + přiložený `backup.log` na `vladimir.buzalka@buzalka.cz`
## Logy
Rotující denní log: `LOG_DIR\backup.log` (history 90 dní).
## Záloha — storage formát
Každý unikátní obsah je uložen jednou jako:
```
BACKUP_PATH/
ab/
cd/
abcdef...zip ← AES-256 ZIP s názvem <blake3hex>.blob uvnitř
```
Deduplikace je automatická — soubory se stejným obsahem sdílí jeden ZIP.
## Databáze
MySQL databáze `OrdinaceDropBoxBackup` na `192.168.1.76`:
| Tabulka | Popis |
|---|---|
| `runs` | Každé spuštění indexeru (status, statistiky) |
| `files` | Aktuální stav všech souborů (`exists_now`, hash, velikost) |
| `file_events` | Historie změn (CREATED / MODIFIED / DELETED) |
## Závislosti
```
blake3
pymysql
python-dotenv
pyzipper
openpyxl
msal
requests
```
Instalace: `C:\Reporting\Python\python.exe -m pip install -r requirements.txt`
## Projekty
- Produkce: `C:\Reporting\DropboxBackup\`
- Vývoj: `C:\Users\vlado\PycharmProjects\drobboxordinacebackup\`
+7
View File
@@ -32,3 +32,10 @@ BACKUP_PASSWORD = os.getenv("BACKUP_PASSWORD")
DRY_RUN = os.getenv("DRY_RUN", "true").lower() in ("1", "true", "yes")
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
# =========================
# Logging
# =========================
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
LOG_DIR = os.getenv("LOG_DIR", r"C:\Reporting\DropboxBackup\logs")
+8 -13
View File
@@ -1,8 +1,13 @@
import unicodedata
import pymysql
from datetime import datetime
from indexer.config import DB_CONFIG, BATCH_SIZE
def _nfc(s: str) -> str:
return unicodedata.normalize("NFC", s) if s else s
def get_connection():
return pymysql.connect(**DB_CONFIG)
@@ -50,7 +55,7 @@ def load_all_files(cur) -> dict:
result = {}
for row in cur.fetchall():
file_id, rel_path, size, mtime, content_hash = row
result[rel_path] = {
result[_nfc(rel_path)] = {
"id": file_id,
"size": size,
"mtime": mtime,
@@ -70,9 +75,7 @@ def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
Returns: {relative_path: file_id}
"""
path_to_id = {}
for i in range(0, len(files_list), BATCH_SIZE):
chunk = files_list[i:i + BATCH_SIZE]
for f in chunk:
for f in files_list:
cur.execute(
"""INSERT INTO files
(relative_path, file_name, directory, file_size, mtime,
@@ -89,15 +92,7 @@ def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
(f["relative_path"], f["file_name"], f["directory"],
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
)
# Fetch real IDs
paths = [f["relative_path"] for f in chunk]
placeholders = ",".join(["%s"] * len(paths))
cur.execute(
f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})",
paths,
)
for row in cur.fetchall():
path_to_id[row[1]] = row[0]
path_to_id[f["relative_path"]] = cur.lastrowid
return path_to_id
+20
View File
@@ -1,4 +1,5 @@
import ctypes
import time
from blake3 import blake3
@@ -17,6 +18,25 @@ def is_cloud_placeholder(path: str) -> bool:
return bool(attrs & _CLOUD_MASK)
def hydrate_file(path: str, timeout: int = 120, poll: int = 3) -> bool:
"""
Přinutí Dropbox stáhnout cloud placeholder otevřením souboru.
Čeká max timeout sekund. Vrátí True pokud se soubor stáhl.
"""
try:
with open(path, "rb") as f:
f.read(1)
except OSError:
pass
deadline = time.time() + timeout
while time.time() < deadline:
if not is_cloud_placeholder(path):
return True
time.sleep(poll)
return False
def blake3_file(path, chunk_size=1024 * 1024):
"""Spočítá BLAKE3 hash souboru po blocích (bez načtení do paměti)."""
h = blake3()
+34
View File
@@ -0,0 +1,34 @@
import logging
import os
import sys
from logging.handlers import TimedRotatingFileHandler
from indexer.config import LOG_LEVEL, LOG_DIR
def setup_logging() -> logging.Logger:
os.makedirs(LOG_DIR, exist_ok=True)
level = getattr(logging, LOG_LEVEL.upper(), logging.INFO)
fmt = logging.Formatter(
"%(asctime)s [%(levelname)-8s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
file_handler = TimedRotatingFileHandler(
os.path.join(LOG_DIR, "backup.log"),
when="midnight",
backupCount=90,
encoding="utf-8",
)
file_handler.setFormatter(fmt)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(fmt)
logging.root.setLevel(level)
logging.root.handlers.clear()
logging.root.addHandler(file_handler)
logging.root.addHandler(console_handler)
return logging.getLogger("backup")
+3 -2
View File
@@ -1,4 +1,5 @@
import os
import unicodedata
from datetime import datetime
@@ -18,8 +19,8 @@ def scan_files(root_path: str) -> dict:
stat = os.stat(full_path)
except (FileNotFoundError, PermissionError):
continue
rel_path = os.path.relpath(full_path, root_path).replace("\\", "/")
rel_dir = os.path.relpath(root, root_path).replace("\\", "/")
rel_path = unicodedata.normalize("NFC", os.path.relpath(full_path, root_path).replace("\\", "/"))
rel_dir = unicodedata.normalize("NFC", os.path.relpath(root, root_path).replace("\\", "/"))
# Truncate microseconds — MySQL DATETIME rounds to whole seconds,
# which causes false "modified" detections on every run.
mtime = datetime.fromtimestamp(stat.st_mtime).replace(microsecond=0)
+56 -66
View File
@@ -2,6 +2,7 @@ import os
from datetime import datetime
from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN, BACKUP_PATH
from indexer.logger import setup_logging
from indexer.scanner import scan_files
from indexer.hasher import blake3_file
from indexer.db import (
@@ -11,42 +12,43 @@ from indexer.db import (
)
from indexer.events import batch_log_events
from indexer.backup import ensure_backed_up
from indexer.hasher import is_cloud_placeholder
from indexer.hasher import is_cloud_placeholder, hydrate_file
def main():
print("=" * 60)
print("ORDINACE DROPBOX BACKUP INDEXER")
print(f"Root : {ROOT_PATH}")
print(f"Backup : {BACKUP_PATH}")
print(f"DRY RUN : {DRY_RUN}")
print("=" * 60)
log = setup_logging()
log.info("=" * 60)
log.info("ORDINACE DROPBOX BACKUP INDEXER")
log.info(f"Root : {ROOT_PATH}")
log.info(f"Backup : {BACKUP_PATH}")
log.info(f"DRY RUN : {DRY_RUN}")
log.info("=" * 60)
# ── 1. Scan filesystem (fast, no hashing) ──
print("\n[1/7] Scanning filesystem...")
log.info("[1/7] Scanning filesystem...")
fs = scan_files(ROOT_PATH)
print(f" Found {len(fs)} files on disk.")
log.info(f" Found {len(fs)} files on disk.")
if DRY_RUN:
# V DRY_RUN režimu jen ukážeme co by se stalo
print("\n[DRY RUN] No DB connection, showing scan results only.")
print(f" Files on disk: {len(fs)}")
log.info("[DRY RUN] No DB connection, showing scan results only.")
log.info(f" Files on disk: {len(fs)}")
return
# ── 2. Connect & create run ──
conn = get_connection()
cur = conn.cursor()
run_id = create_run(cur)
print(f"\n[2/7] Run #{run_id} created.")
log.info(f"[2/7] Run #{run_id} created.")
try:
# ── 3. Load DB state ──
print("[3/7] Loading DB state...")
log.info("[3/7] Loading DB state...")
db = load_all_files(cur)
print(f" {len(db)} files in DB (exists_now=1).")
log.info(f" {len(db)} files in DB (exists_now=1).")
# ── 4. Diff ──
print("[4/7] Diffing...")
log.info("[4/7] Diffing...")
fs_paths = set(fs.keys())
db_paths = set(db.keys())
@@ -64,11 +66,11 @@ def main():
else:
unchanged_paths.add(p)
print(f" NEW: {len(new_paths)} MOD: {len(modified_paths)} "
log.info(f" NEW: {len(new_paths)} MOD: {len(modified_paths)} "
f"DEL: {len(deleted_paths)} SAME: {len(unchanged_paths)}")
# ── 5. Process changes ──
print("[5/7] Processing changes...")
log.info("[5/7] Processing changes...")
events = []
files_to_backup = []
@@ -76,19 +78,21 @@ def main():
skipped_files = []
new_files = []
if new_paths:
print(f" Hashing {len(new_paths)} new files...")
new_files = []
log.info(f" Hashing {len(new_paths)} new files...")
for p in new_paths:
f = fs[p]
if is_cloud_placeholder(f["full_path"]):
reason = "not synced (cloud placeholder)"
print(f" WARN: skip {p}: {reason}")
log.warning(f" PLACEHOLDER {p} — čekám na stažení...")
if not hydrate_file(f["full_path"]):
reason = "not synced (cloud placeholder, hydration timeout)"
log.warning(f" SKIP {p}: {reason}")
skipped_files.append((p, reason))
continue
log.info(f" OK hydrated: {p}")
try:
content_hash = blake3_file(f["full_path"])
except (FileNotFoundError, PermissionError, OSError) as e:
print(f" WARN: skip {p}: {e}")
log.warning(f" SKIP {p}: {e}")
skipped_files.append((p, str(e)))
continue
new_files.append({
@@ -114,7 +118,7 @@ def main():
# 5b) MODIFIED files — compute BLAKE3, batch UPDATE
if modified_paths:
print(f" Hashing {len(modified_paths)} modified files...")
log.info(f" Hashing {len(modified_paths)} modified files...")
mod_files = []
for p in modified_paths:
f = fs[p]
@@ -122,7 +126,7 @@ def main():
try:
content_hash = blake3_file(f["full_path"])
except (FileNotFoundError, PermissionError, OSError) as e:
print(f" WARN: skip {p}: {e}")
log.warning(f" SKIP {p}: {e}")
continue
mod_files.append({
"id": db_file["id"],
@@ -168,16 +172,16 @@ def main():
# ── 6. Backup ──
if files_to_backup and BACKUP_PATH:
print(f"[6/7] Backing up {len(files_to_backup)} files...")
log.info(f"[6/7] Backing up {len(files_to_backup)} files...")
backed = ensure_backed_up(files_to_backup, BACKUP_PATH)
print(f" {backed} new blobs written.")
log.info(f" {backed} new blobs written.")
else:
print("[6/7] Nothing to backup.")
log.info("[6/7] Nothing to backup.")
# ── 7. Finalize ──
stats = {
"total": len(fs),
"new": len(new_files) if new_paths else 0,
"new": len(new_files),
"modified": len(modified_paths),
"deleted": len(deleted_paths),
"unchanged": len(unchanged_paths),
@@ -185,10 +189,10 @@ def main():
}
finalize_run(cur, run_id, stats)
conn.commit()
print(f"[7/7] Run #{run_id} COMPLETED.")
log.info(f"[7/7] Run #{run_id} COMPLETED.")
except Exception as e:
print(f"\nERROR: {e}")
log.exception(f"FATAL ERROR: {e}")
try:
fail_run(cur, run_id)
conn.commit()
@@ -200,19 +204,19 @@ def main():
conn.close()
# ── Summary ──
print("\n" + "=" * 60)
print(f"Total : {stats['total']}")
print(f"New : {stats['new']}")
print(f"Modified : {stats['modified']}")
print(f"Deleted : {stats['deleted']}")
print(f"Unchanged: {stats['unchanged']}")
log.info("=" * 60)
log.info(f"Total : {stats['total']}")
log.info(f"New : {stats['new']}")
log.info(f"Modified : {stats['modified']}")
log.info(f"Deleted : {stats['deleted']}")
log.info(f"Unchanged: {stats['unchanged']}")
if skipped_files:
print(f"Skipped : {len(skipped_files)} (hash failed)")
print("-" * 60)
log.warning(f"Skipped : {len(skipped_files)} (hash failed)")
log.info("-" * 60)
for path, reason in skipped_files:
print(f" SKIP: {path}")
print(f" {reason}")
print("=" * 60)
log.warning(f" SKIP: {path}")
log.warning(f" {reason}")
log.info("=" * 60)
# ── 8. Generate Excel report ──
report_path = None
@@ -225,10 +229,10 @@ def main():
os.remove(os.path.join(report_dir, f))
timestamp = datetime.now().strftime("%Y-%m-%d %H_%M")
report_path = os.path.join(report_dir, f"{timestamp} DropboxBackupReport.xlsx")
print(f"\n[8] Generating report...")
log.info("[8] Generating report...")
generate_report(report_path)
except Exception as e:
print(f" WARN: Report generation failed: {e}")
log.warning(f"Report generation failed: {e}")
# ── 9. Send email notification ──
try:
@@ -252,23 +256,6 @@ def main():
{rows}
</table>"""
def _file_section(title, color, paths):
if not paths:
return ""
rows = "".join(f"<tr><td style='padding:2px 8px;font-size:12px;'>{p}</td></tr>" for p in sorted(paths))
return f"""
<h3 style="color:{color};margin-top:18px;">{title} ({len(paths)})</h3>
<table border="0" cellpadding="2" cellspacing="0" style="border-collapse:collapse;width:100%;font-family:monospace;">
{rows}
</table>"""
new_paths_ok = [nf["relative_path"] for nf in new_files]
files_detail = (
_file_section("&#10003; Nove soubory", "#2a7a2a", new_paths_ok)
+ _file_section("&#9998; Zmenene soubory", "#a07000", list(modified_paths))
+ _file_section("&#10007; Smazane soubory", "#a00000", list(deleted_paths))
)
body = f"""
<html><body style="font-family:Segoe UI,Arial,sans-serif;font-size:14px;color:#222;">
<h2 style="color:#2e6da4;">&#10003; Dropbox Ordinace Backup &ndash; {ts}</h2>
@@ -283,16 +270,19 @@ def main():
{skipped_row}
{report_line}
</table>
{files_detail}
{skipped_detail}
<p style="color:#888;font-size:12px;margin-top:20px;">REPORTER &bull; {ts}</p>
</body></html>
"""
subject = f"Dropbox Backup #{run_id} \u2013 {ts} ({changes} zmen)"
send_mail("vladimir.buzalka@buzalka.cz", subject, body, html=True)
print(f"\n[9] Email odeslan na vladimir.buzalka@buzalka.cz")
subject = f"Dropbox Backup #{run_id} {ts} ({changes} zmen)"
from indexer.config import LOG_DIR
log_file = os.path.join(LOG_DIR, "backup.log")
attachments = [log_file] if os.path.exists(log_file) else []
send_mail("vladimir.buzalka@buzalka.cz", subject, body, html=True, attachments=attachments)
log.info("[9] Email odeslan na vladimir.buzalka@buzalka.cz")
except Exception as e:
print(f" WARN: Email failed: {e}")
log.warning(f"Email failed: {e}")
if __name__ == "__main__":