diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 1c731c7..59ad312 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -12,7 +12,15 @@ "Bash(C:Pythonpython.exe -c \"import pymysql; conn = pymysql.connect\\(host=''192.168.1.76'', port=3306, user=''root'', password='''', db=''OrdinaceDropBoxBackup''\\); cur = conn.cursor\\(\\); cur.execute\\(''SELECT * FROM runs''\\); print\\(cur.fetchall\\(\\)\\); cur.execute\\(''SELECT COUNT\\(*\\) FROM files''\\); print\\(''files count:'', cur.fetchone\\(\\)\\); conn.close\\(\\)\")", "Bash(/c/Python/python.exe -c \"import pymysql; conn = pymysql.connect\\(host=''192.168.1.76'', port=3306, user=''root'', password='''', db=''OrdinaceDropBoxBackup''\\); cur = conn.cursor\\(\\); cur.execute\\(''SELECT * FROM runs''\\); print\\(''RUNS:'', cur.fetchall\\(\\)\\); cur.execute\\(''SELECT COUNT\\(*\\) FROM files''\\); print\\(''FILES count:'', cur.fetchone\\(\\)\\); conn.close\\(\\)\")", "Bash(/c/Python/python.exe -c \"import pymysql; conn = pymysql.connect\\(host=''192.168.1.76'', port=3306, user=''root'', password=''Vlado9674+'', db=''OrdinaceDropBoxBackup''\\); cur = conn.cursor\\(\\); cur.execute\\(''SELECT * FROM runs''\\); rows = cur.fetchall\\(\\); print\\(''RUNS:''\\); [print\\(r\\) for r in rows]; cur.execute\\(''SELECT COUNT\\(*\\) FROM files''\\); print\\(''FILES count:'', cur.fetchone\\(\\)[0]\\); cur.execute\\(''SELECT COUNT\\(*\\) FROM file_events''\\); print\\(''EVENTS count:'', cur.fetchone\\(\\)[0]\\); conn.close\\(\\)\")", - "Bash(/c/Python/python.exe:*)" + "Bash(/c/Python/python.exe:*)", + "Bash(cd \"u:\\\\OnedriveOrdinace\\\\OneDrive\\\\DropBoxBackupClaude\"\" && powershell -Command \"Get-ChildItem -Recurse -Filter '*.blob')", + "Bash(Measure-Object:*)", + "Bash(Select-Object -ExpandProperty Count \")", + "Bash(powershell:*)", + "Bash(\"C:\\\\Python\\\\python.exe\" -m pip list)", + "Bash(findstr:*)", + "Bash(ls:*)", + "Bash(C:Pythonpython.exe -m pip install pyzipper)" ] } } diff --git a/backup_report.xlsx b/backup_report.xlsx deleted file mode 100644 index 81709f3..0000000 Binary files a/backup_report.xlsx and /dev/null differ diff --git a/indexer/backup.py b/indexer/backup.py index 579555b..b4a99c2 100644 --- a/indexer/backup.py +++ b/indexer/backup.py @@ -1,21 +1,24 @@ import os -import shutil import tempfile +import pyzipper + +from indexer.config import BACKUP_PASSWORD def blob_path(backup_root: str, content_hash: bytes) -> str: - """Vrátí cestu k blob souboru: BACKUP/ab/cd/abcdef...blob""" + """Vrátí cestu k ZIP souboru: BACKUP/ab/cd/abcdef...zip""" hex_hash = content_hash.hex() - return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".blob") + return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".zip") def ensure_backed_up(files_with_hash: list, backup_root: str) -> int: """ - Zkopíruje soubory do content-addressable storage. + Vytvoří AES-256 šifrovaný ZIP pro každý soubor v content-addressable storage. files_with_hash: [(full_path, content_hash_bytes), ...] - Přeskočí soubory, jejichž blob už existuje (deduplikace). + Přeskočí soubory, jejichž zip už existuje (deduplikace). Returns: počet nově zálohovaných souborů. """ + password = BACKUP_PASSWORD.encode("utf-8") backed_up = 0 for full_path, content_hash in files_with_hash: target = blob_path(backup_root, content_hash) @@ -25,17 +28,26 @@ def ensure_backed_up(files_with_hash: list, backup_root: str) -> int: target_dir = os.path.dirname(target) os.makedirs(target_dir, exist_ok=True) + tmp_path = None try: # Atomický zápis: temp soubor + přejmenování fd, tmp_path = tempfile.mkstemp(dir=target_dir, suffix=".tmp") os.close(fd) - shutil.copy2(full_path, tmp_path) + + hex_hash = content_hash.hex() + with pyzipper.AESZipFile( + tmp_path, "w", + compression=pyzipper.ZIP_DEFLATED, + encryption=pyzipper.WZ_AES, + ) as zf: + zf.setpassword(password) + zf.write(full_path, arcname=hex_hash + ".blob") + os.replace(tmp_path, target) backed_up += 1 except (FileNotFoundError, PermissionError, OSError) as e: print(f" WARN: backup failed for {full_path}: {e}") - # Uklidíme temp soubor pokud existuje - if os.path.exists(tmp_path): + if tmp_path and os.path.exists(tmp_path): os.remove(tmp_path) continue diff --git a/indexer/config.py b/indexer/config.py index bdb01d0..55814c1 100644 --- a/indexer/config.py +++ b/indexer/config.py @@ -24,6 +24,7 @@ DB_CONFIG = { ROOT_PATH = os.getenv("ROOT_PATH") ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE") BACKUP_PATH = os.getenv("BACKUP_PATH") +BACKUP_PASSWORD = os.getenv("BACKUP_PASSWORD") # ========================= # Behaviour diff --git a/migrate_to_zip.py b/migrate_to_zip.py new file mode 100644 index 0000000..697199b --- /dev/null +++ b/migrate_to_zip.py @@ -0,0 +1,134 @@ +""" +One-time migration: convert plain .bak backup blobs to AES-256 encrypted .zip files. + +Usage: python migrate_to_zip.py + +Walks BACKUP_PATH, finds all .bak files, creates encrypted .zip for each, +then deletes the original .bak. Resumable: skips files where .zip already exists. +""" + +import os +import sys +import time +import pyzipper +from indexer.config import BACKUP_PATH, BACKUP_PASSWORD + + +def collect_bak_files(backup_root: str) -> list: + """Walk backup dir and collect all .bak file paths.""" + bak_files = [] + for dirpath, _dirnames, filenames in os.walk(backup_root): + for fn in filenames: + if fn.endswith(".bak"): + bak_files.append(os.path.join(dirpath, fn)) + return bak_files + + +def migrate(backup_root: str, password: str): + print(f"Backup dir: {backup_root}") + print("Scanning for .bak files...") + bak_files = collect_bak_files(backup_root) + total = len(bak_files) + print(f"Found {total} .bak files to migrate.\n") + + if total == 0: + print("Nothing to migrate.") + return + + password_bytes = password.encode("utf-8") + converted = 0 + skipped = 0 + errors = 0 + start_time = time.time() + + try: + for i, bak_path in enumerate(bak_files, 1): + # Derive the .zip path from the .bak path + # e.g., ab/cd/abcdef...64hex.bak -> ab/cd/abcdef...64hex.zip + base = bak_path[:-4] # strip ".bak" + zip_path = base + ".zip" + hex_hash = os.path.basename(base) # the 64-char hex name + + # Resume support: skip if .zip already exists + if os.path.exists(zip_path): + skipped += 1 + if i % 500 == 0 or i == total: + elapsed = time.time() - start_time + print(f" [{i}/{total}] ({100*i//total}%) " + f"converted={converted} skipped={skipped} errors={errors} " + f"elapsed={elapsed:.0f}s") + continue + + try: + # Create encrypted zip in a temp file, then rename + tmp_path = zip_path + ".tmp" + with pyzipper.AESZipFile( + tmp_path, "w", + compression=pyzipper.ZIP_DEFLATED, + encryption=pyzipper.WZ_AES, + ) as zf: + zf.setpassword(password_bytes) + zf.write(bak_path, arcname=hex_hash + ".blob") + + os.replace(tmp_path, zip_path) + + # Verify the zip is valid before deleting original + with pyzipper.AESZipFile(zip_path, "r") as zf: + zf.setpassword(password_bytes) + names = zf.namelist() + if not names: + raise ValueError("ZIP is empty after creation") + + # Delete original .bak + os.remove(bak_path) + converted += 1 + + except Exception as e: + print(f" ERROR: {bak_path}: {e}") + errors += 1 + # Clean up temp file if it exists + if os.path.exists(zip_path + ".tmp"): + try: + os.remove(zip_path + ".tmp") + except OSError: + pass + continue + + # Progress every 500 files + if i % 500 == 0 or i == total: + elapsed = time.time() - start_time + rate = converted / elapsed if elapsed > 0 else 0 + eta = (total - i) / rate if rate > 0 else 0 + print(f" [{i}/{total}] ({100*i//total}%) " + f"converted={converted} skipped={skipped} errors={errors} " + f"elapsed={elapsed:.0f}s ETA={eta:.0f}s") + + except KeyboardInterrupt: + print(f"\n\nInterrupted by user at file {i}/{total}.") + print("Migration is resumable — run again to continue.") + + elapsed = time.time() - start_time + print(f"\n{'='*60}") + print(f"Migration complete.") + print(f" Total .bak files : {total}") + print(f" Converted : {converted}") + print(f" Skipped (exists) : {skipped}") + print(f" Errors : {errors}") + print(f" Time : {elapsed:.0f}s") + print(f"{'='*60}") + + +if __name__ == "__main__": + if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH): + print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}") + sys.exit(1) + if not BACKUP_PASSWORD: + print("ERROR: BACKUP_PASSWORD not set in .env") + sys.exit(1) + + print("=" * 60) + print("MIGRATION: .bak -> encrypted .zip") + print(f"Backup dir: {BACKUP_PATH}") + print("=" * 60) + + migrate(BACKUP_PATH, BACKUP_PASSWORD) diff --git a/reconcile.py b/reconcile.py new file mode 100644 index 0000000..e58b5b2 --- /dev/null +++ b/reconcile.py @@ -0,0 +1,113 @@ +""" +reconcile.py — Cross-check MySQL content_hash values against .zip files on disk. + +Reports: + 1. DB hashes with no blob on disk (missing backups) + 2. Blob files on disk with no matching DB record (orphan blobs) + 3. Summary stats +""" + +import os +import sys + +from indexer.config import BACKUP_PATH +from indexer.db import get_connection + + +def collect_disk_hashes(backup_root: str) -> set: + """Walk backup dir and collect all hex hashes from .zip filenames.""" + hashes = set() + for dirpath, _dirnames, filenames in os.walk(backup_root): + for fn in filenames: + if fn.endswith(".zip"): + hex_hash = fn[:-4] # strip ".zip" + if len(hex_hash) == 64: + hashes.add(hex_hash) + else: + print(f" WARN: unexpected zip name: {os.path.join(dirpath, fn)}") + return hashes + + +def collect_db_hashes(conn) -> set: + """Fetch all distinct non-NULL content_hash values from files table.""" + with conn.cursor() as cur: + cur.execute("SELECT DISTINCT HEX(content_hash) FROM files WHERE content_hash IS NOT NULL") + return {row[0].lower() for row in cur.fetchall()} + + +def main(): + if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH): + print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}") + sys.exit(1) + + print(f"Backup dir : {BACKUP_PATH}") + print("Scanning disk blobs...") + disk_hashes = collect_disk_hashes(BACKUP_PATH) + print(f" Found {len(disk_hashes)} blob files on disk.") + + print("Loading DB hashes...") + conn = get_connection() + try: + db_hashes = collect_db_hashes(conn) + finally: + conn.close() + print(f" Found {len(db_hashes)} distinct hashes in DB.") + + # --- Options --- +# PURGE_ORPHANS = True # uncomment to delete orphan blobs +PURGE_ORPHANS = False + +# --- Reconcile --- + missing_on_disk = db_hashes - disk_hashes + orphans_on_disk = disk_hashes - db_hashes + matched = db_hashes & disk_hashes + + print() + print("=== Reconciliation Results ===") + print(f" Matched (DB + disk) : {len(matched)}") + print(f" Missing on disk : {len(missing_on_disk)}") + print(f" Orphan blobs (no DB) : {len(orphans_on_disk)}") + + if missing_on_disk: + print(f"\n--- Missing on disk ({len(missing_on_disk)}) ---") + conn = get_connection() + try: + with conn.cursor() as cur: + for h in sorted(missing_on_disk): + cur.execute( + "SELECT relative_path FROM files WHERE content_hash = UNHEX(%s) LIMIT 5", + (h,) + ) + paths = [row[0] for row in cur.fetchall()] + print(f" {h} -> {paths}") + finally: + conn.close() + + if orphans_on_disk: + print(f"\n--- Orphan blobs ({len(orphans_on_disk)}) ---") + total_orphan_bytes = 0 + for h in sorted(orphans_on_disk): + blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip") + size = os.path.getsize(blob) if os.path.exists(blob) else 0 + total_orphan_bytes += size if isinstance(size, int) else 0 + print(f" {h} ({size} bytes)") + print(f" Total orphan size: {total_orphan_bytes / 1024 / 1024:.1f} MB") + + if PURGE_ORPHANS: + print("\n PURGING orphan blobs...") + purged = 0 + for h in sorted(orphans_on_disk): + blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip") + try: + os.remove(blob) + purged += 1 + except OSError as e: + print(f" WARN: could not delete {blob}: {e}") + print(f" Purged {purged}/{len(orphans_on_disk)} orphan blobs.") + + if not missing_on_disk and not orphans_on_disk: + print("\nAll clean — DB and disk are in sync.") + + +if __name__ == "__main__": + main() diff --git a/recovery.py b/recovery.py index 3cdd449..cbfcc84 100644 --- a/recovery.py +++ b/recovery.py @@ -11,8 +11,8 @@ the original directory structure. import os import sys -import shutil -from indexer.config import DB_CONFIG, BACKUP_PATH +import pyzipper +from indexer.config import BACKUP_PATH, BACKUP_PASSWORD from indexer.db import get_connection from indexer.backup import blob_path @@ -37,21 +37,37 @@ def recover(run_id: int, output_dir: str): print(f"Recovering {len(rows)} files from run #{run_id} to {output_dir}") recovered = 0 missing = 0 + password = BACKUP_PASSWORD.encode("utf-8") for relative_path, content_hash in rows: source = blob_path(BACKUP_PATH, content_hash) target = os.path.join(output_dir, relative_path.replace("/", os.sep)) if not os.path.exists(source): - print(f" MISSING blob: {content_hash.hex()} for {relative_path}") + print(f" MISSING zip: {content_hash.hex()} for {relative_path}") missing += 1 continue os.makedirs(os.path.dirname(target), exist_ok=True) - shutil.copy2(source, target) - recovered += 1 - print(f"\nRecovered: {recovered} Missing blobs: {missing}") + try: + with pyzipper.AESZipFile(source, "r") as zf: + zf.setpassword(password) + names = zf.namelist() + if not names: + print(f" WARN: empty zip: {source}") + missing += 1 + continue + data = zf.read(names[0]) + with open(target, "wb") as f: + f.write(data) + recovered += 1 + except Exception as e: + print(f" ERROR extracting {source} for {relative_path}: {e}") + missing += 1 + continue + + print(f"\nRecovered: {recovered} Missing/errors: {missing}") if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 62c1860..dd8f7ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pymysql blake3 python-dotenv +pyzipper