This commit is contained in:
2026-02-12 07:38:16 +01:00
parent f9082f1e5b
commit 42cd021b9c
8 changed files with 300 additions and 15 deletions

View File

@@ -12,7 +12,15 @@
"Bash(C:Pythonpython.exe -c \"import pymysql; conn = pymysql.connect\\(host=''192.168.1.76'', port=3306, user=''root'', password='''', db=''OrdinaceDropBoxBackup''\\); cur = conn.cursor\\(\\); cur.execute\\(''SELECT * FROM runs''\\); print\\(cur.fetchall\\(\\)\\); cur.execute\\(''SELECT COUNT\\(*\\) FROM files''\\); print\\(''files count:'', cur.fetchone\\(\\)\\); conn.close\\(\\)\")",
"Bash(/c/Python/python.exe -c \"import pymysql; conn = pymysql.connect\\(host=''192.168.1.76'', port=3306, user=''root'', password='''', db=''OrdinaceDropBoxBackup''\\); cur = conn.cursor\\(\\); cur.execute\\(''SELECT * FROM runs''\\); print\\(''RUNS:'', cur.fetchall\\(\\)\\); cur.execute\\(''SELECT COUNT\\(*\\) FROM files''\\); print\\(''FILES count:'', cur.fetchone\\(\\)\\); conn.close\\(\\)\")",
"Bash(/c/Python/python.exe -c \"import pymysql; conn = pymysql.connect\\(host=''192.168.1.76'', port=3306, user=''root'', password=''Vlado9674+'', db=''OrdinaceDropBoxBackup''\\); cur = conn.cursor\\(\\); cur.execute\\(''SELECT * FROM runs''\\); rows = cur.fetchall\\(\\); print\\(''RUNS:''\\); [print\\(r\\) for r in rows]; cur.execute\\(''SELECT COUNT\\(*\\) FROM files''\\); print\\(''FILES count:'', cur.fetchone\\(\\)[0]\\); cur.execute\\(''SELECT COUNT\\(*\\) FROM file_events''\\); print\\(''EVENTS count:'', cur.fetchone\\(\\)[0]\\); conn.close\\(\\)\")",
"Bash(/c/Python/python.exe:*)"
"Bash(/c/Python/python.exe:*)",
"Bash(cd \"u:\\\\OnedriveOrdinace\\\\OneDrive\\\\DropBoxBackupClaude\"\" && powershell -Command \"Get-ChildItem -Recurse -Filter '*.blob')",
"Bash(Measure-Object:*)",
"Bash(Select-Object -ExpandProperty Count \")",
"Bash(powershell:*)",
"Bash(\"C:\\\\Python\\\\python.exe\" -m pip list)",
"Bash(findstr:*)",
"Bash(ls:*)",
"Bash(C:Pythonpython.exe -m pip install pyzipper)"
]
}
}

Binary file not shown.

View File

@@ -1,21 +1,24 @@
import os
import shutil
import tempfile
import pyzipper
from indexer.config import BACKUP_PASSWORD
def blob_path(backup_root: str, content_hash: bytes) -> str:
"""Vrátí cestu k blob souboru: BACKUP/ab/cd/abcdef...blob"""
"""Vrátí cestu k ZIP souboru: BACKUP/ab/cd/abcdef...zip"""
hex_hash = content_hash.hex()
return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".blob")
return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".zip")
def ensure_backed_up(files_with_hash: list, backup_root: str) -> int:
"""
Zkopíruje soubory do content-addressable storage.
Vytvoří AES-256 šifrovaný ZIP pro každý soubor v content-addressable storage.
files_with_hash: [(full_path, content_hash_bytes), ...]
Přeskočí soubory, jejichž blob už existuje (deduplikace).
Přeskočí soubory, jejichž zip už existuje (deduplikace).
Returns: počet nově zálohovaných souborů.
"""
password = BACKUP_PASSWORD.encode("utf-8")
backed_up = 0
for full_path, content_hash in files_with_hash:
target = blob_path(backup_root, content_hash)
@@ -25,17 +28,26 @@ def ensure_backed_up(files_with_hash: list, backup_root: str) -> int:
target_dir = os.path.dirname(target)
os.makedirs(target_dir, exist_ok=True)
tmp_path = None
try:
# Atomický zápis: temp soubor + přejmenování
fd, tmp_path = tempfile.mkstemp(dir=target_dir, suffix=".tmp")
os.close(fd)
shutil.copy2(full_path, tmp_path)
hex_hash = content_hash.hex()
with pyzipper.AESZipFile(
tmp_path, "w",
compression=pyzipper.ZIP_DEFLATED,
encryption=pyzipper.WZ_AES,
) as zf:
zf.setpassword(password)
zf.write(full_path, arcname=hex_hash + ".blob")
os.replace(tmp_path, target)
backed_up += 1
except (FileNotFoundError, PermissionError, OSError) as e:
print(f" WARN: backup failed for {full_path}: {e}")
# Uklidíme temp soubor pokud existuje
if os.path.exists(tmp_path):
if tmp_path and os.path.exists(tmp_path):
os.remove(tmp_path)
continue

View File

@@ -24,6 +24,7 @@ DB_CONFIG = {
ROOT_PATH = os.getenv("ROOT_PATH")
ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
BACKUP_PATH = os.getenv("BACKUP_PATH")
BACKUP_PASSWORD = os.getenv("BACKUP_PASSWORD")
# =========================
# Behaviour

134
migrate_to_zip.py Normal file
View File

@@ -0,0 +1,134 @@
"""
One-time migration: convert plain .bak backup blobs to AES-256 encrypted .zip files.
Usage: python migrate_to_zip.py
Walks BACKUP_PATH, finds all .bak files, creates encrypted .zip for each,
then deletes the original .bak. Resumable: skips files where .zip already exists.
"""
import os
import sys
import time
import pyzipper
from indexer.config import BACKUP_PATH, BACKUP_PASSWORD
def collect_bak_files(backup_root: str) -> list:
"""Walk backup dir and collect all .bak file paths."""
bak_files = []
for dirpath, _dirnames, filenames in os.walk(backup_root):
for fn in filenames:
if fn.endswith(".bak"):
bak_files.append(os.path.join(dirpath, fn))
return bak_files
def migrate(backup_root: str, password: str):
print(f"Backup dir: {backup_root}")
print("Scanning for .bak files...")
bak_files = collect_bak_files(backup_root)
total = len(bak_files)
print(f"Found {total} .bak files to migrate.\n")
if total == 0:
print("Nothing to migrate.")
return
password_bytes = password.encode("utf-8")
converted = 0
skipped = 0
errors = 0
start_time = time.time()
try:
for i, bak_path in enumerate(bak_files, 1):
# Derive the .zip path from the .bak path
# e.g., ab/cd/abcdef...64hex.bak -> ab/cd/abcdef...64hex.zip
base = bak_path[:-4] # strip ".bak"
zip_path = base + ".zip"
hex_hash = os.path.basename(base) # the 64-char hex name
# Resume support: skip if .zip already exists
if os.path.exists(zip_path):
skipped += 1
if i % 500 == 0 or i == total:
elapsed = time.time() - start_time
print(f" [{i}/{total}] ({100*i//total}%) "
f"converted={converted} skipped={skipped} errors={errors} "
f"elapsed={elapsed:.0f}s")
continue
try:
# Create encrypted zip in a temp file, then rename
tmp_path = zip_path + ".tmp"
with pyzipper.AESZipFile(
tmp_path, "w",
compression=pyzipper.ZIP_DEFLATED,
encryption=pyzipper.WZ_AES,
) as zf:
zf.setpassword(password_bytes)
zf.write(bak_path, arcname=hex_hash + ".blob")
os.replace(tmp_path, zip_path)
# Verify the zip is valid before deleting original
with pyzipper.AESZipFile(zip_path, "r") as zf:
zf.setpassword(password_bytes)
names = zf.namelist()
if not names:
raise ValueError("ZIP is empty after creation")
# Delete original .bak
os.remove(bak_path)
converted += 1
except Exception as e:
print(f" ERROR: {bak_path}: {e}")
errors += 1
# Clean up temp file if it exists
if os.path.exists(zip_path + ".tmp"):
try:
os.remove(zip_path + ".tmp")
except OSError:
pass
continue
# Progress every 500 files
if i % 500 == 0 or i == total:
elapsed = time.time() - start_time
rate = converted / elapsed if elapsed > 0 else 0
eta = (total - i) / rate if rate > 0 else 0
print(f" [{i}/{total}] ({100*i//total}%) "
f"converted={converted} skipped={skipped} errors={errors} "
f"elapsed={elapsed:.0f}s ETA={eta:.0f}s")
except KeyboardInterrupt:
print(f"\n\nInterrupted by user at file {i}/{total}.")
print("Migration is resumable — run again to continue.")
elapsed = time.time() - start_time
print(f"\n{'='*60}")
print(f"Migration complete.")
print(f" Total .bak files : {total}")
print(f" Converted : {converted}")
print(f" Skipped (exists) : {skipped}")
print(f" Errors : {errors}")
print(f" Time : {elapsed:.0f}s")
print(f"{'='*60}")
if __name__ == "__main__":
if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH):
print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}")
sys.exit(1)
if not BACKUP_PASSWORD:
print("ERROR: BACKUP_PASSWORD not set in .env")
sys.exit(1)
print("=" * 60)
print("MIGRATION: .bak -> encrypted .zip")
print(f"Backup dir: {BACKUP_PATH}")
print("=" * 60)
migrate(BACKUP_PATH, BACKUP_PASSWORD)

113
reconcile.py Normal file
View File

@@ -0,0 +1,113 @@
"""
reconcile.py — Cross-check MySQL content_hash values against .zip files on disk.
Reports:
1. DB hashes with no blob on disk (missing backups)
2. Blob files on disk with no matching DB record (orphan blobs)
3. Summary stats
"""
import os
import sys
from indexer.config import BACKUP_PATH
from indexer.db import get_connection
def collect_disk_hashes(backup_root: str) -> set:
"""Walk backup dir and collect all hex hashes from .zip filenames."""
hashes = set()
for dirpath, _dirnames, filenames in os.walk(backup_root):
for fn in filenames:
if fn.endswith(".zip"):
hex_hash = fn[:-4] # strip ".zip"
if len(hex_hash) == 64:
hashes.add(hex_hash)
else:
print(f" WARN: unexpected zip name: {os.path.join(dirpath, fn)}")
return hashes
def collect_db_hashes(conn) -> set:
"""Fetch all distinct non-NULL content_hash values from files table."""
with conn.cursor() as cur:
cur.execute("SELECT DISTINCT HEX(content_hash) FROM files WHERE content_hash IS NOT NULL")
return {row[0].lower() for row in cur.fetchall()}
def main():
if not BACKUP_PATH or not os.path.isdir(BACKUP_PATH):
print(f"ERROR: BACKUP_PATH is not a valid directory: {BACKUP_PATH}")
sys.exit(1)
print(f"Backup dir : {BACKUP_PATH}")
print("Scanning disk blobs...")
disk_hashes = collect_disk_hashes(BACKUP_PATH)
print(f" Found {len(disk_hashes)} blob files on disk.")
print("Loading DB hashes...")
conn = get_connection()
try:
db_hashes = collect_db_hashes(conn)
finally:
conn.close()
print(f" Found {len(db_hashes)} distinct hashes in DB.")
# --- Options ---
# PURGE_ORPHANS = True # uncomment to delete orphan blobs
PURGE_ORPHANS = False
# --- Reconcile ---
missing_on_disk = db_hashes - disk_hashes
orphans_on_disk = disk_hashes - db_hashes
matched = db_hashes & disk_hashes
print()
print("=== Reconciliation Results ===")
print(f" Matched (DB + disk) : {len(matched)}")
print(f" Missing on disk : {len(missing_on_disk)}")
print(f" Orphan blobs (no DB) : {len(orphans_on_disk)}")
if missing_on_disk:
print(f"\n--- Missing on disk ({len(missing_on_disk)}) ---")
conn = get_connection()
try:
with conn.cursor() as cur:
for h in sorted(missing_on_disk):
cur.execute(
"SELECT relative_path FROM files WHERE content_hash = UNHEX(%s) LIMIT 5",
(h,)
)
paths = [row[0] for row in cur.fetchall()]
print(f" {h} -> {paths}")
finally:
conn.close()
if orphans_on_disk:
print(f"\n--- Orphan blobs ({len(orphans_on_disk)}) ---")
total_orphan_bytes = 0
for h in sorted(orphans_on_disk):
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
size = os.path.getsize(blob) if os.path.exists(blob) else 0
total_orphan_bytes += size if isinstance(size, int) else 0
print(f" {h} ({size} bytes)")
print(f" Total orphan size: {total_orphan_bytes / 1024 / 1024:.1f} MB")
if PURGE_ORPHANS:
print("\n PURGING orphan blobs...")
purged = 0
for h in sorted(orphans_on_disk):
blob = os.path.join(BACKUP_PATH, h[:2], h[2:4], h + ".zip")
try:
os.remove(blob)
purged += 1
except OSError as e:
print(f" WARN: could not delete {blob}: {e}")
print(f" Purged {purged}/{len(orphans_on_disk)} orphan blobs.")
if not missing_on_disk and not orphans_on_disk:
print("\nAll clean — DB and disk are in sync.")
if __name__ == "__main__":
main()

View File

@@ -11,8 +11,8 @@ the original directory structure.
import os
import sys
import shutil
from indexer.config import DB_CONFIG, BACKUP_PATH
import pyzipper
from indexer.config import BACKUP_PATH, BACKUP_PASSWORD
from indexer.db import get_connection
from indexer.backup import blob_path
@@ -37,21 +37,37 @@ def recover(run_id: int, output_dir: str):
print(f"Recovering {len(rows)} files from run #{run_id} to {output_dir}")
recovered = 0
missing = 0
password = BACKUP_PASSWORD.encode("utf-8")
for relative_path, content_hash in rows:
source = blob_path(BACKUP_PATH, content_hash)
target = os.path.join(output_dir, relative_path.replace("/", os.sep))
if not os.path.exists(source):
print(f" MISSING blob: {content_hash.hex()} for {relative_path}")
print(f" MISSING zip: {content_hash.hex()} for {relative_path}")
missing += 1
continue
os.makedirs(os.path.dirname(target), exist_ok=True)
shutil.copy2(source, target)
recovered += 1
print(f"\nRecovered: {recovered} Missing blobs: {missing}")
try:
with pyzipper.AESZipFile(source, "r") as zf:
zf.setpassword(password)
names = zf.namelist()
if not names:
print(f" WARN: empty zip: {source}")
missing += 1
continue
data = zf.read(names[0])
with open(target, "wb") as f:
f.write(data)
recovered += 1
except Exception as e:
print(f" ERROR extracting {source} for {relative_path}: {e}")
missing += 1
continue
print(f"\nRecovered: {recovered} Missing/errors: {missing}")
if __name__ == "__main__":

View File

@@ -1,3 +1,4 @@
pymysql
blake3
python-dotenv
pyzipper