z230
This commit is contained in:
14
.claude/settings.local.json
Normal file
14
.claude/settings.local.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(dir /s \"U:\\\\drobboxordinacebackup\")",
|
||||||
|
"Bash(where:*)",
|
||||||
|
"Bash(dir:*)",
|
||||||
|
"Bash(python:*)",
|
||||||
|
"Bash(pip install:*)",
|
||||||
|
"Bash(tasklist:*)",
|
||||||
|
"Bash(wmic process:*)",
|
||||||
|
"Bash(taskkill:*)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
42
indexer/backup.py
Normal file
42
indexer/backup.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
|
def blob_path(backup_root: str, content_hash: bytes) -> str:
|
||||||
|
"""Vrátí cestu k blob souboru: BACKUP/ab/cd/abcdef...blob"""
|
||||||
|
hex_hash = content_hash.hex()
|
||||||
|
return os.path.join(backup_root, hex_hash[:2], hex_hash[2:4], hex_hash + ".blob")
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_backed_up(files_with_hash: list, backup_root: str) -> int:
|
||||||
|
"""
|
||||||
|
Zkopíruje soubory do content-addressable storage.
|
||||||
|
files_with_hash: [(full_path, content_hash_bytes), ...]
|
||||||
|
Přeskočí soubory, jejichž blob už existuje (deduplikace).
|
||||||
|
Returns: počet nově zálohovaných souborů.
|
||||||
|
"""
|
||||||
|
backed_up = 0
|
||||||
|
for full_path, content_hash in files_with_hash:
|
||||||
|
target = blob_path(backup_root, content_hash)
|
||||||
|
if os.path.exists(target):
|
||||||
|
continue
|
||||||
|
|
||||||
|
target_dir = os.path.dirname(target)
|
||||||
|
os.makedirs(target_dir, exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Atomický zápis: temp soubor + přejmenování
|
||||||
|
fd, tmp_path = tempfile.mkstemp(dir=target_dir, suffix=".tmp")
|
||||||
|
os.close(fd)
|
||||||
|
shutil.copy2(full_path, tmp_path)
|
||||||
|
os.replace(tmp_path, target)
|
||||||
|
backed_up += 1
|
||||||
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
||||||
|
print(f" WARN: backup failed for {full_path}: {e}")
|
||||||
|
# Uklidíme temp soubor pokud existuje
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
os.remove(tmp_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
return backed_up
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# načti .env z rootu projektu
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
@@ -24,9 +23,11 @@ DB_CONFIG = {
|
|||||||
|
|
||||||
ROOT_PATH = os.getenv("ROOT_PATH")
|
ROOT_PATH = os.getenv("ROOT_PATH")
|
||||||
ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
|
ROOT_NAME = os.getenv("ROOT_NAME", "ORDINACE")
|
||||||
|
BACKUP_PATH = os.getenv("BACKUP_PATH")
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# Behaviour
|
# Behaviour
|
||||||
# =========================
|
# =========================
|
||||||
|
|
||||||
DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
|
DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
|
||||||
|
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
|
||||||
|
|||||||
180
indexer/db.py
180
indexer/db.py
@@ -1,91 +1,123 @@
|
|||||||
import pymysql
|
import pymysql
|
||||||
import hashlib
|
from datetime import datetime
|
||||||
from indexer.config import DB_CONFIG, ROOT_NAME
|
from indexer.config import DB_CONFIG, BATCH_SIZE
|
||||||
|
|
||||||
|
|
||||||
def get_connection():
|
def get_connection():
|
||||||
return pymysql.connect(**DB_CONFIG)
|
return pymysql.connect(**DB_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
def preload_mark_all_missing():
|
# ── Run management ──────────────────────────────────────────
|
||||||
"""
|
|
||||||
Na začátku běhu:
|
|
||||||
označí všechny soubory jako neexistující.
|
|
||||||
Ty, které skener znovu najde, se přepnou zpět na exists_now = 1.
|
|
||||||
"""
|
|
||||||
conn = get_connection()
|
|
||||||
try:
|
|
||||||
with conn.cursor() as cur:
|
|
||||||
cur.execute("UPDATE files SET exists_now = 0")
|
|
||||||
conn.commit()
|
|
||||||
finally:
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
|
def create_run(cur) -> int:
|
||||||
def path_hash(path: str) -> bytes:
|
|
||||||
"""
|
|
||||||
MD5 hash cesty – pouze identifikátor, ne bezpečnostní hash
|
|
||||||
"""
|
|
||||||
return hashlib.md5(path.encode("utf-8")).digest()
|
|
||||||
|
|
||||||
|
|
||||||
def find_file_by_path(cur, path_hash_bytes):
|
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""
|
"INSERT INTO runs (started_at, status) VALUES (%s, 'RUNNING')",
|
||||||
SELECT id, file_size, mtime, content_hash
|
(datetime.now(),)
|
||||||
FROM files
|
|
||||||
WHERE path_hash = %s
|
|
||||||
""",
|
|
||||||
(path_hash_bytes,)
|
|
||||||
)
|
|
||||||
return cur.fetchone()
|
|
||||||
|
|
||||||
|
|
||||||
def insert_file(cur, file):
|
|
||||||
cur.execute(
|
|
||||||
"""
|
|
||||||
INSERT INTO files (
|
|
||||||
root_name, full_path, path_hash,
|
|
||||||
file_name, directory,
|
|
||||||
file_size, mtime, content_hash,
|
|
||||||
first_seen, last_seen, exists_now
|
|
||||||
)
|
|
||||||
VALUES (
|
|
||||||
%s, %s, %s,
|
|
||||||
%s, %s,
|
|
||||||
%s, %s, %s,
|
|
||||||
NOW(), NOW(), 1
|
|
||||||
)
|
|
||||||
""",
|
|
||||||
(
|
|
||||||
ROOT_NAME,
|
|
||||||
file["full_path"],
|
|
||||||
path_hash(file["full_path"]),
|
|
||||||
file["file_name"],
|
|
||||||
file["directory"],
|
|
||||||
file["size"],
|
|
||||||
file["mtime"],
|
|
||||||
file["content_hash"],
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
return cur.lastrowid
|
return cur.lastrowid
|
||||||
|
|
||||||
|
|
||||||
def update_file(cur, file_id, file):
|
def finalize_run(cur, run_id: int, stats: dict):
|
||||||
cur.execute(
|
cur.execute(
|
||||||
|
"""UPDATE runs
|
||||||
|
SET finished_at = %s, status = 'COMPLETED',
|
||||||
|
files_total = %s, files_new = %s, files_modified = %s,
|
||||||
|
files_deleted = %s, files_unchanged = %s
|
||||||
|
WHERE id = %s""",
|
||||||
|
(datetime.now(), stats["total"], stats["new"], stats["modified"],
|
||||||
|
stats["deleted"], stats["unchanged"], run_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def fail_run(cur, run_id: int):
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE runs SET finished_at = %s, status = 'FAILED' WHERE id = %s",
|
||||||
|
(datetime.now(), run_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Load DB state ──────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_all_files(cur) -> dict:
|
||||||
"""
|
"""
|
||||||
UPDATE files
|
Načte všechny existující soubory z DB do RAM.
|
||||||
SET file_size = %s,
|
Returns: {relative_path: {id, size, mtime, content_hash}}
|
||||||
mtime = %s,
|
"""
|
||||||
content_hash = %s,
|
cur.execute(
|
||||||
last_seen = NOW(),
|
"""SELECT id, relative_path, file_size, mtime, content_hash
|
||||||
exists_now = 1
|
FROM files WHERE exists_now = 1"""
|
||||||
WHERE id = %s
|
|
||||||
""",
|
|
||||||
(
|
|
||||||
file["size"],
|
|
||||||
file["mtime"],
|
|
||||||
file["content_hash"],
|
|
||||||
file_id,
|
|
||||||
)
|
)
|
||||||
|
result = {}
|
||||||
|
for row in cur.fetchall():
|
||||||
|
file_id, rel_path, size, mtime, content_hash = row
|
||||||
|
result[rel_path] = {
|
||||||
|
"id": file_id,
|
||||||
|
"size": size,
|
||||||
|
"mtime": mtime,
|
||||||
|
"content_hash": content_hash,
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ── Batch operations ────────────────────────────────────────
|
||||||
|
|
||||||
|
def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
|
||||||
|
"""
|
||||||
|
Batch INSERT nových souborů.
|
||||||
|
files_list: [{relative_path, file_name, directory, size, mtime, content_hash}]
|
||||||
|
Returns: {relative_path: file_id}
|
||||||
|
"""
|
||||||
|
path_to_id = {}
|
||||||
|
for i in range(0, len(files_list), BATCH_SIZE):
|
||||||
|
chunk = files_list[i:i + BATCH_SIZE]
|
||||||
|
cur.executemany(
|
||||||
|
"""INSERT INTO files
|
||||||
|
(relative_path, file_name, directory, file_size, mtime,
|
||||||
|
content_hash, first_seen_run, last_seen_run, exists_now)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""",
|
||||||
|
[(f["relative_path"], f["file_name"], f["directory"],
|
||||||
|
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
|
||||||
|
for f in chunk]
|
||||||
|
)
|
||||||
|
# pymysql executemany: lastrowid = first id in batch
|
||||||
|
first_id = cur.lastrowid
|
||||||
|
for j, f in enumerate(chunk):
|
||||||
|
path_to_id[f["relative_path"]] = first_id + j
|
||||||
|
return path_to_id
|
||||||
|
|
||||||
|
|
||||||
|
def batch_update_modified(cur, files_list: list, run_id: int):
|
||||||
|
"""
|
||||||
|
Batch UPDATE změněných souborů.
|
||||||
|
files_list: [{id, size, mtime, content_hash}]
|
||||||
|
"""
|
||||||
|
for i in range(0, len(files_list), BATCH_SIZE):
|
||||||
|
chunk = files_list[i:i + BATCH_SIZE]
|
||||||
|
cur.executemany(
|
||||||
|
"""UPDATE files
|
||||||
|
SET file_size = %s, mtime = %s, content_hash = %s,
|
||||||
|
last_seen_run = %s, exists_now = 1
|
||||||
|
WHERE id = %s""",
|
||||||
|
[(f["size"], f["mtime"], f["content_hash"], run_id, f["id"])
|
||||||
|
for f in chunk]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_mark_deleted(cur, file_ids: list, run_id: int):
|
||||||
|
"""Batch UPDATE smazaných souborů — exists_now = 0."""
|
||||||
|
for i in range(0, len(file_ids), BATCH_SIZE):
|
||||||
|
chunk = file_ids[i:i + BATCH_SIZE]
|
||||||
|
cur.executemany(
|
||||||
|
"UPDATE files SET exists_now = 0, last_seen_run = %s WHERE id = %s",
|
||||||
|
[(run_id, fid) for fid in chunk]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_update_unchanged(cur, file_ids: list, run_id: int):
|
||||||
|
"""Batch UPDATE nezměněných souborů — jen last_seen_run."""
|
||||||
|
for i in range(0, len(file_ids), BATCH_SIZE):
|
||||||
|
chunk = file_ids[i:i + BATCH_SIZE]
|
||||||
|
cur.executemany(
|
||||||
|
"UPDATE files SET last_seen_run = %s, exists_now = 1 WHERE id = %s",
|
||||||
|
[(run_id, fid) for fid in chunk]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,19 +1,21 @@
|
|||||||
def log_event(cur, file_id, event_type, old=None, new=None):
|
from indexer.config import BATCH_SIZE
|
||||||
cur.execute(
|
|
||||||
|
|
||||||
|
def batch_log_events(cur, events: list):
|
||||||
"""
|
"""
|
||||||
INSERT INTO file_events (
|
Batch INSERT eventů do file_events.
|
||||||
file_id, event_type, event_time,
|
events: [{run_id, file_id, event_type, old_size, new_size, old_hash, new_hash}]
|
||||||
old_size, new_size,
|
"""
|
||||||
old_hash, new_hash
|
if not events:
|
||||||
)
|
return
|
||||||
VALUES (%s, %s, NOW(), %s, %s, %s, %s)
|
for i in range(0, len(events), BATCH_SIZE):
|
||||||
""",
|
chunk = events[i:i + BATCH_SIZE]
|
||||||
(
|
cur.executemany(
|
||||||
file_id,
|
"""INSERT INTO file_events
|
||||||
event_type,
|
(run_id, file_id, event_type, old_size, new_size, old_hash, new_hash)
|
||||||
old["size"] if old else None,
|
VALUES (%s, %s, %s, %s, %s, %s, %s)""",
|
||||||
new["size"] if new else None,
|
[(e["run_id"], e["file_id"], e["event_type"],
|
||||||
old["content_hash"] if old else None,
|
e.get("old_size"), e.get("new_size"),
|
||||||
new["content_hash"] if new else None,
|
e.get("old_hash"), e.get("new_hash"))
|
||||||
)
|
for e in chunk]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,21 +1,30 @@
|
|||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from indexer.hasher import blake3_file
|
|
||||||
|
|
||||||
def scan_files(root_path):
|
|
||||||
|
def scan_files(root_path: str) -> dict:
|
||||||
|
"""
|
||||||
|
Projde celý adresářový strom a vrátí dict všech souborů.
|
||||||
|
Nehasuje obsah — to se dělá až pro změněné soubory.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{relative_path: {full_path, file_name, directory, size, mtime}}
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
for root, _, files in os.walk(root_path):
|
for root, _, files in os.walk(root_path):
|
||||||
for name in files:
|
for name in files:
|
||||||
full_path = os.path.join(root, name)
|
full_path = os.path.join(root, name)
|
||||||
try:
|
try:
|
||||||
stat = os.stat(full_path)
|
stat = os.stat(full_path)
|
||||||
except FileNotFoundError:
|
except (FileNotFoundError, PermissionError):
|
||||||
continue
|
continue
|
||||||
|
rel_path = os.path.relpath(full_path, root_path).replace("\\", "/")
|
||||||
yield {
|
rel_dir = os.path.relpath(root, root_path).replace("\\", "/")
|
||||||
"full_path": full_path.replace("\\", "/"),
|
result[rel_path] = {
|
||||||
|
"full_path": full_path,
|
||||||
"file_name": name,
|
"file_name": name,
|
||||||
"directory": root.replace("\\", "/"),
|
"directory": rel_dir,
|
||||||
"size": stat.st_size,
|
"size": stat.st_size,
|
||||||
"mtime": datetime.fromtimestamp(stat.st_mtime),
|
"mtime": datetime.fromtimestamp(stat.st_mtime),
|
||||||
"content_hash": blake3_file(full_path),
|
|
||||||
}
|
}
|
||||||
|
return result
|
||||||
|
|||||||
221
main.py
221
main.py
@@ -1,73 +1,200 @@
|
|||||||
from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN
|
from indexer.config import ROOT_PATH, ROOT_NAME, DRY_RUN, BACKUP_PATH
|
||||||
from indexer.scanner import scan_files
|
from indexer.scanner import scan_files
|
||||||
|
from indexer.hasher import blake3_file
|
||||||
from indexer.db import (
|
from indexer.db import (
|
||||||
get_connection,
|
get_connection, create_run, finalize_run, fail_run,
|
||||||
preload_mark_all_missing,
|
load_all_files, batch_insert_files, batch_update_modified,
|
||||||
find_file_by_path,
|
batch_mark_deleted, batch_update_unchanged,
|
||||||
insert_file,
|
|
||||||
update_file,
|
|
||||||
path_hash,
|
|
||||||
)
|
)
|
||||||
from indexer.events import log_event
|
from indexer.events import batch_log_events
|
||||||
|
from indexer.backup import ensure_backed_up
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("ORDINACE DROPBOX BACKUP – INDEXER")
|
print("ORDINACE DROPBOX BACKUP – INDEXER")
|
||||||
print(f"Root : {ROOT_PATH}")
|
print(f"Root : {ROOT_PATH}")
|
||||||
print(f"Name : {ROOT_NAME}")
|
print(f"Backup : {BACKUP_PATH}")
|
||||||
print(f"DRY RUN : {DRY_RUN}")
|
print(f"DRY RUN : {DRY_RUN}")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
|
# ── 1. Scan filesystem (fast, no hashing) ──
|
||||||
|
print("\n[1/7] Scanning filesystem...")
|
||||||
|
fs = scan_files(ROOT_PATH)
|
||||||
|
print(f" Found {len(fs)} files on disk.")
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
# V DRY_RUN režimu jen ukážeme co by se stalo
|
||||||
|
print("\n[DRY RUN] No DB connection, showing scan results only.")
|
||||||
|
print(f" Files on disk: {len(fs)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ── 2. Connect & create run ──
|
||||||
conn = get_connection()
|
conn = get_connection()
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
|
run_id = create_run(cur)
|
||||||
|
print(f"\n[2/7] Run #{run_id} created.")
|
||||||
|
|
||||||
if not DRY_RUN:
|
try:
|
||||||
preload_mark_all_missing()
|
# ── 3. Load DB state ──
|
||||||
|
print("[3/7] Loading DB state...")
|
||||||
|
db = load_all_files(cur)
|
||||||
|
print(f" {len(db)} files in DB (exists_now=1).")
|
||||||
|
|
||||||
created = modified = seen = 0
|
# ── 4. Diff ──
|
||||||
|
print("[4/7] Diffing...")
|
||||||
|
fs_paths = set(fs.keys())
|
||||||
|
db_paths = set(db.keys())
|
||||||
|
|
||||||
for file in scan_files(ROOT_PATH):
|
new_paths = fs_paths - db_paths
|
||||||
seen += 1
|
deleted_paths = db_paths - fs_paths
|
||||||
ph = path_hash(file["full_path"])
|
existing_paths = fs_paths & db_paths
|
||||||
row = find_file_by_path(cur, ph)
|
|
||||||
|
|
||||||
if row is None:
|
modified_paths = set()
|
||||||
created += 1
|
unchanged_paths = set()
|
||||||
if not DRY_RUN:
|
for p in existing_paths:
|
||||||
file_id = insert_file(cur, file)
|
fs_file = fs[p]
|
||||||
log_event(cur, file_id, "CREATED", new=file)
|
db_file = db[p]
|
||||||
|
if fs_file["size"] != db_file["size"] or fs_file["mtime"] != db_file["mtime"]:
|
||||||
|
modified_paths.add(p)
|
||||||
else:
|
else:
|
||||||
file_id, old_size, old_mtime, old_hash = row
|
unchanged_paths.add(p)
|
||||||
if old_size != file["size"] or old_hash != file["content_hash"]:
|
|
||||||
modified += 1
|
print(f" NEW: {len(new_paths)} MOD: {len(modified_paths)} "
|
||||||
if not DRY_RUN:
|
f"DEL: {len(deleted_paths)} SAME: {len(unchanged_paths)}")
|
||||||
update_file(cur, file_id, file)
|
|
||||||
log_event(
|
# ── 5. Process changes ──
|
||||||
cur,
|
print("[5/7] Processing changes...")
|
||||||
file_id,
|
events = []
|
||||||
"MODIFIED",
|
files_to_backup = []
|
||||||
old={"size": old_size, "content_hash": old_hash},
|
|
||||||
new=file,
|
# 5a) NEW files — compute BLAKE3, batch INSERT
|
||||||
)
|
if new_paths:
|
||||||
|
print(f" Hashing {len(new_paths)} new files...")
|
||||||
|
new_files = []
|
||||||
|
for p in new_paths:
|
||||||
|
f = fs[p]
|
||||||
|
try:
|
||||||
|
content_hash = blake3_file(f["full_path"])
|
||||||
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
||||||
|
print(f" WARN: skip {p}: {e}")
|
||||||
|
continue
|
||||||
|
new_files.append({
|
||||||
|
"relative_path": p,
|
||||||
|
"file_name": f["file_name"],
|
||||||
|
"directory": f["directory"],
|
||||||
|
"size": f["size"],
|
||||||
|
"mtime": f["mtime"],
|
||||||
|
"content_hash": content_hash,
|
||||||
|
})
|
||||||
|
files_to_backup.append((f["full_path"], content_hash))
|
||||||
|
|
||||||
|
if new_files:
|
||||||
|
path_to_id = batch_insert_files(cur, new_files, run_id)
|
||||||
|
for nf in new_files:
|
||||||
|
events.append({
|
||||||
|
"run_id": run_id,
|
||||||
|
"file_id": path_to_id[nf["relative_path"]],
|
||||||
|
"event_type": "CREATED",
|
||||||
|
"new_size": nf["size"],
|
||||||
|
"new_hash": nf["content_hash"],
|
||||||
|
})
|
||||||
|
|
||||||
|
# 5b) MODIFIED files — compute BLAKE3, batch UPDATE
|
||||||
|
if modified_paths:
|
||||||
|
print(f" Hashing {len(modified_paths)} modified files...")
|
||||||
|
mod_files = []
|
||||||
|
for p in modified_paths:
|
||||||
|
f = fs[p]
|
||||||
|
db_file = db[p]
|
||||||
|
try:
|
||||||
|
content_hash = blake3_file(f["full_path"])
|
||||||
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
||||||
|
print(f" WARN: skip {p}: {e}")
|
||||||
|
continue
|
||||||
|
mod_files.append({
|
||||||
|
"id": db_file["id"],
|
||||||
|
"size": f["size"],
|
||||||
|
"mtime": f["mtime"],
|
||||||
|
"content_hash": content_hash,
|
||||||
|
})
|
||||||
|
events.append({
|
||||||
|
"run_id": run_id,
|
||||||
|
"file_id": db_file["id"],
|
||||||
|
"event_type": "MODIFIED",
|
||||||
|
"old_size": db_file["size"],
|
||||||
|
"new_size": f["size"],
|
||||||
|
"old_hash": db_file["content_hash"],
|
||||||
|
"new_hash": content_hash,
|
||||||
|
})
|
||||||
|
files_to_backup.append((f["full_path"], content_hash))
|
||||||
|
|
||||||
|
if mod_files:
|
||||||
|
batch_update_modified(cur, mod_files, run_id)
|
||||||
|
|
||||||
|
# 5c) DELETED files — batch UPDATE exists_now=0
|
||||||
|
if deleted_paths:
|
||||||
|
del_ids = [db[p]["id"] for p in deleted_paths]
|
||||||
|
batch_mark_deleted(cur, del_ids, run_id)
|
||||||
|
for p in deleted_paths:
|
||||||
|
events.append({
|
||||||
|
"run_id": run_id,
|
||||||
|
"file_id": db[p]["id"],
|
||||||
|
"event_type": "DELETED",
|
||||||
|
"old_size": db[p]["size"],
|
||||||
|
"old_hash": db[p]["content_hash"],
|
||||||
|
})
|
||||||
|
|
||||||
|
# 5d) UNCHANGED files — batch UPDATE last_seen_run
|
||||||
|
if unchanged_paths:
|
||||||
|
unch_ids = [db[p]["id"] for p in unchanged_paths]
|
||||||
|
batch_update_unchanged(cur, unch_ids, run_id)
|
||||||
|
|
||||||
|
# 5e) Log all events
|
||||||
|
if events:
|
||||||
|
batch_log_events(cur, events)
|
||||||
|
|
||||||
|
# ── 6. Backup ──
|
||||||
|
if files_to_backup and BACKUP_PATH:
|
||||||
|
print(f"[6/7] Backing up {len(files_to_backup)} files...")
|
||||||
|
backed = ensure_backed_up(files_to_backup, BACKUP_PATH)
|
||||||
|
print(f" {backed} new blobs written.")
|
||||||
else:
|
else:
|
||||||
if not DRY_RUN:
|
print("[6/7] Nothing to backup.")
|
||||||
cur.execute(
|
|
||||||
"UPDATE files SET last_seen = NOW(), exists_now = 1 WHERE id = %s",
|
|
||||||
(file_id,)
|
|
||||||
)
|
|
||||||
|
|
||||||
if seen % 500 == 0:
|
# ── 7. Finalize ──
|
||||||
print(f"{seen} files scanned...")
|
stats = {
|
||||||
|
"total": len(fs),
|
||||||
if not DRY_RUN:
|
"new": len(new_paths),
|
||||||
|
"modified": len(modified_paths),
|
||||||
|
"deleted": len(deleted_paths),
|
||||||
|
"unchanged": len(unchanged_paths),
|
||||||
|
}
|
||||||
|
finalize_run(cur, run_id, stats)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
print(f"[7/7] Run #{run_id} COMPLETED.")
|
||||||
|
|
||||||
print("================================")
|
except Exception as e:
|
||||||
print(f"Scanned : {seen}")
|
print(f"\nERROR: {e}")
|
||||||
print(f"Created : {created}")
|
try:
|
||||||
print(f"Modified : {modified}")
|
fail_run(cur, run_id)
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
# ── Summary ──
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(f"Total : {stats['total']}")
|
||||||
|
print(f"New : {stats['new']}")
|
||||||
|
print(f"Modified : {stats['modified']}")
|
||||||
|
print(f"Deleted : {stats['deleted']}")
|
||||||
|
print(f"Unchanged: {stats['unchanged']}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
64
recovery.py
Normal file
64
recovery.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""
|
||||||
|
Recovery script: reconstruct directory tree from a specific run.
|
||||||
|
|
||||||
|
Usage: python recovery.py <run_id> <output_dir>
|
||||||
|
|
||||||
|
For a given run_id, finds all files that existed at that point
|
||||||
|
(first_seen_run <= run_id AND last_seen_run >= run_id)
|
||||||
|
and copies them from backup storage to output_dir preserving
|
||||||
|
the original directory structure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
from indexer.config import DB_CONFIG, BACKUP_PATH
|
||||||
|
from indexer.db import get_connection
|
||||||
|
from indexer.backup import blob_path
|
||||||
|
|
||||||
|
|
||||||
|
def recover(run_id: int, output_dir: str):
|
||||||
|
conn = get_connection()
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
cur.execute(
|
||||||
|
"""SELECT relative_path, content_hash
|
||||||
|
FROM files
|
||||||
|
WHERE first_seen_run <= %s AND last_seen_run >= %s""",
|
||||||
|
(run_id, run_id)
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print(f"No files found for run #{run_id}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Recovering {len(rows)} files from run #{run_id} to {output_dir}")
|
||||||
|
recovered = 0
|
||||||
|
missing = 0
|
||||||
|
|
||||||
|
for relative_path, content_hash in rows:
|
||||||
|
source = blob_path(BACKUP_PATH, content_hash)
|
||||||
|
target = os.path.join(output_dir, relative_path.replace("/", os.sep))
|
||||||
|
|
||||||
|
if not os.path.exists(source):
|
||||||
|
print(f" MISSING blob: {content_hash.hex()} for {relative_path}")
|
||||||
|
missing += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(target), exist_ok=True)
|
||||||
|
shutil.copy2(source, target)
|
||||||
|
recovered += 1
|
||||||
|
|
||||||
|
print(f"\nRecovered: {recovered} Missing blobs: {missing}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python recovery.py <run_id> <output_dir>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
run_id = int(sys.argv[1])
|
||||||
|
output_dir = sys.argv[2]
|
||||||
|
recover(run_id, output_dir)
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
pymysql
|
||||||
|
blake3
|
||||||
|
python-dotenv
|
||||||
|
|||||||
Reference in New Issue
Block a user