# ============================================================ # migrate_to_v16.py # Verze: 1.1 # Datum: 2026-06-15 # Popis: Jednorázová migrace stávajících STUDY-level dat # (nasbíraných pipeline v1.3–v1.5) na schéma v1.6. # # v1.6 ukládá dokumenty JEN do SeaweedFS (žádný Dropbox), # klíč = číslo dokumentu + verze. Dvě fáze: # # [mongo] Re-parse NEJNOVĚJŠÍHO archivovaného study reportu # (WhatToDownload/Zpracovano/*Study Level*.xlsx) # v1.6 parserem a obohacení existujících dokumentů # o nová pole (level, levels[], scopes[], studies[], # countries=[], sites=[], classification, # process_name, external_system_name, created_by, # last_modified_by, version_created_by). # NESAHÁ na download stav (downloaded, sha256, # seaweed_*, history, first_seen). # # [seaweed] Překlíčování SeaweedFS ze starých SHA cest na nové # /vtmf-documents//.. Zdroj # bajtů = stávající soubor na disku (pole file), jako # fallback GET ze staré SHA cesty. Po úspěchu: oprava # seaweed_path/url + sha256 v Mongo, smazání staré SHA # cesty a ODEBRÁNÍ pole file z Mongo (Dropbox se už # nepoužívá; fyzické soubory v Dropboxu pak můžeš # smazat ručně). # # DEFAULT je DRY-RUN. Ostře až s --apply. Idempotentní. # # Použití: # python migrate_to_v16.py # dry-run, vše # python migrate_to_v16.py --apply # ostře, vše # python migrate_to_v16.py --phase mongo --apply # python migrate_to_v16.py --phase seaweed --apply # ============================================================ import argparse import hashlib import importlib.util import mimetypes import re import urllib.error import urllib.request from datetime import datetime from pathlib import Path SCRIPT_DIR = Path(__file__).resolve().parent PIPE_FILE = SCRIPT_DIR / "vtmf_pipeline_v1.6.py" # starý SHA-256 content-addressed tvar cesty (k odstranění z SeaweedFS) OLD_SHA_PATH_RE = re.compile(r"^/vtmf-documents/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}$") def load_pipeline(): spec = importlib.util.spec_from_file_location("vtmf_pipeline_v16", PIPE_FILE) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) return mod def log(msg): print(msg, flush=True) def http_get(url): with urllib.request.urlopen(url, timeout=120) as r: return r.read() def seaweed_delete(url): try: urllib.request.urlopen(urllib.request.Request(url, method="DELETE"), timeout=30) return True except urllib.error.HTTPError as e: return e.code in (404, 204, 200) except Exception: return False # --- Fáze MONGO -------------------------------------------------------- def phase_mongo(mod, coll, apply): zp = SCRIPT_DIR / "WhatToDownload" / "Zpracovano" reports = sorted(zp.glob("*Study Level*.xlsx")) if not reports: log("[!] Nenašel jsem žádný archivovaný study report — fáze mongo přeskočena.") return newest = reports[-1] log(f"[i] [mongo] Re-parse: {newest.name}") docs = mod.read_documents_from_excel(newest, "study") docs = [d for d in docs if mod.TARGET_STUDY in d["studies"]] log(f"[i] [mongo] {len(docs)} dokumentů study-level {mod.TARGET_STUDY}.") sk = f"study|{mod.TARGET_STUDY}|" enriched = missing = 0 for d in docs: key = mod.doc_key(d["vtmf"], d["version"]) if not coll.find_one({"_id": key}, {"_id": 1}): missing += 1 if missing <= 10: log(f" [!] V Mongo chybí {key} (přeskočeno).") continue set_fields = { "level": "study", "url": d["url"], "name": d["name"], "status": d["status"], "type": d["type"], "subtype": d["subtype"], "classification": d["classification"], "desc": d["desc"], "process_name": d["process_name"], "external_system_name": d["external_system_name"], "created_by": d["created_by"], "last_modified_by": d["last_modified_by"], "version_created_by": d["version_created_by"], "date": d["date"], "studies": d["studies"], "countries": [], "sites": [], } if apply: coll.update_one({"_id": key}, { "$set": set_fields, "$addToSet": {"scopes": sk, "levels": "study"}, }) enriched += 1 log(f"[{'APPLY' if apply else 'DRY'}] [mongo] Obohaceno {enriched} dokumentů" + (f", {missing} v Mongo chybělo." if missing else ".")) # --- Fáze SEAWEED ------------------------------------------------------ def phase_seaweed(mod, coll, apply): q = {"downloaded": True, "placeholder": {"$ne": True}, "file": {"$ne": None}} docs = list(coll.find(q)) log(f"[i] [seaweed] Kandidátů (s polem file): {len(docs)}") uploaded = old_deleted = unset = missing = err = already = 0 for doc in docs: key = doc["_id"] src = Path(doc["file"]) ext = src.suffix new_path = mod.seaweed_path(doc["vtmf"], doc["version"], ext) old_path = doc.get("seaweed_path") old_is_sha = bool(old_path and OLD_SHA_PATH_RE.match(old_path)) if old_path == new_path: already += 1 if apply: # jen dorovnat: zahodit file coll.update_one({"_id": key}, {"$unset": {"file": ""}}) unset += 1 continue if not apply: note = f" (smazat starou {old_path})" if old_is_sha else "" log(f" PUT {new_path}{note} (+ unset file)") continue # zdroj bajtů: disk, fallback GET ze staré SHA cesty try: if src.exists(): data = src.read_bytes() elif old_is_sha: data = http_get(mod.SEAWEED_FILER + old_path) else: missing += 1 if missing <= 10: log(f" [!] {key}: zdroj nedostupný (soubor i SHA chybí).") continue mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream" sw_path, sw_url = mod.seaweed_store(doc["vtmf"], doc["version"], ext, data, mime) coll.update_one({"_id": key}, { "$set": {"seaweed_path": sw_path, "seaweed_url": sw_url, "sha256": hashlib.sha256(data).hexdigest(), "seaweed_synced_at": datetime.now()}, "$unset": {"file": ""}}) uploaded += 1 unset += 1 if old_is_sha and old_path != sw_path: if seaweed_delete(mod.SEAWEED_FILER + old_path): old_deleted += 1 except Exception as e: err += 1 log(f" [!] {key}: SeaweedFS selhal: {e}") log(f"[{'APPLY' if apply else 'DRY'}] [seaweed] Překlíčováno {uploaded}, " f"už na nové cestě {already}, starých SHA smazáno {old_deleted}, " f"pole file odebráno {unset}, chybí zdroj {missing}, chyb {err}.") # --- Main -------------------------------------------------------------- def main(): ap = argparse.ArgumentParser(description="Migrace VTMF dat na schéma v1.6") ap.add_argument("--phase", choices=["mongo", "seaweed", "all"], default="all") ap.add_argument("--apply", action="store_true", help="ostrý běh (bez něj jen DRY-RUN)") args = ap.parse_args() mode = "APPLY (ostře)" if args.apply else "DRY-RUN (nic se nemění)" log(f"=== Migrace na v1.6 — fáze: {args.phase} — režim: {mode} ===\n") mod = load_pipeline() _, coll, _ = mod.get_db() log(f"[ok] Mongo: {mod.MONGO_URI} / {mod.MONGO_DB}.{mod.MONGO_COLL}\n") if args.phase in ("mongo", "all"): phase_mongo(mod, coll, args.apply) log("") if args.phase in ("seaweed", "all"): phase_seaweed(mod, coll, args.apply) log("") log("=== DRY-RUN hotov. Pro ostrý běh přidej --apply. ===" if not args.apply else "=== Migrace dokončena. ===") if __name__ == "__main__": main()