This commit is contained in:
2026-06-15 16:10:47 +02:00
parent 36aa84aa02
commit 495cf8da21
34 changed files with 8012 additions and 8 deletions
+215
View File
@@ -0,0 +1,215 @@
# ============================================================
# migrate_to_v16.py
# Verze: 1.1
# Datum: 2026-06-15
# Popis: Jednorázová migrace stávajících STUDY-level dat
# (nasbíraných pipeline v1.3v1.5) na schéma v1.6.
#
# v1.6 ukládá dokumenty JEN do SeaweedFS (žádný Dropbox),
# klíč = číslo dokumentu + verze. Dvě fáze:
#
# [mongo] Re-parse NEJNOVĚJŠÍHO archivovaného study reportu
# (WhatToDownload/Zpracovano/*Study Level*.xlsx)
# v1.6 parserem a obohacení existujících dokumentů
# o nová pole (level, levels[], scopes[], studies[],
# countries=[], sites=[], classification,
# process_name, external_system_name, created_by,
# last_modified_by, version_created_by).
# NESAHÁ na download stav (downloaded, sha256,
# seaweed_*, history, first_seen).
#
# [seaweed] Překlíčování SeaweedFS ze starých SHA cest na nové
# /vtmf-documents/<vtmf>/<verze>.<přípona>. Zdroj
# bajtů = stávající soubor na disku (pole file), jako
# fallback GET ze staré SHA cesty. Po úspěchu: oprava
# seaweed_path/url + sha256 v Mongo, smazání staré SHA
# cesty a ODEBRÁNÍ pole file z Mongo (Dropbox se už
# nepoužívá; fyzické soubory v Dropboxu pak můžeš
# smazat ručně).
#
# DEFAULT je DRY-RUN. Ostře až s --apply. Idempotentní.
#
# Použití:
# python migrate_to_v16.py # dry-run, vše
# python migrate_to_v16.py --apply # ostře, vše
# python migrate_to_v16.py --phase mongo --apply
# python migrate_to_v16.py --phase seaweed --apply
# ============================================================
import argparse
import hashlib
import importlib.util
import mimetypes
import re
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
PIPE_FILE = SCRIPT_DIR / "vtmf_pipeline_v1.6.py"
# starý SHA-256 content-addressed tvar cesty (k odstranění z SeaweedFS)
OLD_SHA_PATH_RE = re.compile(r"^/vtmf-documents/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}$")
def load_pipeline():
spec = importlib.util.spec_from_file_location("vtmf_pipeline_v16", PIPE_FILE)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
def log(msg):
print(msg, flush=True)
def http_get(url):
with urllib.request.urlopen(url, timeout=120) as r:
return r.read()
def seaweed_delete(url):
try:
urllib.request.urlopen(urllib.request.Request(url, method="DELETE"), timeout=30)
return True
except urllib.error.HTTPError as e:
return e.code in (404, 204, 200)
except Exception:
return False
# --- Fáze MONGO --------------------------------------------------------
def phase_mongo(mod, coll, apply):
zp = SCRIPT_DIR / "WhatToDownload" / "Zpracovano"
reports = sorted(zp.glob("*Study Level*.xlsx"))
if not reports:
log("[!] Nenašel jsem žádný archivovaný study report — fáze mongo přeskočena.")
return
newest = reports[-1]
log(f"[i] [mongo] Re-parse: {newest.name}")
docs = mod.read_documents_from_excel(newest, "study")
docs = [d for d in docs if mod.TARGET_STUDY in d["studies"]]
log(f"[i] [mongo] {len(docs)} dokumentů study-level {mod.TARGET_STUDY}.")
sk = f"study|{mod.TARGET_STUDY}|"
enriched = missing = 0
for d in docs:
key = mod.doc_key(d["vtmf"], d["version"])
if not coll.find_one({"_id": key}, {"_id": 1}):
missing += 1
if missing <= 10:
log(f" [!] V Mongo chybí {key} (přeskočeno).")
continue
set_fields = {
"level": "study", "url": d["url"], "name": d["name"],
"status": d["status"], "type": d["type"], "subtype": d["subtype"],
"classification": d["classification"], "desc": d["desc"],
"process_name": d["process_name"],
"external_system_name": d["external_system_name"],
"created_by": d["created_by"], "last_modified_by": d["last_modified_by"],
"version_created_by": d["version_created_by"], "date": d["date"],
"studies": d["studies"], "countries": [], "sites": [],
}
if apply:
coll.update_one({"_id": key}, {
"$set": set_fields,
"$addToSet": {"scopes": sk, "levels": "study"},
})
enriched += 1
log(f"[{'APPLY' if apply else 'DRY'}] [mongo] Obohaceno {enriched} dokumentů"
+ (f", {missing} v Mongo chybělo." if missing else "."))
# --- Fáze SEAWEED ------------------------------------------------------
def phase_seaweed(mod, coll, apply):
q = {"downloaded": True, "placeholder": {"$ne": True}, "file": {"$ne": None}}
docs = list(coll.find(q))
log(f"[i] [seaweed] Kandidátů (s polem file): {len(docs)}")
uploaded = old_deleted = unset = missing = err = already = 0
for doc in docs:
key = doc["_id"]
src = Path(doc["file"])
ext = src.suffix
new_path = mod.seaweed_path(doc["vtmf"], doc["version"], ext)
old_path = doc.get("seaweed_path")
old_is_sha = bool(old_path and OLD_SHA_PATH_RE.match(old_path))
if old_path == new_path:
already += 1
if apply: # jen dorovnat: zahodit file
coll.update_one({"_id": key}, {"$unset": {"file": ""}})
unset += 1
continue
if not apply:
note = f" (smazat starou {old_path})" if old_is_sha else ""
log(f" PUT {new_path}{note} (+ unset file)")
continue
# zdroj bajtů: disk, fallback GET ze staré SHA cesty
try:
if src.exists():
data = src.read_bytes()
elif old_is_sha:
data = http_get(mod.SEAWEED_FILER + old_path)
else:
missing += 1
if missing <= 10:
log(f" [!] {key}: zdroj nedostupný (soubor i SHA chybí).")
continue
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
sw_path, sw_url = mod.seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
coll.update_one({"_id": key}, {
"$set": {"seaweed_path": sw_path, "seaweed_url": sw_url,
"sha256": hashlib.sha256(data).hexdigest(),
"seaweed_synced_at": datetime.now()},
"$unset": {"file": ""}})
uploaded += 1
unset += 1
if old_is_sha and old_path != sw_path:
if seaweed_delete(mod.SEAWEED_FILER + old_path):
old_deleted += 1
except Exception as e:
err += 1
log(f" [!] {key}: SeaweedFS selhal: {e}")
log(f"[{'APPLY' if apply else 'DRY'}] [seaweed] Překlíčováno {uploaded}, "
f"už na nové cestě {already}, starých SHA smazáno {old_deleted}, "
f"pole file odebráno {unset}, chybí zdroj {missing}, chyb {err}.")
# --- Main --------------------------------------------------------------
def main():
ap = argparse.ArgumentParser(description="Migrace VTMF dat na schéma v1.6")
ap.add_argument("--phase", choices=["mongo", "seaweed", "all"], default="all")
ap.add_argument("--apply", action="store_true",
help="ostrý běh (bez něj jen DRY-RUN)")
args = ap.parse_args()
mode = "APPLY (ostře)" if args.apply else "DRY-RUN (nic se nemění)"
log(f"=== Migrace na v1.6 — fáze: {args.phase} — režim: {mode} ===\n")
mod = load_pipeline()
_, coll, _ = mod.get_db()
log(f"[ok] Mongo: {mod.MONGO_URI} / {mod.MONGO_DB}.{mod.MONGO_COLL}\n")
if args.phase in ("mongo", "all"):
phase_mongo(mod, coll, args.apply)
log("")
if args.phase in ("seaweed", "all"):
phase_seaweed(mod, coll, args.apply)
log("")
log("=== DRY-RUN hotov. Pro ostrý běh přidej --apply. ==="
if not args.apply else "=== Migrace dokončena. ===")
if __name__ == "__main__":
main()