Files
janssen/VTMFDownloadFiles/migrate_to_v16.py
T
2026-06-15 16:10:47 +02:00

216 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ============================================================
# migrate_to_v16.py
# Verze: 1.1
# Datum: 2026-06-15
# Popis: Jednorázová migrace stávajících STUDY-level dat
# (nasbíraných pipeline v1.3v1.5) na schéma v1.6.
#
# v1.6 ukládá dokumenty JEN do SeaweedFS (žádný Dropbox),
# klíč = číslo dokumentu + verze. Dvě fáze:
#
# [mongo] Re-parse NEJNOVĚJŠÍHO archivovaného study reportu
# (WhatToDownload/Zpracovano/*Study Level*.xlsx)
# v1.6 parserem a obohacení existujících dokumentů
# o nová pole (level, levels[], scopes[], studies[],
# countries=[], sites=[], classification,
# process_name, external_system_name, created_by,
# last_modified_by, version_created_by).
# NESAHÁ na download stav (downloaded, sha256,
# seaweed_*, history, first_seen).
#
# [seaweed] Překlíčování SeaweedFS ze starých SHA cest na nové
# /vtmf-documents/<vtmf>/<verze>.<přípona>. Zdroj
# bajtů = stávající soubor na disku (pole file), jako
# fallback GET ze staré SHA cesty. Po úspěchu: oprava
# seaweed_path/url + sha256 v Mongo, smazání staré SHA
# cesty a ODEBRÁNÍ pole file z Mongo (Dropbox se už
# nepoužívá; fyzické soubory v Dropboxu pak můžeš
# smazat ručně).
#
# DEFAULT je DRY-RUN. Ostře až s --apply. Idempotentní.
#
# Použití:
# python migrate_to_v16.py # dry-run, vše
# python migrate_to_v16.py --apply # ostře, vše
# python migrate_to_v16.py --phase mongo --apply
# python migrate_to_v16.py --phase seaweed --apply
# ============================================================
import argparse
import hashlib
import importlib.util
import mimetypes
import re
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
PIPE_FILE = SCRIPT_DIR / "vtmf_pipeline_v1.6.py"
# starý SHA-256 content-addressed tvar cesty (k odstranění z SeaweedFS)
OLD_SHA_PATH_RE = re.compile(r"^/vtmf-documents/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}$")
def load_pipeline():
spec = importlib.util.spec_from_file_location("vtmf_pipeline_v16", PIPE_FILE)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
def log(msg):
print(msg, flush=True)
def http_get(url):
with urllib.request.urlopen(url, timeout=120) as r:
return r.read()
def seaweed_delete(url):
try:
urllib.request.urlopen(urllib.request.Request(url, method="DELETE"), timeout=30)
return True
except urllib.error.HTTPError as e:
return e.code in (404, 204, 200)
except Exception:
return False
# --- Fáze MONGO --------------------------------------------------------
def phase_mongo(mod, coll, apply):
zp = SCRIPT_DIR / "WhatToDownload" / "Zpracovano"
reports = sorted(zp.glob("*Study Level*.xlsx"))
if not reports:
log("[!] Nenašel jsem žádný archivovaný study report — fáze mongo přeskočena.")
return
newest = reports[-1]
log(f"[i] [mongo] Re-parse: {newest.name}")
docs = mod.read_documents_from_excel(newest, "study")
docs = [d for d in docs if mod.TARGET_STUDY in d["studies"]]
log(f"[i] [mongo] {len(docs)} dokumentů study-level {mod.TARGET_STUDY}.")
sk = f"study|{mod.TARGET_STUDY}|"
enriched = missing = 0
for d in docs:
key = mod.doc_key(d["vtmf"], d["version"])
if not coll.find_one({"_id": key}, {"_id": 1}):
missing += 1
if missing <= 10:
log(f" [!] V Mongo chybí {key} (přeskočeno).")
continue
set_fields = {
"level": "study", "url": d["url"], "name": d["name"],
"status": d["status"], "type": d["type"], "subtype": d["subtype"],
"classification": d["classification"], "desc": d["desc"],
"process_name": d["process_name"],
"external_system_name": d["external_system_name"],
"created_by": d["created_by"], "last_modified_by": d["last_modified_by"],
"version_created_by": d["version_created_by"], "date": d["date"],
"studies": d["studies"], "countries": [], "sites": [],
}
if apply:
coll.update_one({"_id": key}, {
"$set": set_fields,
"$addToSet": {"scopes": sk, "levels": "study"},
})
enriched += 1
log(f"[{'APPLY' if apply else 'DRY'}] [mongo] Obohaceno {enriched} dokumentů"
+ (f", {missing} v Mongo chybělo." if missing else "."))
# --- Fáze SEAWEED ------------------------------------------------------
def phase_seaweed(mod, coll, apply):
q = {"downloaded": True, "placeholder": {"$ne": True}, "file": {"$ne": None}}
docs = list(coll.find(q))
log(f"[i] [seaweed] Kandidátů (s polem file): {len(docs)}")
uploaded = old_deleted = unset = missing = err = already = 0
for doc in docs:
key = doc["_id"]
src = Path(doc["file"])
ext = src.suffix
new_path = mod.seaweed_path(doc["vtmf"], doc["version"], ext)
old_path = doc.get("seaweed_path")
old_is_sha = bool(old_path and OLD_SHA_PATH_RE.match(old_path))
if old_path == new_path:
already += 1
if apply: # jen dorovnat: zahodit file
coll.update_one({"_id": key}, {"$unset": {"file": ""}})
unset += 1
continue
if not apply:
note = f" (smazat starou {old_path})" if old_is_sha else ""
log(f" PUT {new_path}{note} (+ unset file)")
continue
# zdroj bajtů: disk, fallback GET ze staré SHA cesty
try:
if src.exists():
data = src.read_bytes()
elif old_is_sha:
data = http_get(mod.SEAWEED_FILER + old_path)
else:
missing += 1
if missing <= 10:
log(f" [!] {key}: zdroj nedostupný (soubor i SHA chybí).")
continue
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
sw_path, sw_url = mod.seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
coll.update_one({"_id": key}, {
"$set": {"seaweed_path": sw_path, "seaweed_url": sw_url,
"sha256": hashlib.sha256(data).hexdigest(),
"seaweed_synced_at": datetime.now()},
"$unset": {"file": ""}})
uploaded += 1
unset += 1
if old_is_sha and old_path != sw_path:
if seaweed_delete(mod.SEAWEED_FILER + old_path):
old_deleted += 1
except Exception as e:
err += 1
log(f" [!] {key}: SeaweedFS selhal: {e}")
log(f"[{'APPLY' if apply else 'DRY'}] [seaweed] Překlíčováno {uploaded}, "
f"už na nové cestě {already}, starých SHA smazáno {old_deleted}, "
f"pole file odebráno {unset}, chybí zdroj {missing}, chyb {err}.")
# --- Main --------------------------------------------------------------
def main():
ap = argparse.ArgumentParser(description="Migrace VTMF dat na schéma v1.6")
ap.add_argument("--phase", choices=["mongo", "seaweed", "all"], default="all")
ap.add_argument("--apply", action="store_true",
help="ostrý běh (bez něj jen DRY-RUN)")
args = ap.parse_args()
mode = "APPLY (ostře)" if args.apply else "DRY-RUN (nic se nemění)"
log(f"=== Migrace na v1.6 — fáze: {args.phase} — režim: {mode} ===\n")
mod = load_pipeline()
_, coll, _ = mod.get_db()
log(f"[ok] Mongo: {mod.MONGO_URI} / {mod.MONGO_DB}.{mod.MONGO_COLL}\n")
if args.phase in ("mongo", "all"):
phase_mongo(mod, coll, args.apply)
log("")
if args.phase in ("seaweed", "all"):
phase_seaweed(mod, coll, args.apply)
log("")
log("=== DRY-RUN hotov. Pro ostrý běh přidej --apply. ==="
if not args.apply else "=== Migrace dokončena. ===")
if __name__ == "__main__":
main()