216 lines
8.3 KiB
Python
216 lines
8.3 KiB
Python
# ============================================================
|
||
# migrate_to_v16.py
|
||
# Verze: 1.1
|
||
# Datum: 2026-06-15
|
||
# Popis: Jednorázová migrace stávajících STUDY-level dat
|
||
# (nasbíraných pipeline v1.3–v1.5) na schéma v1.6.
|
||
#
|
||
# v1.6 ukládá dokumenty JEN do SeaweedFS (žádný Dropbox),
|
||
# klíč = číslo dokumentu + verze. Dvě fáze:
|
||
#
|
||
# [mongo] Re-parse NEJNOVĚJŠÍHO archivovaného study reportu
|
||
# (WhatToDownload/Zpracovano/*Study Level*.xlsx)
|
||
# v1.6 parserem a obohacení existujících dokumentů
|
||
# o nová pole (level, levels[], scopes[], studies[],
|
||
# countries=[], sites=[], classification,
|
||
# process_name, external_system_name, created_by,
|
||
# last_modified_by, version_created_by).
|
||
# NESAHÁ na download stav (downloaded, sha256,
|
||
# seaweed_*, history, first_seen).
|
||
#
|
||
# [seaweed] Překlíčování SeaweedFS ze starých SHA cest na nové
|
||
# /vtmf-documents/<vtmf>/<verze>.<přípona>. Zdroj
|
||
# bajtů = stávající soubor na disku (pole file), jako
|
||
# fallback GET ze staré SHA cesty. Po úspěchu: oprava
|
||
# seaweed_path/url + sha256 v Mongo, smazání staré SHA
|
||
# cesty a ODEBRÁNÍ pole file z Mongo (Dropbox se už
|
||
# nepoužívá; fyzické soubory v Dropboxu pak můžeš
|
||
# smazat ručně).
|
||
#
|
||
# DEFAULT je DRY-RUN. Ostře až s --apply. Idempotentní.
|
||
#
|
||
# Použití:
|
||
# python migrate_to_v16.py # dry-run, vše
|
||
# python migrate_to_v16.py --apply # ostře, vše
|
||
# python migrate_to_v16.py --phase mongo --apply
|
||
# python migrate_to_v16.py --phase seaweed --apply
|
||
# ============================================================
|
||
|
||
import argparse
|
||
import hashlib
|
||
import importlib.util
|
||
import mimetypes
|
||
import re
|
||
import urllib.error
|
||
import urllib.request
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||
PIPE_FILE = SCRIPT_DIR / "vtmf_pipeline_v1.6.py"
|
||
|
||
# starý SHA-256 content-addressed tvar cesty (k odstranění z SeaweedFS)
|
||
OLD_SHA_PATH_RE = re.compile(r"^/vtmf-documents/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}$")
|
||
|
||
|
||
def load_pipeline():
|
||
spec = importlib.util.spec_from_file_location("vtmf_pipeline_v16", PIPE_FILE)
|
||
mod = importlib.util.module_from_spec(spec)
|
||
spec.loader.exec_module(mod)
|
||
return mod
|
||
|
||
|
||
def log(msg):
|
||
print(msg, flush=True)
|
||
|
||
|
||
def http_get(url):
|
||
with urllib.request.urlopen(url, timeout=120) as r:
|
||
return r.read()
|
||
|
||
|
||
def seaweed_delete(url):
|
||
try:
|
||
urllib.request.urlopen(urllib.request.Request(url, method="DELETE"), timeout=30)
|
||
return True
|
||
except urllib.error.HTTPError as e:
|
||
return e.code in (404, 204, 200)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
# --- Fáze MONGO --------------------------------------------------------
|
||
|
||
def phase_mongo(mod, coll, apply):
|
||
zp = SCRIPT_DIR / "WhatToDownload" / "Zpracovano"
|
||
reports = sorted(zp.glob("*Study Level*.xlsx"))
|
||
if not reports:
|
||
log("[!] Nenašel jsem žádný archivovaný study report — fáze mongo přeskočena.")
|
||
return
|
||
newest = reports[-1]
|
||
log(f"[i] [mongo] Re-parse: {newest.name}")
|
||
docs = mod.read_documents_from_excel(newest, "study")
|
||
docs = [d for d in docs if mod.TARGET_STUDY in d["studies"]]
|
||
log(f"[i] [mongo] {len(docs)} dokumentů study-level {mod.TARGET_STUDY}.")
|
||
|
||
sk = f"study|{mod.TARGET_STUDY}|"
|
||
enriched = missing = 0
|
||
for d in docs:
|
||
key = mod.doc_key(d["vtmf"], d["version"])
|
||
if not coll.find_one({"_id": key}, {"_id": 1}):
|
||
missing += 1
|
||
if missing <= 10:
|
||
log(f" [!] V Mongo chybí {key} (přeskočeno).")
|
||
continue
|
||
set_fields = {
|
||
"level": "study", "url": d["url"], "name": d["name"],
|
||
"status": d["status"], "type": d["type"], "subtype": d["subtype"],
|
||
"classification": d["classification"], "desc": d["desc"],
|
||
"process_name": d["process_name"],
|
||
"external_system_name": d["external_system_name"],
|
||
"created_by": d["created_by"], "last_modified_by": d["last_modified_by"],
|
||
"version_created_by": d["version_created_by"], "date": d["date"],
|
||
"studies": d["studies"], "countries": [], "sites": [],
|
||
}
|
||
if apply:
|
||
coll.update_one({"_id": key}, {
|
||
"$set": set_fields,
|
||
"$addToSet": {"scopes": sk, "levels": "study"},
|
||
})
|
||
enriched += 1
|
||
|
||
log(f"[{'APPLY' if apply else 'DRY'}] [mongo] Obohaceno {enriched} dokumentů"
|
||
+ (f", {missing} v Mongo chybělo." if missing else "."))
|
||
|
||
|
||
# --- Fáze SEAWEED ------------------------------------------------------
|
||
|
||
def phase_seaweed(mod, coll, apply):
|
||
q = {"downloaded": True, "placeholder": {"$ne": True}, "file": {"$ne": None}}
|
||
docs = list(coll.find(q))
|
||
log(f"[i] [seaweed] Kandidátů (s polem file): {len(docs)}")
|
||
|
||
uploaded = old_deleted = unset = missing = err = already = 0
|
||
for doc in docs:
|
||
key = doc["_id"]
|
||
src = Path(doc["file"])
|
||
ext = src.suffix
|
||
new_path = mod.seaweed_path(doc["vtmf"], doc["version"], ext)
|
||
old_path = doc.get("seaweed_path")
|
||
old_is_sha = bool(old_path and OLD_SHA_PATH_RE.match(old_path))
|
||
|
||
if old_path == new_path:
|
||
already += 1
|
||
if apply: # jen dorovnat: zahodit file
|
||
coll.update_one({"_id": key}, {"$unset": {"file": ""}})
|
||
unset += 1
|
||
continue
|
||
|
||
if not apply:
|
||
note = f" (smazat starou {old_path})" if old_is_sha else ""
|
||
log(f" PUT {new_path}{note} (+ unset file)")
|
||
continue
|
||
|
||
# zdroj bajtů: disk, fallback GET ze staré SHA cesty
|
||
try:
|
||
if src.exists():
|
||
data = src.read_bytes()
|
||
elif old_is_sha:
|
||
data = http_get(mod.SEAWEED_FILER + old_path)
|
||
else:
|
||
missing += 1
|
||
if missing <= 10:
|
||
log(f" [!] {key}: zdroj nedostupný (soubor i SHA chybí).")
|
||
continue
|
||
|
||
mime = mimetypes.guess_type("f" + ext)[0] or "application/octet-stream"
|
||
sw_path, sw_url = mod.seaweed_store(doc["vtmf"], doc["version"], ext, data, mime)
|
||
coll.update_one({"_id": key}, {
|
||
"$set": {"seaweed_path": sw_path, "seaweed_url": sw_url,
|
||
"sha256": hashlib.sha256(data).hexdigest(),
|
||
"seaweed_synced_at": datetime.now()},
|
||
"$unset": {"file": ""}})
|
||
uploaded += 1
|
||
unset += 1
|
||
if old_is_sha and old_path != sw_path:
|
||
if seaweed_delete(mod.SEAWEED_FILER + old_path):
|
||
old_deleted += 1
|
||
except Exception as e:
|
||
err += 1
|
||
log(f" [!] {key}: SeaweedFS selhal: {e}")
|
||
|
||
log(f"[{'APPLY' if apply else 'DRY'}] [seaweed] Překlíčováno {uploaded}, "
|
||
f"už na nové cestě {already}, starých SHA smazáno {old_deleted}, "
|
||
f"pole file odebráno {unset}, chybí zdroj {missing}, chyb {err}.")
|
||
|
||
|
||
# --- Main --------------------------------------------------------------
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser(description="Migrace VTMF dat na schéma v1.6")
|
||
ap.add_argument("--phase", choices=["mongo", "seaweed", "all"], default="all")
|
||
ap.add_argument("--apply", action="store_true",
|
||
help="ostrý běh (bez něj jen DRY-RUN)")
|
||
args = ap.parse_args()
|
||
|
||
mode = "APPLY (ostře)" if args.apply else "DRY-RUN (nic se nemění)"
|
||
log(f"=== Migrace na v1.6 — fáze: {args.phase} — režim: {mode} ===\n")
|
||
|
||
mod = load_pipeline()
|
||
_, coll, _ = mod.get_db()
|
||
log(f"[ok] Mongo: {mod.MONGO_URI} / {mod.MONGO_DB}.{mod.MONGO_COLL}\n")
|
||
|
||
if args.phase in ("mongo", "all"):
|
||
phase_mongo(mod, coll, args.apply)
|
||
log("")
|
||
if args.phase in ("seaweed", "all"):
|
||
phase_seaweed(mod, coll, args.apply)
|
||
log("")
|
||
|
||
log("=== DRY-RUN hotov. Pro ostrý běh přidej --apply. ==="
|
||
if not args.apply else "=== Migrace dokončena. ===")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|