203 lines
7.4 KiB
Python
203 lines
7.4 KiB
Python
# =============================================================================
|
|
# Název: materialize_lab_reports_v1.0.py
|
|
# Verze: 1.0
|
|
# Datum: 2026-06-16
|
|
# Popis: Materializuje Lab Reports z MongoDB (covance.labreports) do Dropboxu.
|
|
# Cíl: adresář = IDENTICKÁ KOPIE Monga (plné zrcadlo) — doplní chybějící
|
|
# PDF a SMAŽE přebytečné. Žádná jiná ambice.
|
|
#
|
|
# Struktura: <base studie>/<centrum siteNum>/<fileName>.pdf
|
|
# (podsložka per centrum, do ní všechna PDF toho centra).
|
|
#
|
|
# Klíč dokumentu = record_id (reportId). Název souboru = doc.fileName.
|
|
# Kdyz vic ruznych reportu (ruzne reportId) ma v jednom centru stejny
|
|
# fileName, prida se ke VSEM z te skupiny " [<reportId[:8]>]" -> vzdy
|
|
# unikatni a stabilni (idempotentni zrcadlo).
|
|
#
|
|
# Efektivita: cte z Monga jen metadata; PDF binarku tahá jen pro
|
|
# soubory, ktere chybi (existujici se shodnou velikosti se preskoci,
|
|
# obsah reportu je v case nemenny). --verify navic kontroluje sha256.
|
|
#
|
|
# Pojistka: pokud Mongo vrati pro studii 0 dokumentu, studie se
|
|
# PRESKOCI (nesmaze cely adresar kvuli vypadku DB) — obejde --allow-empty.
|
|
#
|
|
# Prepinace: --dry-run (jen nahled, nic nezapise/nesmaze),
|
|
# --verify (porovnat i sha256), --study <kod> (jen jedna),
|
|
# --allow-empty (povolit zrcadleni i kdyz Mongo vrati 0).
|
|
# =============================================================================
|
|
from pymongo import MongoClient
|
|
import argparse
|
|
import hashlib
|
|
import os
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cilove cesty v Dropboxu — klic = studyCode (jak je v Mongo).
|
|
# ---------------------------------------------------------------------------
|
|
STUDY_PATHS = {
|
|
"77242113UCO3001": r"u:\Dropbox\!77242113UCO3001\#020 Reporty\LabCorp",
|
|
"42847922MDD3003": r"u:\Dropbox\!!42847922MDD3003\#000 Reports\LabCorp",
|
|
}
|
|
|
|
MONGO_URI = "mongodb://192.168.1.76:27017"
|
|
DB_NAME = "covance"
|
|
COLLECTION = "labreports"
|
|
|
|
parser = argparse.ArgumentParser(description="Materializace Lab Reports z Mongo do Dropboxu (zrcadlo).")
|
|
parser.add_argument("--dry-run", action="store_true", help="jen nahled, nic nezapise/nesmaze")
|
|
parser.add_argument("--verify", action="store_true", help="u existujicich porovnat i sha256 (jinak jen velikost)")
|
|
parser.add_argument("--study", default=None, help="zpracovat jen jednu studii (studyCode)")
|
|
parser.add_argument("--allow-empty", action="store_true", help="povolit zrcadleni i kdyz Mongo vrati 0 dokumentu")
|
|
ARGS = parser.parse_args()
|
|
|
|
|
|
def log(msg):
|
|
print(msg, flush=True)
|
|
|
|
|
|
def safe(s: str) -> str:
|
|
import re
|
|
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
|
|
|
|
|
def sha256_file(path: str) -> str:
|
|
h = hashlib.sha256()
|
|
with open(path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(1 << 20), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
def plan_study(col, study_code: str, base: str):
|
|
"""Spocita ocekavane soubory (cesta -> doc-meta) + kontrola kolizi nazvu."""
|
|
docs = list(col.find(
|
|
{"studyCode": study_code},
|
|
{"record_id": 1, "site": 1, "fileName": 1, "pdfSize": 1, "pdfSha256": 1},
|
|
))
|
|
# skupiny (centrum, fileName) kvuli kolizim
|
|
groups = {}
|
|
for d in docs:
|
|
key = (safe(d.get("site") or "_"), d.get("fileName") or (d["record_id"] + ".pdf"))
|
|
groups.setdefault(key, []).append(d)
|
|
|
|
expected = {} # abs_path -> doc
|
|
for (center, fname), members in groups.items():
|
|
collide = len(members) > 1
|
|
for d in members:
|
|
name = fname
|
|
if collide:
|
|
stem, ext = (name[:-4], name[-4:]) if name.lower().endswith(".pdf") else (name, "")
|
|
name = f"{stem} [{d['record_id'][:8]}]{ext}"
|
|
path = os.path.join(base, center, name)
|
|
expected[os.path.normpath(path)] = d
|
|
return docs, expected
|
|
|
|
|
|
def existing_pdfs(base: str):
|
|
out = set()
|
|
for root, _dirs, files in os.walk(base):
|
|
for fn in files:
|
|
if fn.lower().endswith(".pdf"):
|
|
out.add(os.path.normpath(os.path.join(root, fn)))
|
|
return out
|
|
|
|
|
|
def needs_write(path: str, doc, verify: bool) -> bool:
|
|
if not os.path.exists(path):
|
|
return True
|
|
if os.path.getsize(path) != doc.get("pdfSize", -1):
|
|
return True
|
|
if verify and sha256_file(path) != doc.get("pdfSha256"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def process_study(col, study_code: str, base: str):
|
|
log(f"\n=== {study_code} -> {base} ===")
|
|
if not os.path.isdir(base):
|
|
log(f" CHYBA: cilovy adresar neexistuje: {base} — PRESKAKUJI.")
|
|
return
|
|
docs, expected = plan_study(col, study_code, base)
|
|
log(f" Mongo: {len(docs)} dokumentu, ocekavanych souboru: {len(expected)}.")
|
|
|
|
if len(docs) == 0 and not ARGS.allow_empty:
|
|
log(f" POJISTKA: Mongo vratilo 0 dokumentu -> studie PRESKOCENA (nemazu nic). "
|
|
f"Pouzij --allow-empty kdyz je to zamer.")
|
|
return
|
|
|
|
have = existing_pdfs(base)
|
|
to_write = [(p, d) for p, d in expected.items() if needs_write(p, d, ARGS.verify)]
|
|
to_delete = sorted(have - set(expected.keys()))
|
|
|
|
skip = len(expected) - len(to_write)
|
|
log(f" PLAN: +{len(to_write)} zapsat, -{len(to_delete)} smazat, ={skip} beze zmeny.")
|
|
|
|
if ARGS.dry_run:
|
|
for p, _ in to_write[:10]:
|
|
log(f" [DRY +] {os.path.relpath(p, base)}")
|
|
if len(to_write) > 10:
|
|
log(f" [DRY +] ... a dalsich {len(to_write)-10}")
|
|
for p in to_delete[:10]:
|
|
log(f" [DRY -] {os.path.relpath(p, base)}")
|
|
if len(to_delete) > 10:
|
|
log(f" [DRY -] ... a dalsich {len(to_delete)-10}")
|
|
return
|
|
|
|
# ZAPIS chybejicich (PDF binarka se tahá az ted, jen pro tyto)
|
|
written = 0
|
|
for p, d in to_write:
|
|
os.makedirs(os.path.dirname(p), exist_ok=True)
|
|
full = col.find_one({"record_id": d["record_id"]}, {"pdf": 1})
|
|
data = bytes(full["pdf"])
|
|
with open(p, "wb") as f:
|
|
f.write(data)
|
|
written += 1
|
|
if written % 50 == 0:
|
|
log(f" ... zapsano {written}/{len(to_write)}")
|
|
|
|
# SMAZ prebytecne
|
|
deleted = 0
|
|
for p in to_delete:
|
|
try:
|
|
os.remove(p)
|
|
deleted += 1
|
|
except OSError as e:
|
|
log(f" CHYBA mazani {p}: {e!r}")
|
|
|
|
# odstran prazdne podslozky center pod base
|
|
removed_dirs = 0
|
|
for root, dirs, files in os.walk(base, topdown=False):
|
|
if root == base:
|
|
continue
|
|
if not os.listdir(root):
|
|
try:
|
|
os.rmdir(root)
|
|
removed_dirs += 1
|
|
except OSError:
|
|
pass
|
|
|
|
log(f" HOTOVO: zapsano {written}, smazano {deleted}, smazano prazdnych slozek {removed_dirs}.")
|
|
|
|
|
|
def main():
|
|
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
|
client.admin.command("ping")
|
|
col = client[DB_NAME][COLLECTION]
|
|
|
|
items = STUDY_PATHS.items()
|
|
if ARGS.study:
|
|
items = [(k, v) for k, v in STUDY_PATHS.items() if k == ARGS.study]
|
|
if not items:
|
|
log(f"Neznamy studyCode: {ARGS.study}. Dostupne: {list(STUDY_PATHS)}")
|
|
return
|
|
|
|
log(f"Materializace {'(DRY-RUN) ' if ARGS.dry_run else ''}-> Dropbox, {len(list(items))} studii"
|
|
f"{', --verify' if ARGS.verify else ''}.")
|
|
for study_code, base in items:
|
|
process_study(col, study_code, base)
|
|
client.close()
|
|
log("\nKONEC.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|