# ============================================================================= # Název: materialize_lab_reports_v1.0.py # Verze: 1.0 # Datum: 2026-06-16 # Popis: Materializuje Lab Reports z MongoDB (covance.labreports) do Dropboxu. # Cíl: adresář = IDENTICKÁ KOPIE Monga (plné zrcadlo) — doplní chybějící # PDF a SMAŽE přebytečné. Žádná jiná ambice. # # Struktura: //.pdf # (podsložka per centrum, do ní všechna PDF toho centra). # # Klíč dokumentu = record_id (reportId). Název souboru = doc.fileName. # Kdyz vic ruznych reportu (ruzne reportId) ma v jednom centru stejny # fileName, prida se ke VSEM z te skupiny " []" -> vzdy # unikatni a stabilni (idempotentni zrcadlo). # # Efektivita: cte z Monga jen metadata; PDF binarku tahá jen pro # soubory, ktere chybi (existujici se shodnou velikosti se preskoci, # obsah reportu je v case nemenny). --verify navic kontroluje sha256. # # Pojistka: pokud Mongo vrati pro studii 0 dokumentu, studie se # PRESKOCI (nesmaze cely adresar kvuli vypadku DB) — obejde --allow-empty. # # Prepinace: --dry-run (jen nahled, nic nezapise/nesmaze), # --verify (porovnat i sha256), --study (jen jedna), # --allow-empty (povolit zrcadleni i kdyz Mongo vrati 0). # ============================================================================= from pymongo import MongoClient import argparse import hashlib import os # --------------------------------------------------------------------------- # Cilove cesty v Dropboxu — klic = studyCode (jak je v Mongo). # --------------------------------------------------------------------------- STUDY_PATHS = { "77242113UCO3001": r"u:\Dropbox\!77242113UCO3001\#020 Reporty\LabCorp", "42847922MDD3003": r"u:\Dropbox\!!42847922MDD3003\#000 Reports\LabCorp", } MONGO_URI = "mongodb://192.168.1.76:27017" DB_NAME = "covance" COLLECTION = "labreports" parser = argparse.ArgumentParser(description="Materializace Lab Reports z Mongo do Dropboxu (zrcadlo).") parser.add_argument("--dry-run", action="store_true", help="jen nahled, nic nezapise/nesmaze") parser.add_argument("--verify", action="store_true", help="u existujicich porovnat i sha256 (jinak jen velikost)") parser.add_argument("--study", default=None, help="zpracovat jen jednu studii (studyCode)") parser.add_argument("--allow-empty", action="store_true", help="povolit zrcadleni i kdyz Mongo vrati 0 dokumentu") ARGS = parser.parse_args() def log(msg): print(msg, flush=True) def safe(s: str) -> str: import re return re.sub(r'[\\/:*?"<>|]', "", s or "").strip() def sha256_file(path: str) -> str: h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(1 << 20), b""): h.update(chunk) return h.hexdigest() def plan_study(col, study_code: str, base: str): """Spocita ocekavane soubory (cesta -> doc-meta) + kontrola kolizi nazvu.""" docs = list(col.find( {"studyCode": study_code}, {"record_id": 1, "site": 1, "fileName": 1, "pdfSize": 1, "pdfSha256": 1}, )) # skupiny (centrum, fileName) kvuli kolizim groups = {} for d in docs: key = (safe(d.get("site") or "_"), d.get("fileName") or (d["record_id"] + ".pdf")) groups.setdefault(key, []).append(d) expected = {} # abs_path -> doc for (center, fname), members in groups.items(): collide = len(members) > 1 for d in members: name = fname if collide: stem, ext = (name[:-4], name[-4:]) if name.lower().endswith(".pdf") else (name, "") name = f"{stem} [{d['record_id'][:8]}]{ext}" path = os.path.join(base, center, name) expected[os.path.normpath(path)] = d return docs, expected def existing_pdfs(base: str): out = set() for root, _dirs, files in os.walk(base): for fn in files: if fn.lower().endswith(".pdf"): out.add(os.path.normpath(os.path.join(root, fn))) return out def needs_write(path: str, doc, verify: bool) -> bool: if not os.path.exists(path): return True if os.path.getsize(path) != doc.get("pdfSize", -1): return True if verify and sha256_file(path) != doc.get("pdfSha256"): return True return False def process_study(col, study_code: str, base: str): log(f"\n=== {study_code} -> {base} ===") if not os.path.isdir(base): log(f" CHYBA: cilovy adresar neexistuje: {base} — PRESKAKUJI.") return docs, expected = plan_study(col, study_code, base) log(f" Mongo: {len(docs)} dokumentu, ocekavanych souboru: {len(expected)}.") if len(docs) == 0 and not ARGS.allow_empty: log(f" POJISTKA: Mongo vratilo 0 dokumentu -> studie PRESKOCENA (nemazu nic). " f"Pouzij --allow-empty kdyz je to zamer.") return have = existing_pdfs(base) to_write = [(p, d) for p, d in expected.items() if needs_write(p, d, ARGS.verify)] to_delete = sorted(have - set(expected.keys())) skip = len(expected) - len(to_write) log(f" PLAN: +{len(to_write)} zapsat, -{len(to_delete)} smazat, ={skip} beze zmeny.") if ARGS.dry_run: for p, _ in to_write[:10]: log(f" [DRY +] {os.path.relpath(p, base)}") if len(to_write) > 10: log(f" [DRY +] ... a dalsich {len(to_write)-10}") for p in to_delete[:10]: log(f" [DRY -] {os.path.relpath(p, base)}") if len(to_delete) > 10: log(f" [DRY -] ... a dalsich {len(to_delete)-10}") return # ZAPIS chybejicich (PDF binarka se tahá az ted, jen pro tyto) written = 0 for p, d in to_write: os.makedirs(os.path.dirname(p), exist_ok=True) full = col.find_one({"record_id": d["record_id"]}, {"pdf": 1}) data = bytes(full["pdf"]) with open(p, "wb") as f: f.write(data) written += 1 if written % 50 == 0: log(f" ... zapsano {written}/{len(to_write)}") # SMAZ prebytecne deleted = 0 for p in to_delete: try: os.remove(p) deleted += 1 except OSError as e: log(f" CHYBA mazani {p}: {e!r}") # odstran prazdne podslozky center pod base removed_dirs = 0 for root, dirs, files in os.walk(base, topdown=False): if root == base: continue if not os.listdir(root): try: os.rmdir(root) removed_dirs += 1 except OSError: pass log(f" HOTOVO: zapsano {written}, smazano {deleted}, smazano prazdnych slozek {removed_dirs}.") def main(): client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") col = client[DB_NAME][COLLECTION] items = STUDY_PATHS.items() if ARGS.study: items = [(k, v) for k, v in STUDY_PATHS.items() if k == ARGS.study] if not items: log(f"Neznamy studyCode: {ARGS.study}. Dostupne: {list(STUDY_PATHS)}") return log(f"Materializace {'(DRY-RUN) ' if ARGS.dry_run else ''}-> Dropbox, {len(list(items))} studii" f"{', --verify' if ARGS.verify else ''}.") for study_code, base in items: process_study(col, study_code, base) client.close() log("\nKONEC.") if __name__ == "__main__": main()