z230
This commit is contained in:
@@ -0,0 +1,202 @@
|
||||
# =============================================================================
|
||||
# Název: materialize_lab_reports_v1.0.py
|
||||
# Verze: 1.0
|
||||
# Datum: 2026-06-16
|
||||
# Popis: Materializuje Lab Reports z MongoDB (covance.labreports) do Dropboxu.
|
||||
# Cíl: adresář = IDENTICKÁ KOPIE Monga (plné zrcadlo) — doplní chybějící
|
||||
# PDF a SMAŽE přebytečné. Žádná jiná ambice.
|
||||
#
|
||||
# Struktura: <base studie>/<centrum siteNum>/<fileName>.pdf
|
||||
# (podsložka per centrum, do ní všechna PDF toho centra).
|
||||
#
|
||||
# Klíč dokumentu = record_id (reportId). Název souboru = doc.fileName.
|
||||
# Kdyz vic ruznych reportu (ruzne reportId) ma v jednom centru stejny
|
||||
# fileName, prida se ke VSEM z te skupiny " [<reportId[:8]>]" -> vzdy
|
||||
# unikatni a stabilni (idempotentni zrcadlo).
|
||||
#
|
||||
# Efektivita: cte z Monga jen metadata; PDF binarku tahá jen pro
|
||||
# soubory, ktere chybi (existujici se shodnou velikosti se preskoci,
|
||||
# obsah reportu je v case nemenny). --verify navic kontroluje sha256.
|
||||
#
|
||||
# Pojistka: pokud Mongo vrati pro studii 0 dokumentu, studie se
|
||||
# PRESKOCI (nesmaze cely adresar kvuli vypadku DB) — obejde --allow-empty.
|
||||
#
|
||||
# Prepinace: --dry-run (jen nahled, nic nezapise/nesmaze),
|
||||
# --verify (porovnat i sha256), --study <kod> (jen jedna),
|
||||
# --allow-empty (povolit zrcadleni i kdyz Mongo vrati 0).
|
||||
# =============================================================================
|
||||
from pymongo import MongoClient
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cilove cesty v Dropboxu — klic = studyCode (jak je v Mongo).
|
||||
# ---------------------------------------------------------------------------
|
||||
STUDY_PATHS = {
|
||||
"77242113UCO3001": r"u:\Dropbox\!77242113UCO3001\#020 Reporty\LabCorp",
|
||||
"42847922MDD3003": r"u:\Dropbox\!!42847922MDD3003\#000 Reports\LabCorp",
|
||||
}
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "covance"
|
||||
COLLECTION = "labreports"
|
||||
|
||||
parser = argparse.ArgumentParser(description="Materializace Lab Reports z Mongo do Dropboxu (zrcadlo).")
|
||||
parser.add_argument("--dry-run", action="store_true", help="jen nahled, nic nezapise/nesmaze")
|
||||
parser.add_argument("--verify", action="store_true", help="u existujicich porovnat i sha256 (jinak jen velikost)")
|
||||
parser.add_argument("--study", default=None, help="zpracovat jen jednu studii (studyCode)")
|
||||
parser.add_argument("--allow-empty", action="store_true", help="povolit zrcadleni i kdyz Mongo vrati 0 dokumentu")
|
||||
ARGS = parser.parse_args()
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def safe(s: str) -> str:
|
||||
import re
|
||||
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
||||
|
||||
|
||||
def sha256_file(path: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(1 << 20), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def plan_study(col, study_code: str, base: str):
|
||||
"""Spocita ocekavane soubory (cesta -> doc-meta) + kontrola kolizi nazvu."""
|
||||
docs = list(col.find(
|
||||
{"studyCode": study_code},
|
||||
{"record_id": 1, "site": 1, "fileName": 1, "pdfSize": 1, "pdfSha256": 1},
|
||||
))
|
||||
# skupiny (centrum, fileName) kvuli kolizim
|
||||
groups = {}
|
||||
for d in docs:
|
||||
key = (safe(d.get("site") or "_"), d.get("fileName") or (d["record_id"] + ".pdf"))
|
||||
groups.setdefault(key, []).append(d)
|
||||
|
||||
expected = {} # abs_path -> doc
|
||||
for (center, fname), members in groups.items():
|
||||
collide = len(members) > 1
|
||||
for d in members:
|
||||
name = fname
|
||||
if collide:
|
||||
stem, ext = (name[:-4], name[-4:]) if name.lower().endswith(".pdf") else (name, "")
|
||||
name = f"{stem} [{d['record_id'][:8]}]{ext}"
|
||||
path = os.path.join(base, center, name)
|
||||
expected[os.path.normpath(path)] = d
|
||||
return docs, expected
|
||||
|
||||
|
||||
def existing_pdfs(base: str):
|
||||
out = set()
|
||||
for root, _dirs, files in os.walk(base):
|
||||
for fn in files:
|
||||
if fn.lower().endswith(".pdf"):
|
||||
out.add(os.path.normpath(os.path.join(root, fn)))
|
||||
return out
|
||||
|
||||
|
||||
def needs_write(path: str, doc, verify: bool) -> bool:
|
||||
if not os.path.exists(path):
|
||||
return True
|
||||
if os.path.getsize(path) != doc.get("pdfSize", -1):
|
||||
return True
|
||||
if verify and sha256_file(path) != doc.get("pdfSha256"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def process_study(col, study_code: str, base: str):
|
||||
log(f"\n=== {study_code} -> {base} ===")
|
||||
if not os.path.isdir(base):
|
||||
log(f" CHYBA: cilovy adresar neexistuje: {base} — PRESKAKUJI.")
|
||||
return
|
||||
docs, expected = plan_study(col, study_code, base)
|
||||
log(f" Mongo: {len(docs)} dokumentu, ocekavanych souboru: {len(expected)}.")
|
||||
|
||||
if len(docs) == 0 and not ARGS.allow_empty:
|
||||
log(f" POJISTKA: Mongo vratilo 0 dokumentu -> studie PRESKOCENA (nemazu nic). "
|
||||
f"Pouzij --allow-empty kdyz je to zamer.")
|
||||
return
|
||||
|
||||
have = existing_pdfs(base)
|
||||
to_write = [(p, d) for p, d in expected.items() if needs_write(p, d, ARGS.verify)]
|
||||
to_delete = sorted(have - set(expected.keys()))
|
||||
|
||||
skip = len(expected) - len(to_write)
|
||||
log(f" PLAN: +{len(to_write)} zapsat, -{len(to_delete)} smazat, ={skip} beze zmeny.")
|
||||
|
||||
if ARGS.dry_run:
|
||||
for p, _ in to_write[:10]:
|
||||
log(f" [DRY +] {os.path.relpath(p, base)}")
|
||||
if len(to_write) > 10:
|
||||
log(f" [DRY +] ... a dalsich {len(to_write)-10}")
|
||||
for p in to_delete[:10]:
|
||||
log(f" [DRY -] {os.path.relpath(p, base)}")
|
||||
if len(to_delete) > 10:
|
||||
log(f" [DRY -] ... a dalsich {len(to_delete)-10}")
|
||||
return
|
||||
|
||||
# ZAPIS chybejicich (PDF binarka se tahá az ted, jen pro tyto)
|
||||
written = 0
|
||||
for p, d in to_write:
|
||||
os.makedirs(os.path.dirname(p), exist_ok=True)
|
||||
full = col.find_one({"record_id": d["record_id"]}, {"pdf": 1})
|
||||
data = bytes(full["pdf"])
|
||||
with open(p, "wb") as f:
|
||||
f.write(data)
|
||||
written += 1
|
||||
if written % 50 == 0:
|
||||
log(f" ... zapsano {written}/{len(to_write)}")
|
||||
|
||||
# SMAZ prebytecne
|
||||
deleted = 0
|
||||
for p in to_delete:
|
||||
try:
|
||||
os.remove(p)
|
||||
deleted += 1
|
||||
except OSError as e:
|
||||
log(f" CHYBA mazani {p}: {e!r}")
|
||||
|
||||
# odstran prazdne podslozky center pod base
|
||||
removed_dirs = 0
|
||||
for root, dirs, files in os.walk(base, topdown=False):
|
||||
if root == base:
|
||||
continue
|
||||
if not os.listdir(root):
|
||||
try:
|
||||
os.rmdir(root)
|
||||
removed_dirs += 1
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
log(f" HOTOVO: zapsano {written}, smazano {deleted}, smazano prazdnych slozek {removed_dirs}.")
|
||||
|
||||
|
||||
def main():
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
col = client[DB_NAME][COLLECTION]
|
||||
|
||||
items = STUDY_PATHS.items()
|
||||
if ARGS.study:
|
||||
items = [(k, v) for k, v in STUDY_PATHS.items() if k == ARGS.study]
|
||||
if not items:
|
||||
log(f"Neznamy studyCode: {ARGS.study}. Dostupne: {list(STUDY_PATHS)}")
|
||||
return
|
||||
|
||||
log(f"Materializace {'(DRY-RUN) ' if ARGS.dry_run else ''}-> Dropbox, {len(list(items))} studii"
|
||||
f"{', --verify' if ARGS.verify else ''}.")
|
||||
for study_code, base in items:
|
||||
process_study(col, study_code, base)
|
||||
client.close()
|
||||
log("\nKONEC.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user