# ============================================================================= # Název: download_lab_reports_v1.4.py # Verze: 1.4 # Datum: 2026-06-16 # Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com do MongoDB # (db covance, kolekce labreports) — metadata z tabulky + skutecne PDF # (inline Binary). Na disk NEUKLADA. # # VICE STUDII: nahore je seznam STUDIES — kazda polozka ma "link" # (lab-reports URL s filtrem center) a "enabled". Skript zpracuje # vsechny enabled polozky. Z linku se odvodi interni cislo studie # (/study/{N}/) i mnozina center (site=[...]); "lidsky" kod studie # (napr. 77242113UCO3001) se cte z hlavicky portalu. Do Monga se pise # study (interni) + studyCode (z hlavicky). # # REZIM SYNC_MODE: "delta" (jen nove pres reportId, stop-at-known) # nebo "fullsync" (projit vse). CLI prepise: --delta/--fullsync. # # KLIC: record_id = reportId (stabilni 32-hex ID dokumentu, globalne # unikatni). Jedna kolekce pro vsechny studie; delta filtruje per study. # # POJISTKA center: zpracuji se jen radky, jejichz siteId je v mnozine # center z linku (study/zeme-agnosticke). # # Zmeny v1.4: seznam STUDIES (link+enabled) misto pevneho STUDY/SITES; # studyCode z hlavicky; site-pojistka pres siteId z linku. # ============================================================================= from playwright.sync_api import sync_playwright from datetime import datetime from pymongo import MongoClient, ASCENDING from bson.binary import Binary import argparse import hashlib import json import os import re import traceback import urllib.parse # ============================================================================ # STUDIE KE STAZENI — kazda ma link (lab-reports URL s filtrem center) a status. # Skript stahne VSE, co ma enabled=True. Pridani dalsi studie = dalsi radek. # ============================================================================ STUDIES = [ { "enabled": True, "note": "77242113UCO3001 (UC)", "link": "https://xsp.labcorp.com/sponsor/study/36940/lab-reports?site=%5B%22930539%22,%22930547%22,%22930555%22,%22930556%22,%22930553%22,%22930549%22,%22930525%22,%22930536%22,%22930557%22,%22930531%22%5D", }, { "enabled": True, "note": "42847922MDD3003 (MDD)", "link": "https://xsp.labcorp.com/sponsor/study/35472/lab-reports?site=%5B%22898727%22,%22898733%22,%22898739%22,%22898745%22,%22898744%22%5D", }, ] # ============================================================================ # REZIM SYNCHRONIZACE — CLI --delta / --fullsync ma prednost # ============================================================================ SYNC_MODE = "delta" # "delta" = jen nove (stop-at-known pres reportId) # "fullsync" = projit vse, doplnit chybejici/zmenene HEADLESS = True # True = Playwright bezi bez okna (headless), # False = viditelne okno prohlizece # ============================================================================ parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.") parser.add_argument("--delta", action="store_true", help="vynutit rezim delta") parser.add_argument("--fullsync", action="store_true", help="vynutit rezim fullsync") parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove") parser.add_argument("--limit", type=int, default=0, help="max N novych radku NA STUDII (0 = vse)") ARGS = parser.parse_args() _mode = SYNC_MODE if ARGS.fullsync: _mode = "fullsync" if ARGS.delta: _mode = "delta" FULLSYNC = (_mode == "fullsync") def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) # --- konfigurace ------------------------------------------------------------ EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "%zT3Wqfc9)cWua5" LOGIN_URL = "https://xsp.labcorp.com/" MONGO_URI = "mongodb://192.168.1.76:27017" DB_NAME = "covance" COLLECTION = "labreports" _BASE = os.path.dirname(os.path.abspath(__file__)) PROFILE_DIR = os.path.join(_BASE, "browser_profile") def parse_link(link: str): """Z lab-reports linku vytahne interni cislo studie a mnozinu center.""" m = re.search(r"/study/(\d+)/", link) study = m.group(1) if m else None qs = urllib.parse.urlparse(link).query raw = urllib.parse.parse_qs(qs).get("site", [None])[0] sites = [] if raw: try: sites = [str(s) for s in json.loads(raw)] except Exception: sites = re.findall(r"\d+", raw) return study, sites # --- formatovani ------------------------------------------------------------- _MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"} def safe(s: str) -> str: return re.sub(r'[\\/:*?"<>|]', "", s or "").strip() def fmt_date(s: str) -> str: m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "") if m and m.group(1)[:3] in _MONTHS: return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}" return safe(s) def build_basename(study_code: str, meta: dict) -> str: return safe( f"{study_code} {fmt_date(meta['collected'])} {meta['site']} " f"{meta['subject']} {meta['visit']} {meta['accession']} " f"posted {fmt_date(meta['postedDisplay'])}" ) + ".pdf" # --- JS helpery (AG Grid) --------------------------------------------------- JS_ALL_ROWS = r"""() => { let holder = null; for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) { if (el.__agComponent) { holder = el.__agComponent; break; } } if (!holder) return null; const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder; if (!api || !api.getDisplayedRowCount) return null; const toIso = v => { if (v == null) return null; if (typeof v === 'string') return v; if (v._i && typeof v._i === 'string') return v._i; if (typeof v.toISOString === 'function') { try { return v.toISOString(); } catch (e) {} } return String(v); }; const cnt = api.getDisplayedRowCount(); const out = []; for (let i = 0; i < cnt; i++) { const n = api.getDisplayedRowAtIndex(i); if (!n || !n.data) continue; const d = n.data; const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {}; out.push({ rowIndex: i, reportId: d.reportId, fileId: fl.fileId, serverFileName: fl.fileName, postedIso: toIso(d.postedDateTime), site: d.siteNum, siteId: String(d.siteId), subject: d.subjectNumber, }); } return out; }""" JS_CELLS = r"""(idx) => { const dedup = s => { s = (s || '').replace(/\s+/g, ' ').trim(); const h = s.slice(0, Math.floor(s.length / 2)); if (s === h + h) return h; const m = s.match(/^(.*?)\s+\1$/); if (m) return m[1]; return s; }; const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]'); if (!row) return null; const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; }; return { type: get('type'), accession: get('accessionNumber'), visit: get('visit'), collected: get('visitCollectionDate'), postedDisplay: get('postedDateTime'), }; }""" JS_SCROLL_TO = r"""(args) => { const [idx, rh] = args; const vp = document.querySelector('.ag-body-viewport'); if (!vp) return; vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2); }""" JS_ROW_HEIGHT = r"""() => { const r = document.querySelector('.ag-body-container .ag-row'); return r ? r.getBoundingClientRect().height || 25 : 25; }""" # Kod studie z hlavicky portalu (napr. "77242113UCO3001"): cifry+pismena+cifry. JS_STUDY_CODE = r"""() => { const txt = (document.body.innerText || ''); const m = txt.match(/\b\d{7,}[A-Z]{2,4}\d{3,}\b/); return m ? m[0] : null; }""" # --- login ------------------------------------------------------------------ def login(page): log("LOGIN: otviram login stranku...") page.goto(LOGIN_URL) try: page.get_by_label("Email").wait_for(state="visible", timeout=12000) except Exception: log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})") return log("LOGIN: zadavam email...") page.get_by_label("Email").fill(EMAIL) page.get_by_role("button", name="Next").click() log("LOGIN: cekam na pole pro heslo...") page.get_by_label("Password").wait_for(state="visible", timeout=30000) log("LOGIN: zadavam heslo...") page.get_by_label("Password").fill(PASSWORD) page.get_by_role("button", name="Verify").click() log("LOGIN: cekam na presmerovani po prihlaseni...") try: page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000) except Exception: log("LOGIN: wait_for_url vyprsel, pokracuji.") page.wait_for_timeout(3000) log(f"LOGIN: prihlaseni hotovo ({page.url})") def open_grid(page, link): log("GRID: navigace na Lab Reports...") page.goto(link) log("GRID: cekam na radky (.ag-row)...") page.wait_for_selector(".ag-body-container .ag-row", timeout=120000) prev = -1 rows = None for i in range(25): rows = page.evaluate(JS_ALL_ROWS) cnt = len(rows) if rows else 0 log(f" ...kontrola #{i+1}: rows={cnt}") if rows and cnt == prev and cnt > 0: break prev = cnt page.wait_for_timeout(2000) row_height = page.evaluate(JS_ROW_HEIGHT) study_code = page.evaluate(JS_STUDY_CODE) log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px, studyCode={study_code}.") return rows or [], row_height, study_code def download_pdf_bytes(page, idx): link = page.locator( f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link', has_text="English", ).first with page.expect_download(timeout=60000) as dl: link.click() with open(dl.value.path(), "rb") as f: return f.read() def upsert(col, study, study_code, rec, cells, data, now): fields = { "Type": cells["type"], "Subject": rec["subject"], "Accession": cells["accession"], "Visit": cells["visit"], "Collected Date": cells["collected"], "Site Number": rec["site"], "Posted": cells["postedDisplay"], } sha = hashlib.sha256(data).hexdigest() meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": cells["collected"], "postedDisplay": cells["postedDisplay"]} derived = { "study": study, "studyCode": study_code, "type": cells["type"] or "Lab Result", "site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": fmt_date(cells["collected"]), "posted": cells["postedDisplay"], "postedIso": rec["postedIso"], "fileId": rec["fileId"], "serverFileName": rec["serverFileName"], "fields": fields, "fileName": build_basename(study_code, meta), "pdf": Binary(data), "pdfSize": len(data), "pdfSha256": sha, } rid = rec["reportId"] existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1}) if existing is None: col.insert_one({"record_id": rid, **derived, "firstSeen": now, "lastSeen": now, "history": []}) return "insert" if existing.get("pdfSha256") != sha or existing.get("fields") != fields: col.update_one( {"_id": existing["_id"]}, {"$push": {"history": {"date": existing.get("lastSeen"), "fields": existing.get("fields"), "pdfSha256": existing.get("pdfSha256")}}, "$set": {**derived, "lastSeen": now}}, ) return "update" col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}}) return "same" def process_study(page, col, study_item, now): study, sites = parse_link(study_item["link"]) site_set = set(sites) log(f"\n=== STUDIE {study_item.get('note','')} (interni {study}, {len(sites)} center) ===") if not study or not sites: log(f"PRESKAKUJI: z linku nelze odvodit study/sites: {study_item['link']}") return {"new": 0, "upd": 0, "same": 0, "fail": 0} # existujici reportId teto studie (i v dry-run, kvuli "co je opravdu nove") read_col = col if col is not None else \ MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)[DB_NAME][COLLECTION] existing_ids = {d["record_id"] for d in read_col.find({"study": study}, {"record_id": 1})} log(f"START: v Mongo je {len(existing_ids)} reportu pro studii {study}.") rows, row_height, study_code = open_grid(page, study_item["link"]) if not study_code: study_code = f"study-{study}" log(f"POZOR: studyCode z hlavicky nenalezen -> pouzivam '{study_code}'.") # POJISTKA: jen centra z linku (podle interniho siteId) bad = [r for r in rows if r["siteId"] not in site_set] if bad: log(f"POZOR: {len(bad)} radku mimo pozadovana centra (napr. siteId {bad[0]['siteId']}) " f"-> filtruji. Zkontroluj link!") rows = [r for r in rows if r["siteId"] in site_set] log(f"GRID: po pojistce {len(rows)} radku v pozadovanych centrech.") # vyber dle rezimu todo = [] for rec in rows: if rec["reportId"] in existing_ids: if FULLSYNC: continue log(f"DELTA stop-at-known: rowIndex {rec['rowIndex']} ({rec['reportId'][:12]}…) " f"uz v Mongo -> koncim (zbytek je starsi).") break todo.append(rec) if ARGS.limit: todo = todo[:ARGS.limit] log(f"PLAN [{'FULLSYNC' if FULLSYNC else 'DELTA'}]: {len(todo)} novych radku " f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).") new_cnt = upd_cnt = same_cnt = 0 failed = [] for k, rec in enumerate(todo, 1): idx = rec["rowIndex"] try: page.evaluate(JS_SCROLL_TO, [idx, row_height]) page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000) page.wait_for_timeout(120) cells = page.evaluate(JS_CELLS, idx) if not cells: raise RuntimeError("nepodarilo se precist bunky radku") meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": cells["collected"], "postedDisplay": cells["postedDisplay"]} fname = build_basename(study_code, meta) if ARGS.dry_run: log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}") new_cnt += 1 continue data = download_pdf_bytes(page, idx) action = upsert(col, study, study_code, rec, cells, data, now) existing_ids.add(rec["reportId"]) if action == "insert": new_cnt += 1 log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}") elif action == "update": upd_cnt += 1 log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}") else: same_cnt += 1 log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}") except Exception as e: failed.append(idx) log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.") log(f"STUDIE {study} HOTOVO: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.") if failed: log(f"STUDIE {study}: SELHALY rowIndexy: {failed}") return {"new": new_cnt, "upd": upd_cnt, "same": same_cnt, "fail": len(failed)} def main(): now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") enabled = [s for s in STUDIES if s.get("enabled")] log(f"START: rezim={'FULLSYNC' if FULLSYNC else 'DELTA'}" f"{', DRY-RUN' if ARGS.dry_run else ''}" f"{f', limit {ARGS.limit}/studie' if ARGS.limit else ''}; " f"{len(enabled)}/{len(STUDIES)} studii enabled.") col = None client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") if not ARGS.dry_run: col = client[DB_NAME][COLLECTION] col.create_index([("record_id", ASCENDING)], unique=True) for f in ("study", "site", "subject", "accession", "postedIso", "fileId"): col.create_index([(f, ASCENDING)]) with sync_playwright() as p: context = p.chromium.launch_persistent_context( user_data_dir=PROFILE_DIR, headless=HEADLESS, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized", "--disable-restore-session-state", "--disable-session-crashed-bubble", ], no_viewport=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", accept_downloads=True, ) context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") page = context.new_page() log("START: prohlizec spusten.") login(page) totals = {"new": 0, "upd": 0, "same": 0, "fail": 0} for study_item in enabled: try: r = process_study(page, col, study_item, now) for kk in totals: totals[kk] += r[kk] except Exception as e: log(f"CHYBA studie {study_item.get('note','')}: {e!r} — pokracuji dalsi.") log(f"\nKONEC (vse): nove={totals['new']}, update={totals['upd']}, " f"beze zmeny={totals['same']}, chyby={totals['fail']}.") context.close() if __name__ == "__main__": try: main() except Exception as e: log(f"FATAL: beh spadl: {e!r}") traceback.print_exc()