# ============================================================================= # Název: download_lab_reports_v1.3.py # Verze: 1.3 # Datum: 2026-06-16 # Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001 # (interni cislo 36940), filtrovane na 10 ceskych center (CZ), # a uklada je PRIMO do MongoDB (db covance, kolekce labreports) — # metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA. # # REZIM SYNCHRONIZACE: promenna SYNC_MODE nahore. # "delta" = jen NOVE reporty pres interni reportId (stop-at-known). # List je Posted DESC; shora se hleda prvni uz ulozeny # reportId -> vse pod nim je starsi a uz v Mongo je. # "fullsync" = projit VSECHNY radky a doplnit chybejici / zmenene # (rekonciliace). Pomalejsi, stahuje vse chybejici. # CLI prepise promennou: --delta / --fullsync. # # KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID # dokumentu, NAPRIC vsemi radky UNIKATNI a perzistentni v case # (overeno: stejne reportId vraci i jiny grid pro totez centrum). # Resi pripad ruznych PDF se SHODNYMI viditelnymi metadaty. # # Zmeny v1.3: + SYNC_MODE promenna (delta/fullsync); oprava postedIso # (drive se ukladal cely moment.js objekt -> ted cisty ISO). # ============================================================================= from playwright.sync_api import sync_playwright from datetime import datetime from pymongo import MongoClient, ASCENDING from bson.binary import Binary import argparse import hashlib import json import os import re import traceback import urllib.parse # ============================================================================ # REZIM SYNCHRONIZACE — nastav zde (CLI --delta / --fullsync ma prednost) # ============================================================================ SYNC_MODE = "delta" # "delta" = jen nove (stop-at-known pres reportId) # "fullsync" = projit vse, doplnit chybejici/zmenene # ============================================================================ parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.") parser.add_argument("--delta", action="store_true", help="vynutit rezim delta") parser.add_argument("--fullsync", action="store_true", help="vynutit rezim fullsync") parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove") parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)") ARGS = parser.parse_args() # rozhodnuti rezimu: CLI > promenna _mode = SYNC_MODE if ARGS.fullsync: _mode = "fullsync" if ARGS.delta: _mode = "delta" FULLSYNC = (_mode == "fullsync") def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) # --- konfigurace ------------------------------------------------------------ EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "%zT3Wqfc9)cWua5" LOGIN_URL = "https://xsp.labcorp.com/" STUDY = "36940" STUDY_CODE = "77242113UCO3001" MONGO_URI = "mongodb://192.168.1.76:27017" DB_NAME = "covance" COLLECTION = "labreports" SITES = [ "930539", "930547", "930555", "930556", "930553", "930549", "930525", "930536", "930557", "930531", ] _BASE = os.path.dirname(os.path.abspath(__file__)) PROFILE_DIR = os.path.join(_BASE, "browser_profile") def lab_reports_url(): site_param = json.dumps(SITES, separators=(",", ":")) return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports" f"?site={urllib.parse.quote(site_param)}") # --- formatovani ------------------------------------------------------------- _MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"} def safe(s: str) -> str: return re.sub(r'[\\/:*?"<>|]', "", s or "").strip() def fmt_date(s: str) -> str: """'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'.""" m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "") if m and m.group(1)[:3] in _MONTHS: return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}" return safe(s) def build_basename(meta: dict) -> str: return safe( f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} " f"{meta['subject']} {meta['visit']} {meta['accession']} " f"posted {fmt_date(meta['postedDisplay'])}" ) + ".pdf" # --- JS helpery (AG Grid) --------------------------------------------------- # Seznam vsech radku. postedDateTime je v datech moment.js objekt -> prevedu # na cisty ISO string (jinak by se serializoval cely moment objekt). JS_ALL_ROWS = r"""() => { let holder = null; for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) { if (el.__agComponent) { holder = el.__agComponent; break; } } if (!holder) return null; const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder; if (!api || !api.getDisplayedRowCount) return null; const toIso = v => { if (v == null) return null; if (typeof v === 'string') return v; if (v._i && typeof v._i === 'string') return v._i; // puvodni serverove ISO s offsetem if (typeof v.toISOString === 'function') { try { return v.toISOString(); } catch (e) {} } return String(v); }; const cnt = api.getDisplayedRowCount(); const out = []; for (let i = 0; i < cnt; i++) { const n = api.getDisplayedRowAtIndex(i); if (!n || !n.data) continue; const d = n.data; const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {}; out.push({ rowIndex: i, reportId: d.reportId, fileId: fl.fileId, serverFileName: fl.fileName, postedIso: toIso(d.postedDateTime), site: d.siteNum, subject: d.subjectNumber, }); } return out; }""" JS_CELLS = r"""(idx) => { const dedup = s => { s = (s || '').replace(/\s+/g, ' ').trim(); const h = s.slice(0, Math.floor(s.length / 2)); if (s === h + h) return h; const m = s.match(/^(.*?)\s+\1$/); if (m) return m[1]; return s; }; const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]'); if (!row) return null; const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; }; return { type: get('type'), accession: get('accessionNumber'), visit: get('visit'), collected: get('visitCollectionDate'), postedDisplay: get('postedDateTime'), }; }""" JS_SCROLL_TO = r"""(args) => { const [idx, rh] = args; const vp = document.querySelector('.ag-body-viewport'); if (!vp) return; vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2); }""" JS_ROW_HEIGHT = r"""() => { const r = document.querySelector('.ag-body-container .ag-row'); return r ? r.getBoundingClientRect().height || 25 : 25; }""" # --- login ------------------------------------------------------------------ def login(page): log("LOGIN: otviram login stranku...") page.goto(LOGIN_URL) try: page.get_by_label("Email").wait_for(state="visible", timeout=12000) except Exception: log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})") return log("LOGIN: zadavam email...") page.get_by_label("Email").fill(EMAIL) page.get_by_role("button", name="Next").click() log("LOGIN: cekam na pole pro heslo...") page.get_by_label("Password").wait_for(state="visible", timeout=30000) log("LOGIN: zadavam heslo...") page.get_by_label("Password").fill(PASSWORD) page.get_by_role("button", name="Verify").click() log("LOGIN: cekam na presmerovani po prihlaseni...") try: page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000) except Exception: log("LOGIN: wait_for_url vyprsel, pokracuji.") page.wait_for_timeout(3000) log(f"LOGIN: prihlaseni hotovo ({page.url})") def open_grid(page): log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...") page.goto(lab_reports_url()) log("GRID: cekam na radky (.ag-row)...") page.wait_for_selector(".ag-body-container .ag-row", timeout=120000) prev = -1 rows = None for i in range(25): rows = page.evaluate(JS_ALL_ROWS) cnt = len(rows) if rows else 0 log(f" ...kontrola #{i+1}: rows={cnt}") if rows and cnt == prev and cnt > 0: break prev = cnt page.wait_for_timeout(2000) row_height = page.evaluate(JS_ROW_HEIGHT) log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.") return rows or [], row_height def download_pdf_bytes(page, idx): link = page.locator( f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link', has_text="English", ).first with page.expect_download(timeout=60000) as dl: link.click() with open(dl.value.path(), "rb") as f: return f.read() def upsert(col, rec, cells, data, now): fields = { "Type": cells["type"], "Subject": rec["subject"], "Accession": cells["accession"], "Visit": cells["visit"], "Collected Date": cells["collected"], "Site Number": rec["site"], "Posted": cells["postedDisplay"], } sha = hashlib.sha256(data).hexdigest() derived = { "study": STUDY, "studyCode": STUDY_CODE, "type": cells["type"] or "Lab Result", "site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": fmt_date(cells["collected"]), "posted": cells["postedDisplay"], "postedIso": rec["postedIso"], "fileId": rec["fileId"], "serverFileName": rec["serverFileName"], "fields": fields, "fileName": build_basename({**rec, "accession": cells["accession"], "visit": cells["visit"], "collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}), "pdf": Binary(data), "pdfSize": len(data), "pdfSha256": sha, } rid = rec["reportId"] existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1}) if existing is None: col.insert_one({"record_id": rid, **derived, "firstSeen": now, "lastSeen": now, "history": []}) return "insert" if existing.get("pdfSha256") != sha or existing.get("fields") != fields: col.update_one( {"_id": existing["_id"]}, {"$push": {"history": {"date": existing.get("lastSeen"), "fields": existing.get("fields"), "pdfSha256": existing.get("pdfSha256")}}, "$set": {**derived, "lastSeen": now}}, ) return "update" col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}}) return "same" def main(): now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, " f"rezim={'FULLSYNC' if FULLSYNC else 'DELTA'}" f"{', DRY-RUN' if ARGS.dry_run else ''}" f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.") col = None client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") if not ARGS.dry_run: col = client[DB_NAME][COLLECTION] col.create_index([("record_id", ASCENDING)], unique=True) for f in ("study", "site", "subject", "accession", "postedIso", "fileId"): col.create_index([(f, ASCENDING)]) existing_ids = {d["record_id"] for d in client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})} log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.") with sync_playwright() as p: context = p.chromium.launch_persistent_context( user_data_dir=PROFILE_DIR, headless=False, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized", "--disable-restore-session-state", "--disable-session-crashed-bubble", ], no_viewport=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", accept_downloads=True, ) context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") page = context.new_page() log("START: prohlizec spusten.") login(page) rows, row_height = open_grid(page) # POJISTKA: jen CZ centra (kdyby URL filtr selhal). non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")] if non_cz: log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) " f"-> filtruji jen CZ. Zkontroluj URL filtr center!") rows = [r for r in rows if str(r["site"]).startswith("CZ")] log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.") # vyber radku ke zpracovani podle rezimu todo = [] for rec in rows: if rec["reportId"] in existing_ids: if FULLSYNC: continue # fullsync: znamy preskoc, jdi dal log(f"DELTA stop-at-known: rowIndex {rec['rowIndex']} " f"(reportId {rec['reportId'][:12]}…) uz v Mongo -> koncim (zbytek je starsi).") break # delta: prvni znamy = konec todo.append(rec) if ARGS.limit: todo = todo[:ARGS.limit] log(f"PLAN [{'FULLSYNC' if FULLSYNC else 'DELTA'}]: {len(todo)} novych radku ke stazeni " f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).") new_cnt = upd_cnt = same_cnt = 0 failed = [] for k, rec in enumerate(todo, 1): idx = rec["rowIndex"] try: page.evaluate(JS_SCROLL_TO, [idx, row_height]) page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000) page.wait_for_timeout(120) cells = page.evaluate(JS_CELLS, idx) if not cells: raise RuntimeError("nepodarilo se precist bunky radku") meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": cells["collected"], "postedDisplay": cells["postedDisplay"]} fname = build_basename(meta) if ARGS.dry_run: log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}") new_cnt += 1 continue data = download_pdf_bytes(page, idx) action = upsert(col, rec, cells, data, now) existing_ids.add(rec["reportId"]) if action == "insert": new_cnt += 1 log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}") elif action == "update": upd_cnt += 1 log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}") else: same_cnt += 1 log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}") except Exception as e: failed.append(idx) log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.") log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.") if failed: log(f"KONEC: SELHALY rowIndexy: {failed}") context.close() if __name__ == "__main__": try: main() except Exception as e: log(f"FATAL: beh spadl: {e!r}") traceback.print_exc()