# ============================================================================= # Název: download_lab_reports_v1.2.py # Verze: 1.2 # Datum: 2026-06-16 # Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001 # (interni cislo 36940), filtrovane na 10 ceskych center (CZ), # a uklada je PRIMO do MongoDB (db covance, kolekce labreports) — # metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA. # # KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID # dokumentu, NAPRIC vsemi radky UNIKATNI (overeno: 997 radku -> # 997 unikatnich reportId/fileId). Resi pripad, kdy se na portalu # vyskytnou ruzna PDF se SHODNYMI viditelnymi metadaty (stejny # site|subject|accession|visit|posted i na minutu) — to skutecne # nastava (korekce vysledku reissue se shodnym casem Posted). # Verze v1.1 klicovala podle metadat a tyto by chybne slucovala. # # Princip stahovani: Playwright + perzistentni profil, login, klik na # "English" ve sloupci Download -> expect_download; PDF bajty se ctou # z Playwright temp souboru (download.path()), save_as se nevola. # # INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi # nahore). Nejdriv se z grid API precte SEZNAM (rowIndex, reportId) # BEZ stahovani; od shora se hleda prvni uz ulozeny reportId -> vse # pod nim je starsi a uz v Mongo je. Stahuji se jen nove (nahore). # # Prepinace: # --full projit vsechny radky (bez stop-at-known); upsert # chybejicich (rekonciliace). # --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty. # --limit N zpracovat max N novych radku (test). # ============================================================================= from playwright.sync_api import sync_playwright from datetime import datetime from pymongo import MongoClient, ASCENDING from bson.binary import Binary import argparse import hashlib import json import os import re import traceback import urllib.parse parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.") parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)") parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove") parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)") ARGS = parser.parse_args() def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) # --- konfigurace ------------------------------------------------------------ EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "%zT3Wqfc9)cWua5" LOGIN_URL = "https://xsp.labcorp.com/" STUDY = "36940" STUDY_CODE = "77242113UCO3001" MONGO_URI = "mongodb://192.168.1.76:27017" DB_NAME = "covance" COLLECTION = "labreports" SITES = [ "930539", "930547", "930555", "930556", "930553", "930549", "930525", "930536", "930557", "930531", ] _BASE = os.path.dirname(os.path.abspath(__file__)) PROFILE_DIR = os.path.join(_BASE, "browser_profile") def lab_reports_url(): site_param = json.dumps(SITES, separators=(",", ":")) return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports" f"?site={urllib.parse.quote(site_param)}") # --- formatovani ------------------------------------------------------------- _MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"} def safe(s: str) -> str: return re.sub(r'[\\/:*?"<>|]', "", s or "").strip() def fmt_date(s: str) -> str: """'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'.""" m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "") if m and m.group(1)[:3] in _MONTHS: return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}" return safe(s) def build_basename(meta: dict) -> str: """Lidsky citelny nazev (pro budouci materializacni skript), ulozen jako fileName.""" return safe( f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} " f"{meta['subject']} {meta['visit']} {meta['accession']} " f"posted {fmt_date(meta['postedDisplay'])}" ) + ".pdf" # --- JS helpery (AG Grid) --------------------------------------------------- # Seznam vsech radku (rowIndex + reportId + data, ktera nepotrebuji vykresleni). JS_ALL_ROWS = r"""() => { let holder = null; for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) { if (el.__agComponent) { holder = el.__agComponent; break; } } if (!holder) return null; const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder; if (!api || !api.getDisplayedRowCount) return null; const cnt = api.getDisplayedRowCount(); const out = []; for (let i = 0; i < cnt; i++) { const n = api.getDisplayedRowAtIndex(i); if (!n || !n.data) continue; const d = n.data; const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {}; out.push({ rowIndex: i, reportId: d.reportId, fileId: fl.fileId, serverFileName: fl.fileName, postedIso: d.postedDateTime, site: d.siteNum, subject: d.subjectNumber, }); } return out; }""" # Bunky daneho radku (potrebuji vykresleni -> nejdriv scroll). Accession/Visit/ # Collected nejsou v top-level datech (jsou ve 'visits'), beru je z bunek. JS_CELLS = r"""(idx) => { const dedup = s => { s = (s || '').replace(/\s+/g, ' ').trim(); const h = s.slice(0, Math.floor(s.length / 2)); if (s === h + h) return h; const m = s.match(/^(.*?)\s+\1$/); if (m) return m[1]; return s; }; const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]'); if (!row) return null; const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; }; return { type: get('type'), accession: get('accessionNumber'), visit: get('visit'), collected: get('visitCollectionDate'), postedDisplay: get('postedDateTime'), }; }""" JS_SCROLL_TO = r"""(args) => { const [idx, rh] = args; const vp = document.querySelector('.ag-body-viewport'); if (!vp) return; vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2); }""" JS_ROW_HEIGHT = r"""() => { const r = document.querySelector('.ag-body-container .ag-row'); return r ? r.getBoundingClientRect().height || 25 : 25; }""" # --- login ------------------------------------------------------------------ def login(page): log("LOGIN: otviram login stranku...") page.goto(LOGIN_URL) try: page.get_by_label("Email").wait_for(state="visible", timeout=12000) except Exception: log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})") return log("LOGIN: zadavam email...") page.get_by_label("Email").fill(EMAIL) page.get_by_role("button", name="Next").click() log("LOGIN: cekam na pole pro heslo...") page.get_by_label("Password").wait_for(state="visible", timeout=30000) log("LOGIN: zadavam heslo...") page.get_by_label("Password").fill(PASSWORD) page.get_by_role("button", name="Verify").click() log("LOGIN: cekam na presmerovani po prihlaseni...") try: page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000) except Exception: log("LOGIN: wait_for_url vyprsel, pokracuji.") page.wait_for_timeout(3000) log(f"LOGIN: prihlaseni hotovo ({page.url})") def open_grid(page): log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...") page.goto(lab_reports_url()) log("GRID: cekam na radky (.ag-row)...") page.wait_for_selector(".ag-body-container .ag-row", timeout=120000) # cekej, az grid API hlasi stabilni pocet radku prev = -1 rows = None for i in range(25): rows = page.evaluate(JS_ALL_ROWS) cnt = len(rows) if rows else 0 log(f" ...kontrola #{i+1}: rows={cnt}") if rows and cnt == prev and cnt > 0: break prev = cnt page.wait_for_timeout(2000) row_height = page.evaluate(JS_ROW_HEIGHT) log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.") return rows or [], row_height def download_pdf_bytes(page, idx): link = page.locator( f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link', has_text="English", ).first with page.expect_download(timeout=60000) as dl: link.click() with open(dl.value.path(), "rb") as f: return f.read() def upsert(col, rec, cells, data, now): fields = { "Type": cells["type"], "Subject": rec["subject"], "Accession": cells["accession"], "Visit": cells["visit"], "Collected Date": cells["collected"], "Site Number": rec["site"], "Posted": cells["postedDisplay"], } sha = hashlib.sha256(data).hexdigest() meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": cells["collected"], "postedDisplay": cells["postedDisplay"]} derived = { "study": STUDY, "studyCode": STUDY_CODE, "type": cells["type"] or "Lab Result", "site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": fmt_date(cells["collected"]), "posted": cells["postedDisplay"], "postedIso": rec["postedIso"], "fileId": rec["fileId"], "serverFileName": rec["serverFileName"], "fields": fields, "fileName": build_basename(meta), "pdf": Binary(data), "pdfSize": len(data), "pdfSha256": sha, } rid = rec["reportId"] existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1}) if existing is None: col.insert_one({"record_id": rid, **derived, "firstSeen": now, "lastSeen": now, "history": []}) return "insert" if existing.get("pdfSha256") != sha or existing.get("fields") != fields: col.update_one( {"_id": existing["_id"]}, {"$push": {"history": {"date": existing.get("lastSeen"), "fields": existing.get("fields"), "pdfSha256": existing.get("pdfSha256")}}, "$set": {**derived, "lastSeen": now}}, ) return "update" col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}}) return "same" def main(): now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, " f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}" f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}" f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.") col = None existing_ids = set() client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") if not ARGS.dry_run: col = client[DB_NAME][COLLECTION] col.create_index([("record_id", ASCENDING)], unique=True) for f in ("study", "site", "subject", "accession", "postedIso", "fileId"): col.create_index([(f, ASCENDING)]) existing_ids = {d["record_id"] for d in client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})} log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.") with sync_playwright() as p: context = p.chromium.launch_persistent_context( user_data_dir=PROFILE_DIR, headless=False, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized", "--disable-restore-session-state", "--disable-session-crashed-bubble", ], no_viewport=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", accept_downloads=True, ) context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") page = context.new_page() log("START: prohlizec spusten.") login(page) rows, row_height = open_grid(page) # POJISTKA: jen CZ centra. URL filtr by mel vratit jen 10 CZ sites, # ale kdyby selhal (napr. ztrata filtru po nejakem reloadu), tahle # kontrola zabrani stazeni cizich center. non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")] if non_cz: log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) " f"-> filtruji jen CZ. Zkontroluj URL filtr center!") rows = [r for r in rows if str(r["site"]).startswith("CZ")] log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.") # vyber radky ke zpracovani: shora dolu, stop-at-known todo = [] for rec in rows: if rec["reportId"] in existing_ids: if ARGS.full: continue log(f"STOP-AT-KNOWN: rowIndex {rec['rowIndex']} (reportId {rec['reportId'][:12]}…) " f"uz v Mongo -> koncim vyber (zbytek je starsi).") break todo.append(rec) if ARGS.limit: todo = todo[:ARGS.limit] log(f"PLAN: {len(todo)} novych radku ke stazeni " f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).") new_cnt = upd_cnt = same_cnt = 0 failed = [] for k, rec in enumerate(todo, 1): idx = rec["rowIndex"] try: page.evaluate(JS_SCROLL_TO, [idx, row_height]) page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000) page.wait_for_timeout(120) cells = page.evaluate(JS_CELLS, idx) if not cells: raise RuntimeError("nepodarilo se precist bunky radku") meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"], "visit": cells["visit"], "collected": cells["collected"], "postedDisplay": cells["postedDisplay"]} fname = build_basename(meta) if ARGS.dry_run: log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}") new_cnt += 1 continue data = download_pdf_bytes(page, idx) action = upsert(col, rec, cells, data, now) existing_ids.add(rec["reportId"]) if action == "insert": new_cnt += 1 log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}") elif action == "update": upd_cnt += 1 log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}") else: same_cnt += 1 log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}") except Exception as e: failed.append(idx) log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.") log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.") if failed: log(f"KONEC: SELHALY rowIndexy: {failed}") context.close() if __name__ == "__main__": try: main() except Exception as e: log(f"FATAL: beh spadl: {e!r}") traceback.print_exc() finally: try: input("\n[Enter] pro zavreni tohoto okna...") except EOFError: pass