# ============================================================================= # Název: download_lab_reports_v1.1.py # Verze: 1.1 # Datum: 2026-06-16 # Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001 # (interni cislo 36940), filtrovane na 10 ceskych center (CZ), # a uklada je PRIMO do MongoDB (db covance, kolekce labreports) — # metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA. # # Princip stahovani stejny jako download_test_results: Playwright + # perzistentni profil, jednorazovy login, klik na "English" ve sloupci # Download. PDF bajty se ctou z Playwright temp souboru (download.path()), # save_as se nevola -> nic netrvale neni na disku. # # INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi nahore). # Skript jde shora dolu; u kazdeho radku nejdriv precte metadata a # spocita record_id. Jakmile narazi na uz ulozeny report, KONCI # (vse pod nim je starsi a uz v Mongo je). Korekce vysledku = stejny # report znovu vystaveny s NOVYM Posted => novy record_id => stahne se # jako novy, puvodni zustava. # # record_id = "{site}|{subject}|{accession}|{visit}|{posted}" # (Posted vc. casu odlisuje reissue). # # Prepinace: # --full projit vsechny radky (bez predcasneho konce); upsertne # chybejici / zmenene (rekonciliace). # --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty. # --limit N zpracovat max N radku (test). # ============================================================================= from playwright.sync_api import sync_playwright from datetime import datetime from pymongo import MongoClient, ASCENDING from bson.binary import Binary import argparse import hashlib import json import os import re import traceback import urllib.parse # --- argumenty -------------------------------------------------------------- parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.") parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)") parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove") parser.add_argument("--limit", type=int, default=0, help="max N radku (0 = vse)") ARGS = parser.parse_args() def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) # --- konfigurace ------------------------------------------------------------ EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "%zT3Wqfc9)cWua5" LOGIN_URL = "https://xsp.labcorp.com/" STUDY = "36940" STUDY_CODE = "77242113UCO3001" MONGO_URI = "mongodb://192.168.1.76:27017" DB_NAME = "covance" COLLECTION = "labreports" # 10 center (interni ID center) — z URL "GO TO LINK". SITES = [ "930539", "930547", "930555", "930556", "930553", "930549", "930525", "930536", "930557", "930531", ] _BASE = os.path.dirname(os.path.abspath(__file__)) PROFILE_DIR = os.path.join(_BASE, "browser_profile") def lab_reports_url(): site_param = json.dumps(SITES, separators=(",", ":")) return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports" f"?site={urllib.parse.quote(site_param)}") # --- formatovani -------------------------------------------------------------- _MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"} def safe(s: str) -> str: return re.sub(r'[\\/:*?"<>|]', "", s or "").strip() def fmt_date(s: str) -> str: """'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'.""" m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "") if m and m.group(1)[:3] in _MONTHS: return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}" return safe(s) def fmt_datetime(s: str) -> str: """'Jun 15, 2026 7:49 PM' -> '2026-06-15 19:49'. Bez casu -> jen datum.""" s = (s or "").strip() for f in ("%b %d, %Y %I:%M %p", "%b %d, %Y %I:%M:%S %p"): try: return datetime.strptime(s, f).strftime("%Y-%m-%d %H:%M") except ValueError: pass return fmt_date(s) def make_record_id(meta: dict) -> str: return "|".join([ meta["site"], meta["subject"], meta["accession"], meta["visit"], fmt_datetime(meta["posted"]), ]) def build_basename(meta: dict) -> str: """Nazev (kvuli budoucimu materializacnimu skriptu); ulozen jako fileName.""" return safe( f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} " f"{meta['subject']} {meta['visit']} {meta['accession']} " f"posted {fmt_date(meta['posted'])}" ) + ".pdf" # --- JS helpery (AG Grid) --------------------------------------------------- JS_GRID_INFO = r"""() => { const c = document.querySelector('.ag-body-container'); const r = document.querySelector('.ag-body-container .ag-row'); const rh = r ? r.getBoundingClientRect().height : 25; const ch = c ? parseFloat(c.style.height || '0') : 0; return { rowHeight: rh || 25, total: rh ? Math.round(ch / rh) : 0 }; }""" JS_READ_ROW = r"""(idx) => { const dedup = s => { s = (s || '').replace(/\s+/g, ' ').trim(); const h = s.slice(0, Math.floor(s.length / 2)); if (s === h + h) return h; const m = s.match(/^(.*?)\s+\1$/); if (m) return m[1]; return s; }; const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]'); if (!row) return null; const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; }; return { type: get('type'), subject: get('subjectId'), accession: get('accessionNumber'), visit: get('visit'), collected: get('visitCollectionDate'), site: get('siteNum'), posted: get('postedDateTime'), }; }""" JS_SCROLL_TO = r"""(args) => { const [idx, rh] = args; const vp = document.querySelector('.ag-body-viewport'); if (!vp) return; vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2); }""" # --- login ------------------------------------------------------------------ def login(page): log("LOGIN: otviram login stranku...") page.goto(LOGIN_URL) try: page.get_by_label("Email").wait_for(state="visible", timeout=12000) except Exception: log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})") return log("LOGIN: zadavam email...") page.get_by_label("Email").fill(EMAIL) page.get_by_role("button", name="Next").click() log("LOGIN: cekam na pole pro heslo...") page.get_by_label("Password").wait_for(state="visible", timeout=30000) log("LOGIN: zadavam heslo...") page.get_by_label("Password").fill(PASSWORD) page.get_by_role("button", name="Verify").click() log("LOGIN: cekam na presmerovani po prihlaseni...") try: page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000) except Exception: log("LOGIN: wait_for_url vyprsel, pokracuji.") page.wait_for_timeout(3000) log(f"LOGIN: prihlaseni hotovo ({page.url})") def open_grid(page): log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...") page.goto(lab_reports_url()) log("GRID: cekam na radky (.ag-row)...") page.wait_for_selector(".ag-body-container .ag-row", timeout=120000) prev = -1 for i in range(20): info = page.evaluate(JS_GRID_INFO) cnt = info["total"] log(f" ...kontrola #{i+1}: total={cnt}, rowHeight={info['rowHeight']}") if cnt == prev and cnt > 0: break prev = cnt page.wait_for_timeout(2000) info = page.evaluate(JS_GRID_INFO) log(f"GRID: nacteno, total={info['total']} radku, rowHeight={info['rowHeight']}px.") return info["total"], info["rowHeight"] def read_row(page, idx, row_height): page.evaluate(JS_SCROLL_TO, [idx, row_height]) page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000) page.wait_for_timeout(120) meta = page.evaluate(JS_READ_ROW, idx) if not meta or not meta.get("subject"): raise RuntimeError(f"radek {idx}: nepodarilo se precist metadata") return meta def download_pdf_bytes(page, idx): link = page.locator( f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link', has_text="English", ).first with page.expect_download(timeout=60000) as dl: link.click() path = dl.value.path() # temp soubor Playwrightu with open(path, "rb") as f: data = f.read() return data def upsert(col, meta, rid, data, now): fields = { "Type": meta["type"], "Subject": meta["subject"], "Accession": meta["accession"], "Visit": meta["visit"], "Collected Date": meta["collected"], "Site Number": meta["site"], "Posted": meta["posted"], } sha = hashlib.sha256(data).hexdigest() derived = { "study": STUDY, "studyCode": STUDY_CODE, "type": meta["type"] or "Lab Result", "site": meta["site"], "subject": meta["subject"], "accession": meta["accession"], "visit": meta["visit"], "collected": fmt_date(meta["collected"]), "posted": fmt_datetime(meta["posted"]), "fields": fields, "fileName": build_basename(meta), "pdf": Binary(data), "pdfSize": len(data), "pdfSha256": sha, } existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1}) if existing is None: col.insert_one({"record_id": rid, **derived, "firstSeen": now, "lastSeen": now, "history": []}) return "insert" if existing.get("pdfSha256") != sha or existing.get("fields") != fields: col.update_one( {"_id": existing["_id"]}, {"$push": {"history": {"date": existing.get("lastSeen"), "fields": existing.get("fields"), "pdfSha256": existing.get("pdfSha256")}}, "$set": {**derived, "lastSeen": now}}, ) return "update" col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}}) return "same" def main(): now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, " f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}" f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}" f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.") col = None existing_ids = set() if not ARGS.dry_run: client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") col = client[DB_NAME][COLLECTION] col.create_index([("record_id", ASCENDING)], unique=True) for idx_def in (["study"], ["site"], ["subject"], ["accession"], ["posted"], ["collected"]): col.create_index([(idx_def[0], ASCENDING)]) existing_ids = {d["record_id"] for d in col.find({"study": STUDY}, {"record_id": 1})} log(f"START: v Mongo uz je {len(existing_ids)} reportu pro tuto studii.") else: # i v dry-run nacti existujici, at vime, co je opravdu nove try: client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") existing_ids = {d["record_id"] for d in client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})} log(f"START: [dry-run] v Mongo je {len(existing_ids)} reportu.") except Exception as e: log(f"START: [dry-run] Mongo nedostupne ({e!r}), beru vse jako nove.") with sync_playwright() as p: context = p.chromium.launch_persistent_context( user_data_dir=PROFILE_DIR, headless=False, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized", "--disable-restore-session-state", "--disable-session-crashed-bubble", ], no_viewport=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", accept_downloads=True, ) context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") page = context.new_page() log("START: prohlizec spusten.") login(page) total, row_height = open_grid(page) if ARGS.limit: total = min(total, ARGS.limit) new_cnt = upd_cnt = same_cnt = 0 failed = [] stopped = False for idx in range(total): try: meta = read_row(page, idx, row_height) except Exception as e: failed.append(idx) log(f"CHYBA cteni radku {idx}: {e!r} — pokracuji.") continue rid = make_record_id(meta) known = rid in existing_ids if known and not ARGS.full: log(f">>> Radek {idx+1}/{total}: '{rid}' uz v Mongo " f"-> stop-at-known, koncim (zbytek je starsi).") stopped = True break if known and ARGS.full: log(f" #{idx}: znamy, [full] preskakuji download.") same_cnt += 1 continue # novy report if ARGS.dry_run: log(f" [DRY] NOVY #{idx}: {build_basename(meta)}") new_cnt += 1 existing_ids.add(rid) continue try: data = download_pdf_bytes(page, idx) action = upsert(col, meta, rid, data, now) existing_ids.add(rid) if action == "insert": new_cnt += 1 log(f" #{idx}: INSERT ({len(data)//1024} KB) {build_basename(meta)}") elif action == "update": upd_cnt += 1 log(f" #{idx}: UPDATE {build_basename(meta)}") else: same_cnt += 1 except Exception as e: failed.append(idx) log(f"CHYBA stazeni/zapisu radku {idx}: {e!r} — pokracuji.") log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, " f"chyby={len(failed)} {'(stop-at-known)' if stopped else '(projeto vse)'}.") if failed: log(f"KONEC: SELHALY indexy: {failed}") context.close() if __name__ == "__main__": try: main() except Exception as e: log(f"FATAL: beh spadl: {e!r}") traceback.print_exc() finally: try: input("\n[Enter] pro zavreni tohoto okna...") except EOFError: pass