# ============================================================================= # Název: download_test_results_v1.2.py # Verze: 1.2 # Datum: 2026-05-29 # Popis: Stahuje Test Results ze xsp.labcorp.com pro 2 studie (36940, 35472), # oba typy reportu (Standard + Microbiology), pres vsechna centra. # Ceka na nacteni AG Grid radku (.ag-row); prazdne centrum ('No Data') # preskoci. Vystup: timestampovane CSV do adresare Source/. # Zmeny v1.2: + paralelni beh pres sharding (--shard N --of M). Kazdy shard # vezme svuj podil reportu (REPORTS[shard-1::of]) a pouziva vlastni # profil browser_profile_{shard}, takze 4 procesy mohou bezet # soucasne (kazdy se prihlasi do sveho profilu sam). Bez argumentu # = puvodni serialni beh nad profilem browser_profile. # Zmeny v1.1: + studie 35472, + report typ microbiology (driv jen 36940/standard). # ============================================================================= from playwright.sync_api import sync_playwright from datetime import datetime import argparse import os # --- argumenty: sharding pro paralelni beh ---------------------------------- parser = argparse.ArgumentParser(description="Stahovani Test Results (XSP) s podporou shardingu.") parser.add_argument("--shard", type=int, default=1, help="poradi tohoto shardu (1..of)") parser.add_argument("--of", type=int, default=1, help="celkovy pocet shardu") ARGS = parser.parse_args() SHARD, OF = ARGS.shard, ARGS.of TAG = f"[S{SHARD}/{OF}]" if OF > 1 else "" def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {TAG} {msg}", flush=True) EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "%zT3Wqfc9)cWua5" LOGIN_URL = "https://xsp.covance.com/" OUT_DIR = r"U:\PythonProject\Janssen\Covance_UCO3001\Source" # Pri paralelnim behu MUSI mit kazdy shard vlastni profil (Chrome zamyka adresar # profilu -> dve bezici instance nemohou sdilet jeden). Serialni beh (of=1) # pouziva puvodni browser_profile. _BASE = os.path.dirname(os.path.abspath(__file__)) PROFILE_DIR = os.path.join(_BASE, "browser_profile" if OF == 1 else f"browser_profile_{SHARD}") # Studie + jejich interni cisla center. # 36940 = 77242113UCO3001 (UC) — zdroj center: download_equeries_report SITES # 35472 = druha studie (MDD) STUDIES = [ { "study": "36940", "sites": [ "930551", "930556", "930525", "930549", "930543", "930547", "930555", "930557", "930539", "930536", "930553", "930531", ], }, { "study": "35472", "sites": [ "898745", "898739", "898733", "898744", "898727", ], }, ] # Typy reportu: zalozka v URL + suffix v nazvu souboru. REPORT_TYPES = [ {"slug": "standard-test-results", "suffix": "standard"}, {"slug": "microbiology", "suffix": "microbiology"}, ] ALL_REPORTS = [ { "site": sid, "study": st["study"], "type": rt["suffix"], "url": f"https://xsp.labcorp.com/sponsor/study/{st['study']}/test-results/{sid}/{rt['slug']}", "filename": f"sponsor-study-{st['study']}-test-results-{sid}-{rt['suffix']}.csv", } for st in STUDIES for sid in st["sites"] for rt in REPORT_TYPES ] # Tento shard vezme kazdy of-ty report od indexu (shard-1). Rovnomerne rozdeleni # a zadny report neudela dva shardy zaroven. REPORTS = ALL_REPORTS[SHARD - 1::OF] def login(page): log("LOGIN: otviram login stranku...") page.goto(LOGIN_URL) page.wait_for_load_state("networkidle") if not page.get_by_label("Email").is_visible(): log(f"LOGIN: session uz aktivni, prihlaseni preskoceno ({page.url})") return log("LOGIN: zadavam email...") page.get_by_label("Email").fill(EMAIL) page.get_by_role("button", name="Next").click() page.wait_for_load_state("networkidle") log("LOGIN: zadavam heslo...") page.get_by_label("Password").fill(PASSWORD) page.get_by_role("button", name="Verify").click() log("LOGIN: cekam na presmerovani po prihlaseni...") page.wait_for_url(lambda url: "code=" not in url, timeout=60000) page.wait_for_load_state("networkidle", timeout=60000) page.wait_for_timeout(2000) log(f"LOGIN: prihlaseni OK ({page.url})") def download_report(page, report): log(f"=== Centrum {report['site']} / {report['type']} (studie {report['study']}) ===") log(f"KROK 1/5: navigace na report URL...") page.goto(report["url"]) log(f"KROK 1/5: stranka nactena ({page.url})") # Grid je AG Grid uvnitř . Data jsou nactena, jakmile # se v gridu objevi radky (.ag-row jde z 0 -> N). Pockej na prvni radek # a pak na stabilizaci poctu (proti castecnemu renderu). log("KROK 2/5: cekam na radky gridu (.ag-row) nebo prazdny grid ('No Data')...") # AG Grid radky jsou position-absolute (virtualni render), takze nejsou # "visible" dle Playwrightu -> cekej na pritomnost v DOM, ne na viditelnost. # Prazdne centrum: AG Grid vykresli no-rows overlay s textem "No Data" ve # wrapperu .ag-overlay-no-rows-wrapper. POZOR: trida NENI -no-rows-center; # navic jsou na strance 2 overlaye (jeden skryty) -> kontroluj viditelny # (offsetParent != null). Detekuj, aby to u centra bez dat necekalo 120 s. EMPTY_GRID_JS = """() => { if (document.querySelectorAll('div.ag-row').length > 0) return false; return [...document.querySelectorAll('.ag-overlay-no-rows-wrapper')] .some(e => e.offsetParent !== null); }""" page.wait_for_function( f"""() => document.querySelectorAll('div.ag-row').length > 0 || ({EMPTY_GRID_JS})()""", timeout=120000, ) if page.evaluate(EMPTY_GRID_JS): log("KROK 2/5: centrum bez dat ('No Data' overlay) — preskakuji export.") return log("KROK 2/5: radky se objevily, cekam na stabilizaci poctu...") prev = -1 for i in range(20): # max ~40 s stabilizace cnt = page.locator("div.ag-row").count() log(f" ...kontrola #{i+1}: {cnt} radku") if cnt == prev and cnt > 0: break prev = cnt page.wait_for_timeout(2000) page.wait_for_timeout(2000) # buffer log(f"KROK 2/5: data stabilni ({prev} radku v gridu).") # Tri tecky: na strance jsou 2x (jeden skryty), klikni na # VIDITELNY more_horiz button. log("KROK 3/5: klikam na viditelne tri tecky (more_horiz)...") page.locator("ag-export button:visible", has_text="more_horiz").first.click() log("KROK 3/5: menu otevreno.") timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S") dest = os.path.join(OUT_DIR, f"{timestamp} {report['filename']}") log("KROK 4/5: klikam na 'Export to CSV' a cekam na stahovani...") with page.expect_download(timeout=60000) as dl: # 2x "Export to CSV" v DOM (jeden skryty) -> klikni na VIDITELNY page.locator("mdl-menu-item:visible", has_text="Export to CSV").first.click() log("KROK 4/5: stahovani zachyceno, ukladam soubor...") dl.value.save_as(dest) log(f"KROK 5/5: HOTOVO -> {dest}") if __name__ == "__main__": log(f"START: shard {SHARD}/{OF}, profil '{os.path.basename(PROFILE_DIR)}', " f"{len(REPORTS)}/{len(ALL_REPORTS)} reportu k zpracovani.") with sync_playwright() as p: context = p.chromium.launch_persistent_context( user_data_dir=PROFILE_DIR, headless=False, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized", "--disable-restore-session-state", "--disable-session-crashed-bubble", ], no_viewport=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", accept_downloads=True, ) context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") page = context.new_page() log("START: prohlizec spusten.") login(page) ok, failed = 0, [] for idx, report in enumerate(REPORTS, 1): log(f">>> Report {idx}/{len(REPORTS)}") try: download_report(page, report) ok += 1 except Exception as e: failed.append(f"{report['site']}/{report['type']}") log(f"CHYBA u centra {report['site']}/{report['type']}: {e!r} — pokracuji dalsim.") log(f"KONEC: hotovo {ok}/{len(REPORTS)} reportu (shard {SHARD}/{OF}).") if failed: log(f"KONEC: SELHALA centra: {', '.join(failed)}") context.close()