# ============================================================================= # Název: download_test_results_v1.4.py # Verze: 1.4 # Datum: 2026-05-29 # Popis: Stahuje Test Results ze xsp.labcorp.com pro 2 studie (36940, 35472), # oba typy reportu (Standard + Microbiology), pres vsechna centra. # Ceka na nacteni AG Grid radku (.ag-row); prazdne centrum ('No Data') # preskoci. Vystup: timestampovane CSV do adresare Source/. # Zmeny v1.4: + RETRY na urovni reportu. Pri paralelnim behu server obcas # timeoutuje (page.goto 30 s, nebo wait_for_function 120 s na grid). # Driv byl report rovnou zaznamenan jako selhany. Ted se kazdy # report zkusi az 2x (MAX_ATTEMPTS=2). Vetsina prechodnych # timeoutu projde napodruhe. Mezi pokusy 5 s pauza. # Zmeny v1.3: robustni login (NEcekat na networkidle, cekat na pole # Email/Password) + okno se pri padu nezavre (try/except + input()). # Zmeny v1.2: + paralelni beh pres sharding (--shard N --of M). Profil per shard. # Zmeny v1.1: + studie 35472, + report typ microbiology. # ============================================================================= from playwright.sync_api import sync_playwright from datetime import datetime import argparse import traceback import os # --- argumenty: sharding pro paralelni beh ---------------------------------- parser = argparse.ArgumentParser(description="Stahovani Test Results (XSP) s podporou shardingu.") parser.add_argument("--shard", type=int, default=1, help="poradi tohoto shardu (1..of)") parser.add_argument("--of", type=int, default=1, help="celkovy pocet shardu") ARGS = parser.parse_args() SHARD, OF = ARGS.shard, ARGS.of TAG = f"[S{SHARD}/{OF}]" if OF > 1 else "" def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {TAG} {msg}", flush=True) EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "%zT3Wqfc9)cWua5" LOGIN_URL = "https://xsp.covance.com/" OUT_DIR = r"U:\PythonProject\Janssen\Covance_UCO3001\Source" # Pri paralelnim behu MUSI mit kazdy shard vlastni profil (Chrome zamyka adresar # profilu -> dve bezici instance nemohou sdilet jeden). Serialni beh (of=1) # pouziva puvodni browser_profile. _BASE = os.path.dirname(os.path.abspath(__file__)) PROFILE_DIR = os.path.join(_BASE, "browser_profile" if OF == 1 else f"browser_profile_{SHARD}") # --- retry konfigurace ----------------------------------------------------- MAX_ATTEMPTS = 2 # 1. pokus + 1 retry na report RETRY_BACKOFF_S = 5 # pauza pred opakovanym pokusem (s) # Studie + jejich interni cisla center. # 36940 = 77242113UCO3001 (UC) — zdroj center: download_equeries_report SITES # 35472 = druha studie (MDD) STUDIES = [ { "study": "36940", "sites": [ "930551", "930556", "930525", "930549", "930543", "930547", "930555", "930557", "930539", "930536", "930553", "930531", ], }, { "study": "35472", "sites": [ "898745", "898739", "898733", "898744", "898727", ], }, ] # Typy reportu: zalozka v URL + suffix v nazvu souboru. REPORT_TYPES = [ {"slug": "standard-test-results", "suffix": "standard"}, {"slug": "microbiology", "suffix": "microbiology"}, ] ALL_REPORTS = [ { "site": sid, "study": st["study"], "type": rt["suffix"], "url": f"https://xsp.labcorp.com/sponsor/study/{st['study']}/test-results/{sid}/{rt['slug']}", "filename": f"sponsor-study-{st['study']}-test-results-{sid}-{rt['suffix']}.csv", } for st in STUDIES for sid in st["sites"] for rt in REPORT_TYPES ] # Tento shard vezme kazdy of-ty report od indexu (shard-1). Rovnomerne rozdeleni # a zadny report neudela dva shardy zaroven. REPORTS = ALL_REPORTS[SHARD - 1::OF] def login(page): log("LOGIN: otviram login stranku...") page.goto(LOGIN_URL) # NEcekat na 'networkidle' — login SPA (labcorp/OKTA) ji nikdy nedosahne # (analytika/polling bezi porad) -> drive to vedlo k timeoutu a padu procesu. # Misto toho cekame primo na pole Email. Pokud se do 12 s neobjevi, # povazujeme session za aktivni (uz prihlaseno). try: page.get_by_label("Email").wait_for(state="visible", timeout=12000) except Exception: log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})") return log("LOGIN: zadavam email...") page.get_by_label("Email").fill(EMAIL) page.get_by_role("button", name="Next").click() log("LOGIN: cekam na pole pro heslo...") page.get_by_label("Password").wait_for(state="visible", timeout=30000) log("LOGIN: zadavam heslo...") page.get_by_label("Password").fill(PASSWORD) page.get_by_role("button", name="Verify").click() log("LOGIN: cekam na presmerovani po prihlaseni...") try: page.wait_for_url( lambda url: "code=" not in url or "xsp." in url, timeout=60000, ) except Exception: log("LOGIN: wait_for_url vyprsel, pokracuji (overim pristup pri 1. reportu).") page.wait_for_timeout(3000) log(f"LOGIN: prihlaseni hotovo ({page.url})") def download_report_once(page, report): """Jeden pokus o stazeni reportu. Vyhazuje vyjimku pri chybe.""" log(f"KROK 1/5: navigace na report URL...") page.goto(report["url"]) log(f"KROK 1/5: stranka nactena ({page.url})") log("KROK 2/5: cekam na radky gridu (.ag-row) nebo prazdny grid ('No Data')...") # AG Grid radky jsou position-absolute (virtualni render), takze nejsou # "visible" dle Playwrightu -> cekej na pritomnost v DOM, ne na viditelnost. # Prazdne centrum: AG Grid vykresli no-rows overlay s textem "No Data" ve # wrapperu .ag-overlay-no-rows-wrapper. POZOR: trida NENI -no-rows-center; # navic jsou na strance 2 overlaye (jeden skryty) -> kontroluj viditelny # (offsetParent != null). Detekuj, aby to u centra bez dat necekalo 120 s. EMPTY_GRID_JS = """() => { if (document.querySelectorAll('div.ag-row').length > 0) return false; return [...document.querySelectorAll('.ag-overlay-no-rows-wrapper')] .some(e => e.offsetParent !== null); }""" page.wait_for_function( f"""() => document.querySelectorAll('div.ag-row').length > 0 || ({EMPTY_GRID_JS})()""", timeout=120000, ) if page.evaluate(EMPTY_GRID_JS): log("KROK 2/5: centrum bez dat ('No Data' overlay) — preskakuji export.") return log("KROK 2/5: radky se objevily, cekam na stabilizaci poctu...") prev = -1 for i in range(20): # max ~40 s stabilizace cnt = page.locator("div.ag-row").count() log(f" ...kontrola #{i+1}: {cnt} radku") if cnt == prev and cnt > 0: break prev = cnt page.wait_for_timeout(2000) page.wait_for_timeout(2000) # buffer log(f"KROK 2/5: data stabilni ({prev} radku v gridu).") log("KROK 3/5: klikam na viditelne tri tecky (more_horiz)...") page.locator("ag-export button:visible", has_text="more_horiz").first.click() log("KROK 3/5: menu otevreno.") timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S") dest = os.path.join(OUT_DIR, f"{timestamp} {report['filename']}") log("KROK 4/5: klikam na 'Export to CSV' a cekam na stahovani...") with page.expect_download(timeout=60000) as dl: page.locator("mdl-menu-item:visible", has_text="Export to CSV").first.click() log("KROK 4/5: stahovani zachyceno, ukladam soubor...") dl.value.save_as(dest) log(f"KROK 5/5: HOTOVO -> {dest}") def download_report(page, report): """Wrapper s retry. Vyhazuje vyjimku az kdyz selzou vsechny pokusy.""" log(f"=== Centrum {report['site']} / {report['type']} (studie {report['study']}) ===") last_err = None for attempt in range(1, MAX_ATTEMPTS + 1): if attempt > 1: log(f"RETRY: {attempt}/{MAX_ATTEMPTS} pokus o centrum " f"{report['site']}/{report['type']} (po {RETRY_BACKOFF_S}s pauze)...") try: page.wait_for_timeout(RETRY_BACKOFF_S * 1000) except Exception: pass try: download_report_once(page, report) if attempt > 1: log(f"RETRY: pokus {attempt} uspesny.") return except Exception as e: last_err = e log(f"POKUS {attempt}/{MAX_ATTEMPTS} SELHAL: {e!r}") if attempt == MAX_ATTEMPTS: # propustit vyse — outer try ve smycce reportu to zaloguje a pokracuje raise # Pred dalsim pokusem zkusit prejit pryc, at se grid resetuje. try: page.goto("about:blank") except Exception: pass def main(): log(f"START: shard {SHARD}/{OF}, profil '{os.path.basename(PROFILE_DIR)}', " f"{len(REPORTS)}/{len(ALL_REPORTS)} reportu k zpracovani " f"(MAX_ATTEMPTS={MAX_ATTEMPTS}).") with sync_playwright() as p: context = p.chromium.launch_persistent_context( user_data_dir=PROFILE_DIR, headless=False, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized", "--disable-restore-session-state", "--disable-session-crashed-bubble", ], no_viewport=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", accept_downloads=True, ) context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") page = context.new_page() log("START: prohlizec spusten.") login(page) ok, failed = 0, [] for idx, report in enumerate(REPORTS, 1): log(f">>> Report {idx}/{len(REPORTS)}") try: download_report(page, report) ok += 1 except Exception as e: failed.append(f"{report['site']}/{report['type']}") log(f"CHYBA u centra {report['site']}/{report['type']} " f"(vsechny {MAX_ATTEMPTS} pokusy selhaly): {e!r} — pokracuji dalsim.") log(f"KONEC: hotovo {ok}/{len(REPORTS)} reportu (shard {SHARD}/{OF}).") if failed: log(f"KONEC: SELHALA centra: {', '.join(failed)}") context.close() if __name__ == "__main__": try: main() except Exception as e: log(f"FATAL: beh shardu spadl: {e!r}") traceback.print_exc() finally: # Nech okno otevrene, at je videt log/chyba. try: input("\n[Enter] pro zavreni tohoto okna...") except EOFError: pass