Files
janssen/Covance/download_test_results_v1.4.py
2026-06-09 08:22:49 +02:00

261 lines
11 KiB
Python

# =============================================================================
# Název: download_test_results_v1.4.py
# Verze: 1.4
# Datum: 2026-05-29
# Popis: Stahuje Test Results ze xsp.labcorp.com pro 2 studie (36940, 35472),
# oba typy reportu (Standard + Microbiology), pres vsechna centra.
# Ceka na nacteni AG Grid radku (.ag-row); prazdne centrum ('No Data')
# preskoci. Vystup: timestampovane CSV do adresare Source/.
# Zmeny v1.4: + RETRY na urovni reportu. Pri paralelnim behu server obcas
# timeoutuje (page.goto 30 s, nebo wait_for_function 120 s na grid).
# Driv byl report rovnou zaznamenan jako selhany. Ted se kazdy
# report zkusi az 2x (MAX_ATTEMPTS=2). Vetsina prechodnych
# timeoutu projde napodruhe. Mezi pokusy 5 s pauza.
# Zmeny v1.3: robustni login (NEcekat na networkidle, cekat na pole
# Email/Password) + okno se pri padu nezavre (try/except + input()).
# Zmeny v1.2: + paralelni beh pres sharding (--shard N --of M). Profil per shard.
# Zmeny v1.1: + studie 35472, + report typ microbiology.
# =============================================================================
from playwright.sync_api import sync_playwright
from datetime import datetime
import argparse
import traceback
import os
# --- argumenty: sharding pro paralelni beh ----------------------------------
parser = argparse.ArgumentParser(description="Stahovani Test Results (XSP) s podporou shardingu.")
parser.add_argument("--shard", type=int, default=1, help="poradi tohoto shardu (1..of)")
parser.add_argument("--of", type=int, default=1, help="celkovy pocet shardu")
ARGS = parser.parse_args()
SHARD, OF = ARGS.shard, ARGS.of
TAG = f"[S{SHARD}/{OF}]" if OF > 1 else ""
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {TAG} {msg}", flush=True)
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "%zT3Wqfc9)cWua5"
LOGIN_URL = "https://xsp.covance.com/"
OUT_DIR = r"U:\PythonProject\Janssen\Covance\Source"
# Pri paralelnim behu MUSI mit kazdy shard vlastni profil (Chrome zamyka adresar
# profilu -> dve bezici instance nemohou sdilet jeden). Serialni beh (of=1)
# pouziva puvodni browser_profile.
_BASE = os.path.dirname(os.path.abspath(__file__))
PROFILE_DIR = os.path.join(_BASE, "browser_profile" if OF == 1 else f"browser_profile_{SHARD}")
# --- retry konfigurace -----------------------------------------------------
MAX_ATTEMPTS = 2 # 1. pokus + 1 retry na report
RETRY_BACKOFF_S = 5 # pauza pred opakovanym pokusem (s)
# Studie + jejich interni cisla center.
# 36940 = 77242113UCO3001 (UC) — zdroj center: download_equeries_report SITES
# 35472 = druha studie (MDD)
STUDIES = [
{
"study": "36940",
"sites": [
"930551", "930556", "930525", "930549", "930543", "930547",
"930555", "930557", "930539", "930536", "930553", "930531",
],
},
{
"study": "35472",
"sites": [
"898745", "898739", "898733", "898744", "898727",
],
},
]
# Typy reportu: zalozka v URL + suffix v nazvu souboru.
REPORT_TYPES = [
{"slug": "standard-test-results", "suffix": "standard"},
{"slug": "microbiology", "suffix": "microbiology"},
]
ALL_REPORTS = [
{
"site": sid,
"study": st["study"],
"type": rt["suffix"],
"url": f"https://xsp.labcorp.com/sponsor/study/{st['study']}/test-results/{sid}/{rt['slug']}",
"filename": f"sponsor-study-{st['study']}-test-results-{sid}-{rt['suffix']}.csv",
}
for st in STUDIES
for sid in st["sites"]
for rt in REPORT_TYPES
]
# Tento shard vezme kazdy of-ty report od indexu (shard-1). Rovnomerne rozdeleni
# a zadny report neudela dva shardy zaroven.
REPORTS = ALL_REPORTS[SHARD - 1::OF]
def login(page):
log("LOGIN: otviram login stranku...")
page.goto(LOGIN_URL)
# NEcekat na 'networkidle' — login SPA (labcorp/OKTA) ji nikdy nedosahne
# (analytika/polling bezi porad) -> drive to vedlo k timeoutu a padu procesu.
# Misto toho cekame primo na pole Email. Pokud se do 12 s neobjevi,
# povazujeme session za aktivni (uz prihlaseno).
try:
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
except Exception:
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
return
log("LOGIN: zadavam email...")
page.get_by_label("Email").fill(EMAIL)
page.get_by_role("button", name="Next").click()
log("LOGIN: cekam na pole pro heslo...")
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
log("LOGIN: zadavam heslo...")
page.get_by_label("Password").fill(PASSWORD)
page.get_by_role("button", name="Verify").click()
log("LOGIN: cekam na presmerovani po prihlaseni...")
try:
page.wait_for_url(
lambda url: "code=" not in url or "xsp." in url,
timeout=60000,
)
except Exception:
log("LOGIN: wait_for_url vyprsel, pokracuji (overim pristup pri 1. reportu).")
page.wait_for_timeout(3000)
log(f"LOGIN: prihlaseni hotovo ({page.url})")
def download_report_once(page, report):
"""Jeden pokus o stazeni reportu. Vyhazuje vyjimku pri chybe."""
log(f"KROK 1/5: navigace na report URL...")
page.goto(report["url"])
log(f"KROK 1/5: stranka nactena ({page.url})")
log("KROK 2/5: cekam na radky gridu (.ag-row) nebo prazdny grid ('No Data')...")
# AG Grid radky jsou position-absolute (virtualni render), takze nejsou
# "visible" dle Playwrightu -> cekej na pritomnost v DOM, ne na viditelnost.
# Prazdne centrum: AG Grid vykresli no-rows overlay s textem "No Data" ve
# wrapperu .ag-overlay-no-rows-wrapper. POZOR: trida NENI -no-rows-center;
# navic jsou na strance 2 overlaye (jeden skryty) -> kontroluj viditelny
# (offsetParent != null). Detekuj, aby to u centra bez dat necekalo 120 s.
EMPTY_GRID_JS = """() => {
if (document.querySelectorAll('div.ag-row').length > 0) return false;
return [...document.querySelectorAll('.ag-overlay-no-rows-wrapper')]
.some(e => e.offsetParent !== null);
}"""
page.wait_for_function(
f"""() => document.querySelectorAll('div.ag-row').length > 0
|| ({EMPTY_GRID_JS})()""",
timeout=120000,
)
if page.evaluate(EMPTY_GRID_JS):
log("KROK 2/5: centrum bez dat ('No Data' overlay) — preskakuji export.")
return
log("KROK 2/5: radky se objevily, cekam na stabilizaci poctu...")
prev = -1
for i in range(20): # max ~40 s stabilizace
cnt = page.locator("div.ag-row").count()
log(f" ...kontrola #{i+1}: {cnt} radku")
if cnt == prev and cnt > 0:
break
prev = cnt
page.wait_for_timeout(2000)
page.wait_for_timeout(2000) # buffer
log(f"KROK 2/5: data stabilni ({prev} radku v gridu).")
log("KROK 3/5: klikam na viditelne tri tecky (more_horiz)...")
page.locator("ag-export button:visible", has_text="more_horiz").first.click()
log("KROK 3/5: menu otevreno.")
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
dest = os.path.join(OUT_DIR, f"{timestamp} {report['filename']}")
log("KROK 4/5: klikam na 'Export to CSV' a cekam na stahovani...")
with page.expect_download(timeout=60000) as dl:
page.locator("mdl-menu-item:visible", has_text="Export to CSV").first.click()
log("KROK 4/5: stahovani zachyceno, ukladam soubor...")
dl.value.save_as(dest)
log(f"KROK 5/5: HOTOVO -> {dest}")
def download_report(page, report):
"""Wrapper s retry. Vyhazuje vyjimku az kdyz selzou vsechny pokusy."""
log(f"=== Centrum {report['site']} / {report['type']} (studie {report['study']}) ===")
last_err = None
for attempt in range(1, MAX_ATTEMPTS + 1):
if attempt > 1:
log(f"RETRY: {attempt}/{MAX_ATTEMPTS} pokus o centrum "
f"{report['site']}/{report['type']} (po {RETRY_BACKOFF_S}s pauze)...")
try:
page.wait_for_timeout(RETRY_BACKOFF_S * 1000)
except Exception:
pass
try:
download_report_once(page, report)
if attempt > 1:
log(f"RETRY: pokus {attempt} uspesny.")
return
except Exception as e:
last_err = e
log(f"POKUS {attempt}/{MAX_ATTEMPTS} SELHAL: {e!r}")
if attempt == MAX_ATTEMPTS:
# propustit vyse — outer try ve smycce reportu to zaloguje a pokracuje
raise
# Pred dalsim pokusem zkusit prejit pryc, at se grid resetuje.
try:
page.goto("about:blank")
except Exception:
pass
def main():
log(f"START: shard {SHARD}/{OF}, profil '{os.path.basename(PROFILE_DIR)}', "
f"{len(REPORTS)}/{len(ALL_REPORTS)} reportu k zpracovani "
f"(MAX_ATTEMPTS={MAX_ATTEMPTS}).")
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=PROFILE_DIR,
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--disable-restore-session-state",
"--disable-session-crashed-bubble",
],
no_viewport=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
accept_downloads=True,
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
log("START: prohlizec spusten.")
login(page)
ok, failed = 0, []
for idx, report in enumerate(REPORTS, 1):
log(f">>> Report {idx}/{len(REPORTS)}")
try:
download_report(page, report)
ok += 1
except Exception as e:
failed.append(f"{report['site']}/{report['type']}")
log(f"CHYBA u centra {report['site']}/{report['type']} "
f"(vsechny {MAX_ATTEMPTS} pokusy selhaly): {e!r} — pokracuji dalsim.")
log(f"KONEC: hotovo {ok}/{len(REPORTS)} reportu (shard {SHARD}/{OF}).")
if failed:
log(f"KONEC: SELHALA centra: {', '.join(failed)}")
context.close()
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: beh shardu spadl: {e!r}")
traceback.print_exc()
finally:
# Nech okno otevrene, at je videt log/chyba.
try:
input("\n[Enter] pro zavreni tohoto okna...")
except EOFError:
pass