261 lines
11 KiB
Python
261 lines
11 KiB
Python
# =============================================================================
|
|
# Název: download_test_results_v1.4.py
|
|
# Verze: 1.4
|
|
# Datum: 2026-05-29
|
|
# Popis: Stahuje Test Results ze xsp.labcorp.com pro 2 studie (36940, 35472),
|
|
# oba typy reportu (Standard + Microbiology), pres vsechna centra.
|
|
# Ceka na nacteni AG Grid radku (.ag-row); prazdne centrum ('No Data')
|
|
# preskoci. Vystup: timestampovane CSV do adresare Source/.
|
|
# Zmeny v1.4: + RETRY na urovni reportu. Pri paralelnim behu server obcas
|
|
# timeoutuje (page.goto 30 s, nebo wait_for_function 120 s na grid).
|
|
# Driv byl report rovnou zaznamenan jako selhany. Ted se kazdy
|
|
# report zkusi az 2x (MAX_ATTEMPTS=2). Vetsina prechodnych
|
|
# timeoutu projde napodruhe. Mezi pokusy 5 s pauza.
|
|
# Zmeny v1.3: robustni login (NEcekat na networkidle, cekat na pole
|
|
# Email/Password) + okno se pri padu nezavre (try/except + input()).
|
|
# Zmeny v1.2: + paralelni beh pres sharding (--shard N --of M). Profil per shard.
|
|
# Zmeny v1.1: + studie 35472, + report typ microbiology.
|
|
# =============================================================================
|
|
from playwright.sync_api import sync_playwright
|
|
from datetime import datetime
|
|
import argparse
|
|
import traceback
|
|
import os
|
|
|
|
# --- argumenty: sharding pro paralelni beh ----------------------------------
|
|
parser = argparse.ArgumentParser(description="Stahovani Test Results (XSP) s podporou shardingu.")
|
|
parser.add_argument("--shard", type=int, default=1, help="poradi tohoto shardu (1..of)")
|
|
parser.add_argument("--of", type=int, default=1, help="celkovy pocet shardu")
|
|
ARGS = parser.parse_args()
|
|
SHARD, OF = ARGS.shard, ARGS.of
|
|
TAG = f"[S{SHARD}/{OF}]" if OF > 1 else ""
|
|
|
|
|
|
def log(msg):
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {TAG} {msg}", flush=True)
|
|
|
|
EMAIL = "vbuzalka@its.jnj.com"
|
|
PASSWORD = "%zT3Wqfc9)cWua5"
|
|
LOGIN_URL = "https://xsp.covance.com/"
|
|
OUT_DIR = r"U:\PythonProject\Janssen\Covance\Source"
|
|
# Pri paralelnim behu MUSI mit kazdy shard vlastni profil (Chrome zamyka adresar
|
|
# profilu -> dve bezici instance nemohou sdilet jeden). Serialni beh (of=1)
|
|
# pouziva puvodni browser_profile.
|
|
_BASE = os.path.dirname(os.path.abspath(__file__))
|
|
PROFILE_DIR = os.path.join(_BASE, "browser_profile" if OF == 1 else f"browser_profile_{SHARD}")
|
|
|
|
# --- retry konfigurace -----------------------------------------------------
|
|
MAX_ATTEMPTS = 2 # 1. pokus + 1 retry na report
|
|
RETRY_BACKOFF_S = 5 # pauza pred opakovanym pokusem (s)
|
|
|
|
# Studie + jejich interni cisla center.
|
|
# 36940 = 77242113UCO3001 (UC) — zdroj center: download_equeries_report SITES
|
|
# 35472 = druha studie (MDD)
|
|
STUDIES = [
|
|
{
|
|
"study": "36940",
|
|
"sites": [
|
|
"930551", "930556", "930525", "930549", "930543", "930547",
|
|
"930555", "930557", "930539", "930536", "930553", "930531",
|
|
],
|
|
},
|
|
{
|
|
"study": "35472",
|
|
"sites": [
|
|
"898745", "898739", "898733", "898744", "898727",
|
|
],
|
|
},
|
|
]
|
|
|
|
# Typy reportu: zalozka v URL + suffix v nazvu souboru.
|
|
REPORT_TYPES = [
|
|
{"slug": "standard-test-results", "suffix": "standard"},
|
|
{"slug": "microbiology", "suffix": "microbiology"},
|
|
]
|
|
|
|
ALL_REPORTS = [
|
|
{
|
|
"site": sid,
|
|
"study": st["study"],
|
|
"type": rt["suffix"],
|
|
"url": f"https://xsp.labcorp.com/sponsor/study/{st['study']}/test-results/{sid}/{rt['slug']}",
|
|
"filename": f"sponsor-study-{st['study']}-test-results-{sid}-{rt['suffix']}.csv",
|
|
}
|
|
for st in STUDIES
|
|
for sid in st["sites"]
|
|
for rt in REPORT_TYPES
|
|
]
|
|
|
|
# Tento shard vezme kazdy of-ty report od indexu (shard-1). Rovnomerne rozdeleni
|
|
# a zadny report neudela dva shardy zaroven.
|
|
REPORTS = ALL_REPORTS[SHARD - 1::OF]
|
|
|
|
|
|
def login(page):
|
|
log("LOGIN: otviram login stranku...")
|
|
page.goto(LOGIN_URL)
|
|
# NEcekat na 'networkidle' — login SPA (labcorp/OKTA) ji nikdy nedosahne
|
|
# (analytika/polling bezi porad) -> drive to vedlo k timeoutu a padu procesu.
|
|
# Misto toho cekame primo na pole Email. Pokud se do 12 s neobjevi,
|
|
# povazujeme session za aktivni (uz prihlaseno).
|
|
try:
|
|
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
|
except Exception:
|
|
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
|
return
|
|
|
|
log("LOGIN: zadavam email...")
|
|
page.get_by_label("Email").fill(EMAIL)
|
|
page.get_by_role("button", name="Next").click()
|
|
|
|
log("LOGIN: cekam na pole pro heslo...")
|
|
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
|
log("LOGIN: zadavam heslo...")
|
|
page.get_by_label("Password").fill(PASSWORD)
|
|
page.get_by_role("button", name="Verify").click()
|
|
|
|
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
|
try:
|
|
page.wait_for_url(
|
|
lambda url: "code=" not in url or "xsp." in url,
|
|
timeout=60000,
|
|
)
|
|
except Exception:
|
|
log("LOGIN: wait_for_url vyprsel, pokracuji (overim pristup pri 1. reportu).")
|
|
page.wait_for_timeout(3000)
|
|
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
|
|
|
|
|
def download_report_once(page, report):
|
|
"""Jeden pokus o stazeni reportu. Vyhazuje vyjimku pri chybe."""
|
|
log(f"KROK 1/5: navigace na report URL...")
|
|
page.goto(report["url"])
|
|
log(f"KROK 1/5: stranka nactena ({page.url})")
|
|
|
|
log("KROK 2/5: cekam na radky gridu (.ag-row) nebo prazdny grid ('No Data')...")
|
|
# AG Grid radky jsou position-absolute (virtualni render), takze nejsou
|
|
# "visible" dle Playwrightu -> cekej na pritomnost v DOM, ne na viditelnost.
|
|
# Prazdne centrum: AG Grid vykresli no-rows overlay s textem "No Data" ve
|
|
# wrapperu .ag-overlay-no-rows-wrapper. POZOR: trida NENI -no-rows-center;
|
|
# navic jsou na strance 2 overlaye (jeden skryty) -> kontroluj viditelny
|
|
# (offsetParent != null). Detekuj, aby to u centra bez dat necekalo 120 s.
|
|
EMPTY_GRID_JS = """() => {
|
|
if (document.querySelectorAll('div.ag-row').length > 0) return false;
|
|
return [...document.querySelectorAll('.ag-overlay-no-rows-wrapper')]
|
|
.some(e => e.offsetParent !== null);
|
|
}"""
|
|
page.wait_for_function(
|
|
f"""() => document.querySelectorAll('div.ag-row').length > 0
|
|
|| ({EMPTY_GRID_JS})()""",
|
|
timeout=120000,
|
|
)
|
|
if page.evaluate(EMPTY_GRID_JS):
|
|
log("KROK 2/5: centrum bez dat ('No Data' overlay) — preskakuji export.")
|
|
return
|
|
log("KROK 2/5: radky se objevily, cekam na stabilizaci poctu...")
|
|
prev = -1
|
|
for i in range(20): # max ~40 s stabilizace
|
|
cnt = page.locator("div.ag-row").count()
|
|
log(f" ...kontrola #{i+1}: {cnt} radku")
|
|
if cnt == prev and cnt > 0:
|
|
break
|
|
prev = cnt
|
|
page.wait_for_timeout(2000)
|
|
page.wait_for_timeout(2000) # buffer
|
|
log(f"KROK 2/5: data stabilni ({prev} radku v gridu).")
|
|
|
|
log("KROK 3/5: klikam na viditelne tri tecky (more_horiz)...")
|
|
page.locator("ag-export button:visible", has_text="more_horiz").first.click()
|
|
log("KROK 3/5: menu otevreno.")
|
|
|
|
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
|
dest = os.path.join(OUT_DIR, f"{timestamp} {report['filename']}")
|
|
log("KROK 4/5: klikam na 'Export to CSV' a cekam na stahovani...")
|
|
with page.expect_download(timeout=60000) as dl:
|
|
page.locator("mdl-menu-item:visible", has_text="Export to CSV").first.click()
|
|
log("KROK 4/5: stahovani zachyceno, ukladam soubor...")
|
|
dl.value.save_as(dest)
|
|
log(f"KROK 5/5: HOTOVO -> {dest}")
|
|
|
|
|
|
def download_report(page, report):
|
|
"""Wrapper s retry. Vyhazuje vyjimku az kdyz selzou vsechny pokusy."""
|
|
log(f"=== Centrum {report['site']} / {report['type']} (studie {report['study']}) ===")
|
|
last_err = None
|
|
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
if attempt > 1:
|
|
log(f"RETRY: {attempt}/{MAX_ATTEMPTS} pokus o centrum "
|
|
f"{report['site']}/{report['type']} (po {RETRY_BACKOFF_S}s pauze)...")
|
|
try:
|
|
page.wait_for_timeout(RETRY_BACKOFF_S * 1000)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
download_report_once(page, report)
|
|
if attempt > 1:
|
|
log(f"RETRY: pokus {attempt} uspesny.")
|
|
return
|
|
except Exception as e:
|
|
last_err = e
|
|
log(f"POKUS {attempt}/{MAX_ATTEMPTS} SELHAL: {e!r}")
|
|
if attempt == MAX_ATTEMPTS:
|
|
# propustit vyse — outer try ve smycce reportu to zaloguje a pokracuje
|
|
raise
|
|
# Pred dalsim pokusem zkusit prejit pryc, at se grid resetuje.
|
|
try:
|
|
page.goto("about:blank")
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def main():
|
|
log(f"START: shard {SHARD}/{OF}, profil '{os.path.basename(PROFILE_DIR)}', "
|
|
f"{len(REPORTS)}/{len(ALL_REPORTS)} reportu k zpracovani "
|
|
f"(MAX_ATTEMPTS={MAX_ATTEMPTS}).")
|
|
with sync_playwright() as p:
|
|
context = p.chromium.launch_persistent_context(
|
|
user_data_dir=PROFILE_DIR,
|
|
headless=False,
|
|
args=[
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--disable-restore-session-state",
|
|
"--disable-session-crashed-bubble",
|
|
],
|
|
no_viewport=True,
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
|
accept_downloads=True,
|
|
)
|
|
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
|
page = context.new_page()
|
|
log("START: prohlizec spusten.")
|
|
login(page)
|
|
ok, failed = 0, []
|
|
for idx, report in enumerate(REPORTS, 1):
|
|
log(f">>> Report {idx}/{len(REPORTS)}")
|
|
try:
|
|
download_report(page, report)
|
|
ok += 1
|
|
except Exception as e:
|
|
failed.append(f"{report['site']}/{report['type']}")
|
|
log(f"CHYBA u centra {report['site']}/{report['type']} "
|
|
f"(vsechny {MAX_ATTEMPTS} pokusy selhaly): {e!r} — pokracuji dalsim.")
|
|
log(f"KONEC: hotovo {ok}/{len(REPORTS)} reportu (shard {SHARD}/{OF}).")
|
|
if failed:
|
|
log(f"KONEC: SELHALA centra: {', '.join(failed)}")
|
|
context.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except Exception as e:
|
|
log(f"FATAL: beh shardu spadl: {e!r}")
|
|
traceback.print_exc()
|
|
finally:
|
|
# Nech okno otevrene, at je videt log/chyba.
|
|
try:
|
|
input("\n[Enter] pro zavreni tohoto okna...")
|
|
except EOFError:
|
|
pass
|