Initial commit — clean history (removed large test files, browser profiles, Medidata/Clario downloads)
This commit is contained in:
@@ -0,0 +1,260 @@
|
||||
# =============================================================================
|
||||
# Název: download_test_results_v1.4.py
|
||||
# Verze: 1.4
|
||||
# Datum: 2026-05-29
|
||||
# Popis: Stahuje Test Results ze xsp.labcorp.com pro 2 studie (36940, 35472),
|
||||
# oba typy reportu (Standard + Microbiology), pres vsechna centra.
|
||||
# Ceka na nacteni AG Grid radku (.ag-row); prazdne centrum ('No Data')
|
||||
# preskoci. Vystup: timestampovane CSV do adresare Source/.
|
||||
# Zmeny v1.4: + RETRY na urovni reportu. Pri paralelnim behu server obcas
|
||||
# timeoutuje (page.goto 30 s, nebo wait_for_function 120 s na grid).
|
||||
# Driv byl report rovnou zaznamenan jako selhany. Ted se kazdy
|
||||
# report zkusi az 2x (MAX_ATTEMPTS=2). Vetsina prechodnych
|
||||
# timeoutu projde napodruhe. Mezi pokusy 5 s pauza.
|
||||
# Zmeny v1.3: robustni login (NEcekat na networkidle, cekat na pole
|
||||
# Email/Password) + okno se pri padu nezavre (try/except + input()).
|
||||
# Zmeny v1.2: + paralelni beh pres sharding (--shard N --of M). Profil per shard.
|
||||
# Zmeny v1.1: + studie 35472, + report typ microbiology.
|
||||
# =============================================================================
|
||||
from playwright.sync_api import sync_playwright
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import traceback
|
||||
import os
|
||||
|
||||
# --- argumenty: sharding pro paralelni beh ----------------------------------
|
||||
parser = argparse.ArgumentParser(description="Stahovani Test Results (XSP) s podporou shardingu.")
|
||||
parser.add_argument("--shard", type=int, default=1, help="poradi tohoto shardu (1..of)")
|
||||
parser.add_argument("--of", type=int, default=1, help="celkovy pocet shardu")
|
||||
ARGS = parser.parse_args()
|
||||
SHARD, OF = ARGS.shard, ARGS.of
|
||||
TAG = f"[S{SHARD}/{OF}]" if OF > 1 else ""
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {TAG} {msg}", flush=True)
|
||||
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "%zT3Wqfc9)cWua5"
|
||||
LOGIN_URL = "https://xsp.covance.com/"
|
||||
OUT_DIR = r"U:\PythonProject\Janssen\Covance_UCO3001\Source"
|
||||
# Pri paralelnim behu MUSI mit kazdy shard vlastni profil (Chrome zamyka adresar
|
||||
# profilu -> dve bezici instance nemohou sdilet jeden). Serialni beh (of=1)
|
||||
# pouziva puvodni browser_profile.
|
||||
_BASE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE_DIR = os.path.join(_BASE, "browser_profile" if OF == 1 else f"browser_profile_{SHARD}")
|
||||
|
||||
# --- retry konfigurace -----------------------------------------------------
|
||||
MAX_ATTEMPTS = 2 # 1. pokus + 1 retry na report
|
||||
RETRY_BACKOFF_S = 5 # pauza pred opakovanym pokusem (s)
|
||||
|
||||
# Studie + jejich interni cisla center.
|
||||
# 36940 = 77242113UCO3001 (UC) — zdroj center: download_equeries_report SITES
|
||||
# 35472 = druha studie (MDD)
|
||||
STUDIES = [
|
||||
{
|
||||
"study": "36940",
|
||||
"sites": [
|
||||
"930551", "930556", "930525", "930549", "930543", "930547",
|
||||
"930555", "930557", "930539", "930536", "930553", "930531",
|
||||
],
|
||||
},
|
||||
{
|
||||
"study": "35472",
|
||||
"sites": [
|
||||
"898745", "898739", "898733", "898744", "898727",
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
# Typy reportu: zalozka v URL + suffix v nazvu souboru.
|
||||
REPORT_TYPES = [
|
||||
{"slug": "standard-test-results", "suffix": "standard"},
|
||||
{"slug": "microbiology", "suffix": "microbiology"},
|
||||
]
|
||||
|
||||
ALL_REPORTS = [
|
||||
{
|
||||
"site": sid,
|
||||
"study": st["study"],
|
||||
"type": rt["suffix"],
|
||||
"url": f"https://xsp.labcorp.com/sponsor/study/{st['study']}/test-results/{sid}/{rt['slug']}",
|
||||
"filename": f"sponsor-study-{st['study']}-test-results-{sid}-{rt['suffix']}.csv",
|
||||
}
|
||||
for st in STUDIES
|
||||
for sid in st["sites"]
|
||||
for rt in REPORT_TYPES
|
||||
]
|
||||
|
||||
# Tento shard vezme kazdy of-ty report od indexu (shard-1). Rovnomerne rozdeleni
|
||||
# a zadny report neudela dva shardy zaroven.
|
||||
REPORTS = ALL_REPORTS[SHARD - 1::OF]
|
||||
|
||||
|
||||
def login(page):
|
||||
log("LOGIN: otviram login stranku...")
|
||||
page.goto(LOGIN_URL)
|
||||
# NEcekat na 'networkidle' — login SPA (labcorp/OKTA) ji nikdy nedosahne
|
||||
# (analytika/polling bezi porad) -> drive to vedlo k timeoutu a padu procesu.
|
||||
# Misto toho cekame primo na pole Email. Pokud se do 12 s neobjevi,
|
||||
# povazujeme session za aktivni (uz prihlaseno).
|
||||
try:
|
||||
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
||||
except Exception:
|
||||
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
||||
return
|
||||
|
||||
log("LOGIN: zadavam email...")
|
||||
page.get_by_label("Email").fill(EMAIL)
|
||||
page.get_by_role("button", name="Next").click()
|
||||
|
||||
log("LOGIN: cekam na pole pro heslo...")
|
||||
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
||||
log("LOGIN: zadavam heslo...")
|
||||
page.get_by_label("Password").fill(PASSWORD)
|
||||
page.get_by_role("button", name="Verify").click()
|
||||
|
||||
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
||||
try:
|
||||
page.wait_for_url(
|
||||
lambda url: "code=" not in url or "xsp." in url,
|
||||
timeout=60000,
|
||||
)
|
||||
except Exception:
|
||||
log("LOGIN: wait_for_url vyprsel, pokracuji (overim pristup pri 1. reportu).")
|
||||
page.wait_for_timeout(3000)
|
||||
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
||||
|
||||
|
||||
def download_report_once(page, report):
|
||||
"""Jeden pokus o stazeni reportu. Vyhazuje vyjimku pri chybe."""
|
||||
log(f"KROK 1/5: navigace na report URL...")
|
||||
page.goto(report["url"])
|
||||
log(f"KROK 1/5: stranka nactena ({page.url})")
|
||||
|
||||
log("KROK 2/5: cekam na radky gridu (.ag-row) nebo prazdny grid ('No Data')...")
|
||||
# AG Grid radky jsou position-absolute (virtualni render), takze nejsou
|
||||
# "visible" dle Playwrightu -> cekej na pritomnost v DOM, ne na viditelnost.
|
||||
# Prazdne centrum: AG Grid vykresli no-rows overlay s textem "No Data" ve
|
||||
# wrapperu .ag-overlay-no-rows-wrapper. POZOR: trida NENI -no-rows-center;
|
||||
# navic jsou na strance 2 overlaye (jeden skryty) -> kontroluj viditelny
|
||||
# (offsetParent != null). Detekuj, aby to u centra bez dat necekalo 120 s.
|
||||
EMPTY_GRID_JS = """() => {
|
||||
if (document.querySelectorAll('div.ag-row').length > 0) return false;
|
||||
return [...document.querySelectorAll('.ag-overlay-no-rows-wrapper')]
|
||||
.some(e => e.offsetParent !== null);
|
||||
}"""
|
||||
page.wait_for_function(
|
||||
f"""() => document.querySelectorAll('div.ag-row').length > 0
|
||||
|| ({EMPTY_GRID_JS})()""",
|
||||
timeout=120000,
|
||||
)
|
||||
if page.evaluate(EMPTY_GRID_JS):
|
||||
log("KROK 2/5: centrum bez dat ('No Data' overlay) — preskakuji export.")
|
||||
return
|
||||
log("KROK 2/5: radky se objevily, cekam na stabilizaci poctu...")
|
||||
prev = -1
|
||||
for i in range(20): # max ~40 s stabilizace
|
||||
cnt = page.locator("div.ag-row").count()
|
||||
log(f" ...kontrola #{i+1}: {cnt} radku")
|
||||
if cnt == prev and cnt > 0:
|
||||
break
|
||||
prev = cnt
|
||||
page.wait_for_timeout(2000)
|
||||
page.wait_for_timeout(2000) # buffer
|
||||
log(f"KROK 2/5: data stabilni ({prev} radku v gridu).")
|
||||
|
||||
log("KROK 3/5: klikam na viditelne tri tecky (more_horiz)...")
|
||||
page.locator("ag-export button:visible", has_text="more_horiz").first.click()
|
||||
log("KROK 3/5: menu otevreno.")
|
||||
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||
dest = os.path.join(OUT_DIR, f"{timestamp} {report['filename']}")
|
||||
log("KROK 4/5: klikam na 'Export to CSV' a cekam na stahovani...")
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
page.locator("mdl-menu-item:visible", has_text="Export to CSV").first.click()
|
||||
log("KROK 4/5: stahovani zachyceno, ukladam soubor...")
|
||||
dl.value.save_as(dest)
|
||||
log(f"KROK 5/5: HOTOVO -> {dest}")
|
||||
|
||||
|
||||
def download_report(page, report):
|
||||
"""Wrapper s retry. Vyhazuje vyjimku az kdyz selzou vsechny pokusy."""
|
||||
log(f"=== Centrum {report['site']} / {report['type']} (studie {report['study']}) ===")
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_ATTEMPTS + 1):
|
||||
if attempt > 1:
|
||||
log(f"RETRY: {attempt}/{MAX_ATTEMPTS} pokus o centrum "
|
||||
f"{report['site']}/{report['type']} (po {RETRY_BACKOFF_S}s pauze)...")
|
||||
try:
|
||||
page.wait_for_timeout(RETRY_BACKOFF_S * 1000)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
download_report_once(page, report)
|
||||
if attempt > 1:
|
||||
log(f"RETRY: pokus {attempt} uspesny.")
|
||||
return
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
log(f"POKUS {attempt}/{MAX_ATTEMPTS} SELHAL: {e!r}")
|
||||
if attempt == MAX_ATTEMPTS:
|
||||
# propustit vyse — outer try ve smycce reportu to zaloguje a pokracuje
|
||||
raise
|
||||
# Pred dalsim pokusem zkusit prejit pryc, at se grid resetuje.
|
||||
try:
|
||||
page.goto("about:blank")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
log(f"START: shard {SHARD}/{OF}, profil '{os.path.basename(PROFILE_DIR)}', "
|
||||
f"{len(REPORTS)}/{len(ALL_REPORTS)} reportu k zpracovani "
|
||||
f"(MAX_ATTEMPTS={MAX_ATTEMPTS}).")
|
||||
with sync_playwright() as p:
|
||||
context = p.chromium.launch_persistent_context(
|
||||
user_data_dir=PROFILE_DIR,
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--disable-restore-session-state",
|
||||
"--disable-session-crashed-bubble",
|
||||
],
|
||||
no_viewport=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
accept_downloads=True,
|
||||
)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
log("START: prohlizec spusten.")
|
||||
login(page)
|
||||
ok, failed = 0, []
|
||||
for idx, report in enumerate(REPORTS, 1):
|
||||
log(f">>> Report {idx}/{len(REPORTS)}")
|
||||
try:
|
||||
download_report(page, report)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
failed.append(f"{report['site']}/{report['type']}")
|
||||
log(f"CHYBA u centra {report['site']}/{report['type']} "
|
||||
f"(vsechny {MAX_ATTEMPTS} pokusy selhaly): {e!r} — pokracuji dalsim.")
|
||||
log(f"KONEC: hotovo {ok}/{len(REPORTS)} reportu (shard {SHARD}/{OF}).")
|
||||
if failed:
|
||||
log(f"KONEC: SELHALA centra: {', '.join(failed)}")
|
||||
context.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: beh shardu spadl: {e!r}")
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
# Nech okno otevrene, at je videt log/chyba.
|
||||
try:
|
||||
input("\n[Enter] pro zavreni tohoto okna...")
|
||||
except EOFError:
|
||||
pass
|
||||
Reference in New Issue
Block a user