z230
This commit is contained in:
@@ -0,0 +1,399 @@
|
||||
# =============================================================================
|
||||
# Název: download_lab_reports_v1.1.py
|
||||
# Verze: 1.1
|
||||
# Datum: 2026-06-16
|
||||
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
|
||||
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
|
||||
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
|
||||
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
|
||||
#
|
||||
# Princip stahovani stejny jako download_test_results: Playwright +
|
||||
# perzistentni profil, jednorazovy login, klik na "English" ve sloupci
|
||||
# Download. PDF bajty se ctou z Playwright temp souboru (download.path()),
|
||||
# save_as se nevola -> nic netrvale neni na disku.
|
||||
#
|
||||
# INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi nahore).
|
||||
# Skript jde shora dolu; u kazdeho radku nejdriv precte metadata a
|
||||
# spocita record_id. Jakmile narazi na uz ulozeny report, KONCI
|
||||
# (vse pod nim je starsi a uz v Mongo je). Korekce vysledku = stejny
|
||||
# report znovu vystaveny s NOVYM Posted => novy record_id => stahne se
|
||||
# jako novy, puvodni zustava.
|
||||
#
|
||||
# record_id = "{site}|{subject}|{accession}|{visit}|{posted}"
|
||||
# (Posted vc. casu odlisuje reissue).
|
||||
#
|
||||
# Prepinace:
|
||||
# --full projit vsechny radky (bez predcasneho konce); upsertne
|
||||
# chybejici / zmenene (rekonciliace).
|
||||
# --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty.
|
||||
# --limit N zpracovat max N radku (test).
|
||||
# =============================================================================
|
||||
from playwright.sync_api import sync_playwright
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
from bson.binary import Binary
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import urllib.parse
|
||||
|
||||
# --- argumenty --------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
|
||||
parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
|
||||
parser.add_argument("--limit", type=int, default=0, help="max N radku (0 = vse)")
|
||||
ARGS = parser.parse_args()
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
||||
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "%zT3Wqfc9)cWua5"
|
||||
LOGIN_URL = "https://xsp.labcorp.com/"
|
||||
STUDY = "36940"
|
||||
STUDY_CODE = "77242113UCO3001"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "covance"
|
||||
COLLECTION = "labreports"
|
||||
|
||||
# 10 center (interni ID center) — z URL "GO TO LINK".
|
||||
SITES = [
|
||||
"930539", "930547", "930555", "930556", "930553",
|
||||
"930549", "930525", "930536", "930557", "930531",
|
||||
]
|
||||
|
||||
_BASE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
|
||||
|
||||
|
||||
def lab_reports_url():
|
||||
site_param = json.dumps(SITES, separators=(",", ":"))
|
||||
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
|
||||
f"?site={urllib.parse.quote(site_param)}")
|
||||
|
||||
|
||||
# --- formatovani --------------------------------------------------------------
|
||||
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
|
||||
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
|
||||
"Nov": "11", "Dec": "12"}
|
||||
|
||||
|
||||
def safe(s: str) -> str:
|
||||
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
||||
|
||||
|
||||
def fmt_date(s: str) -> str:
|
||||
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
|
||||
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
|
||||
if m and m.group(1)[:3] in _MONTHS:
|
||||
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
|
||||
return safe(s)
|
||||
|
||||
|
||||
def fmt_datetime(s: str) -> str:
|
||||
"""'Jun 15, 2026 7:49 PM' -> '2026-06-15 19:49'. Bez casu -> jen datum."""
|
||||
s = (s or "").strip()
|
||||
for f in ("%b %d, %Y %I:%M %p", "%b %d, %Y %I:%M:%S %p"):
|
||||
try:
|
||||
return datetime.strptime(s, f).strftime("%Y-%m-%d %H:%M")
|
||||
except ValueError:
|
||||
pass
|
||||
return fmt_date(s)
|
||||
|
||||
|
||||
def make_record_id(meta: dict) -> str:
|
||||
return "|".join([
|
||||
meta["site"], meta["subject"], meta["accession"],
|
||||
meta["visit"], fmt_datetime(meta["posted"]),
|
||||
])
|
||||
|
||||
|
||||
def build_basename(meta: dict) -> str:
|
||||
"""Nazev (kvuli budoucimu materializacnimu skriptu); ulozen jako fileName."""
|
||||
return safe(
|
||||
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
|
||||
f"{meta['subject']} {meta['visit']} {meta['accession']} "
|
||||
f"posted {fmt_date(meta['posted'])}"
|
||||
) + ".pdf"
|
||||
|
||||
|
||||
# --- JS helpery (AG Grid) ---------------------------------------------------
|
||||
JS_GRID_INFO = r"""() => {
|
||||
const c = document.querySelector('.ag-body-container');
|
||||
const r = document.querySelector('.ag-body-container .ag-row');
|
||||
const rh = r ? r.getBoundingClientRect().height : 25;
|
||||
const ch = c ? parseFloat(c.style.height || '0') : 0;
|
||||
return { rowHeight: rh || 25, total: rh ? Math.round(ch / rh) : 0 };
|
||||
}"""
|
||||
|
||||
JS_READ_ROW = r"""(idx) => {
|
||||
const dedup = s => {
|
||||
s = (s || '').replace(/\s+/g, ' ').trim();
|
||||
const h = s.slice(0, Math.floor(s.length / 2));
|
||||
if (s === h + h) return h;
|
||||
const m = s.match(/^(.*?)\s+\1$/);
|
||||
if (m) return m[1];
|
||||
return s;
|
||||
};
|
||||
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
|
||||
if (!row) return null;
|
||||
const get = id => {
|
||||
const c = row.querySelector('[col-id="' + id + '"]');
|
||||
return c ? dedup(c.textContent) : '';
|
||||
};
|
||||
return {
|
||||
type: get('type'),
|
||||
subject: get('subjectId'),
|
||||
accession: get('accessionNumber'),
|
||||
visit: get('visit'),
|
||||
collected: get('visitCollectionDate'),
|
||||
site: get('siteNum'),
|
||||
posted: get('postedDateTime'),
|
||||
};
|
||||
}"""
|
||||
|
||||
JS_SCROLL_TO = r"""(args) => {
|
||||
const [idx, rh] = args;
|
||||
const vp = document.querySelector('.ag-body-viewport');
|
||||
if (!vp) return;
|
||||
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
|
||||
}"""
|
||||
|
||||
|
||||
# --- login ------------------------------------------------------------------
|
||||
def login(page):
|
||||
log("LOGIN: otviram login stranku...")
|
||||
page.goto(LOGIN_URL)
|
||||
try:
|
||||
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
||||
except Exception:
|
||||
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
||||
return
|
||||
log("LOGIN: zadavam email...")
|
||||
page.get_by_label("Email").fill(EMAIL)
|
||||
page.get_by_role("button", name="Next").click()
|
||||
log("LOGIN: cekam na pole pro heslo...")
|
||||
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
||||
log("LOGIN: zadavam heslo...")
|
||||
page.get_by_label("Password").fill(PASSWORD)
|
||||
page.get_by_role("button", name="Verify").click()
|
||||
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
||||
try:
|
||||
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
|
||||
except Exception:
|
||||
log("LOGIN: wait_for_url vyprsel, pokracuji.")
|
||||
page.wait_for_timeout(3000)
|
||||
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
||||
|
||||
|
||||
def open_grid(page):
|
||||
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
|
||||
page.goto(lab_reports_url())
|
||||
log("GRID: cekam na radky (.ag-row)...")
|
||||
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
|
||||
prev = -1
|
||||
for i in range(20):
|
||||
info = page.evaluate(JS_GRID_INFO)
|
||||
cnt = info["total"]
|
||||
log(f" ...kontrola #{i+1}: total={cnt}, rowHeight={info['rowHeight']}")
|
||||
if cnt == prev and cnt > 0:
|
||||
break
|
||||
prev = cnt
|
||||
page.wait_for_timeout(2000)
|
||||
info = page.evaluate(JS_GRID_INFO)
|
||||
log(f"GRID: nacteno, total={info['total']} radku, rowHeight={info['rowHeight']}px.")
|
||||
return info["total"], info["rowHeight"]
|
||||
|
||||
|
||||
def read_row(page, idx, row_height):
|
||||
page.evaluate(JS_SCROLL_TO, [idx, row_height])
|
||||
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
|
||||
page.wait_for_timeout(120)
|
||||
meta = page.evaluate(JS_READ_ROW, idx)
|
||||
if not meta or not meta.get("subject"):
|
||||
raise RuntimeError(f"radek {idx}: nepodarilo se precist metadata")
|
||||
return meta
|
||||
|
||||
|
||||
def download_pdf_bytes(page, idx):
|
||||
link = page.locator(
|
||||
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
|
||||
has_text="English",
|
||||
).first
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
link.click()
|
||||
path = dl.value.path() # temp soubor Playwrightu
|
||||
with open(path, "rb") as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
|
||||
def upsert(col, meta, rid, data, now):
|
||||
fields = {
|
||||
"Type": meta["type"],
|
||||
"Subject": meta["subject"],
|
||||
"Accession": meta["accession"],
|
||||
"Visit": meta["visit"],
|
||||
"Collected Date": meta["collected"],
|
||||
"Site Number": meta["site"],
|
||||
"Posted": meta["posted"],
|
||||
}
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
derived = {
|
||||
"study": STUDY,
|
||||
"studyCode": STUDY_CODE,
|
||||
"type": meta["type"] or "Lab Result",
|
||||
"site": meta["site"],
|
||||
"subject": meta["subject"],
|
||||
"accession": meta["accession"],
|
||||
"visit": meta["visit"],
|
||||
"collected": fmt_date(meta["collected"]),
|
||||
"posted": fmt_datetime(meta["posted"]),
|
||||
"fields": fields,
|
||||
"fileName": build_basename(meta),
|
||||
"pdf": Binary(data),
|
||||
"pdfSize": len(data),
|
||||
"pdfSha256": sha,
|
||||
}
|
||||
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
|
||||
if existing is None:
|
||||
col.insert_one({"record_id": rid, **derived,
|
||||
"firstSeen": now, "lastSeen": now, "history": []})
|
||||
return "insert"
|
||||
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
|
||||
col.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{"$push": {"history": {"date": existing.get("lastSeen"),
|
||||
"fields": existing.get("fields"),
|
||||
"pdfSha256": existing.get("pdfSha256")}},
|
||||
"$set": {**derived, "lastSeen": now}},
|
||||
)
|
||||
return "update"
|
||||
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
|
||||
return "same"
|
||||
|
||||
|
||||
def main():
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
|
||||
f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}"
|
||||
f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}"
|
||||
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
|
||||
|
||||
col = None
|
||||
existing_ids = set()
|
||||
if not ARGS.dry_run:
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
col = client[DB_NAME][COLLECTION]
|
||||
col.create_index([("record_id", ASCENDING)], unique=True)
|
||||
for idx_def in (["study"], ["site"], ["subject"], ["accession"],
|
||||
["posted"], ["collected"]):
|
||||
col.create_index([(idx_def[0], ASCENDING)])
|
||||
existing_ids = {d["record_id"] for d in col.find({"study": STUDY}, {"record_id": 1})}
|
||||
log(f"START: v Mongo uz je {len(existing_ids)} reportu pro tuto studii.")
|
||||
else:
|
||||
# i v dry-run nacti existujici, at vime, co je opravdu nove
|
||||
try:
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
existing_ids = {d["record_id"] for d in
|
||||
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
|
||||
log(f"START: [dry-run] v Mongo je {len(existing_ids)} reportu.")
|
||||
except Exception as e:
|
||||
log(f"START: [dry-run] Mongo nedostupne ({e!r}), beru vse jako nove.")
|
||||
|
||||
with sync_playwright() as p:
|
||||
context = p.chromium.launch_persistent_context(
|
||||
user_data_dir=PROFILE_DIR,
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--disable-restore-session-state",
|
||||
"--disable-session-crashed-bubble",
|
||||
],
|
||||
no_viewport=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
accept_downloads=True,
|
||||
)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
log("START: prohlizec spusten.")
|
||||
|
||||
login(page)
|
||||
total, row_height = open_grid(page)
|
||||
if ARGS.limit:
|
||||
total = min(total, ARGS.limit)
|
||||
|
||||
new_cnt = upd_cnt = same_cnt = 0
|
||||
failed = []
|
||||
stopped = False
|
||||
for idx in range(total):
|
||||
try:
|
||||
meta = read_row(page, idx, row_height)
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA cteni radku {idx}: {e!r} — pokracuji.")
|
||||
continue
|
||||
|
||||
rid = make_record_id(meta)
|
||||
known = rid in existing_ids
|
||||
|
||||
if known and not ARGS.full:
|
||||
log(f">>> Radek {idx+1}/{total}: '{rid}' uz v Mongo "
|
||||
f"-> stop-at-known, koncim (zbytek je starsi).")
|
||||
stopped = True
|
||||
break
|
||||
if known and ARGS.full:
|
||||
log(f" #{idx}: znamy, [full] preskakuji download.")
|
||||
same_cnt += 1
|
||||
continue
|
||||
|
||||
# novy report
|
||||
if ARGS.dry_run:
|
||||
log(f" [DRY] NOVY #{idx}: {build_basename(meta)}")
|
||||
new_cnt += 1
|
||||
existing_ids.add(rid)
|
||||
continue
|
||||
try:
|
||||
data = download_pdf_bytes(page, idx)
|
||||
action = upsert(col, meta, rid, data, now)
|
||||
existing_ids.add(rid)
|
||||
if action == "insert":
|
||||
new_cnt += 1
|
||||
log(f" #{idx}: INSERT ({len(data)//1024} KB) {build_basename(meta)}")
|
||||
elif action == "update":
|
||||
upd_cnt += 1
|
||||
log(f" #{idx}: UPDATE {build_basename(meta)}")
|
||||
else:
|
||||
same_cnt += 1
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA stazeni/zapisu radku {idx}: {e!r} — pokracuji.")
|
||||
|
||||
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, "
|
||||
f"chyby={len(failed)} {'(stop-at-known)' if stopped else '(projeto vse)'}.")
|
||||
if failed:
|
||||
log(f"KONEC: SELHALY indexy: {failed}")
|
||||
context.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: beh spadl: {e!r}")
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
try:
|
||||
input("\n[Enter] pro zavreni tohoto okna...")
|
||||
except EOFError:
|
||||
pass
|
||||
Reference in New Issue
Block a user