468 lines
18 KiB
Python
468 lines
18 KiB
Python
# =============================================================================
|
|
# Název: download_lab_reports_v1.4.py
|
|
# Verze: 1.4
|
|
# Datum: 2026-06-16
|
|
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com do MongoDB
|
|
# (db covance, kolekce labreports) — metadata z tabulky + skutecne PDF
|
|
# (inline Binary). Na disk NEUKLADA.
|
|
#
|
|
# VICE STUDII: nahore je seznam STUDIES — kazda polozka ma "link"
|
|
# (lab-reports URL s filtrem center) a "enabled". Skript zpracuje
|
|
# vsechny enabled polozky. Z linku se odvodi interni cislo studie
|
|
# (/study/{N}/) i mnozina center (site=[...]); "lidsky" kod studie
|
|
# (napr. 77242113UCO3001) se cte z hlavicky portalu. Do Monga se pise
|
|
# study (interni) + studyCode (z hlavicky).
|
|
#
|
|
# REZIM SYNC_MODE: "delta" (jen nove pres reportId, stop-at-known)
|
|
# nebo "fullsync" (projit vse). CLI prepise: --delta/--fullsync.
|
|
#
|
|
# KLIC: record_id = reportId (stabilni 32-hex ID dokumentu, globalne
|
|
# unikatni). Jedna kolekce pro vsechny studie; delta filtruje per study.
|
|
#
|
|
# POJISTKA center: zpracuji se jen radky, jejichz siteId je v mnozine
|
|
# center z linku (study/zeme-agnosticke).
|
|
#
|
|
# Zmeny v1.4: seznam STUDIES (link+enabled) misto pevneho STUDY/SITES;
|
|
# studyCode z hlavicky; site-pojistka pres siteId z linku.
|
|
# =============================================================================
|
|
from playwright.sync_api import sync_playwright
|
|
from datetime import datetime
|
|
from pymongo import MongoClient, ASCENDING
|
|
from bson.binary import Binary
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import traceback
|
|
import urllib.parse
|
|
|
|
# ============================================================================
|
|
# STUDIE KE STAZENI — kazda ma link (lab-reports URL s filtrem center) a status.
|
|
# Skript stahne VSE, co ma enabled=True. Pridani dalsi studie = dalsi radek.
|
|
# ============================================================================
|
|
STUDIES = [
|
|
{
|
|
"enabled": True,
|
|
"note": "77242113UCO3001 (UC)",
|
|
"link": "https://xsp.labcorp.com/sponsor/study/36940/lab-reports?site=%5B%22930539%22,%22930547%22,%22930555%22,%22930556%22,%22930553%22,%22930549%22,%22930525%22,%22930536%22,%22930557%22,%22930531%22%5D",
|
|
},
|
|
{
|
|
"enabled": True,
|
|
"note": "42847922MDD3003 (MDD)",
|
|
"link": "https://xsp.labcorp.com/sponsor/study/35472/lab-reports?site=%5B%22898727%22,%22898733%22,%22898739%22,%22898745%22,%22898744%22%5D",
|
|
},
|
|
]
|
|
|
|
# ============================================================================
|
|
# REZIM SYNCHRONIZACE — CLI --delta / --fullsync ma prednost
|
|
# ============================================================================
|
|
SYNC_MODE = "delta" # "delta" = jen nove (stop-at-known pres reportId)
|
|
# "fullsync" = projit vse, doplnit chybejici/zmenene
|
|
|
|
HEADLESS = True # True = Playwright bezi bez okna (headless),
|
|
# False = viditelne okno prohlizece
|
|
# ============================================================================
|
|
|
|
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
|
|
parser.add_argument("--delta", action="store_true", help="vynutit rezim delta")
|
|
parser.add_argument("--fullsync", action="store_true", help="vynutit rezim fullsync")
|
|
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
|
|
parser.add_argument("--limit", type=int, default=0, help="max N novych radku NA STUDII (0 = vse)")
|
|
ARGS = parser.parse_args()
|
|
|
|
_mode = SYNC_MODE
|
|
if ARGS.fullsync:
|
|
_mode = "fullsync"
|
|
if ARGS.delta:
|
|
_mode = "delta"
|
|
FULLSYNC = (_mode == "fullsync")
|
|
|
|
|
|
def log(msg):
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
|
|
|
|
|
# --- konfigurace ------------------------------------------------------------
|
|
EMAIL = "vbuzalka@its.jnj.com"
|
|
PASSWORD = "%zT3Wqfc9)cWua5"
|
|
LOGIN_URL = "https://xsp.labcorp.com/"
|
|
|
|
MONGO_URI = "mongodb://192.168.1.76:27017"
|
|
DB_NAME = "covance"
|
|
COLLECTION = "labreports"
|
|
|
|
_BASE = os.path.dirname(os.path.abspath(__file__))
|
|
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
|
|
|
|
|
|
def parse_link(link: str):
|
|
"""Z lab-reports linku vytahne interni cislo studie a mnozinu center."""
|
|
m = re.search(r"/study/(\d+)/", link)
|
|
study = m.group(1) if m else None
|
|
qs = urllib.parse.urlparse(link).query
|
|
raw = urllib.parse.parse_qs(qs).get("site", [None])[0]
|
|
sites = []
|
|
if raw:
|
|
try:
|
|
sites = [str(s) for s in json.loads(raw)]
|
|
except Exception:
|
|
sites = re.findall(r"\d+", raw)
|
|
return study, sites
|
|
|
|
|
|
# --- formatovani -------------------------------------------------------------
|
|
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
|
|
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
|
|
"Nov": "11", "Dec": "12"}
|
|
|
|
|
|
def safe(s: str) -> str:
|
|
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
|
|
|
|
|
def fmt_date(s: str) -> str:
|
|
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
|
|
if m and m.group(1)[:3] in _MONTHS:
|
|
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
|
|
return safe(s)
|
|
|
|
|
|
def build_basename(study_code: str, meta: dict) -> str:
|
|
return safe(
|
|
f"{study_code} {fmt_date(meta['collected'])} {meta['site']} "
|
|
f"{meta['subject']} {meta['visit']} {meta['accession']} "
|
|
f"posted {fmt_date(meta['postedDisplay'])}"
|
|
) + ".pdf"
|
|
|
|
|
|
# --- JS helpery (AG Grid) ---------------------------------------------------
|
|
JS_ALL_ROWS = r"""() => {
|
|
let holder = null;
|
|
for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) {
|
|
if (el.__agComponent) { holder = el.__agComponent; break; }
|
|
}
|
|
if (!holder) return null;
|
|
const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder;
|
|
if (!api || !api.getDisplayedRowCount) return null;
|
|
const toIso = v => {
|
|
if (v == null) return null;
|
|
if (typeof v === 'string') return v;
|
|
if (v._i && typeof v._i === 'string') return v._i;
|
|
if (typeof v.toISOString === 'function') { try { return v.toISOString(); } catch (e) {} }
|
|
return String(v);
|
|
};
|
|
const cnt = api.getDisplayedRowCount();
|
|
const out = [];
|
|
for (let i = 0; i < cnt; i++) {
|
|
const n = api.getDisplayedRowAtIndex(i);
|
|
if (!n || !n.data) continue;
|
|
const d = n.data;
|
|
const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {};
|
|
out.push({
|
|
rowIndex: i,
|
|
reportId: d.reportId,
|
|
fileId: fl.fileId,
|
|
serverFileName: fl.fileName,
|
|
postedIso: toIso(d.postedDateTime),
|
|
site: d.siteNum,
|
|
siteId: String(d.siteId),
|
|
subject: d.subjectNumber,
|
|
});
|
|
}
|
|
return out;
|
|
}"""
|
|
|
|
JS_CELLS = r"""(idx) => {
|
|
const dedup = s => {
|
|
s = (s || '').replace(/\s+/g, ' ').trim();
|
|
const h = s.slice(0, Math.floor(s.length / 2));
|
|
if (s === h + h) return h;
|
|
const m = s.match(/^(.*?)\s+\1$/);
|
|
if (m) return m[1];
|
|
return s;
|
|
};
|
|
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
|
|
if (!row) return null;
|
|
const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; };
|
|
return {
|
|
type: get('type'),
|
|
accession: get('accessionNumber'),
|
|
visit: get('visit'),
|
|
collected: get('visitCollectionDate'),
|
|
postedDisplay: get('postedDateTime'),
|
|
};
|
|
}"""
|
|
|
|
JS_SCROLL_TO = r"""(args) => {
|
|
const [idx, rh] = args;
|
|
const vp = document.querySelector('.ag-body-viewport');
|
|
if (!vp) return;
|
|
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
|
|
}"""
|
|
|
|
JS_ROW_HEIGHT = r"""() => {
|
|
const r = document.querySelector('.ag-body-container .ag-row');
|
|
return r ? r.getBoundingClientRect().height || 25 : 25;
|
|
}"""
|
|
|
|
# Kod studie z hlavicky portalu (napr. "77242113UCO3001"): cifry+pismena+cifry.
|
|
JS_STUDY_CODE = r"""() => {
|
|
const txt = (document.body.innerText || '');
|
|
const m = txt.match(/\b\d{7,}[A-Z]{2,4}\d{3,}\b/);
|
|
return m ? m[0] : null;
|
|
}"""
|
|
|
|
|
|
# --- login ------------------------------------------------------------------
|
|
def login(page):
|
|
log("LOGIN: otviram login stranku...")
|
|
page.goto(LOGIN_URL)
|
|
try:
|
|
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
|
except Exception:
|
|
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
|
return
|
|
log("LOGIN: zadavam email...")
|
|
page.get_by_label("Email").fill(EMAIL)
|
|
page.get_by_role("button", name="Next").click()
|
|
log("LOGIN: cekam na pole pro heslo...")
|
|
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
|
log("LOGIN: zadavam heslo...")
|
|
page.get_by_label("Password").fill(PASSWORD)
|
|
page.get_by_role("button", name="Verify").click()
|
|
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
|
try:
|
|
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
|
|
except Exception:
|
|
log("LOGIN: wait_for_url vyprsel, pokracuji.")
|
|
page.wait_for_timeout(3000)
|
|
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
|
|
|
|
|
def open_grid(page, link):
|
|
log("GRID: navigace na Lab Reports...")
|
|
page.goto(link)
|
|
log("GRID: cekam na radky (.ag-row)...")
|
|
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
|
|
prev = -1
|
|
rows = None
|
|
for i in range(25):
|
|
rows = page.evaluate(JS_ALL_ROWS)
|
|
cnt = len(rows) if rows else 0
|
|
log(f" ...kontrola #{i+1}: rows={cnt}")
|
|
if rows and cnt == prev and cnt > 0:
|
|
break
|
|
prev = cnt
|
|
page.wait_for_timeout(2000)
|
|
row_height = page.evaluate(JS_ROW_HEIGHT)
|
|
study_code = page.evaluate(JS_STUDY_CODE)
|
|
log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px, studyCode={study_code}.")
|
|
return rows or [], row_height, study_code
|
|
|
|
|
|
def download_pdf_bytes(page, idx):
|
|
link = page.locator(
|
|
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
|
|
has_text="English",
|
|
).first
|
|
with page.expect_download(timeout=60000) as dl:
|
|
link.click()
|
|
with open(dl.value.path(), "rb") as f:
|
|
return f.read()
|
|
|
|
|
|
def upsert(col, study, study_code, rec, cells, data, now):
|
|
fields = {
|
|
"Type": cells["type"],
|
|
"Subject": rec["subject"],
|
|
"Accession": cells["accession"],
|
|
"Visit": cells["visit"],
|
|
"Collected Date": cells["collected"],
|
|
"Site Number": rec["site"],
|
|
"Posted": cells["postedDisplay"],
|
|
}
|
|
sha = hashlib.sha256(data).hexdigest()
|
|
meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"],
|
|
"visit": cells["visit"], "collected": cells["collected"],
|
|
"postedDisplay": cells["postedDisplay"]}
|
|
derived = {
|
|
"study": study,
|
|
"studyCode": study_code,
|
|
"type": cells["type"] or "Lab Result",
|
|
"site": rec["site"],
|
|
"subject": rec["subject"],
|
|
"accession": cells["accession"],
|
|
"visit": cells["visit"],
|
|
"collected": fmt_date(cells["collected"]),
|
|
"posted": cells["postedDisplay"],
|
|
"postedIso": rec["postedIso"],
|
|
"fileId": rec["fileId"],
|
|
"serverFileName": rec["serverFileName"],
|
|
"fields": fields,
|
|
"fileName": build_basename(study_code, meta),
|
|
"pdf": Binary(data),
|
|
"pdfSize": len(data),
|
|
"pdfSha256": sha,
|
|
}
|
|
rid = rec["reportId"]
|
|
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
|
|
if existing is None:
|
|
col.insert_one({"record_id": rid, **derived,
|
|
"firstSeen": now, "lastSeen": now, "history": []})
|
|
return "insert"
|
|
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
|
|
col.update_one(
|
|
{"_id": existing["_id"]},
|
|
{"$push": {"history": {"date": existing.get("lastSeen"),
|
|
"fields": existing.get("fields"),
|
|
"pdfSha256": existing.get("pdfSha256")}},
|
|
"$set": {**derived, "lastSeen": now}},
|
|
)
|
|
return "update"
|
|
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
|
|
return "same"
|
|
|
|
|
|
def process_study(page, col, study_item, now):
|
|
study, sites = parse_link(study_item["link"])
|
|
site_set = set(sites)
|
|
log(f"\n=== STUDIE {study_item.get('note','')} (interni {study}, {len(sites)} center) ===")
|
|
if not study or not sites:
|
|
log(f"PRESKAKUJI: z linku nelze odvodit study/sites: {study_item['link']}")
|
|
return {"new": 0, "upd": 0, "same": 0, "fail": 0}
|
|
|
|
# existujici reportId teto studie (i v dry-run, kvuli "co je opravdu nove")
|
|
read_col = col if col is not None else \
|
|
MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)[DB_NAME][COLLECTION]
|
|
existing_ids = {d["record_id"] for d in read_col.find({"study": study}, {"record_id": 1})}
|
|
log(f"START: v Mongo je {len(existing_ids)} reportu pro studii {study}.")
|
|
|
|
rows, row_height, study_code = open_grid(page, study_item["link"])
|
|
if not study_code:
|
|
study_code = f"study-{study}"
|
|
log(f"POZOR: studyCode z hlavicky nenalezen -> pouzivam '{study_code}'.")
|
|
|
|
# POJISTKA: jen centra z linku (podle interniho siteId)
|
|
bad = [r for r in rows if r["siteId"] not in site_set]
|
|
if bad:
|
|
log(f"POZOR: {len(bad)} radku mimo pozadovana centra (napr. siteId {bad[0]['siteId']}) "
|
|
f"-> filtruji. Zkontroluj link!")
|
|
rows = [r for r in rows if r["siteId"] in site_set]
|
|
log(f"GRID: po pojistce {len(rows)} radku v pozadovanych centrech.")
|
|
|
|
# vyber dle rezimu
|
|
todo = []
|
|
for rec in rows:
|
|
if rec["reportId"] in existing_ids:
|
|
if FULLSYNC:
|
|
continue
|
|
log(f"DELTA stop-at-known: rowIndex {rec['rowIndex']} ({rec['reportId'][:12]}…) "
|
|
f"uz v Mongo -> koncim (zbytek je starsi).")
|
|
break
|
|
todo.append(rec)
|
|
if ARGS.limit:
|
|
todo = todo[:ARGS.limit]
|
|
log(f"PLAN [{'FULLSYNC' if FULLSYNC else 'DELTA'}]: {len(todo)} novych radku "
|
|
f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).")
|
|
|
|
new_cnt = upd_cnt = same_cnt = 0
|
|
failed = []
|
|
for k, rec in enumerate(todo, 1):
|
|
idx = rec["rowIndex"]
|
|
try:
|
|
page.evaluate(JS_SCROLL_TO, [idx, row_height])
|
|
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
|
|
page.wait_for_timeout(120)
|
|
cells = page.evaluate(JS_CELLS, idx)
|
|
if not cells:
|
|
raise RuntimeError("nepodarilo se precist bunky radku")
|
|
meta = {"site": rec["site"], "subject": rec["subject"], "accession": cells["accession"],
|
|
"visit": cells["visit"], "collected": cells["collected"],
|
|
"postedDisplay": cells["postedDisplay"]}
|
|
fname = build_basename(study_code, meta)
|
|
|
|
if ARGS.dry_run:
|
|
log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}")
|
|
new_cnt += 1
|
|
continue
|
|
|
|
data = download_pdf_bytes(page, idx)
|
|
action = upsert(col, study, study_code, rec, cells, data, now)
|
|
existing_ids.add(rec["reportId"])
|
|
if action == "insert":
|
|
new_cnt += 1
|
|
log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}")
|
|
elif action == "update":
|
|
upd_cnt += 1
|
|
log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}")
|
|
else:
|
|
same_cnt += 1
|
|
log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}")
|
|
except Exception as e:
|
|
failed.append(idx)
|
|
log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.")
|
|
|
|
log(f"STUDIE {study} HOTOVO: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.")
|
|
if failed:
|
|
log(f"STUDIE {study}: SELHALY rowIndexy: {failed}")
|
|
return {"new": new_cnt, "upd": upd_cnt, "same": same_cnt, "fail": len(failed)}
|
|
|
|
|
|
def main():
|
|
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
enabled = [s for s in STUDIES if s.get("enabled")]
|
|
log(f"START: rezim={'FULLSYNC' if FULLSYNC else 'DELTA'}"
|
|
f"{', DRY-RUN' if ARGS.dry_run else ''}"
|
|
f"{f', limit {ARGS.limit}/studie' if ARGS.limit else ''}; "
|
|
f"{len(enabled)}/{len(STUDIES)} studii enabled.")
|
|
|
|
col = None
|
|
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
|
client.admin.command("ping")
|
|
if not ARGS.dry_run:
|
|
col = client[DB_NAME][COLLECTION]
|
|
col.create_index([("record_id", ASCENDING)], unique=True)
|
|
for f in ("study", "site", "subject", "accession", "postedIso", "fileId"):
|
|
col.create_index([(f, ASCENDING)])
|
|
|
|
with sync_playwright() as p:
|
|
context = p.chromium.launch_persistent_context(
|
|
user_data_dir=PROFILE_DIR,
|
|
headless=HEADLESS,
|
|
args=[
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--start-maximized",
|
|
"--disable-restore-session-state",
|
|
"--disable-session-crashed-bubble",
|
|
],
|
|
no_viewport=True,
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
|
accept_downloads=True,
|
|
)
|
|
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
|
page = context.new_page()
|
|
log("START: prohlizec spusten.")
|
|
|
|
login(page)
|
|
|
|
totals = {"new": 0, "upd": 0, "same": 0, "fail": 0}
|
|
for study_item in enabled:
|
|
try:
|
|
r = process_study(page, col, study_item, now)
|
|
for kk in totals:
|
|
totals[kk] += r[kk]
|
|
except Exception as e:
|
|
log(f"CHYBA studie {study_item.get('note','')}: {e!r} — pokracuji dalsi.")
|
|
log(f"\nKONEC (vse): nove={totals['new']}, update={totals['upd']}, "
|
|
f"beze zmeny={totals['same']}, chyby={totals['fail']}.")
|
|
context.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except Exception as e:
|
|
log(f"FATAL: beh spadl: {e!r}")
|
|
traceback.print_exc()
|