Files
janssen/Covance/Trash/download_lab_reports_v1.3.py
T
2026-06-16 14:32:28 +02:00

409 lines
16 KiB
Python

# =============================================================================
# Název: download_lab_reports_v1.3.py
# Verze: 1.3
# Datum: 2026-06-16
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
#
# REZIM SYNCHRONIZACE: promenna SYNC_MODE nahore.
# "delta" = jen NOVE reporty pres interni reportId (stop-at-known).
# List je Posted DESC; shora se hleda prvni uz ulozeny
# reportId -> vse pod nim je starsi a uz v Mongo je.
# "fullsync" = projit VSECHNY radky a doplnit chybejici / zmenene
# (rekonciliace). Pomalejsi, stahuje vse chybejici.
# CLI prepise promennou: --delta / --fullsync.
#
# KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID
# dokumentu, NAPRIC vsemi radky UNIKATNI a perzistentni v case
# (overeno: stejne reportId vraci i jiny grid pro totez centrum).
# Resi pripad ruznych PDF se SHODNYMI viditelnymi metadaty.
#
# Zmeny v1.3: + SYNC_MODE promenna (delta/fullsync); oprava postedIso
# (drive se ukladal cely moment.js objekt -> ted cisty ISO).
# =============================================================================
from playwright.sync_api import sync_playwright
from datetime import datetime
from pymongo import MongoClient, ASCENDING
from bson.binary import Binary
import argparse
import hashlib
import json
import os
import re
import traceback
import urllib.parse
# ============================================================================
# REZIM SYNCHRONIZACE — nastav zde (CLI --delta / --fullsync ma prednost)
# ============================================================================
SYNC_MODE = "delta" # "delta" = jen nove (stop-at-known pres reportId)
# "fullsync" = projit vse, doplnit chybejici/zmenene
# ============================================================================
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
parser.add_argument("--delta", action="store_true", help="vynutit rezim delta")
parser.add_argument("--fullsync", action="store_true", help="vynutit rezim fullsync")
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)")
ARGS = parser.parse_args()
# rozhodnuti rezimu: CLI > promenna
_mode = SYNC_MODE
if ARGS.fullsync:
_mode = "fullsync"
if ARGS.delta:
_mode = "delta"
FULLSYNC = (_mode == "fullsync")
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
# --- konfigurace ------------------------------------------------------------
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "%zT3Wqfc9)cWua5"
LOGIN_URL = "https://xsp.labcorp.com/"
STUDY = "36940"
STUDY_CODE = "77242113UCO3001"
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "covance"
COLLECTION = "labreports"
SITES = [
"930539", "930547", "930555", "930556", "930553",
"930549", "930525", "930536", "930557", "930531",
]
_BASE = os.path.dirname(os.path.abspath(__file__))
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
def lab_reports_url():
site_param = json.dumps(SITES, separators=(",", ":"))
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
f"?site={urllib.parse.quote(site_param)}")
# --- formatovani -------------------------------------------------------------
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12"}
def safe(s: str) -> str:
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
def fmt_date(s: str) -> str:
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
if m and m.group(1)[:3] in _MONTHS:
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
return safe(s)
def build_basename(meta: dict) -> str:
return safe(
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
f"{meta['subject']} {meta['visit']} {meta['accession']} "
f"posted {fmt_date(meta['postedDisplay'])}"
) + ".pdf"
# --- JS helpery (AG Grid) ---------------------------------------------------
# Seznam vsech radku. postedDateTime je v datech moment.js objekt -> prevedu
# na cisty ISO string (jinak by se serializoval cely moment objekt).
JS_ALL_ROWS = r"""() => {
let holder = null;
for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) {
if (el.__agComponent) { holder = el.__agComponent; break; }
}
if (!holder) return null;
const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder;
if (!api || !api.getDisplayedRowCount) return null;
const toIso = v => {
if (v == null) return null;
if (typeof v === 'string') return v;
if (v._i && typeof v._i === 'string') return v._i; // puvodni serverove ISO s offsetem
if (typeof v.toISOString === 'function') { try { return v.toISOString(); } catch (e) {} }
return String(v);
};
const cnt = api.getDisplayedRowCount();
const out = [];
for (let i = 0; i < cnt; i++) {
const n = api.getDisplayedRowAtIndex(i);
if (!n || !n.data) continue;
const d = n.data;
const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {};
out.push({
rowIndex: i,
reportId: d.reportId,
fileId: fl.fileId,
serverFileName: fl.fileName,
postedIso: toIso(d.postedDateTime),
site: d.siteNum,
subject: d.subjectNumber,
});
}
return out;
}"""
JS_CELLS = r"""(idx) => {
const dedup = s => {
s = (s || '').replace(/\s+/g, ' ').trim();
const h = s.slice(0, Math.floor(s.length / 2));
if (s === h + h) return h;
const m = s.match(/^(.*?)\s+\1$/);
if (m) return m[1];
return s;
};
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
if (!row) return null;
const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; };
return {
type: get('type'),
accession: get('accessionNumber'),
visit: get('visit'),
collected: get('visitCollectionDate'),
postedDisplay: get('postedDateTime'),
};
}"""
JS_SCROLL_TO = r"""(args) => {
const [idx, rh] = args;
const vp = document.querySelector('.ag-body-viewport');
if (!vp) return;
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
}"""
JS_ROW_HEIGHT = r"""() => {
const r = document.querySelector('.ag-body-container .ag-row');
return r ? r.getBoundingClientRect().height || 25 : 25;
}"""
# --- login ------------------------------------------------------------------
def login(page):
log("LOGIN: otviram login stranku...")
page.goto(LOGIN_URL)
try:
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
except Exception:
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
return
log("LOGIN: zadavam email...")
page.get_by_label("Email").fill(EMAIL)
page.get_by_role("button", name="Next").click()
log("LOGIN: cekam na pole pro heslo...")
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
log("LOGIN: zadavam heslo...")
page.get_by_label("Password").fill(PASSWORD)
page.get_by_role("button", name="Verify").click()
log("LOGIN: cekam na presmerovani po prihlaseni...")
try:
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
except Exception:
log("LOGIN: wait_for_url vyprsel, pokracuji.")
page.wait_for_timeout(3000)
log(f"LOGIN: prihlaseni hotovo ({page.url})")
def open_grid(page):
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
page.goto(lab_reports_url())
log("GRID: cekam na radky (.ag-row)...")
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
prev = -1
rows = None
for i in range(25):
rows = page.evaluate(JS_ALL_ROWS)
cnt = len(rows) if rows else 0
log(f" ...kontrola #{i+1}: rows={cnt}")
if rows and cnt == prev and cnt > 0:
break
prev = cnt
page.wait_for_timeout(2000)
row_height = page.evaluate(JS_ROW_HEIGHT)
log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.")
return rows or [], row_height
def download_pdf_bytes(page, idx):
link = page.locator(
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
has_text="English",
).first
with page.expect_download(timeout=60000) as dl:
link.click()
with open(dl.value.path(), "rb") as f:
return f.read()
def upsert(col, rec, cells, data, now):
fields = {
"Type": cells["type"],
"Subject": rec["subject"],
"Accession": cells["accession"],
"Visit": cells["visit"],
"Collected Date": cells["collected"],
"Site Number": rec["site"],
"Posted": cells["postedDisplay"],
}
sha = hashlib.sha256(data).hexdigest()
derived = {
"study": STUDY,
"studyCode": STUDY_CODE,
"type": cells["type"] or "Lab Result",
"site": rec["site"],
"subject": rec["subject"],
"accession": cells["accession"],
"visit": cells["visit"],
"collected": fmt_date(cells["collected"]),
"posted": cells["postedDisplay"],
"postedIso": rec["postedIso"],
"fileId": rec["fileId"],
"serverFileName": rec["serverFileName"],
"fields": fields,
"fileName": build_basename({**rec, "accession": cells["accession"],
"visit": cells["visit"], "collected": cells["collected"],
"postedDisplay": cells["postedDisplay"]}),
"pdf": Binary(data),
"pdfSize": len(data),
"pdfSha256": sha,
}
rid = rec["reportId"]
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
if existing is None:
col.insert_one({"record_id": rid, **derived,
"firstSeen": now, "lastSeen": now, "history": []})
return "insert"
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
col.update_one(
{"_id": existing["_id"]},
{"$push": {"history": {"date": existing.get("lastSeen"),
"fields": existing.get("fields"),
"pdfSha256": existing.get("pdfSha256")}},
"$set": {**derived, "lastSeen": now}},
)
return "update"
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
return "same"
def main():
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
f"rezim={'FULLSYNC' if FULLSYNC else 'DELTA'}"
f"{', DRY-RUN' if ARGS.dry_run else ''}"
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
col = None
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
if not ARGS.dry_run:
col = client[DB_NAME][COLLECTION]
col.create_index([("record_id", ASCENDING)], unique=True)
for f in ("study", "site", "subject", "accession", "postedIso", "fileId"):
col.create_index([(f, ASCENDING)])
existing_ids = {d["record_id"] for d in
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.")
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=PROFILE_DIR,
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--disable-restore-session-state",
"--disable-session-crashed-bubble",
],
no_viewport=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
accept_downloads=True,
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
log("START: prohlizec spusten.")
login(page)
rows, row_height = open_grid(page)
# POJISTKA: jen CZ centra (kdyby URL filtr selhal).
non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")]
if non_cz:
log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) "
f"-> filtruji jen CZ. Zkontroluj URL filtr center!")
rows = [r for r in rows if str(r["site"]).startswith("CZ")]
log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.")
# vyber radku ke zpracovani podle rezimu
todo = []
for rec in rows:
if rec["reportId"] in existing_ids:
if FULLSYNC:
continue # fullsync: znamy preskoc, jdi dal
log(f"DELTA stop-at-known: rowIndex {rec['rowIndex']} "
f"(reportId {rec['reportId'][:12]}…) uz v Mongo -> koncim (zbytek je starsi).")
break # delta: prvni znamy = konec
todo.append(rec)
if ARGS.limit:
todo = todo[:ARGS.limit]
log(f"PLAN [{'FULLSYNC' if FULLSYNC else 'DELTA'}]: {len(todo)} novych radku ke stazeni "
f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).")
new_cnt = upd_cnt = same_cnt = 0
failed = []
for k, rec in enumerate(todo, 1):
idx = rec["rowIndex"]
try:
page.evaluate(JS_SCROLL_TO, [idx, row_height])
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
page.wait_for_timeout(120)
cells = page.evaluate(JS_CELLS, idx)
if not cells:
raise RuntimeError("nepodarilo se precist bunky radku")
meta = {"site": rec["site"], "subject": rec["subject"],
"accession": cells["accession"], "visit": cells["visit"],
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
fname = build_basename(meta)
if ARGS.dry_run:
log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}")
new_cnt += 1
continue
data = download_pdf_bytes(page, idx)
action = upsert(col, rec, cells, data, now)
existing_ids.add(rec["reportId"])
if action == "insert":
new_cnt += 1
log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}")
elif action == "update":
upd_cnt += 1
log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}")
else:
same_cnt += 1
log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}")
except Exception as e:
failed.append(idx)
log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.")
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.")
if failed:
log(f"KONEC: SELHALY rowIndexy: {failed}")
context.close()
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: beh spadl: {e!r}")
traceback.print_exc()