janssen/IWRS/Patients/download_subject_details.py

from playwright.sync_api import sync_playwright
import os
import glob
import datetime

import pandas as pd

# ── CONFIG ──────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL    = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"

STUDIES = ["77242113UCO3001", "42847922MDD3003"]

BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR  = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
# ────────────────────────────────────────────────────────────────────────────


def get_subjects(study):
    pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
    files = sorted(
        [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
        key=os.path.getmtime,
        reverse=True,
    )
    if not files:
        raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
    today = datetime.date.today().strftime("%Y-%m-%d")
    if not os.path.basename(files[0]).startswith(today):
        raise FileNotFoundError(f"Dnešní Subject Summary Report pro {study} neexistuje — spusť nejdříve download_subject_summary.py")
    path = files[0]
    print(f"  Čtu subjekty z: {os.path.basename(path)}")

    raw = pd.read_excel(path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Subject" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        raise ValueError("Hlavičkový řádek nenalezen")

    df = pd.read_excel(path, header=header_row)
    subjects = df["Subject"].dropna().astype(str).str.strip().tolist()
    return subjects


def run(page, study):
    out_dir = os.path.join(DETAILS_DIR, study)
    os.makedirs(out_dir, exist_ok=True)

    subjects = get_subjects(study)
    print(f"  Nalezeno {len(subjects)} subjektů")
    today = datetime.date.today().strftime("%Y-%m-%d")

    page.goto(f"{BASE_URL}/report/patient_detail_report")
    page.wait_for_load_state("networkidle", timeout=120000)

    for subject in subjects:
        filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
        print(f"  [{subject}] Stahuji...")
        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
        input_field.click()
        input_field.fill(subject)
        page.wait_for_timeout(500)
        page.locator("mat-option").first.dispatch_event("click")
        page.wait_for_load_state("networkidle", timeout=120000)

        with page.expect_download(timeout=120000) as dl:
            page.get_by_role("button", name="Download XLS").click()
        dl.value.save_as(filename)
        print(f"  [{subject}] OK")

        page.get_by_role("button", name="Clear").click()
        page.wait_for_load_state("networkidle", timeout=120000)

    print(f"  [{study}] Subject details hotovo.")


def main():
    os.makedirs(DETAILS_DIR, exist_ok=True)

    with sync_playwright() as p:
        for study in STUDIES:
            print(f"\n[{study}] Přihlášení...")
            browser = p.chromium.launch(headless=False)
            context = browser.new_context(accept_downloads=True)
            page = context.new_page()

            page.goto(BASE_URL)
            page.wait_for_load_state("networkidle")
            page.get_by_label("Email *").fill(EMAIL)
            page.get_by_label("Password *").fill(PASSWORD)
            page.locator("#login__submit").click()
            page.wait_for_load_state("networkidle")

            page.get_by_label("Study *").click()
            page.get_by_role("option", name=study).click()
            page.get_by_role("button", name="SELECT").click()
            page.wait_for_load_state("networkidle")

            try:
                run(page, study)
            except Exception as e:
                print(f"  [{study}] CHYBA: {e}")

            browser.close()

    print("\nVše hotovo.")


main()