janssen/IWRS/Patients/run_all.py

"""
Kompletní pipeline:
  1. Stažení Subject Summary Reportů (obě studie)
  2. Stažení Subject Detail Reportů + notifikací (obě studie)
  3. Import do MySQL (summary, visits, notifikace)

Spusť tento skript místo samostatných skriptů.
"""

import os
import datetime
import glob
import re

from playwright.sync_api import sync_playwright
import numpy as np
import pandas as pd

import db_config
import mysql.connector
import download_subject_details as dsd

# ── CONFIG ───────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL    = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"

STUDIES = ["77242113UCO3001", "42847922MDD3003"]

BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR  = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")


# ── helpers ───────────────────────────────────────────────────────────────────

def unique_path(directory, stem):
    path = os.path.join(directory, f"{stem}.xlsx")
    if not os.path.exists(path):
        return path
    time_tag = datetime.datetime.now().strftime("%H%M")
    return os.path.join(directory, f"{stem} {time_tag}.xlsx")


def login(page, study):
    page.goto(BASE_URL)
    page.wait_for_load_state("networkidle")
    page.get_by_label("Email *").fill(EMAIL)
    page.get_by_label("Password *").fill(PASSWORD)
    page.locator("#login__submit").click()
    page.wait_for_load_state("networkidle")
    page.get_by_label("Study *").click()
    page.get_by_role("option", name=study).click()
    page.get_by_role("button", name="SELECT").click()
    page.wait_for_load_state("networkidle")


# ── KROK 1: Subject Summary ───────────────────────────────────────────────────

def download_summary(page, study, today):
    print(f"  [{study}] Stahuji Subject Summary Report...")
    page.goto(f"{BASE_URL}/report/patient_summary_report")
    page.wait_for_load_state("networkidle", timeout=120000)
    filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
    with page.expect_download(timeout=120000) as dl:
        page.get_by_role("button", name="Download XLS").click()
    dl.value.save_as(filename)
    print(f"  [{study}] Summary OK -> {os.path.basename(filename)}")
    return filename


# ── KROK 2: Subject Details ───────────────────────────────────────────────────

def get_subjects_from_summary(summary_path):
    raw = pd.read_excel(summary_path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Subject" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        raise ValueError("Hlavičkový řádek nenalezen")
    df = pd.read_excel(summary_path, header=header_row)
    return df["Subject"].dropna().astype(str).str.strip().tolist()


def download_details(page, study, summary_path, today):
    out_dir = os.path.join(DETAILS_DIR, study)
    os.makedirs(out_dir, exist_ok=True)

    subjects = get_subjects_from_summary(summary_path)
    print(f"  [{study}] Subjektů k stažení: {len(subjects)}")

    page.goto(f"{BASE_URL}/report/patient_detail_report")
    page.wait_for_load_state("networkidle", timeout=120000)

    for subject in subjects:
        filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
        input_field.click()
        input_field.fill(subject)
        page.wait_for_timeout(500)
        page.locator("mat-option").first.dispatch_event("click")
        page.wait_for_load_state("networkidle", timeout=120000)

        with page.expect_download(timeout=120000) as dl:
            page.get_by_role("button", name="Download XLS").click()
        dl.value.save_as(filename)
        print(f"  [{study}] Detail {subject} OK")

        page.get_by_role("button", name="Clear").click()
        page.wait_for_load_state("networkidle", timeout=120000)


# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────

def get_conn():
    return mysql.connector.connect(
        host=db_config.DB_HOST,
        port=db_config.DB_PORT,
        user=db_config.DB_USER,
        password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )


def _py(val):
    """Převede numpy skalár na Python nativní typ."""
    if isinstance(val, np.generic):
        return val.item()
    return val


def to_date(val):
    val = _py(val)
    if val is None or (isinstance(val, float) and (val != val)):
        return None
    try:
        if pd.isna(val):
            return None
    except (TypeError, ValueError):
        pass
    if isinstance(val, pd.Timestamp):
        return None if pd.isna(val) else val.date()
    if isinstance(val, datetime.datetime):
        return val.date()
    if isinstance(val, datetime.date):
        return val
    s = str(val).strip()
    if not s or s.lower() in ("nat", "nan", "none", ""):
        return None
    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.datetime.strptime(s, fmt).date()
        except ValueError:
            pass
    return None


def to_int(val):
    val = _py(val)
    try:
        v = float(val)
        return None if (v != v) else int(v)
    except (TypeError, ValueError):
        return None


def to_float(val):
    val = _py(val)
    try:
        v = float(val)
        return None if (v != v) else float(v)
    except (TypeError, ValueError):
        return None


def to_str(val):
    val = _py(val)
    if val is None:
        return None
    if isinstance(val, float) and (val != val):
        return None
    s = str(val).strip()
    return None if s.lower() in ("nan", "nat", "none", "") else s


def read_summary_df(path):
    raw = pd.read_excel(path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Subject" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
    return pd.read_excel(path, header=header_row).dropna(how="all")


def parse_detail_visits(path):
    df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
    header_row = None
    for i, row in df.iterrows():
        if "Visit Type" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        return []
    visits_df = df.iloc[header_row + 1:].copy()
    visits_df.columns = range(visits_df.shape[1])
    rows = []
    for _, r in visits_df.iterrows():
        visit_type = to_str(r.get(0))
        if visit_type not in ("Past", "Upcoming"):
            continue
        rows.append({
            "visit_type":                  visit_type,
            "scheduled_date":              to_date(r.get(1)),
            "window_days":                 to_str(r.get(2)),
            "actual_date":                 to_date(r.get(3)),
            "irt_transaction_no":          to_int(r.get(4)),
            "irt_transaction_description": to_str(r.get(5)),
            "medication_assignment":       to_str(r.get(6)),
            "quantity_assigned":           to_int(r.get(7)),
            "medication_id":               to_str(r.get(8)),
        })
    return rows


def insert_import(cursor, study, source_file):
    cursor.execute(
        "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
        (study, datetime.datetime.now(), os.path.basename(source_file)),
    )
    return cursor.lastrowid


def insert_uco3001_summary(cursor, import_id, df):
    sql = """INSERT INTO iwrs_uco3001_subject_summary (
        import_id, subject, prior_subject_identifier, site, investigator, location,
        cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
        rescreened_subject, adt_ir, three_or_more_advanced_therapies,
        only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
        clinical_responder_status_i12_m0, irt_subject_status,
        i0_rand_date_local, last_irt_transaction,
        last_irt_transaction_date_local, last_irt_transaction_date_utc,
        next_irt_transaction, next_irt_transaction_date_local,
        most_recent_med_assignment_date, days_since_last_med_assignment,
        patient_forecast_status, patient_forecast_status_changed_date
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    col = df.columns.tolist()
    for _, r in df.iterrows():
        cursor.execute(sql, (
            import_id,
            to_str(r["Subject"]),
            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
            to_str(r["Site"]),
            to_str(r["Investigator"]),
            to_str(r["Location"]),
            to_str(r["Cohort per IRT"]),
            to_date(r["Informed Consent Date"]),
            to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
            to_int(r["Subject's age collection"]),
            to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
            to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
            to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
            to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
            to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
            to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
            to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
            to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
            to_str(r["IRT Subject Status"]),
            to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
            to_str(r["Last Recorded IRT Transaction"]),
            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
            to_str(r["Next Expected IRT Transaction"]),
            to_date(r["Next Expected IRT Transaction Date [Local]"]),
            to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
            to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
            to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
            to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
        ))


def insert_mdd3003_summary(cursor, import_id, df):
    sql = """INSERT INTO iwrs_mdd3003_subject_summary (
        import_id, subject, prior_subject_identifier, site, investigator, location,
        cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
        madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
        stratification_country, age_group, stable_remitters, irt_subject_status,
        last_irt_transaction, last_irt_transaction_date_local,
        last_irt_transaction_date_utc, next_irt_transaction,
        next_irt_transaction_date_local, date_screened, date_screen_failed,
        date_randomized_part1, date_early_withdraw_randomized_part1,
        date_open_label_induction, date_early_withdraw_open_label_induction,
        date_randomized_part2, date_early_withdraw_randomized_part2,
        date_completed, date_unblinded
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    col = df.columns.tolist()
    for _, r in df.iterrows():
        cursor.execute(sql, (
            import_id,
            to_str(r["Subject"]),
            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
            to_str(r["Site"]),
            to_str(r["Investigator"]),
            to_str(r["Location"]),
            to_str(r["Cohort per IRT"]),
            to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
            to_date(r["Informed Consent Date"]),
            to_int(r["Subject's age collection"]),
            to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
            to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
            to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
            to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
            to_str(r["Age Group"]) if "Age Group" in col else None,
            to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
            to_str(r["IRT Subject Status"]),
            to_str(r["Last Recorded IRT Transaction"]),
            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
            to_str(r["Next Expected IRT Transaction"]),
            to_date(r["Next Expected IRT Transaction Date [Local]"]),
            to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
            to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
            to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
            to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
            to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
            to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
            to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
            to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
            to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
            to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
        ))


def insert_visits(cursor, import_id, study, subject, visits):
    if not visits:
        return
    sql = """INSERT INTO iwrs_subject_visits (
        import_id, study, subject, visit_type, scheduled_date, window_days,
        actual_date, irt_transaction_no, irt_transaction_description,
        medication_assignment, quantity_assigned, medication_id
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    for v in visits:
        cursor.execute(sql, (
            import_id, study, subject,
            v["visit_type"], v["scheduled_date"], v["window_days"],
            v["actual_date"], v["irt_transaction_no"],
            v["irt_transaction_description"], v["medication_assignment"],
            v["quantity_assigned"], v["medication_id"],
        ))


def import_to_mysql(summary_path, detail_files, study):
    print(f"\n  [MySQL] Importuji {study}...")
    df_summary = read_summary_df(summary_path)
    conn = get_conn()
    cursor = conn.cursor()

    import_id = insert_import(cursor, study, summary_path)

    if study == "77242113UCO3001":
        insert_uco3001_summary(cursor, import_id, df_summary)
    else:
        insert_mdd3003_summary(cursor, import_id, df_summary)

    total_visits = 0
    for path in detail_files:
        fname = os.path.basename(path)
        m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
        subject = m.group(1) if m else "UNKNOWN"
        visits = parse_detail_visits(path)
        insert_visits(cursor, import_id, study, subject, visits)
        total_visits += len(visits)

    conn.commit()
    cursor.close()
    conn.close()
    print(f"  [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
    return import_id


# ── MAIN ─────────────────────────────────────────────────────────────────────

def main():
    today = datetime.date.today().strftime("%Y-%m-%d")
    os.makedirs(INCOMING_DIR, exist_ok=True)
    os.makedirs(DETAILS_DIR, exist_ok=True)

    summary_paths = {}

    # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
    with sync_playwright() as p:
        for study in STUDIES:
            print(f"\n{'='*60}")
            print(f"[{study}] KROK 1: Subject Summary Report")
            print(f"{'='*60}")
            browser = p.chromium.launch(headless=False)
            context = browser.new_context(accept_downloads=True)
            page = context.new_page()

            try:
                login(page, study)
                summary_path = download_summary(page, study, today)
                summary_paths[study] = summary_path

                print(f"\n[{study}] KROK 2: Subject Detail Reports + notifikace")
                dsd.run(page, study)

            except Exception as e:
                print(f"  [{study}] CHYBA při stahování: {e}")
                summary_paths[study] = None
            finally:
                browser.close()

    # ── Krok 3: import do MySQL ──────────────────────────────────────────────
    print(f"\n{'='*60}")
    print("KROK 3: Import do MySQL")
    print(f"{'='*60}")

    for study in STUDIES:
        summary_path = summary_paths.get(study)
        if not summary_path:
            print(f"  [{study}] PŘESKOČENO — stahování selhalo")
            continue

        detail_files = sorted(glob.glob(
            os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
        ))

        try:
            import_to_mysql(summary_path, detail_files, study)
        except Exception as e:
            print(f"  [{study}] CHYBA při importu: {e}")

    print(f"\n{'='*60}")
    print("Vše hotovo.")
    print(f"{'='*60}")


main()