z230

2026-05-05 10:40:13 +02:00
parent eaea634b2b
commit afd9b3ef17
4 changed files with 895 additions and 0 deletions
@@ -0,0 +1,110 @@
 -- IWRS tabulky pro databázi studie
 -- Spustit jednou: mysql -h 192.168.1.76 -u root -p studie < create_iwrs_tables.sql
 USE studie;
 -- ── Import log ───────────────────────────────────────────────────────────────
 CREATE TABLE IF NOT EXISTS iwrs_import (
    import_id   INT AUTO_INCREMENT PRIMARY KEY,
    study       VARCHAR(20)  NOT NULL,
    imported_at DATETIME     NOT NULL DEFAULT CURRENT_TIMESTAMP,
    source_file VARCHAR(500) NOT NULL,
    INDEX idx_study (study)
 );
 -- ── UCO3001 subject summary ───────────────────────────────────────────────────
 CREATE TABLE IF NOT EXISTS iwrs_uco3001_subject_summary (
    id                                  INT AUTO_INCREMENT PRIMARY KEY,
    import_id                           INT          NOT NULL,
    subject                             VARCHAR(20)  NOT NULL,
    prior_subject_identifier            VARCHAR(20),
    site                                VARCHAR(50),
    investigator                        VARCHAR(100),
    location                            VARCHAR(50),
    cohort_per_irt                      VARCHAR(100),
    informed_consent_date               DATE,
    adolescent_assent_date              DATE,
    age                                 SMALLINT,
    weight                              DECIMAL(5,1),
    rescreened_subject                  VARCHAR(10),
    adt_ir                              VARCHAR(10),
    three_or_more_advanced_therapies    VARCHAR(10),
    only_oral_5asa_compounds            VARCHAR(10),
    ustekinumab                         VARCHAR(10),
    isolated_proctitis                  VARCHAR(10),
    clinical_responder_status_i12_m0    VARCHAR(100),
    irt_subject_status                  VARCHAR(50),
    i0_rand_date_local                  DATE,
    last_irt_transaction                VARCHAR(100),
    last_irt_transaction_date_local     DATE,
    last_irt_transaction_date_utc       DATE,
    next_irt_transaction                VARCHAR(100),
    next_irt_transaction_date_local     DATE,
    most_recent_med_assignment_date     DATE,
    days_since_last_med_assignment      SMALLINT,
    patient_forecast_status             VARCHAR(50),
    patient_forecast_status_changed_date DATE,
    FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
    INDEX idx_import (import_id),
    INDEX idx_subject (subject)
 );
 -- ── MDD3003 subject summary ───────────────────────────────────────────────────
 CREATE TABLE IF NOT EXISTS iwrs_mdd3003_subject_summary (
    id                                      INT AUTO_INCREMENT PRIMARY KEY,
    import_id                               INT          NOT NULL,
    subject                                 VARCHAR(20)  NOT NULL,
    prior_subject_identifier                VARCHAR(20),
    site                                    VARCHAR(50),
    investigator                            VARCHAR(100),
    location                                VARCHAR(50),
    cohort_per_irt                          VARCHAR(50),
    madrs_criteria_integrated               VARCHAR(50),
    informed_consent_date                   DATE,
    age                                     SMALLINT,
    madrs_criteria_v15                      VARCHAR(10),
    madrs_criteria_v16                      VARCHAR(10),
    madrs_criteria_v17                      VARCHAR(10),
    stratification_country                  VARCHAR(10),
    age_group                               VARCHAR(20),
    stable_remitters                        VARCHAR(50),
    irt_subject_status                      VARCHAR(100),
    last_irt_transaction                    VARCHAR(100),
    last_irt_transaction_date_local         DATE,
    last_irt_transaction_date_utc           DATE,
    next_irt_transaction                    VARCHAR(100),
    next_irt_transaction_date_local         DATE,
    date_screened                           DATE,
    date_screen_failed                      DATE,
    date_randomized_part1                   DATE,
    date_early_withdraw_randomized_part1    DATE,
    date_open_label_induction               DATE,
    date_early_withdraw_open_label_induction DATE,
    date_randomized_part2                   DATE,
    date_early_withdraw_randomized_part2    DATE,
    date_completed                          DATE,
    date_unblinded                          DATE,
    FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
    INDEX idx_import (import_id),
    INDEX idx_subject (subject)
 );
 -- ── Subject visits / transactions (obě studie) ───────────────────────────────
 CREATE TABLE IF NOT EXISTS iwrs_subject_visits (
    id                          INT AUTO_INCREMENT PRIMARY KEY,
    import_id                   INT          NOT NULL,
    study                       VARCHAR(20)  NOT NULL,
    subject                     VARCHAR(20)  NOT NULL,
    visit_type                  ENUM('Past','Upcoming') NOT NULL,
    scheduled_date              DATE,
    window_days                 VARCHAR(20),
    actual_date                 DATE,
    irt_transaction_no          SMALLINT,
    irt_transaction_description VARCHAR(200),
    medication_assignment       VARCHAR(200),
    quantity_assigned           SMALLINT,
    medication_id               VARCHAR(20),
    FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
    INDEX idx_import (import_id),
    INDEX idx_study_subject (study, subject)
 );
@@ -0,0 +1,5 @@
 DB_HOST     = "192.168.1.76"
 DB_PORT     = 3306
 DB_USER     = "root"
 DB_PASSWORD = "Vlado9674+"
 DB_NAME     = "studie"
@@ -0,0 +1,358 @@
 """
 Importuje data z IWRS Excel reportů do MySQL (databáze studie).
 Pořadí spuštění:
  1. download_subject_summary.py
  2. download_subject_details.py
  3. tento skript
 Každé spuštění vytvoří nový import_id v iwrs_import.
 Reportovací skripty pracují vždy s MAX(import_id) pro danou studii.
 """
 import os
 import glob
 import datetime
 import re
 import pandas as pd
 import mysql.connector
 import db_config
 BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
 INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
 DETAILS_DIR  = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
 STUDIES = ["77242113UCO3001", "42847922MDD3003"]
 # ── helpers ──────────────────────────────────────────────────────────────────
 def get_conn():
    return mysql.connector.connect(
        host=db_config.DB_HOST,
        port=db_config.DB_PORT,
        user=db_config.DB_USER,
        password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )
 def to_date(val):
    """Převede pandas Timestamp / string / NaT / NaN na date nebo None."""
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return None
    if isinstance(val, pd.Timestamp):
        return None if pd.isna(val) else val.date()
    if isinstance(val, datetime.datetime):
        return val.date()
    if isinstance(val, datetime.date):
        return val
    s = str(val).strip()
    if not s or s.lower() in ("nat", "nan", "none", ""):
        return None
    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.datetime.strptime(s, fmt).date()
        except ValueError:
            pass
    return None
 def to_int(val):
    try:
        v = float(val)
        return None if pd.isna(v) else int(v)
    except (TypeError, ValueError):
        return None
 def to_float(val):
    try:
        v = float(val)
        return None if pd.isna(v) else v
    except (TypeError, ValueError):
        return None
 def to_str(val):
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return None
    s = str(val).strip()
    return None if s.lower() in ("nan", "nat", "none", "") else s
 def find_summary_file(study):
    today = datetime.date.today().strftime("%Y-%m-%d")
    pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
    files = sorted(
        [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
        key=os.path.getmtime,
        reverse=True,
    )
    if not files:
        raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
    if not os.path.basename(files[0]).startswith(today):
        print(f"  UPOZORNĚNÍ: nejnovější Summary Report pro {study} není z dnešního dne ({os.path.basename(files[0])[:10]})")
    return files[0]
 def read_summary_df(path):
    """Přečte Summary xlsx, vrátí DataFrame od řádku s hlavičkou."""
    raw = pd.read_excel(path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Subject" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
    return pd.read_excel(path, header=header_row)
 def find_detail_files(study):
    out_dir = os.path.join(DETAILS_DIR, study)
    # Vezme soubory ze stejného dne jako nejnovější Summary Report
    summary_path = find_summary_file(study)
    file_date = os.path.basename(summary_path)[:10]  # "YYYY-MM-DD"
    pattern = os.path.join(out_dir, f"{file_date} {study} * Subject Detail.xlsx")
    files = [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")]
    return sorted(files)
 def parse_detail_visits(path):
    """
    Vrátí list slovníků s daty visitů z Detail xlsx.
    Každý řádek tabulky (od řádku s hlavičkou Visit Type) je jedna transakce.
    """
    df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
    header_row = None
    for i, row in df.iterrows():
        if "Visit Type" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        return []
    visits_df = df.iloc[header_row + 1:].copy()
    visits_df.columns = range(visits_df.shape[1])
    rows = []
    for _, r in visits_df.iterrows():
        visit_type = to_str(r.get(0))
        if visit_type not in ("Past", "Upcoming"):
            continue
        rows.append({
            "visit_type":                  visit_type,
            "scheduled_date":              to_date(r.get(1)),
            "window_days":                 to_str(r.get(2)),
            "actual_date":                 to_date(r.get(3)),
            "irt_transaction_no":          to_int(r.get(4)),
            "irt_transaction_description": to_str(r.get(5)),
            "medication_assignment":       to_str(r.get(6)),
            "quantity_assigned":           to_int(r.get(7)),
            "medication_id":               to_str(r.get(8)),
        })
    return rows
 # ── insert helpers ────────────────────────────────────────────────────────────
 def insert_import(cursor, study, source_file):
    cursor.execute(
        "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
        (study, datetime.datetime.now(), os.path.basename(source_file)),
    )
    return cursor.lastrowid
 def insert_uco3001_summary(cursor, import_id, df):
    sql = """
        INSERT INTO iwrs_uco3001_subject_summary (
            import_id, subject, prior_subject_identifier, site, investigator, location,
            cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
            rescreened_subject, adt_ir, three_or_more_advanced_therapies,
            only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
            clinical_responder_status_i12_m0, irt_subject_status,
            i0_rand_date_local, last_irt_transaction,
            last_irt_transaction_date_local, last_irt_transaction_date_utc,
            next_irt_transaction, next_irt_transaction_date_local,
            most_recent_med_assignment_date, days_since_last_med_assignment,
            patient_forecast_status, patient_forecast_status_changed_date
        ) VALUES (
            %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
        )
    """
    col = df.columns.tolist()
    def c(name):
        return col.index(name) if name in col else None
    for _, r in df.iterrows():
        cursor.execute(sql, (
            import_id,
            to_str(r["Subject"]),
            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
            to_str(r["Site"]),
            to_str(r["Investigator"]),
            to_str(r["Location"]),
            to_str(r["Cohort per IRT"]),
            to_date(r["Informed Consent Date"]),
            to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
            to_int(r["Subject's age collection"]),
            to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
            to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
            to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
            to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
            to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
            to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
            to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
            to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
            to_str(r["IRT Subject Status"]),
            to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
            to_str(r["Last Recorded IRT Transaction"]),
            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
            to_str(r["Next Expected IRT Transaction"]),
            to_date(r["Next Expected IRT Transaction Date [Local]"]),
            to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
            to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
            to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
            to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
        ))
 def insert_mdd3003_summary(cursor, import_id, df):
    sql = """
        INSERT INTO iwrs_mdd3003_subject_summary (
            import_id, subject, prior_subject_identifier, site, investigator, location,
            cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
            madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
            stratification_country, age_group, stable_remitters, irt_subject_status,
            last_irt_transaction, last_irt_transaction_date_local,
            last_irt_transaction_date_utc, next_irt_transaction,
            next_irt_transaction_date_local, date_screened, date_screen_failed,
            date_randomized_part1, date_early_withdraw_randomized_part1,
            date_open_label_induction, date_early_withdraw_open_label_induction,
            date_randomized_part2, date_early_withdraw_randomized_part2,
            date_completed, date_unblinded
        ) VALUES (
            %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
        )
    """
    col = df.columns.tolist()
    for _, r in df.iterrows():
        cursor.execute(sql, (
            import_id,
            to_str(r["Subject"]),
            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
            to_str(r["Site"]),
            to_str(r["Investigator"]),
            to_str(r["Location"]),
            to_str(r["Cohort per IRT"]),
            to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
            to_date(r["Informed Consent Date"]),
            to_int(r["Subject's age collection"]),
            to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
            to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
            to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
            to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
            to_str(r["Age Group"]) if "Age Group" in col else None,
            to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
            to_str(r["IRT Subject Status"]),
            to_str(r["Last Recorded IRT Transaction"]),
            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
            to_str(r["Next Expected IRT Transaction"]),
            to_date(r["Next Expected IRT Transaction Date [Local]"]),
            to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
            to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
            to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
            to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
            to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
            to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
            to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
            to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
            to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
            to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
        ))
 def insert_visits(cursor, import_id, study, subject, visits):
    if not visits:
        return
    sql = """
        INSERT INTO iwrs_subject_visits (
            import_id, study, subject, visit_type, scheduled_date, window_days,
            actual_date, irt_transaction_no, irt_transaction_description,
            medication_assignment, quantity_assigned, medication_id
        ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    for v in visits:
        cursor.execute(sql, (
            import_id, study, subject,
            v["visit_type"], v["scheduled_date"], v["window_days"],
            v["actual_date"], v["irt_transaction_no"],
            v["irt_transaction_description"], v["medication_assignment"],
            v["quantity_assigned"], v["medication_id"],
        ))
 # ── main ──────────────────────────────────────────────────────────────────────
 def import_study(conn, study):
    summary_path = find_summary_file(study)
    print(f"  Summary: {os.path.basename(summary_path)}")
    df_summary = read_summary_df(summary_path)
    df_summary = df_summary.dropna(how="all")
    detail_files = find_detail_files(study)
    print(f"  Detail souborů: {len(detail_files)}")
    cursor = conn.cursor()
    import_id = insert_import(cursor, study, summary_path)
    print(f"  import_id = {import_id}")
    if study == "77242113UCO3001":
        insert_uco3001_summary(cursor, import_id, df_summary)
    else:
        insert_mdd3003_summary(cursor, import_id, df_summary)
    print(f"  Summary řádků: {len(df_summary)}")
    visited = 0
    for path in detail_files:
        fname = os.path.basename(path)
        # název: "2026-05-04 77242113UCO3001 CZ100012001 Subject Detail.xlsx"
        m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
        subject = m.group(1) if m else "UNKNOWN"
        visits = parse_detail_visits(path)
        insert_visits(cursor, import_id, study, subject, visits)
        visited += len(visits)
    conn.commit()
    cursor.close()
    print(f"  Transakce uloženo: {visited}")
    return import_id
 def main():
    conn = get_conn()
    print("Připojeno k MySQL.\n")
    for study in STUDIES:
        print(f"[{study}]")
        try:
            import_id = import_study(conn, study)
            print(f"  OK — import_id {import_id}\n")
        except Exception as e:
            print(f"  CHYBA: {e}\n")
    conn.close()
    print("Hotovo.")
 main()
@@ -0,0 +1,422 @@
 """
 Kompletní pipeline:
  1. Stažení Subject Summary Reportů (obě studie)
  2. Stažení Subject Detail Reportů (obě studie)
  3. Import do MySQL
 Spusť tento skript místo tří samostatných skriptů.
 """
 import os
 import datetime
 import glob
 import re
 from playwright.sync_api import sync_playwright
 import pandas as pd
 import db_config
 import mysql.connector
 # ── CONFIG ───────────────────────────────────────────────────────────────────
 BASE_URL = "https://janssen.4gclinical.com"
 EMAIL    = "vbuzalka@its.jnj.com"
 PASSWORD = "Vlado123++-+"
 STUDIES = ["77242113UCO3001", "42847922MDD3003"]
 BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
 INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
 DETAILS_DIR  = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
 # ── helpers ───────────────────────────────────────────────────────────────────
 def unique_path(directory, stem):
    path = os.path.join(directory, f"{stem}.xlsx")
    if not os.path.exists(path):
        return path
    time_tag = datetime.datetime.now().strftime("%H%M")
    return os.path.join(directory, f"{stem} {time_tag}.xlsx")
 def login(page, study):
    page.goto(BASE_URL)
    page.wait_for_load_state("networkidle")
    page.get_by_label("Email *").fill(EMAIL)
    page.get_by_label("Password *").fill(PASSWORD)
    page.locator("#login__submit").click()
    page.wait_for_load_state("networkidle")
    page.get_by_label("Study *").click()
    page.get_by_role("option", name=study).click()
    page.get_by_role("button", name="SELECT").click()
    page.wait_for_load_state("networkidle")
 # ── KROK 1: Subject Summary ───────────────────────────────────────────────────
 def download_summary(page, study, today):
    print(f"  [{study}] Stahuji Subject Summary Report...")
    page.goto(f"{BASE_URL}/report/patient_summary_report")
    page.wait_for_load_state("networkidle", timeout=120000)
    filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
    with page.expect_download(timeout=120000) as dl:
        page.get_by_role("button", name="Download XLS").click()
    dl.value.save_as(filename)
    print(f"  [{study}] Summary OK -> {os.path.basename(filename)}")
    return filename
 # ── KROK 2: Subject Details ───────────────────────────────────────────────────
 def get_subjects_from_summary(summary_path):
    raw = pd.read_excel(summary_path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Subject" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        raise ValueError("Hlavičkový řádek nenalezen")
    df = pd.read_excel(summary_path, header=header_row)
    return df["Subject"].dropna().astype(str).str.strip().tolist()
 def download_details(page, study, summary_path, today):
    out_dir = os.path.join(DETAILS_DIR, study)
    os.makedirs(out_dir, exist_ok=True)
    subjects = get_subjects_from_summary(summary_path)
    print(f"  [{study}] Subjektů k stažení: {len(subjects)}")
    page.goto(f"{BASE_URL}/report/patient_detail_report")
    page.wait_for_load_state("networkidle", timeout=120000)
    for subject in subjects:
        filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
        input_field.click()
        input_field.fill(subject)
        page.wait_for_timeout(500)
        page.locator("mat-option").first.dispatch_event("click")
        page.wait_for_load_state("networkidle", timeout=120000)
        with page.expect_download(timeout=120000) as dl:
            page.get_by_role("button", name="Download XLS").click()
        dl.value.save_as(filename)
        print(f"  [{study}] Detail {subject} OK")
        page.get_by_role("button", name="Clear").click()
        page.wait_for_load_state("networkidle", timeout=120000)
 # ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
 def get_conn():
    return mysql.connector.connect(
        host=db_config.DB_HOST,
        port=db_config.DB_PORT,
        user=db_config.DB_USER,
        password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )
 def to_date(val):
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return None
    if isinstance(val, pd.Timestamp):
        return None if pd.isna(val) else val.date()
    if isinstance(val, datetime.datetime):
        return val.date()
    if isinstance(val, datetime.date):
        return val
    s = str(val).strip()
    if not s or s.lower() in ("nat", "nan", "none", ""):
        return None
    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.datetime.strptime(s, fmt).date()
        except ValueError:
            pass
    return None
 def to_int(val):
    try:
        v = float(val)
        return None if pd.isna(v) else int(v)
    except (TypeError, ValueError):
        return None
 def to_float(val):
    try:
        v = float(val)
        return None if pd.isna(v) else v
    except (TypeError, ValueError):
        return None
 def to_str(val):
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return None
    s = str(val).strip()
    return None if s.lower() in ("nan", "nat", "none", "") else s
 def read_summary_df(path):
    raw = pd.read_excel(path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Subject" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
    return pd.read_excel(path, header=header_row).dropna(how="all")
 def parse_detail_visits(path):
    df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
    header_row = None
    for i, row in df.iterrows():
        if "Visit Type" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        return []
    visits_df = df.iloc[header_row + 1:].copy()
    visits_df.columns = range(visits_df.shape[1])
    rows = []
    for _, r in visits_df.iterrows():
        visit_type = to_str(r.get(0))
        if visit_type not in ("Past", "Upcoming"):
            continue
        rows.append({
            "visit_type":                  visit_type,
            "scheduled_date":              to_date(r.get(1)),
            "window_days":                 to_str(r.get(2)),
            "actual_date":                 to_date(r.get(3)),
            "irt_transaction_no":          to_int(r.get(4)),
            "irt_transaction_description": to_str(r.get(5)),
            "medication_assignment":       to_str(r.get(6)),
            "quantity_assigned":           to_int(r.get(7)),
            "medication_id":               to_str(r.get(8)),
        })
    return rows
 def insert_import(cursor, study, source_file):
    cursor.execute(
        "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
        (study, datetime.datetime.now(), os.path.basename(source_file)),
    )
    return cursor.lastrowid
 def insert_uco3001_summary(cursor, import_id, df):
    sql = """INSERT INTO iwrs_uco3001_subject_summary (
        import_id, subject, prior_subject_identifier, site, investigator, location,
        cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
        rescreened_subject, adt_ir, three_or_more_advanced_therapies,
        only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
        clinical_responder_status_i12_m0, irt_subject_status,
        i0_rand_date_local, last_irt_transaction,
        last_irt_transaction_date_local, last_irt_transaction_date_utc,
        next_irt_transaction, next_irt_transaction_date_local,
        most_recent_med_assignment_date, days_since_last_med_assignment,
        patient_forecast_status, patient_forecast_status_changed_date
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    col = df.columns.tolist()
    for _, r in df.iterrows():
        cursor.execute(sql, (
            import_id,
            to_str(r["Subject"]),
            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
            to_str(r["Site"]),
            to_str(r["Investigator"]),
            to_str(r["Location"]),
            to_str(r["Cohort per IRT"]),
            to_date(r["Informed Consent Date"]),
            to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
            to_int(r["Subject's age collection"]),
            to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
            to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
            to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
            to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
            to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
            to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
            to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
            to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
            to_str(r["IRT Subject Status"]),
            to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
            to_str(r["Last Recorded IRT Transaction"]),
            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
            to_str(r["Next Expected IRT Transaction"]),
            to_date(r["Next Expected IRT Transaction Date [Local]"]),
            to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
            to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
            to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
            to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
        ))
 def insert_mdd3003_summary(cursor, import_id, df):
    sql = """INSERT INTO iwrs_mdd3003_subject_summary (
        import_id, subject, prior_subject_identifier, site, investigator, location,
        cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
        madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
        stratification_country, age_group, stable_remitters, irt_subject_status,
        last_irt_transaction, last_irt_transaction_date_local,
        last_irt_transaction_date_utc, next_irt_transaction,
        next_irt_transaction_date_local, date_screened, date_screen_failed,
        date_randomized_part1, date_early_withdraw_randomized_part1,
        date_open_label_induction, date_early_withdraw_open_label_induction,
        date_randomized_part2, date_early_withdraw_randomized_part2,
        date_completed, date_unblinded
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    col = df.columns.tolist()
    for _, r in df.iterrows():
        cursor.execute(sql, (
            import_id,
            to_str(r["Subject"]),
            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
            to_str(r["Site"]),
            to_str(r["Investigator"]),
            to_str(r["Location"]),
            to_str(r["Cohort per IRT"]),
            to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
            to_date(r["Informed Consent Date"]),
            to_int(r["Subject's age collection"]),
            to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
            to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
            to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
            to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
            to_str(r["Age Group"]) if "Age Group" in col else None,
            to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
            to_str(r["IRT Subject Status"]),
            to_str(r["Last Recorded IRT Transaction"]),
            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
            to_str(r["Next Expected IRT Transaction"]),
            to_date(r["Next Expected IRT Transaction Date [Local]"]),
            to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
            to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
            to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
            to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
            to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
            to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
            to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
            to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
            to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
            to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
        ))
 def insert_visits(cursor, import_id, study, subject, visits):
    if not visits:
        return
    sql = """INSERT INTO iwrs_subject_visits (
        import_id, study, subject, visit_type, scheduled_date, window_days,
        actual_date, irt_transaction_no, irt_transaction_description,
        medication_assignment, quantity_assigned, medication_id
    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    for v in visits:
        cursor.execute(sql, (
            import_id, study, subject,
            v["visit_type"], v["scheduled_date"], v["window_days"],
            v["actual_date"], v["irt_transaction_no"],
            v["irt_transaction_description"], v["medication_assignment"],
            v["quantity_assigned"], v["medication_id"],
        ))
 def import_to_mysql(summary_path, detail_files, study):
    print(f"\n  [MySQL] Importuji {study}...")
    df_summary = read_summary_df(summary_path)
    conn = get_conn()
    cursor = conn.cursor()
    import_id = insert_import(cursor, study, summary_path)
    if study == "77242113UCO3001":
        insert_uco3001_summary(cursor, import_id, df_summary)
    else:
        insert_mdd3003_summary(cursor, import_id, df_summary)
    total_visits = 0
    for path in detail_files:
        fname = os.path.basename(path)
        m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
        subject = m.group(1) if m else "UNKNOWN"
        visits = parse_detail_visits(path)
        insert_visits(cursor, import_id, study, subject, visits)
        total_visits += len(visits)
    conn.commit()
    cursor.close()
    conn.close()
    print(f"  [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
    return import_id
 # ── MAIN ─────────────────────────────────────────────────────────────────────
 def main():
    today = datetime.date.today().strftime("%Y-%m-%d")
    os.makedirs(INCOMING_DIR, exist_ok=True)
    os.makedirs(DETAILS_DIR, exist_ok=True)
    summary_paths = {}
    # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
    with sync_playwright() as p:
        for study in STUDIES:
            print(f"\n{'='*60}")
            print(f"[{study}] KROK 1: Subject Summary Report")
            print(f"{'='*60}")
            browser = p.chromium.launch(headless=False)
            context = browser.new_context(accept_downloads=True)
            page = context.new_page()
            try:
                login(page, study)
                summary_path = download_summary(page, study, today)
                summary_paths[study] = summary_path
                print(f"\n[{study}] KROK 2: Subject Detail Reports")
                download_details(page, study, summary_path, today)
            except Exception as e:
                print(f"  [{study}] CHYBA při stahování: {e}")
                summary_paths[study] = None
            finally:
                browser.close()
    # ── Krok 3: import do MySQL ──────────────────────────────────────────────
    print(f"\n{'='*60}")
    print("KROK 3: Import do MySQL")
    print(f"{'='*60}")
    for study in STUDIES:
        summary_path = summary_paths.get(study)
        if not summary_path:
            print(f"  [{study}] PŘESKOČENO — stahování selhalo")
            continue
        detail_files = sorted(glob.glob(
            os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
        ))
        try:
            import_to_mysql(summary_path, detail_files, study)
        except Exception as e:
            print(f"  [{study}] CHYBA při importu: {e}")
    print(f"\n{'='*60}")
    print("Vše hotovo.")
    print(f"{'='*60}")
 main()