From afd9b3ef1715a7685989c2f73e26ca2abc1c8daa Mon Sep 17 00:00:00 2001 From: "vladimir.buzalka" Date: Tue, 5 May 2026 10:40:13 +0200 Subject: [PATCH] z230 --- .../create_iwrs_tables.sql | 110 +++++ IWRS/Přehledpacientůstručný/db_config.py | 5 + .../Přehledpacientůstručný/import_to_mysql.py | 358 +++++++++++++++ IWRS/Přehledpacientůstručný/run_all.py | 422 ++++++++++++++++++ 4 files changed, 895 insertions(+) create mode 100644 IWRS/Přehledpacientůstručný/create_iwrs_tables.sql create mode 100644 IWRS/Přehledpacientůstručný/db_config.py create mode 100644 IWRS/Přehledpacientůstručný/import_to_mysql.py create mode 100644 IWRS/Přehledpacientůstručný/run_all.py diff --git a/IWRS/Přehledpacientůstručný/create_iwrs_tables.sql b/IWRS/Přehledpacientůstručný/create_iwrs_tables.sql new file mode 100644 index 0000000..b20e6c0 --- /dev/null +++ b/IWRS/Přehledpacientůstručný/create_iwrs_tables.sql @@ -0,0 +1,110 @@ +-- IWRS tabulky pro databázi studie +-- Spustit jednou: mysql -h 192.168.1.76 -u root -p studie < create_iwrs_tables.sql + +USE studie; + +-- ── Import log ─────────────────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS iwrs_import ( + import_id INT AUTO_INCREMENT PRIMARY KEY, + study VARCHAR(20) NOT NULL, + imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + source_file VARCHAR(500) NOT NULL, + INDEX idx_study (study) +); + +-- ── UCO3001 subject summary ─────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS iwrs_uco3001_subject_summary ( + id INT AUTO_INCREMENT PRIMARY KEY, + import_id INT NOT NULL, + subject VARCHAR(20) NOT NULL, + prior_subject_identifier VARCHAR(20), + site VARCHAR(50), + investigator VARCHAR(100), + location VARCHAR(50), + cohort_per_irt VARCHAR(100), + informed_consent_date DATE, + adolescent_assent_date DATE, + age SMALLINT, + weight DECIMAL(5,1), + rescreened_subject VARCHAR(10), + adt_ir VARCHAR(10), + three_or_more_advanced_therapies VARCHAR(10), + only_oral_5asa_compounds VARCHAR(10), + ustekinumab VARCHAR(10), + isolated_proctitis VARCHAR(10), + clinical_responder_status_i12_m0 VARCHAR(100), + irt_subject_status VARCHAR(50), + i0_rand_date_local DATE, + last_irt_transaction VARCHAR(100), + last_irt_transaction_date_local DATE, + last_irt_transaction_date_utc DATE, + next_irt_transaction VARCHAR(100), + next_irt_transaction_date_local DATE, + most_recent_med_assignment_date DATE, + days_since_last_med_assignment SMALLINT, + patient_forecast_status VARCHAR(50), + patient_forecast_status_changed_date DATE, + FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id), + INDEX idx_import (import_id), + INDEX idx_subject (subject) +); + +-- ── MDD3003 subject summary ─────────────────────────────────────────────────── +CREATE TABLE IF NOT EXISTS iwrs_mdd3003_subject_summary ( + id INT AUTO_INCREMENT PRIMARY KEY, + import_id INT NOT NULL, + subject VARCHAR(20) NOT NULL, + prior_subject_identifier VARCHAR(20), + site VARCHAR(50), + investigator VARCHAR(100), + location VARCHAR(50), + cohort_per_irt VARCHAR(50), + madrs_criteria_integrated VARCHAR(50), + informed_consent_date DATE, + age SMALLINT, + madrs_criteria_v15 VARCHAR(10), + madrs_criteria_v16 VARCHAR(10), + madrs_criteria_v17 VARCHAR(10), + stratification_country VARCHAR(10), + age_group VARCHAR(20), + stable_remitters VARCHAR(50), + irt_subject_status VARCHAR(100), + last_irt_transaction VARCHAR(100), + last_irt_transaction_date_local DATE, + last_irt_transaction_date_utc DATE, + next_irt_transaction VARCHAR(100), + next_irt_transaction_date_local DATE, + date_screened DATE, + date_screen_failed DATE, + date_randomized_part1 DATE, + date_early_withdraw_randomized_part1 DATE, + date_open_label_induction DATE, + date_early_withdraw_open_label_induction DATE, + date_randomized_part2 DATE, + date_early_withdraw_randomized_part2 DATE, + date_completed DATE, + date_unblinded DATE, + FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id), + INDEX idx_import (import_id), + INDEX idx_subject (subject) +); + +-- ── Subject visits / transactions (obě studie) ─────────────────────────────── +CREATE TABLE IF NOT EXISTS iwrs_subject_visits ( + id INT AUTO_INCREMENT PRIMARY KEY, + import_id INT NOT NULL, + study VARCHAR(20) NOT NULL, + subject VARCHAR(20) NOT NULL, + visit_type ENUM('Past','Upcoming') NOT NULL, + scheduled_date DATE, + window_days VARCHAR(20), + actual_date DATE, + irt_transaction_no SMALLINT, + irt_transaction_description VARCHAR(200), + medication_assignment VARCHAR(200), + quantity_assigned SMALLINT, + medication_id VARCHAR(20), + FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id), + INDEX idx_import (import_id), + INDEX idx_study_subject (study, subject) +); diff --git a/IWRS/Přehledpacientůstručný/db_config.py b/IWRS/Přehledpacientůstručný/db_config.py new file mode 100644 index 0000000..bfa5959 --- /dev/null +++ b/IWRS/Přehledpacientůstručný/db_config.py @@ -0,0 +1,5 @@ +DB_HOST = "192.168.1.76" +DB_PORT = 3306 +DB_USER = "root" +DB_PASSWORD = "Vlado9674+" +DB_NAME = "studie" diff --git a/IWRS/Přehledpacientůstručný/import_to_mysql.py b/IWRS/Přehledpacientůstručný/import_to_mysql.py new file mode 100644 index 0000000..6a16cbe --- /dev/null +++ b/IWRS/Přehledpacientůstručný/import_to_mysql.py @@ -0,0 +1,358 @@ +""" +Importuje data z IWRS Excel reportů do MySQL (databáze studie). + +Pořadí spuštění: + 1. download_subject_summary.py + 2. download_subject_details.py + 3. tento skript + +Každé spuštění vytvoří nový import_id v iwrs_import. +Reportovací skripty pracují vždy s MAX(import_id) pro danou studii. +""" + +import os +import glob +import datetime +import re + +import pandas as pd +import mysql.connector + +import db_config + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports") +DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails") + +STUDIES = ["77242113UCO3001", "42847922MDD3003"] + + +# ── helpers ────────────────────────────────────────────────────────────────── + +def get_conn(): + return mysql.connector.connect( + host=db_config.DB_HOST, + port=db_config.DB_PORT, + user=db_config.DB_USER, + password=db_config.DB_PASSWORD, + database=db_config.DB_NAME, + ) + + +def to_date(val): + """Převede pandas Timestamp / string / NaT / NaN na date nebo None.""" + if val is None or (isinstance(val, float) and pd.isna(val)): + return None + if isinstance(val, pd.Timestamp): + return None if pd.isna(val) else val.date() + if isinstance(val, datetime.datetime): + return val.date() + if isinstance(val, datetime.date): + return val + s = str(val).strip() + if not s or s.lower() in ("nat", "nan", "none", ""): + return None + for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"): + try: + return datetime.datetime.strptime(s, fmt).date() + except ValueError: + pass + return None + + +def to_int(val): + try: + v = float(val) + return None if pd.isna(v) else int(v) + except (TypeError, ValueError): + return None + + +def to_float(val): + try: + v = float(val) + return None if pd.isna(v) else v + except (TypeError, ValueError): + return None + + +def to_str(val): + if val is None or (isinstance(val, float) and pd.isna(val)): + return None + s = str(val).strip() + return None if s.lower() in ("nan", "nat", "none", "") else s + + +def find_summary_file(study): + today = datetime.date.today().strftime("%Y-%m-%d") + pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx") + files = sorted( + [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")], + key=os.path.getmtime, + reverse=True, + ) + if not files: + raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}") + if not os.path.basename(files[0]).startswith(today): + print(f" UPOZORNĚNÍ: nejnovější Summary Report pro {study} není z dnešního dne ({os.path.basename(files[0])[:10]})") + return files[0] + + +def read_summary_df(path): + """Přečte Summary xlsx, vrátí DataFrame od řádku s hlavičkou.""" + raw = pd.read_excel(path, header=None) + header_row = None + for i, row in raw.iterrows(): + if "Subject" in [str(v).strip() for v in row]: + header_row = i + break + if header_row is None: + raise ValueError(f"Hlavičkový řádek nenalezen v {path}") + return pd.read_excel(path, header=header_row) + + +def find_detail_files(study): + out_dir = os.path.join(DETAILS_DIR, study) + # Vezme soubory ze stejného dne jako nejnovější Summary Report + summary_path = find_summary_file(study) + file_date = os.path.basename(summary_path)[:10] # "YYYY-MM-DD" + pattern = os.path.join(out_dir, f"{file_date} {study} * Subject Detail.xlsx") + files = [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")] + return sorted(files) + + +def parse_detail_visits(path): + """ + Vrátí list slovníků s daty visitů z Detail xlsx. + Každý řádek tabulky (od řádku s hlavičkou Visit Type) je jedna transakce. + """ + df = pd.read_excel(path, sheet_name="patient_detail_report", header=None) + + header_row = None + for i, row in df.iterrows(): + if "Visit Type" in [str(v).strip() for v in row]: + header_row = i + break + if header_row is None: + return [] + + visits_df = df.iloc[header_row + 1:].copy() + visits_df.columns = range(visits_df.shape[1]) + + rows = [] + for _, r in visits_df.iterrows(): + visit_type = to_str(r.get(0)) + if visit_type not in ("Past", "Upcoming"): + continue + rows.append({ + "visit_type": visit_type, + "scheduled_date": to_date(r.get(1)), + "window_days": to_str(r.get(2)), + "actual_date": to_date(r.get(3)), + "irt_transaction_no": to_int(r.get(4)), + "irt_transaction_description": to_str(r.get(5)), + "medication_assignment": to_str(r.get(6)), + "quantity_assigned": to_int(r.get(7)), + "medication_id": to_str(r.get(8)), + }) + return rows + + +# ── insert helpers ──────────────────────────────────────────────────────────── + +def insert_import(cursor, study, source_file): + cursor.execute( + "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)", + (study, datetime.datetime.now(), os.path.basename(source_file)), + ) + return cursor.lastrowid + + +def insert_uco3001_summary(cursor, import_id, df): + sql = """ + INSERT INTO iwrs_uco3001_subject_summary ( + import_id, subject, prior_subject_identifier, site, investigator, location, + cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight, + rescreened_subject, adt_ir, three_or_more_advanced_therapies, + only_oral_5asa_compounds, ustekinumab, isolated_proctitis, + clinical_responder_status_i12_m0, irt_subject_status, + i0_rand_date_local, last_irt_transaction, + last_irt_transaction_date_local, last_irt_transaction_date_utc, + next_irt_transaction, next_irt_transaction_date_local, + most_recent_med_assignment_date, days_since_last_med_assignment, + patient_forecast_status, patient_forecast_status_changed_date + ) VALUES ( + %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s + ) + """ + col = df.columns.tolist() + + def c(name): + return col.index(name) if name in col else None + + for _, r in df.iterrows(): + cursor.execute(sql, ( + import_id, + to_str(r["Subject"]), + to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, + to_str(r["Site"]), + to_str(r["Investigator"]), + to_str(r["Location"]), + to_str(r["Cohort per IRT"]), + to_date(r["Informed Consent Date"]), + to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None, + to_int(r["Subject's age collection"]), + to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None, + to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None, + to_str(r["ADT-IR"]) if "ADT-IR" in col else None, + to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None, + to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None, + to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None, + to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None, + to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None, + to_str(r["IRT Subject Status"]), + to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None, + to_str(r["Last Recorded IRT Transaction"]), + to_date(r["Last Recorded IRT Transaction Date [Local]"]), + to_date(r["Last Recorded IRT Transaction Date (UTC)"]), + to_str(r["Next Expected IRT Transaction"]), + to_date(r["Next Expected IRT Transaction Date [Local]"]), + to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None, + to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None, + to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None, + to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None, + )) + + +def insert_mdd3003_summary(cursor, import_id, df): + sql = """ + INSERT INTO iwrs_mdd3003_subject_summary ( + import_id, subject, prior_subject_identifier, site, investigator, location, + cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age, + madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17, + stratification_country, age_group, stable_remitters, irt_subject_status, + last_irt_transaction, last_irt_transaction_date_local, + last_irt_transaction_date_utc, next_irt_transaction, + next_irt_transaction_date_local, date_screened, date_screen_failed, + date_randomized_part1, date_early_withdraw_randomized_part1, + date_open_label_induction, date_early_withdraw_open_label_induction, + date_randomized_part2, date_early_withdraw_randomized_part2, + date_completed, date_unblinded + ) VALUES ( + %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s + ) + """ + col = df.columns.tolist() + + for _, r in df.iterrows(): + cursor.execute(sql, ( + import_id, + to_str(r["Subject"]), + to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, + to_str(r["Site"]), + to_str(r["Investigator"]), + to_str(r["Location"]), + to_str(r["Cohort per IRT"]), + to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None, + to_date(r["Informed Consent Date"]), + to_int(r["Subject's age collection"]), + to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None, + to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None, + to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None, + to_str(r["Stratification Country"]) if "Stratification Country" in col else None, + to_str(r["Age Group"]) if "Age Group" in col else None, + to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None, + to_str(r["IRT Subject Status"]), + to_str(r["Last Recorded IRT Transaction"]), + to_date(r["Last Recorded IRT Transaction Date [Local]"]), + to_date(r["Last Recorded IRT Transaction Date (UTC)"]), + to_str(r["Next Expected IRT Transaction"]), + to_date(r["Next Expected IRT Transaction Date [Local]"]), + to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None, + to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None, + to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None, + to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None, + to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None, + to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None, + to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None, + to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None, + to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None, + to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None, + )) + + +def insert_visits(cursor, import_id, study, subject, visits): + if not visits: + return + sql = """ + INSERT INTO iwrs_subject_visits ( + import_id, study, subject, visit_type, scheduled_date, window_days, + actual_date, irt_transaction_no, irt_transaction_description, + medication_assignment, quantity_assigned, medication_id + ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) + """ + for v in visits: + cursor.execute(sql, ( + import_id, study, subject, + v["visit_type"], v["scheduled_date"], v["window_days"], + v["actual_date"], v["irt_transaction_no"], + v["irt_transaction_description"], v["medication_assignment"], + v["quantity_assigned"], v["medication_id"], + )) + + +# ── main ────────────────────────────────────────────────────────────────────── + +def import_study(conn, study): + summary_path = find_summary_file(study) + print(f" Summary: {os.path.basename(summary_path)}") + + df_summary = read_summary_df(summary_path) + df_summary = df_summary.dropna(how="all") + + detail_files = find_detail_files(study) + print(f" Detail souborů: {len(detail_files)}") + + cursor = conn.cursor() + import_id = insert_import(cursor, study, summary_path) + print(f" import_id = {import_id}") + + if study == "77242113UCO3001": + insert_uco3001_summary(cursor, import_id, df_summary) + else: + insert_mdd3003_summary(cursor, import_id, df_summary) + print(f" Summary řádků: {len(df_summary)}") + + visited = 0 + for path in detail_files: + fname = os.path.basename(path) + # název: "2026-05-04 77242113UCO3001 CZ100012001 Subject Detail.xlsx" + m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname) + subject = m.group(1) if m else "UNKNOWN" + visits = parse_detail_visits(path) + insert_visits(cursor, import_id, study, subject, visits) + visited += len(visits) + + conn.commit() + cursor.close() + print(f" Transakce uloženo: {visited}") + return import_id + + +def main(): + conn = get_conn() + print("Připojeno k MySQL.\n") + + for study in STUDIES: + print(f"[{study}]") + try: + import_id = import_study(conn, study) + print(f" OK — import_id {import_id}\n") + except Exception as e: + print(f" CHYBA: {e}\n") + + conn.close() + print("Hotovo.") + + +main() diff --git a/IWRS/Přehledpacientůstručný/run_all.py b/IWRS/Přehledpacientůstručný/run_all.py new file mode 100644 index 0000000..9453955 --- /dev/null +++ b/IWRS/Přehledpacientůstručný/run_all.py @@ -0,0 +1,422 @@ +""" +Kompletní pipeline: + 1. Stažení Subject Summary Reportů (obě studie) + 2. Stažení Subject Detail Reportů (obě studie) + 3. Import do MySQL + +Spusť tento skript místo tří samostatných skriptů. +""" + +import os +import datetime +import glob +import re + +from playwright.sync_api import sync_playwright +import pandas as pd + +import db_config +import mysql.connector + +# ── CONFIG ─────────────────────────────────────────────────────────────────── +BASE_URL = "https://janssen.4gclinical.com" +EMAIL = "vbuzalka@its.jnj.com" +PASSWORD = "Vlado123++-+" + +STUDIES = ["77242113UCO3001", "42847922MDD3003"] + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports") +DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails") + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def unique_path(directory, stem): + path = os.path.join(directory, f"{stem}.xlsx") + if not os.path.exists(path): + return path + time_tag = datetime.datetime.now().strftime("%H%M") + return os.path.join(directory, f"{stem} {time_tag}.xlsx") + + +def login(page, study): + page.goto(BASE_URL) + page.wait_for_load_state("networkidle") + page.get_by_label("Email *").fill(EMAIL) + page.get_by_label("Password *").fill(PASSWORD) + page.locator("#login__submit").click() + page.wait_for_load_state("networkidle") + page.get_by_label("Study *").click() + page.get_by_role("option", name=study).click() + page.get_by_role("button", name="SELECT").click() + page.wait_for_load_state("networkidle") + + +# ── KROK 1: Subject Summary ─────────────────────────────────────────────────── + +def download_summary(page, study, today): + print(f" [{study}] Stahuji Subject Summary Report...") + page.goto(f"{BASE_URL}/report/patient_summary_report") + page.wait_for_load_state("networkidle", timeout=120000) + filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report") + with page.expect_download(timeout=120000) as dl: + page.get_by_role("button", name="Download XLS").click() + dl.value.save_as(filename) + print(f" [{study}] Summary OK -> {os.path.basename(filename)}") + return filename + + +# ── KROK 2: Subject Details ─────────────────────────────────────────────────── + +def get_subjects_from_summary(summary_path): + raw = pd.read_excel(summary_path, header=None) + header_row = None + for i, row in raw.iterrows(): + if "Subject" in [str(v).strip() for v in row]: + header_row = i + break + if header_row is None: + raise ValueError("Hlavičkový řádek nenalezen") + df = pd.read_excel(summary_path, header=header_row) + return df["Subject"].dropna().astype(str).str.strip().tolist() + + +def download_details(page, study, summary_path, today): + out_dir = os.path.join(DETAILS_DIR, study) + os.makedirs(out_dir, exist_ok=True) + + subjects = get_subjects_from_summary(summary_path) + print(f" [{study}] Subjektů k stažení: {len(subjects)}") + + page.goto(f"{BASE_URL}/report/patient_detail_report") + page.wait_for_load_state("networkidle", timeout=120000) + + for subject in subjects: + filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx") + input_field = page.locator('input[placeholder="search"], input[type="text"]').first + input_field.click() + input_field.fill(subject) + page.wait_for_timeout(500) + page.locator("mat-option").first.dispatch_event("click") + page.wait_for_load_state("networkidle", timeout=120000) + + with page.expect_download(timeout=120000) as dl: + page.get_by_role("button", name="Download XLS").click() + dl.value.save_as(filename) + print(f" [{study}] Detail {subject} OK") + + page.get_by_role("button", name="Clear").click() + page.wait_for_load_state("networkidle", timeout=120000) + + +# ── KROK 3: Import do MySQL ─────────────────────────────────────────────────── + +def get_conn(): + return mysql.connector.connect( + host=db_config.DB_HOST, + port=db_config.DB_PORT, + user=db_config.DB_USER, + password=db_config.DB_PASSWORD, + database=db_config.DB_NAME, + ) + + +def to_date(val): + if val is None or (isinstance(val, float) and pd.isna(val)): + return None + if isinstance(val, pd.Timestamp): + return None if pd.isna(val) else val.date() + if isinstance(val, datetime.datetime): + return val.date() + if isinstance(val, datetime.date): + return val + s = str(val).strip() + if not s or s.lower() in ("nat", "nan", "none", ""): + return None + for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"): + try: + return datetime.datetime.strptime(s, fmt).date() + except ValueError: + pass + return None + + +def to_int(val): + try: + v = float(val) + return None if pd.isna(v) else int(v) + except (TypeError, ValueError): + return None + + +def to_float(val): + try: + v = float(val) + return None if pd.isna(v) else v + except (TypeError, ValueError): + return None + + +def to_str(val): + if val is None or (isinstance(val, float) and pd.isna(val)): + return None + s = str(val).strip() + return None if s.lower() in ("nan", "nat", "none", "") else s + + +def read_summary_df(path): + raw = pd.read_excel(path, header=None) + header_row = None + for i, row in raw.iterrows(): + if "Subject" in [str(v).strip() for v in row]: + header_row = i + break + if header_row is None: + raise ValueError(f"Hlavičkový řádek nenalezen v {path}") + return pd.read_excel(path, header=header_row).dropna(how="all") + + +def parse_detail_visits(path): + df = pd.read_excel(path, sheet_name="patient_detail_report", header=None) + header_row = None + for i, row in df.iterrows(): + if "Visit Type" in [str(v).strip() for v in row]: + header_row = i + break + if header_row is None: + return [] + visits_df = df.iloc[header_row + 1:].copy() + visits_df.columns = range(visits_df.shape[1]) + rows = [] + for _, r in visits_df.iterrows(): + visit_type = to_str(r.get(0)) + if visit_type not in ("Past", "Upcoming"): + continue + rows.append({ + "visit_type": visit_type, + "scheduled_date": to_date(r.get(1)), + "window_days": to_str(r.get(2)), + "actual_date": to_date(r.get(3)), + "irt_transaction_no": to_int(r.get(4)), + "irt_transaction_description": to_str(r.get(5)), + "medication_assignment": to_str(r.get(6)), + "quantity_assigned": to_int(r.get(7)), + "medication_id": to_str(r.get(8)), + }) + return rows + + +def insert_import(cursor, study, source_file): + cursor.execute( + "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)", + (study, datetime.datetime.now(), os.path.basename(source_file)), + ) + return cursor.lastrowid + + +def insert_uco3001_summary(cursor, import_id, df): + sql = """INSERT INTO iwrs_uco3001_subject_summary ( + import_id, subject, prior_subject_identifier, site, investigator, location, + cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight, + rescreened_subject, adt_ir, three_or_more_advanced_therapies, + only_oral_5asa_compounds, ustekinumab, isolated_proctitis, + clinical_responder_status_i12_m0, irt_subject_status, + i0_rand_date_local, last_irt_transaction, + last_irt_transaction_date_local, last_irt_transaction_date_utc, + next_irt_transaction, next_irt_transaction_date_local, + most_recent_med_assignment_date, days_since_last_med_assignment, + patient_forecast_status, patient_forecast_status_changed_date + ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" + col = df.columns.tolist() + for _, r in df.iterrows(): + cursor.execute(sql, ( + import_id, + to_str(r["Subject"]), + to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, + to_str(r["Site"]), + to_str(r["Investigator"]), + to_str(r["Location"]), + to_str(r["Cohort per IRT"]), + to_date(r["Informed Consent Date"]), + to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None, + to_int(r["Subject's age collection"]), + to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None, + to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None, + to_str(r["ADT-IR"]) if "ADT-IR" in col else None, + to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None, + to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None, + to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None, + to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None, + to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None, + to_str(r["IRT Subject Status"]), + to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None, + to_str(r["Last Recorded IRT Transaction"]), + to_date(r["Last Recorded IRT Transaction Date [Local]"]), + to_date(r["Last Recorded IRT Transaction Date (UTC)"]), + to_str(r["Next Expected IRT Transaction"]), + to_date(r["Next Expected IRT Transaction Date [Local]"]), + to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None, + to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None, + to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None, + to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None, + )) + + +def insert_mdd3003_summary(cursor, import_id, df): + sql = """INSERT INTO iwrs_mdd3003_subject_summary ( + import_id, subject, prior_subject_identifier, site, investigator, location, + cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age, + madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17, + stratification_country, age_group, stable_remitters, irt_subject_status, + last_irt_transaction, last_irt_transaction_date_local, + last_irt_transaction_date_utc, next_irt_transaction, + next_irt_transaction_date_local, date_screened, date_screen_failed, + date_randomized_part1, date_early_withdraw_randomized_part1, + date_open_label_induction, date_early_withdraw_open_label_induction, + date_randomized_part2, date_early_withdraw_randomized_part2, + date_completed, date_unblinded + ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" + col = df.columns.tolist() + for _, r in df.iterrows(): + cursor.execute(sql, ( + import_id, + to_str(r["Subject"]), + to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, + to_str(r["Site"]), + to_str(r["Investigator"]), + to_str(r["Location"]), + to_str(r["Cohort per IRT"]), + to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None, + to_date(r["Informed Consent Date"]), + to_int(r["Subject's age collection"]), + to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None, + to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None, + to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None, + to_str(r["Stratification Country"]) if "Stratification Country" in col else None, + to_str(r["Age Group"]) if "Age Group" in col else None, + to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None, + to_str(r["IRT Subject Status"]), + to_str(r["Last Recorded IRT Transaction"]), + to_date(r["Last Recorded IRT Transaction Date [Local]"]), + to_date(r["Last Recorded IRT Transaction Date (UTC)"]), + to_str(r["Next Expected IRT Transaction"]), + to_date(r["Next Expected IRT Transaction Date [Local]"]), + to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None, + to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None, + to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None, + to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None, + to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None, + to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None, + to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None, + to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None, + to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None, + to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None, + )) + + +def insert_visits(cursor, import_id, study, subject, visits): + if not visits: + return + sql = """INSERT INTO iwrs_subject_visits ( + import_id, study, subject, visit_type, scheduled_date, window_days, + actual_date, irt_transaction_no, irt_transaction_description, + medication_assignment, quantity_assigned, medication_id + ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" + for v in visits: + cursor.execute(sql, ( + import_id, study, subject, + v["visit_type"], v["scheduled_date"], v["window_days"], + v["actual_date"], v["irt_transaction_no"], + v["irt_transaction_description"], v["medication_assignment"], + v["quantity_assigned"], v["medication_id"], + )) + + +def import_to_mysql(summary_path, detail_files, study): + print(f"\n [MySQL] Importuji {study}...") + df_summary = read_summary_df(summary_path) + conn = get_conn() + cursor = conn.cursor() + + import_id = insert_import(cursor, study, summary_path) + + if study == "77242113UCO3001": + insert_uco3001_summary(cursor, import_id, df_summary) + else: + insert_mdd3003_summary(cursor, import_id, df_summary) + + total_visits = 0 + for path in detail_files: + fname = os.path.basename(path) + m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname) + subject = m.group(1) if m else "UNKNOWN" + visits = parse_detail_visits(path) + insert_visits(cursor, import_id, study, subject, visits) + total_visits += len(visits) + + conn.commit() + cursor.close() + conn.close() + print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}") + return import_id + + +# ── MAIN ───────────────────────────────────────────────────────────────────── + +def main(): + today = datetime.date.today().strftime("%Y-%m-%d") + os.makedirs(INCOMING_DIR, exist_ok=True) + os.makedirs(DETAILS_DIR, exist_ok=True) + + summary_paths = {} + + # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ── + with sync_playwright() as p: + for study in STUDIES: + print(f"\n{'='*60}") + print(f"[{study}] KROK 1: Subject Summary Report") + print(f"{'='*60}") + browser = p.chromium.launch(headless=False) + context = browser.new_context(accept_downloads=True) + page = context.new_page() + + try: + login(page, study) + summary_path = download_summary(page, study, today) + summary_paths[study] = summary_path + + print(f"\n[{study}] KROK 2: Subject Detail Reports") + download_details(page, study, summary_path, today) + except Exception as e: + print(f" [{study}] CHYBA při stahování: {e}") + summary_paths[study] = None + finally: + browser.close() + + # ── Krok 3: import do MySQL ────────────────────────────────────────────── + print(f"\n{'='*60}") + print("KROK 3: Import do MySQL") + print(f"{'='*60}") + + for study in STUDIES: + summary_path = summary_paths.get(study) + if not summary_path: + print(f" [{study}] PŘESKOČENO — stahování selhalo") + continue + + detail_files = sorted(glob.glob( + os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx") + )) + + try: + import_to_mysql(summary_path, detail_files, study) + except Exception as e: + print(f" [{study}] CHYBA při importu: {e}") + + print(f"\n{'='*60}") + print("Vše hotovo.") + print(f"{'='*60}") + + +main()