""" Kompletní pipeline: 1. Stažení Subject Summary Reportů (obě studie) 2. Stažení Subject Detail Reportů + notifikací (obě studie) 3. Import do MySQL (summary, visits, notifikace) Spusť tento skript místo samostatných skriptů. """ import os import datetime import glob import re from playwright.sync_api import sync_playwright import numpy as np import pandas as pd import db_config import mysql.connector import download_subject_details as dsd # ── CONFIG ─────────────────────────────────────────────────────────────────── BASE_URL = "https://janssen.4gclinical.com" EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "Vlado123++-+" STUDIES = ["77242113UCO3001", "42847922MDD3003"] BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports") DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails") # ── helpers ─────────────────────────────────────────────────────────────────── def unique_path(directory, stem): path = os.path.join(directory, f"{stem}.xlsx") if not os.path.exists(path): return path time_tag = datetime.datetime.now().strftime("%H%M") return os.path.join(directory, f"{stem} {time_tag}.xlsx") def login(page, study): page.goto(BASE_URL) page.wait_for_load_state("networkidle") page.get_by_label("Email *").fill(EMAIL) page.get_by_label("Password *").fill(PASSWORD) page.locator("#login__submit").click() page.wait_for_load_state("networkidle") page.get_by_label("Study *").click() page.get_by_role("option", name=study).click() page.get_by_role("button", name="SELECT").click() page.wait_for_load_state("networkidle") # ── KROK 1: Subject Summary ─────────────────────────────────────────────────── def download_summary(page, study, today): print(f" [{study}] Stahuji Subject Summary Report...") page.goto(f"{BASE_URL}/report/patient_summary_report") page.wait_for_load_state("networkidle", timeout=120000) filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report") with page.expect_download(timeout=120000) as dl: page.get_by_role("button", name="Download XLS").click() dl.value.save_as(filename) print(f" [{study}] Summary OK -> {os.path.basename(filename)}") return filename # ── KROK 2: Subject Details ─────────────────────────────────────────────────── def get_subjects_from_summary(summary_path): raw = pd.read_excel(summary_path, header=None) header_row = None for i, row in raw.iterrows(): if "Subject" in [str(v).strip() for v in row]: header_row = i break if header_row is None: raise ValueError("Hlavičkový řádek nenalezen") df = pd.read_excel(summary_path, header=header_row) return df["Subject"].dropna().astype(str).str.strip().tolist() def download_details(page, study, summary_path, today): out_dir = os.path.join(DETAILS_DIR, study) os.makedirs(out_dir, exist_ok=True) subjects = get_subjects_from_summary(summary_path) print(f" [{study}] Subjektů k stažení: {len(subjects)}") page.goto(f"{BASE_URL}/report/patient_detail_report") page.wait_for_load_state("networkidle", timeout=120000) for subject in subjects: filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx") input_field = page.locator('input[placeholder="search"], input[type="text"]').first input_field.click() input_field.fill(subject) page.wait_for_timeout(500) page.locator("mat-option").first.dispatch_event("click") page.wait_for_load_state("networkidle", timeout=120000) with page.expect_download(timeout=120000) as dl: page.get_by_role("button", name="Download XLS").click() dl.value.save_as(filename) print(f" [{study}] Detail {subject} OK") page.get_by_role("button", name="Clear").click() page.wait_for_load_state("networkidle", timeout=120000) # ── KROK 3: Import do MySQL ─────────────────────────────────────────────────── def get_conn(): return mysql.connector.connect( host=db_config.DB_HOST, port=db_config.DB_PORT, user=db_config.DB_USER, password=db_config.DB_PASSWORD, database=db_config.DB_NAME, ) def _py(val): """Převede numpy skalár na Python nativní typ.""" if isinstance(val, np.generic): return val.item() return val def to_date(val): val = _py(val) if val is None or (isinstance(val, float) and (val != val)): return None try: if pd.isna(val): return None except (TypeError, ValueError): pass if isinstance(val, pd.Timestamp): return None if pd.isna(val) else val.date() if isinstance(val, datetime.datetime): return val.date() if isinstance(val, datetime.date): return val s = str(val).strip() if not s or s.lower() in ("nat", "nan", "none", ""): return None for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"): try: return datetime.datetime.strptime(s, fmt).date() except ValueError: pass return None def to_int(val): val = _py(val) try: v = float(val) return None if (v != v) else int(v) except (TypeError, ValueError): return None def to_float(val): val = _py(val) try: v = float(val) return None if (v != v) else float(v) except (TypeError, ValueError): return None def to_str(val): val = _py(val) if val is None: return None if isinstance(val, float) and (val != val): return None s = str(val).strip() return None if s.lower() in ("nan", "nat", "none", "") else s def read_summary_df(path): raw = pd.read_excel(path, header=None) header_row = None for i, row in raw.iterrows(): if "Subject" in [str(v).strip() for v in row]: header_row = i break if header_row is None: raise ValueError(f"Hlavičkový řádek nenalezen v {path}") return pd.read_excel(path, header=header_row).dropna(how="all") def parse_detail_visits(path): df = pd.read_excel(path, sheet_name="patient_detail_report", header=None) header_row = None for i, row in df.iterrows(): if "Visit Type" in [str(v).strip() for v in row]: header_row = i break if header_row is None: return [] visits_df = df.iloc[header_row + 1:].copy() visits_df.columns = range(visits_df.shape[1]) rows = [] for _, r in visits_df.iterrows(): visit_type = to_str(r.get(0)) if visit_type not in ("Past", "Upcoming"): continue rows.append({ "visit_type": visit_type, "scheduled_date": to_date(r.get(1)), "window_days": to_str(r.get(2)), "actual_date": to_date(r.get(3)), "irt_transaction_no": to_int(r.get(4)), "irt_transaction_description": to_str(r.get(5)), "medication_assignment": to_str(r.get(6)), "quantity_assigned": to_int(r.get(7)), "medication_id": to_str(r.get(8)), }) return rows def insert_import(cursor, study, source_file): cursor.execute( "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)", (study, datetime.datetime.now(), os.path.basename(source_file)), ) return cursor.lastrowid def insert_uco3001_summary(cursor, import_id, df): sql = """INSERT INTO iwrs_uco3001_subject_summary ( import_id, subject, prior_subject_identifier, site, investigator, location, cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight, rescreened_subject, adt_ir, three_or_more_advanced_therapies, only_oral_5asa_compounds, ustekinumab, isolated_proctitis, clinical_responder_status_i12_m0, irt_subject_status, i0_rand_date_local, last_irt_transaction, last_irt_transaction_date_local, last_irt_transaction_date_utc, next_irt_transaction, next_irt_transaction_date_local, most_recent_med_assignment_date, days_since_last_med_assignment, patient_forecast_status, patient_forecast_status_changed_date ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" col = df.columns.tolist() for _, r in df.iterrows(): cursor.execute(sql, ( import_id, to_str(r["Subject"]), to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, to_str(r["Site"]), to_str(r["Investigator"]), to_str(r["Location"]), to_str(r["Cohort per IRT"]), to_date(r["Informed Consent Date"]), to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None, to_int(r["Subject's age collection"]), to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None, to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None, to_str(r["ADT-IR"]) if "ADT-IR" in col else None, to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None, to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None, to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None, to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None, to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None, to_str(r["IRT Subject Status"]), to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None, to_str(r["Last Recorded IRT Transaction"]), to_date(r["Last Recorded IRT Transaction Date [Local]"]), to_date(r["Last Recorded IRT Transaction Date (UTC)"]), to_str(r["Next Expected IRT Transaction"]), to_date(r["Next Expected IRT Transaction Date [Local]"]), to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None, to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None, to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None, to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None, )) def insert_mdd3003_summary(cursor, import_id, df): sql = """INSERT INTO iwrs_mdd3003_subject_summary ( import_id, subject, prior_subject_identifier, site, investigator, location, cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age, madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17, stratification_country, age_group, stable_remitters, irt_subject_status, last_irt_transaction, last_irt_transaction_date_local, last_irt_transaction_date_utc, next_irt_transaction, next_irt_transaction_date_local, date_screened, date_screen_failed, date_randomized_part1, date_early_withdraw_randomized_part1, date_open_label_induction, date_early_withdraw_open_label_induction, date_randomized_part2, date_early_withdraw_randomized_part2, date_completed, date_unblinded ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" col = df.columns.tolist() for _, r in df.iterrows(): cursor.execute(sql, ( import_id, to_str(r["Subject"]), to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, to_str(r["Site"]), to_str(r["Investigator"]), to_str(r["Location"]), to_str(r["Cohort per IRT"]), to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None, to_date(r["Informed Consent Date"]), to_int(r["Subject's age collection"]), to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None, to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None, to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None, to_str(r["Stratification Country"]) if "Stratification Country" in col else None, to_str(r["Age Group"]) if "Age Group" in col else None, to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None, to_str(r["IRT Subject Status"]), to_str(r["Last Recorded IRT Transaction"]), to_date(r["Last Recorded IRT Transaction Date [Local]"]), to_date(r["Last Recorded IRT Transaction Date (UTC)"]), to_str(r["Next Expected IRT Transaction"]), to_date(r["Next Expected IRT Transaction Date [Local]"]), to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None, to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None, to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None, to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None, to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None, to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None, to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None, to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None, to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None, to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None, )) def insert_visits(cursor, import_id, study, subject, visits): if not visits: return sql = """INSERT INTO iwrs_subject_visits ( import_id, study, subject, visit_type, scheduled_date, window_days, actual_date, irt_transaction_no, irt_transaction_description, medication_assignment, quantity_assigned, medication_id ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" for v in visits: cursor.execute(sql, ( import_id, study, subject, v["visit_type"], v["scheduled_date"], v["window_days"], v["actual_date"], v["irt_transaction_no"], v["irt_transaction_description"], v["medication_assignment"], v["quantity_assigned"], v["medication_id"], )) def import_to_mysql(summary_path, detail_files, study): print(f"\n [MySQL] Importuji {study}...") df_summary = read_summary_df(summary_path) conn = get_conn() cursor = conn.cursor() import_id = insert_import(cursor, study, summary_path) if study == "77242113UCO3001": insert_uco3001_summary(cursor, import_id, df_summary) else: insert_mdd3003_summary(cursor, import_id, df_summary) total_visits = 0 for path in detail_files: fname = os.path.basename(path) m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname) subject = m.group(1) if m else "UNKNOWN" visits = parse_detail_visits(path) insert_visits(cursor, import_id, study, subject, visits) total_visits += len(visits) conn.commit() cursor.close() conn.close() print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}") return import_id # ── MAIN ───────────────────────────────────────────────────────────────────── def main(): today = datetime.date.today().strftime("%Y-%m-%d") os.makedirs(INCOMING_DIR, exist_ok=True) os.makedirs(DETAILS_DIR, exist_ok=True) summary_paths = {} # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ── with sync_playwright() as p: for study in STUDIES: print(f"\n{'='*60}") print(f"[{study}] KROK 1: Subject Summary Report") print(f"{'='*60}") browser = p.chromium.launch(headless=False) context = browser.new_context(accept_downloads=True) page = context.new_page() try: login(page, study) summary_path = download_summary(page, study, today) summary_paths[study] = summary_path print(f"\n[{study}] KROK 2: Subject Detail Reports + notifikace") dsd.run(page, study) except Exception as e: print(f" [{study}] CHYBA při stahování: {e}") summary_paths[study] = None finally: browser.close() # ── Krok 3: import do MySQL ────────────────────────────────────────────── print(f"\n{'='*60}") print("KROK 3: Import do MySQL") print(f"{'='*60}") for study in STUDIES: summary_path = summary_paths.get(study) if not summary_path: print(f" [{study}] PŘESKOČENO — stahování selhalo") continue detail_files = sorted(glob.glob( os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx") )) try: import_to_mysql(summary_path, detail_files, study) except Exception as e: print(f" [{study}] CHYBA při importu: {e}") print(f"\n{'='*60}") print("Vše hotovo.") print(f"{'='*60}") main()