442 lines
19 KiB
Python
442 lines
19 KiB
Python
"""
|
|
Kompletní pipeline:
|
|
1. Stažení Subject Summary Reportů (obě studie)
|
|
2. Stažení Subject Detail Reportů (obě studie)
|
|
3. Import do MySQL
|
|
|
|
Spusť tento skript místo tří samostatných skriptů.
|
|
"""
|
|
|
|
import os
|
|
import datetime
|
|
import glob
|
|
import re
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import db_config
|
|
import mysql.connector
|
|
|
|
# ── CONFIG ───────────────────────────────────────────────────────────────────
|
|
BASE_URL = "https://janssen.4gclinical.com"
|
|
EMAIL = "vbuzalka@its.jnj.com"
|
|
PASSWORD = "Vlado123++-+"
|
|
|
|
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
|
|
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
|
|
|
|
|
|
# ── helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
def unique_path(directory, stem):
|
|
path = os.path.join(directory, f"{stem}.xlsx")
|
|
if not os.path.exists(path):
|
|
return path
|
|
time_tag = datetime.datetime.now().strftime("%H%M")
|
|
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
|
|
|
|
|
|
def login(page, study):
|
|
page.goto(BASE_URL)
|
|
page.wait_for_load_state("networkidle")
|
|
page.get_by_label("Email *").fill(EMAIL)
|
|
page.get_by_label("Password *").fill(PASSWORD)
|
|
page.locator("#login__submit").click()
|
|
page.wait_for_load_state("networkidle")
|
|
page.get_by_label("Study *").click()
|
|
page.get_by_role("option", name=study).click()
|
|
page.get_by_role("button", name="SELECT").click()
|
|
page.wait_for_load_state("networkidle")
|
|
|
|
|
|
# ── KROK 1: Subject Summary ───────────────────────────────────────────────────
|
|
|
|
def download_summary(page, study, today):
|
|
print(f" [{study}] Stahuji Subject Summary Report...")
|
|
page.goto(f"{BASE_URL}/report/patient_summary_report")
|
|
page.wait_for_load_state("networkidle", timeout=120000)
|
|
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
|
|
with page.expect_download(timeout=120000) as dl:
|
|
page.get_by_role("button", name="Download XLS").click()
|
|
dl.value.save_as(filename)
|
|
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
|
|
return filename
|
|
|
|
|
|
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
|
|
|
|
def get_subjects_from_summary(summary_path):
|
|
raw = pd.read_excel(summary_path, header=None)
|
|
header_row = None
|
|
for i, row in raw.iterrows():
|
|
if "Subject" in [str(v).strip() for v in row]:
|
|
header_row = i
|
|
break
|
|
if header_row is None:
|
|
raise ValueError("Hlavičkový řádek nenalezen")
|
|
df = pd.read_excel(summary_path, header=header_row)
|
|
return df["Subject"].dropna().astype(str).str.strip().tolist()
|
|
|
|
|
|
def download_details(page, study, summary_path, today):
|
|
out_dir = os.path.join(DETAILS_DIR, study)
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
|
|
subjects = get_subjects_from_summary(summary_path)
|
|
print(f" [{study}] Subjektů k stažení: {len(subjects)}")
|
|
|
|
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
|
page.wait_for_load_state("networkidle", timeout=120000)
|
|
|
|
for subject in subjects:
|
|
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
|
|
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
|
input_field.click()
|
|
input_field.fill(subject)
|
|
page.wait_for_timeout(500)
|
|
page.locator("mat-option").first.dispatch_event("click")
|
|
page.wait_for_load_state("networkidle", timeout=120000)
|
|
|
|
with page.expect_download(timeout=120000) as dl:
|
|
page.get_by_role("button", name="Download XLS").click()
|
|
dl.value.save_as(filename)
|
|
print(f" [{study}] Detail {subject} OK")
|
|
|
|
page.get_by_role("button", name="Clear").click()
|
|
page.wait_for_load_state("networkidle", timeout=120000)
|
|
|
|
|
|
# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
|
|
|
|
def get_conn():
|
|
return mysql.connector.connect(
|
|
host=db_config.DB_HOST,
|
|
port=db_config.DB_PORT,
|
|
user=db_config.DB_USER,
|
|
password=db_config.DB_PASSWORD,
|
|
database=db_config.DB_NAME,
|
|
)
|
|
|
|
|
|
def _py(val):
|
|
"""Převede numpy skalár na Python nativní typ."""
|
|
if isinstance(val, np.generic):
|
|
return val.item()
|
|
return val
|
|
|
|
|
|
def to_date(val):
|
|
val = _py(val)
|
|
if val is None or (isinstance(val, float) and (val != val)):
|
|
return None
|
|
try:
|
|
if pd.isna(val):
|
|
return None
|
|
except (TypeError, ValueError):
|
|
pass
|
|
if isinstance(val, pd.Timestamp):
|
|
return None if pd.isna(val) else val.date()
|
|
if isinstance(val, datetime.datetime):
|
|
return val.date()
|
|
if isinstance(val, datetime.date):
|
|
return val
|
|
s = str(val).strip()
|
|
if not s or s.lower() in ("nat", "nan", "none", ""):
|
|
return None
|
|
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
|
|
try:
|
|
return datetime.datetime.strptime(s, fmt).date()
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
|
|
def to_int(val):
|
|
val = _py(val)
|
|
try:
|
|
v = float(val)
|
|
return None if (v != v) else int(v)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def to_float(val):
|
|
val = _py(val)
|
|
try:
|
|
v = float(val)
|
|
return None if (v != v) else float(v)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def to_str(val):
|
|
val = _py(val)
|
|
if val is None:
|
|
return None
|
|
if isinstance(val, float) and (val != val):
|
|
return None
|
|
s = str(val).strip()
|
|
return None if s.lower() in ("nan", "nat", "none", "") else s
|
|
|
|
|
|
def read_summary_df(path):
|
|
raw = pd.read_excel(path, header=None)
|
|
header_row = None
|
|
for i, row in raw.iterrows():
|
|
if "Subject" in [str(v).strip() for v in row]:
|
|
header_row = i
|
|
break
|
|
if header_row is None:
|
|
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
|
|
return pd.read_excel(path, header=header_row).dropna(how="all")
|
|
|
|
|
|
def parse_detail_visits(path):
|
|
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
|
|
header_row = None
|
|
for i, row in df.iterrows():
|
|
if "Visit Type" in [str(v).strip() for v in row]:
|
|
header_row = i
|
|
break
|
|
if header_row is None:
|
|
return []
|
|
visits_df = df.iloc[header_row + 1:].copy()
|
|
visits_df.columns = range(visits_df.shape[1])
|
|
rows = []
|
|
for _, r in visits_df.iterrows():
|
|
visit_type = to_str(r.get(0))
|
|
if visit_type not in ("Past", "Upcoming"):
|
|
continue
|
|
rows.append({
|
|
"visit_type": visit_type,
|
|
"scheduled_date": to_date(r.get(1)),
|
|
"window_days": to_str(r.get(2)),
|
|
"actual_date": to_date(r.get(3)),
|
|
"irt_transaction_no": to_int(r.get(4)),
|
|
"irt_transaction_description": to_str(r.get(5)),
|
|
"medication_assignment": to_str(r.get(6)),
|
|
"quantity_assigned": to_int(r.get(7)),
|
|
"medication_id": to_str(r.get(8)),
|
|
})
|
|
return rows
|
|
|
|
|
|
def insert_import(cursor, study, source_file):
|
|
cursor.execute(
|
|
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
|
|
(study, datetime.datetime.now(), os.path.basename(source_file)),
|
|
)
|
|
return cursor.lastrowid
|
|
|
|
|
|
def insert_uco3001_summary(cursor, import_id, df):
|
|
sql = """INSERT INTO iwrs_uco3001_subject_summary (
|
|
import_id, subject, prior_subject_identifier, site, investigator, location,
|
|
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
|
|
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
|
|
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
|
|
clinical_responder_status_i12_m0, irt_subject_status,
|
|
i0_rand_date_local, last_irt_transaction,
|
|
last_irt_transaction_date_local, last_irt_transaction_date_utc,
|
|
next_irt_transaction, next_irt_transaction_date_local,
|
|
most_recent_med_assignment_date, days_since_last_med_assignment,
|
|
patient_forecast_status, patient_forecast_status_changed_date
|
|
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
|
col = df.columns.tolist()
|
|
for _, r in df.iterrows():
|
|
cursor.execute(sql, (
|
|
import_id,
|
|
to_str(r["Subject"]),
|
|
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
|
to_str(r["Site"]),
|
|
to_str(r["Investigator"]),
|
|
to_str(r["Location"]),
|
|
to_str(r["Cohort per IRT"]),
|
|
to_date(r["Informed Consent Date"]),
|
|
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
|
|
to_int(r["Subject's age collection"]),
|
|
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
|
|
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
|
|
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
|
|
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
|
|
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
|
|
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
|
|
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
|
|
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
|
|
to_str(r["IRT Subject Status"]),
|
|
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
|
|
to_str(r["Last Recorded IRT Transaction"]),
|
|
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
|
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
|
to_str(r["Next Expected IRT Transaction"]),
|
|
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
|
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
|
|
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
|
|
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
|
|
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
|
|
))
|
|
|
|
|
|
def insert_mdd3003_summary(cursor, import_id, df):
|
|
sql = """INSERT INTO iwrs_mdd3003_subject_summary (
|
|
import_id, subject, prior_subject_identifier, site, investigator, location,
|
|
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
|
|
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
|
|
stratification_country, age_group, stable_remitters, irt_subject_status,
|
|
last_irt_transaction, last_irt_transaction_date_local,
|
|
last_irt_transaction_date_utc, next_irt_transaction,
|
|
next_irt_transaction_date_local, date_screened, date_screen_failed,
|
|
date_randomized_part1, date_early_withdraw_randomized_part1,
|
|
date_open_label_induction, date_early_withdraw_open_label_induction,
|
|
date_randomized_part2, date_early_withdraw_randomized_part2,
|
|
date_completed, date_unblinded
|
|
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
|
col = df.columns.tolist()
|
|
for _, r in df.iterrows():
|
|
cursor.execute(sql, (
|
|
import_id,
|
|
to_str(r["Subject"]),
|
|
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
|
to_str(r["Site"]),
|
|
to_str(r["Investigator"]),
|
|
to_str(r["Location"]),
|
|
to_str(r["Cohort per IRT"]),
|
|
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
|
|
to_date(r["Informed Consent Date"]),
|
|
to_int(r["Subject's age collection"]),
|
|
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
|
|
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
|
|
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
|
|
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
|
|
to_str(r["Age Group"]) if "Age Group" in col else None,
|
|
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
|
|
to_str(r["IRT Subject Status"]),
|
|
to_str(r["Last Recorded IRT Transaction"]),
|
|
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
|
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
|
to_str(r["Next Expected IRT Transaction"]),
|
|
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
|
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
|
|
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
|
|
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
|
|
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
|
|
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
|
|
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
|
|
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
|
|
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
|
|
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
|
|
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
|
|
))
|
|
|
|
|
|
def insert_visits(cursor, import_id, study, subject, visits):
|
|
if not visits:
|
|
return
|
|
sql = """INSERT INTO iwrs_subject_visits (
|
|
import_id, study, subject, visit_type, scheduled_date, window_days,
|
|
actual_date, irt_transaction_no, irt_transaction_description,
|
|
medication_assignment, quantity_assigned, medication_id
|
|
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
|
for v in visits:
|
|
cursor.execute(sql, (
|
|
import_id, study, subject,
|
|
v["visit_type"], v["scheduled_date"], v["window_days"],
|
|
v["actual_date"], v["irt_transaction_no"],
|
|
v["irt_transaction_description"], v["medication_assignment"],
|
|
v["quantity_assigned"], v["medication_id"],
|
|
))
|
|
|
|
|
|
def import_to_mysql(summary_path, detail_files, study):
|
|
print(f"\n [MySQL] Importuji {study}...")
|
|
df_summary = read_summary_df(summary_path)
|
|
conn = get_conn()
|
|
cursor = conn.cursor()
|
|
|
|
import_id = insert_import(cursor, study, summary_path)
|
|
|
|
if study == "77242113UCO3001":
|
|
insert_uco3001_summary(cursor, import_id, df_summary)
|
|
else:
|
|
insert_mdd3003_summary(cursor, import_id, df_summary)
|
|
|
|
total_visits = 0
|
|
for path in detail_files:
|
|
fname = os.path.basename(path)
|
|
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
|
|
subject = m.group(1) if m else "UNKNOWN"
|
|
visits = parse_detail_visits(path)
|
|
insert_visits(cursor, import_id, study, subject, visits)
|
|
total_visits += len(visits)
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
|
|
return import_id
|
|
|
|
|
|
# ── MAIN ─────────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
today = datetime.date.today().strftime("%Y-%m-%d")
|
|
os.makedirs(INCOMING_DIR, exist_ok=True)
|
|
os.makedirs(DETAILS_DIR, exist_ok=True)
|
|
|
|
summary_paths = {}
|
|
|
|
# ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
|
|
with sync_playwright() as p:
|
|
for study in STUDIES:
|
|
print(f"\n{'='*60}")
|
|
print(f"[{study}] KROK 1: Subject Summary Report")
|
|
print(f"{'='*60}")
|
|
browser = p.chromium.launch(headless=False)
|
|
context = browser.new_context(accept_downloads=True)
|
|
page = context.new_page()
|
|
|
|
try:
|
|
login(page, study)
|
|
summary_path = download_summary(page, study, today)
|
|
summary_paths[study] = summary_path
|
|
|
|
print(f"\n[{study}] KROK 2: Subject Detail Reports")
|
|
download_details(page, study, summary_path, today)
|
|
except Exception as e:
|
|
print(f" [{study}] CHYBA při stahování: {e}")
|
|
summary_paths[study] = None
|
|
finally:
|
|
browser.close()
|
|
|
|
# ── Krok 3: import do MySQL ──────────────────────────────────────────────
|
|
print(f"\n{'='*60}")
|
|
print("KROK 3: Import do MySQL")
|
|
print(f"{'='*60}")
|
|
|
|
for study in STUDIES:
|
|
summary_path = summary_paths.get(study)
|
|
if not summary_path:
|
|
print(f" [{study}] PŘESKOČENO — stahování selhalo")
|
|
continue
|
|
|
|
detail_files = sorted(glob.glob(
|
|
os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
|
|
))
|
|
|
|
try:
|
|
import_to_mysql(summary_path, detail_files, study)
|
|
except Exception as e:
|
|
print(f" [{study}] CHYBA při importu: {e}")
|
|
|
|
print(f"\n{'='*60}")
|
|
print("Vše hotovo.")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
main()
|