This commit is contained in:
2026-05-05 10:40:13 +02:00
parent eaea634b2b
commit afd9b3ef17
4 changed files with 895 additions and 0 deletions
@@ -0,0 +1,110 @@
-- IWRS tabulky pro databázi studie
-- Spustit jednou: mysql -h 192.168.1.76 -u root -p studie < create_iwrs_tables.sql
USE studie;
-- ── Import log ───────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_import (
import_id INT AUTO_INCREMENT PRIMARY KEY,
study VARCHAR(20) NOT NULL,
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
source_file VARCHAR(500) NOT NULL,
INDEX idx_study (study)
);
-- ── UCO3001 subject summary ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_uco3001_subject_summary (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
subject VARCHAR(20) NOT NULL,
prior_subject_identifier VARCHAR(20),
site VARCHAR(50),
investigator VARCHAR(100),
location VARCHAR(50),
cohort_per_irt VARCHAR(100),
informed_consent_date DATE,
adolescent_assent_date DATE,
age SMALLINT,
weight DECIMAL(5,1),
rescreened_subject VARCHAR(10),
adt_ir VARCHAR(10),
three_or_more_advanced_therapies VARCHAR(10),
only_oral_5asa_compounds VARCHAR(10),
ustekinumab VARCHAR(10),
isolated_proctitis VARCHAR(10),
clinical_responder_status_i12_m0 VARCHAR(100),
irt_subject_status VARCHAR(50),
i0_rand_date_local DATE,
last_irt_transaction VARCHAR(100),
last_irt_transaction_date_local DATE,
last_irt_transaction_date_utc DATE,
next_irt_transaction VARCHAR(100),
next_irt_transaction_date_local DATE,
most_recent_med_assignment_date DATE,
days_since_last_med_assignment SMALLINT,
patient_forecast_status VARCHAR(50),
patient_forecast_status_changed_date DATE,
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_subject (subject)
);
-- ── MDD3003 subject summary ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_mdd3003_subject_summary (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
subject VARCHAR(20) NOT NULL,
prior_subject_identifier VARCHAR(20),
site VARCHAR(50),
investigator VARCHAR(100),
location VARCHAR(50),
cohort_per_irt VARCHAR(50),
madrs_criteria_integrated VARCHAR(50),
informed_consent_date DATE,
age SMALLINT,
madrs_criteria_v15 VARCHAR(10),
madrs_criteria_v16 VARCHAR(10),
madrs_criteria_v17 VARCHAR(10),
stratification_country VARCHAR(10),
age_group VARCHAR(20),
stable_remitters VARCHAR(50),
irt_subject_status VARCHAR(100),
last_irt_transaction VARCHAR(100),
last_irt_transaction_date_local DATE,
last_irt_transaction_date_utc DATE,
next_irt_transaction VARCHAR(100),
next_irt_transaction_date_local DATE,
date_screened DATE,
date_screen_failed DATE,
date_randomized_part1 DATE,
date_early_withdraw_randomized_part1 DATE,
date_open_label_induction DATE,
date_early_withdraw_open_label_induction DATE,
date_randomized_part2 DATE,
date_early_withdraw_randomized_part2 DATE,
date_completed DATE,
date_unblinded DATE,
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_subject (subject)
);
-- ── Subject visits / transactions (obě studie) ───────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_subject_visits (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
study VARCHAR(20) NOT NULL,
subject VARCHAR(20) NOT NULL,
visit_type ENUM('Past','Upcoming') NOT NULL,
scheduled_date DATE,
window_days VARCHAR(20),
actual_date DATE,
irt_transaction_no SMALLINT,
irt_transaction_description VARCHAR(200),
medication_assignment VARCHAR(200),
quantity_assigned SMALLINT,
medication_id VARCHAR(20),
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_study_subject (study, subject)
);
@@ -0,0 +1,5 @@
DB_HOST = "192.168.1.76"
DB_PORT = 3306
DB_USER = "root"
DB_PASSWORD = "Vlado9674+"
DB_NAME = "studie"
@@ -0,0 +1,358 @@
"""
Importuje data z IWRS Excel reportů do MySQL (databáze studie).
Pořadí spuštění:
1. download_subject_summary.py
2. download_subject_details.py
3. tento skript
Každé spuštění vytvoří nový import_id v iwrs_import.
Reportovací skripty pracují vždy s MAX(import_id) pro danou studii.
"""
import os
import glob
import datetime
import re
import pandas as pd
import mysql.connector
import db_config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
# ── helpers ──────────────────────────────────────────────────────────────────
def get_conn():
return mysql.connector.connect(
host=db_config.DB_HOST,
port=db_config.DB_PORT,
user=db_config.DB_USER,
password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
def to_date(val):
"""Převede pandas Timestamp / string / NaT / NaN na date nebo None."""
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, pd.Timestamp):
return None if pd.isna(val) else val.date()
if isinstance(val, datetime.datetime):
return val.date()
if isinstance(val, datetime.date):
return val
s = str(val).strip()
if not s or s.lower() in ("nat", "nan", "none", ""):
return None
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.datetime.strptime(s, fmt).date()
except ValueError:
pass
return None
def to_int(val):
try:
v = float(val)
return None if pd.isna(v) else int(v)
except (TypeError, ValueError):
return None
def to_float(val):
try:
v = float(val)
return None if pd.isna(v) else v
except (TypeError, ValueError):
return None
def to_str(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
s = str(val).strip()
return None if s.lower() in ("nan", "nat", "none", "") else s
def find_summary_file(study):
today = datetime.date.today().strftime("%Y-%m-%d")
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
files = sorted(
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
key=os.path.getmtime,
reverse=True,
)
if not files:
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
if not os.path.basename(files[0]).startswith(today):
print(f" UPOZORNĚNÍ: nejnovější Summary Report pro {study} není z dnešního dne ({os.path.basename(files[0])[:10]})")
return files[0]
def read_summary_df(path):
"""Přečte Summary xlsx, vrátí DataFrame od řádku s hlavičkou."""
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
return pd.read_excel(path, header=header_row)
def find_detail_files(study):
out_dir = os.path.join(DETAILS_DIR, study)
# Vezme soubory ze stejného dne jako nejnovější Summary Report
summary_path = find_summary_file(study)
file_date = os.path.basename(summary_path)[:10] # "YYYY-MM-DD"
pattern = os.path.join(out_dir, f"{file_date} {study} * Subject Detail.xlsx")
files = [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")]
return sorted(files)
def parse_detail_visits(path):
"""
Vrátí list slovníků s daty visitů z Detail xlsx.
Každý řádek tabulky (od řádku s hlavičkou Visit Type) je jedna transakce.
"""
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
header_row = None
for i, row in df.iterrows():
if "Visit Type" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
return []
visits_df = df.iloc[header_row + 1:].copy()
visits_df.columns = range(visits_df.shape[1])
rows = []
for _, r in visits_df.iterrows():
visit_type = to_str(r.get(0))
if visit_type not in ("Past", "Upcoming"):
continue
rows.append({
"visit_type": visit_type,
"scheduled_date": to_date(r.get(1)),
"window_days": to_str(r.get(2)),
"actual_date": to_date(r.get(3)),
"irt_transaction_no": to_int(r.get(4)),
"irt_transaction_description": to_str(r.get(5)),
"medication_assignment": to_str(r.get(6)),
"quantity_assigned": to_int(r.get(7)),
"medication_id": to_str(r.get(8)),
})
return rows
# ── insert helpers ────────────────────────────────────────────────────────────
def insert_import(cursor, study, source_file):
cursor.execute(
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
(study, datetime.datetime.now(), os.path.basename(source_file)),
)
return cursor.lastrowid
def insert_uco3001_summary(cursor, import_id, df):
sql = """
INSERT INTO iwrs_uco3001_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
clinical_responder_status_i12_m0, irt_subject_status,
i0_rand_date_local, last_irt_transaction,
last_irt_transaction_date_local, last_irt_transaction_date_utc,
next_irt_transaction, next_irt_transaction_date_local,
most_recent_med_assignment_date, days_since_last_med_assignment,
patient_forecast_status, patient_forecast_status_changed_date
) VALUES (
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
)
"""
col = df.columns.tolist()
def c(name):
return col.index(name) if name in col else None
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_date(r["Informed Consent Date"]),
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
to_int(r["Subject's age collection"]),
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
to_str(r["IRT Subject Status"]),
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
))
def insert_mdd3003_summary(cursor, import_id, df):
sql = """
INSERT INTO iwrs_mdd3003_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
stratification_country, age_group, stable_remitters, irt_subject_status,
last_irt_transaction, last_irt_transaction_date_local,
last_irt_transaction_date_utc, next_irt_transaction,
next_irt_transaction_date_local, date_screened, date_screen_failed,
date_randomized_part1, date_early_withdraw_randomized_part1,
date_open_label_induction, date_early_withdraw_open_label_induction,
date_randomized_part2, date_early_withdraw_randomized_part2,
date_completed, date_unblinded
) VALUES (
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
)
"""
col = df.columns.tolist()
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
to_date(r["Informed Consent Date"]),
to_int(r["Subject's age collection"]),
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
to_str(r["Age Group"]) if "Age Group" in col else None,
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
to_str(r["IRT Subject Status"]),
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
))
def insert_visits(cursor, import_id, study, subject, visits):
if not visits:
return
sql = """
INSERT INTO iwrs_subject_visits (
import_id, study, subject, visit_type, scheduled_date, window_days,
actual_date, irt_transaction_no, irt_transaction_description,
medication_assignment, quantity_assigned, medication_id
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
for v in visits:
cursor.execute(sql, (
import_id, study, subject,
v["visit_type"], v["scheduled_date"], v["window_days"],
v["actual_date"], v["irt_transaction_no"],
v["irt_transaction_description"], v["medication_assignment"],
v["quantity_assigned"], v["medication_id"],
))
# ── main ──────────────────────────────────────────────────────────────────────
def import_study(conn, study):
summary_path = find_summary_file(study)
print(f" Summary: {os.path.basename(summary_path)}")
df_summary = read_summary_df(summary_path)
df_summary = df_summary.dropna(how="all")
detail_files = find_detail_files(study)
print(f" Detail souborů: {len(detail_files)}")
cursor = conn.cursor()
import_id = insert_import(cursor, study, summary_path)
print(f" import_id = {import_id}")
if study == "77242113UCO3001":
insert_uco3001_summary(cursor, import_id, df_summary)
else:
insert_mdd3003_summary(cursor, import_id, df_summary)
print(f" Summary řádků: {len(df_summary)}")
visited = 0
for path in detail_files:
fname = os.path.basename(path)
# název: "2026-05-04 77242113UCO3001 CZ100012001 Subject Detail.xlsx"
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
subject = m.group(1) if m else "UNKNOWN"
visits = parse_detail_visits(path)
insert_visits(cursor, import_id, study, subject, visits)
visited += len(visits)
conn.commit()
cursor.close()
print(f" Transakce uloženo: {visited}")
return import_id
def main():
conn = get_conn()
print("Připojeno k MySQL.\n")
for study in STUDIES:
print(f"[{study}]")
try:
import_id = import_study(conn, study)
print(f" OK — import_id {import_id}\n")
except Exception as e:
print(f" CHYBA: {e}\n")
conn.close()
print("Hotovo.")
main()
+422
View File
@@ -0,0 +1,422 @@
"""
Kompletní pipeline:
1. Stažení Subject Summary Reportů (obě studie)
2. Stažení Subject Detail Reportů (obě studie)
3. Import do MySQL
Spusť tento skript místo tří samostatných skriptů.
"""
import os
import datetime
import glob
import re
from playwright.sync_api import sync_playwright
import pandas as pd
import db_config
import mysql.connector
# ── CONFIG ───────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
# ── helpers ───────────────────────────────────────────────────────────────────
def unique_path(directory, stem):
path = os.path.join(directory, f"{stem}.xlsx")
if not os.path.exists(path):
return path
time_tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
def login(page, study):
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
# ── KROK 1: Subject Summary ───────────────────────────────────────────────────
def download_summary(page, study, today):
print(f" [{study}] Stahuji Subject Summary Report...")
page.goto(f"{BASE_URL}/report/patient_summary_report")
page.wait_for_load_state("networkidle", timeout=120000)
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
return filename
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
def get_subjects_from_summary(summary_path):
raw = pd.read_excel(summary_path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError("Hlavičkový řádek nenalezen")
df = pd.read_excel(summary_path, header=header_row)
return df["Subject"].dropna().astype(str).str.strip().tolist()
def download_details(page, study, summary_path, today):
out_dir = os.path.join(DETAILS_DIR, study)
os.makedirs(out_dir, exist_ok=True)
subjects = get_subjects_from_summary(summary_path)
print(f" [{study}] Subjektů k stažení: {len(subjects)}")
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=120000)
for subject in subjects:
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
input_field.click()
input_field.fill(subject)
page.wait_for_timeout(500)
page.locator("mat-option").first.dispatch_event("click")
page.wait_for_load_state("networkidle", timeout=120000)
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Detail {subject} OK")
page.get_by_role("button", name="Clear").click()
page.wait_for_load_state("networkidle", timeout=120000)
# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
def get_conn():
return mysql.connector.connect(
host=db_config.DB_HOST,
port=db_config.DB_PORT,
user=db_config.DB_USER,
password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
def to_date(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, pd.Timestamp):
return None if pd.isna(val) else val.date()
if isinstance(val, datetime.datetime):
return val.date()
if isinstance(val, datetime.date):
return val
s = str(val).strip()
if not s or s.lower() in ("nat", "nan", "none", ""):
return None
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.datetime.strptime(s, fmt).date()
except ValueError:
pass
return None
def to_int(val):
try:
v = float(val)
return None if pd.isna(v) else int(v)
except (TypeError, ValueError):
return None
def to_float(val):
try:
v = float(val)
return None if pd.isna(v) else v
except (TypeError, ValueError):
return None
def to_str(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
s = str(val).strip()
return None if s.lower() in ("nan", "nat", "none", "") else s
def read_summary_df(path):
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
return pd.read_excel(path, header=header_row).dropna(how="all")
def parse_detail_visits(path):
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
header_row = None
for i, row in df.iterrows():
if "Visit Type" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
return []
visits_df = df.iloc[header_row + 1:].copy()
visits_df.columns = range(visits_df.shape[1])
rows = []
for _, r in visits_df.iterrows():
visit_type = to_str(r.get(0))
if visit_type not in ("Past", "Upcoming"):
continue
rows.append({
"visit_type": visit_type,
"scheduled_date": to_date(r.get(1)),
"window_days": to_str(r.get(2)),
"actual_date": to_date(r.get(3)),
"irt_transaction_no": to_int(r.get(4)),
"irt_transaction_description": to_str(r.get(5)),
"medication_assignment": to_str(r.get(6)),
"quantity_assigned": to_int(r.get(7)),
"medication_id": to_str(r.get(8)),
})
return rows
def insert_import(cursor, study, source_file):
cursor.execute(
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
(study, datetime.datetime.now(), os.path.basename(source_file)),
)
return cursor.lastrowid
def insert_uco3001_summary(cursor, import_id, df):
sql = """INSERT INTO iwrs_uco3001_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
clinical_responder_status_i12_m0, irt_subject_status,
i0_rand_date_local, last_irt_transaction,
last_irt_transaction_date_local, last_irt_transaction_date_utc,
next_irt_transaction, next_irt_transaction_date_local,
most_recent_med_assignment_date, days_since_last_med_assignment,
patient_forecast_status, patient_forecast_status_changed_date
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
col = df.columns.tolist()
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_date(r["Informed Consent Date"]),
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
to_int(r["Subject's age collection"]),
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
to_str(r["IRT Subject Status"]),
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
))
def insert_mdd3003_summary(cursor, import_id, df):
sql = """INSERT INTO iwrs_mdd3003_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
stratification_country, age_group, stable_remitters, irt_subject_status,
last_irt_transaction, last_irt_transaction_date_local,
last_irt_transaction_date_utc, next_irt_transaction,
next_irt_transaction_date_local, date_screened, date_screen_failed,
date_randomized_part1, date_early_withdraw_randomized_part1,
date_open_label_induction, date_early_withdraw_open_label_induction,
date_randomized_part2, date_early_withdraw_randomized_part2,
date_completed, date_unblinded
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
col = df.columns.tolist()
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
to_date(r["Informed Consent Date"]),
to_int(r["Subject's age collection"]),
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
to_str(r["Age Group"]) if "Age Group" in col else None,
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
to_str(r["IRT Subject Status"]),
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
))
def insert_visits(cursor, import_id, study, subject, visits):
if not visits:
return
sql = """INSERT INTO iwrs_subject_visits (
import_id, study, subject, visit_type, scheduled_date, window_days,
actual_date, irt_transaction_no, irt_transaction_description,
medication_assignment, quantity_assigned, medication_id
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
for v in visits:
cursor.execute(sql, (
import_id, study, subject,
v["visit_type"], v["scheduled_date"], v["window_days"],
v["actual_date"], v["irt_transaction_no"],
v["irt_transaction_description"], v["medication_assignment"],
v["quantity_assigned"], v["medication_id"],
))
def import_to_mysql(summary_path, detail_files, study):
print(f"\n [MySQL] Importuji {study}...")
df_summary = read_summary_df(summary_path)
conn = get_conn()
cursor = conn.cursor()
import_id = insert_import(cursor, study, summary_path)
if study == "77242113UCO3001":
insert_uco3001_summary(cursor, import_id, df_summary)
else:
insert_mdd3003_summary(cursor, import_id, df_summary)
total_visits = 0
for path in detail_files:
fname = os.path.basename(path)
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
subject = m.group(1) if m else "UNKNOWN"
visits = parse_detail_visits(path)
insert_visits(cursor, import_id, study, subject, visits)
total_visits += len(visits)
conn.commit()
cursor.close()
conn.close()
print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
return import_id
# ── MAIN ─────────────────────────────────────────────────────────────────────
def main():
today = datetime.date.today().strftime("%Y-%m-%d")
os.makedirs(INCOMING_DIR, exist_ok=True)
os.makedirs(DETAILS_DIR, exist_ok=True)
summary_paths = {}
# ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
with sync_playwright() as p:
for study in STUDIES:
print(f"\n{'='*60}")
print(f"[{study}] KROK 1: Subject Summary Report")
print(f"{'='*60}")
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
try:
login(page, study)
summary_path = download_summary(page, study, today)
summary_paths[study] = summary_path
print(f"\n[{study}] KROK 2: Subject Detail Reports")
download_details(page, study, summary_path, today)
except Exception as e:
print(f" [{study}] CHYBA při stahování: {e}")
summary_paths[study] = None
finally:
browser.close()
# ── Krok 3: import do MySQL ──────────────────────────────────────────────
print(f"\n{'='*60}")
print("KROK 3: Import do MySQL")
print(f"{'='*60}")
for study in STUDIES:
summary_path = summary_paths.get(study)
if not summary_path:
print(f" [{study}] PŘESKOČENO — stahování selhalo")
continue
detail_files = sorted(glob.glob(
os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
))
try:
import_to_mysql(summary_path, detail_files, study)
except Exception as e:
print(f" [{study}] CHYBA při importu: {e}")
print(f"\n{'='*60}")
print("Vše hotovo.")
print(f"{'='*60}")
main()