Add Outlook/Soubory/Clario/Feasibility scripts and reports; ignore Incoming, Outlook downloads & profile

This commit is contained in:
2026-06-03 16:15:19 +02:00
parent 61c6aeea23
commit 6c57ab3ae6
36 changed files with 4949 additions and 0 deletions
+39
View File
@@ -0,0 +1,39 @@
"""
Jednorázový skript — vytvoří/aktualizuje tabulky v MySQL.
Spusť jednou: python create_iwrs_tables.py
"""
import os
import mysql.connector
import db_config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SQL_FILE = os.path.join(BASE_DIR, "create_iwrs_tables.sql")
conn = mysql.connector.connect(
host=db_config.DB_HOST,
port=db_config.DB_PORT,
user=db_config.DB_USER,
password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
cursor = conn.cursor()
sql = open(SQL_FILE, encoding="utf-8").read()
# Odstraň komentáře a rozdělíme na příkazy
stmts = [s.strip() for s in sql.split(";")]
for stmt in stmts:
# Odstraň řádkové komentáře
lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")]
stmt = "\n".join(lines).strip()
if not stmt or stmt.upper().startswith("USE"):
continue
try:
cursor.execute(stmt)
print(f"OK: {stmt[:80]}")
except Exception as e:
print(f"SKIP: {e}")
conn.commit()
cursor.close()
conn.close()
print("\nHotovo.")
+128
View File
@@ -0,0 +1,128 @@
-- IWRS tabulky pro databázi studie
-- Spustit jednou: mysql -h 192.168.1.76 -u root -p studie < create_iwrs_tables.sql
USE studie;
-- ── Import log ───────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_import (
import_id INT AUTO_INCREMENT PRIMARY KEY,
study VARCHAR(20) NOT NULL,
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
source_file VARCHAR(500) NOT NULL,
INDEX idx_study (study)
);
-- ── UCO3001 subject summary ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_uco3001_subject_summary (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
subject VARCHAR(20) NOT NULL,
prior_subject_identifier VARCHAR(20),
site VARCHAR(50),
investigator VARCHAR(100),
location VARCHAR(50),
cohort_per_irt VARCHAR(100),
informed_consent_date DATE,
adolescent_assent_date DATE,
age SMALLINT,
weight DECIMAL(5,1),
rescreened_subject VARCHAR(10),
adt_ir VARCHAR(10),
three_or_more_advanced_therapies VARCHAR(10),
only_oral_5asa_compounds VARCHAR(10),
ustekinumab VARCHAR(10),
isolated_proctitis VARCHAR(10),
clinical_responder_status_i12_m0 VARCHAR(100),
irt_subject_status VARCHAR(50),
i0_rand_date_local DATE,
last_irt_transaction VARCHAR(100),
last_irt_transaction_date_local DATE,
last_irt_transaction_date_utc DATE,
next_irt_transaction VARCHAR(100),
next_irt_transaction_date_local DATE,
most_recent_med_assignment_date DATE,
days_since_last_med_assignment SMALLINT,
patient_forecast_status VARCHAR(50),
patient_forecast_status_changed_date DATE,
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_subject (subject)
);
-- ── MDD3003 subject summary ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_mdd3003_subject_summary (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
subject VARCHAR(20) NOT NULL,
prior_subject_identifier VARCHAR(20),
site VARCHAR(50),
investigator VARCHAR(100),
location VARCHAR(50),
cohort_per_irt VARCHAR(50),
madrs_criteria_integrated VARCHAR(50),
informed_consent_date DATE,
age SMALLINT,
madrs_criteria_v15 VARCHAR(10),
madrs_criteria_v16 VARCHAR(10),
madrs_criteria_v17 VARCHAR(10),
stratification_country VARCHAR(10),
age_group VARCHAR(20),
stable_remitters VARCHAR(50),
irt_subject_status VARCHAR(100),
last_irt_transaction VARCHAR(100),
last_irt_transaction_date_local DATE,
last_irt_transaction_date_utc DATE,
next_irt_transaction VARCHAR(100),
next_irt_transaction_date_local DATE,
date_screened DATE,
date_screen_failed DATE,
date_randomized_part1 DATE,
date_early_withdraw_randomized_part1 DATE,
date_open_label_induction DATE,
date_early_withdraw_open_label_induction DATE,
date_randomized_part2 DATE,
date_early_withdraw_randomized_part2 DATE,
date_completed DATE,
date_unblinded DATE,
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_subject (subject)
);
-- ── Notifications ────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_notifications (
id INT AUTO_INCREMENT PRIMARY KEY,
study VARCHAR(20) NOT NULL,
subject VARCHAR(20) NOT NULL,
pk INT NOT NULL,
title VARCHAR(100),
label VARCHAR(500),
event VARCHAR(50),
actual_date DATE,
text TEXT,
pdf MEDIUMBLOB,
source_file VARCHAR(500),
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE KEY uq_pk (pk),
INDEX idx_study_subject (study, subject)
);
-- ── Subject visits / transactions (obě studie) ───────────────────────────────
CREATE TABLE IF NOT EXISTS iwrs_subject_visits (
id INT AUTO_INCREMENT PRIMARY KEY,
import_id INT NOT NULL,
study VARCHAR(20) NOT NULL,
subject VARCHAR(20) NOT NULL,
visit_type ENUM('Past','Upcoming') NOT NULL,
scheduled_date DATE,
window_days VARCHAR(20),
actual_date DATE,
irt_transaction_no SMALLINT,
irt_transaction_description VARCHAR(200),
medication_assignment VARCHAR(200),
quantity_assigned SMALLINT,
medication_id VARCHAR(20),
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
INDEX idx_import (import_id),
INDEX idx_study_subject (study, subject)
);
@@ -0,0 +1,201 @@
from playwright.sync_api import sync_playwright
import os
import glob
import datetime
import requests
import pandas as pd
# ── CONFIG ──────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
# ────────────────────────────────────────────────────────────────────────────
def get_subjects(study):
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
files = sorted(
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
key=os.path.getmtime,
reverse=True,
)
if not files:
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
today = datetime.date.today().strftime("%Y-%m-%d")
if not os.path.basename(files[0]).startswith(today):
raise FileNotFoundError(
f"Dnešní Subject Summary Report pro {study} neexistuje — spusť nejdříve download_subject_summary.py"
)
path = files[0]
print(f" Čtu subjekty z: {os.path.basename(path)}")
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError("Hlavičkový řádek nenalezen")
df = pd.read_excel(path, header=header_row)
subjects = df["Subject"].dropna().astype(str).str.strip().tolist()
return subjects
def get_jwt_and_api_base(page, study):
"""Získá JWT token a api_base_url pro danou studii."""
jwt = page.evaluate("localStorage.getItem('JWT.access')")
if not jwt:
raise ValueError("JWT token nenalezen v localStorage")
instances = page.evaluate("""async (jwt) => {
const res = await fetch('/_/api/dispatch/app_instances/', {
headers: { 'Authorization': `Bearer ${jwt}` }
});
return res.json();
}""", jwt)
instance = next(
(i for i in instances if study in i.get("label", "")),
None
)
if not instance:
raise ValueError(f"app_instance pro studii {study} nenalezena")
return jwt, instance["api_base_url"]
def get_notifications(jwt, api_base, study, subject):
"""Načte seznam notifikací pro daného subjekta přes report_data API."""
url = f"{BASE_URL}{api_base}/api/v1/reports_api/report_data"
params = {
"path": "patient_detail_report",
"id": subject,
"key": "table_1",
"unblinded": "false",
}
payload = {
"path": "patient_detail_report",
"study": study,
"id": subject,
"key": "table_1",
"fields": {},
"filters": [{"tableId": "table_1", "tableFilters": {}}],
"pagination_details": {"order": "type", "reverseOrder": False, "page": 1, "limit": 500},
"cache_key": f"py_{subject}_{datetime.datetime.now().timestamp()}",
}
headers = {
"Authorization": f"Bearer {jwt}",
"Content-Type": "application/json",
"lang": "en",
}
resp = requests.post(url, params=params, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
notifications = []
for row in data.get("data", []):
for notif in row.get("notification", []):
item = notif.get("item", {})
pk = item.get("pk")
title = item.get("et_title")
if pk and title:
notifications.append({"pk": pk, "title": title, "event": row.get("event_event_id", "")})
return notifications
def download_pdf(jwt, api_base, pk, title, out_path):
"""Stáhne PDF notifikaci a uloží ji."""
url = f"{BASE_URL}{api_base}/api/v1/meta_api/pdfnotification"
params = {"pk": pk, "title": title, "html": "true"}
headers = {
"Authorization": f"Bearer {jwt}",
"lang": "en",
"Accept": "*/*",
}
resp = requests.get(url, params=params, headers=headers)
resp.raise_for_status()
with open(out_path, "wb") as f:
f.write(resp.content)
def run(page, study):
out_dir = os.path.join(DETAILS_DIR, study)
os.makedirs(out_dir, exist_ok=True)
subjects = get_subjects(study)
print(f" Nalezeno {len(subjects)} subjektů")
today = datetime.date.today().strftime("%Y-%m-%d")
# Načteme stránku aby byl platný session kontext
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=120000)
jwt, api_base = get_jwt_and_api_base(page, study)
print(f" API base: {api_base}")
for subject in subjects:
print(f" [{subject}] Stahuji notifikace...")
try:
notifications = get_notifications(jwt, api_base, study, subject)
if not notifications:
print(f" [{subject}] Žádné notifikace")
continue
for notif in notifications:
pk = notif["pk"]
title = notif["title"]
filename = os.path.join(out_dir, f"{today} {study} {subject} Notification {title} pk{pk}.pdf")
if os.path.exists(filename):
print(f" [{subject}] {title} (pk={pk}) — již existuje, přeskakuji")
continue
download_pdf(jwt, api_base, pk, title, filename)
print(f" [{subject}] {title} (pk={pk}) OK")
except Exception as e:
print(f" [{subject}] CHYBA při notifikacích: {e}")
print(f" [{study}] Notifikace hotovo.")
def main():
os.makedirs(DETAILS_DIR, exist_ok=True)
with sync_playwright() as p:
for study in STUDIES:
print(f"\n[{study}] Přihlášení...")
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
try:
run(page, study)
except Exception as e:
print(f" [{study}] CHYBA: {e}")
browser.close()
print("\nVše hotovo.")
main()
@@ -0,0 +1,76 @@
from playwright.sync_api import sync_playwright
import os
import datetime
# ── CONFIG ──────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
CREATED_DIR = os.path.join(BASE_DIR, "CreatedReports")
# ────────────────────────────────────────────────────────────────────────────
def unique_path(directory, stem):
path = os.path.join(directory, f"{stem}.xlsx")
if not os.path.exists(path):
return path
time_tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
def download_study(page, study, today):
print(f"\n[{study}] Prihlaseni...")
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
print(f"[{study}] Vyber studie...")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
print(f"[{study}] Stahuji Subject Summary Report...")
page.goto(f"{BASE_URL}/report/patient_summary_report")
page.wait_for_load_state("networkidle", timeout=120000)
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f"[{study}] OK -> {filename}")
return filename
def main():
today = datetime.date.today().strftime("%Y-%m-%d")
os.makedirs(INCOMING_DIR, exist_ok=True)
os.makedirs(CREATED_DIR, exist_ok=True)
downloaded = []
with sync_playwright() as p:
for study in STUDIES:
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
filename = download_study(page, study, today)
downloaded.append((study, filename))
browser.close()
print("\nVse stazeno:")
for study, path in downloaded:
print(f" {study}: {path}")
main()
+453
View File
@@ -0,0 +1,453 @@
"""
Importuje data z IWRS Excel reportů do MySQL (databáze studie).
Pořadí spuštění:
1. download_subject_summary.py
2. download_subject_details.py
3. tento skript
Každé spuštění vytvoří nový import_id v iwrs_import.
Reportovací skripty pracují vždy s MAX(import_id) pro danou studii.
"""
import os
import glob
import datetime
import re
import numpy as np
import pandas as pd
import mysql.connector
import db_config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
# ── helpers ──────────────────────────────────────────────────────────────────
def get_conn():
return mysql.connector.connect(
host=db_config.DB_HOST,
port=db_config.DB_PORT,
user=db_config.DB_USER,
password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
def _py(val):
"""Převede numpy skalár na Python nativní typ."""
if isinstance(val, np.generic):
return val.item()
return val
def to_date(val):
"""Převede pandas Timestamp / string / NaT / NaN na date nebo None."""
val = _py(val)
if val is None or (isinstance(val, float) and (val != val)): # NaN check
return None
try:
if pd.isna(val):
return None
except (TypeError, ValueError):
pass
if isinstance(val, pd.Timestamp):
return None if pd.isna(val) else val.date()
if isinstance(val, datetime.datetime):
return val.date()
if isinstance(val, datetime.date):
return val
s = str(val).strip()
if not s or s.lower() in ("nat", "nan", "none", ""):
return None
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.datetime.strptime(s, fmt).date()
except ValueError:
pass
return None
def to_int(val):
val = _py(val)
try:
v = float(val)
return None if (v != v) else int(v) # v != v je True jen pro NaN
except (TypeError, ValueError):
return None
def to_float(val):
val = _py(val)
try:
v = float(val)
return None if (v != v) else float(v)
except (TypeError, ValueError):
return None
def to_str(val):
val = _py(val)
if val is None:
return None
if isinstance(val, float) and (val != val): # NaN
return None
s = str(val).strip()
return None if s.lower() in ("nan", "nat", "none", "") else s
def find_summary_file(study):
today = datetime.date.today().strftime("%Y-%m-%d")
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
files = sorted(
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
key=os.path.getmtime,
reverse=True,
)
if not files:
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
if not os.path.basename(files[0]).startswith(today):
print(f" UPOZORNĚNÍ: nejnovější Summary Report pro {study} není z dnešního dne ({os.path.basename(files[0])[:10]})")
return files[0]
def read_summary_df(path):
"""Přečte Summary xlsx, vrátí DataFrame od řádku s hlavičkou."""
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
return pd.read_excel(path, header=header_row)
def find_detail_files(study):
out_dir = os.path.join(DETAILS_DIR, study)
# Vezme soubory ze stejného dne jako nejnovější Summary Report
summary_path = find_summary_file(study)
file_date = os.path.basename(summary_path)[:10] # "YYYY-MM-DD"
pattern = os.path.join(out_dir, f"{file_date} {study} * Subject Detail.xlsx")
files = [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")]
return sorted(files)
def parse_detail_visits(path):
"""
Vrátí list slovníků s daty visitů z Detail xlsx.
Každý řádek tabulky (od řádku s hlavičkou Visit Type) je jedna transakce.
"""
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
header_row = None
for i, row in df.iterrows():
if "Visit Type" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
return []
visits_df = df.iloc[header_row + 1:].copy()
visits_df.columns = range(visits_df.shape[1])
rows = []
for _, r in visits_df.iterrows():
visit_type = to_str(r.get(0))
if visit_type not in ("Past", "Upcoming"):
continue
rows.append({
"visit_type": visit_type,
"scheduled_date": to_date(r.get(1)),
"window_days": to_str(r.get(2)),
"actual_date": to_date(r.get(3)),
"irt_transaction_no": to_int(r.get(4)),
"irt_transaction_description": to_str(r.get(5)),
"medication_assignment": to_str(r.get(6)),
"quantity_assigned": to_int(r.get(7)),
"medication_id": to_str(r.get(8)),
})
return rows
# ── insert helpers ────────────────────────────────────────────────────────────
def insert_import(cursor, study, source_file):
cursor.execute(
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
(study, datetime.datetime.now(), os.path.basename(source_file)),
)
return cursor.lastrowid
def insert_uco3001_summary(cursor, import_id, df):
sql = """
INSERT INTO iwrs_uco3001_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
clinical_responder_status_i12_m0, irt_subject_status,
i0_rand_date_local, last_irt_transaction,
last_irt_transaction_date_local, last_irt_transaction_date_utc,
next_irt_transaction, next_irt_transaction_date_local,
most_recent_med_assignment_date, days_since_last_med_assignment,
patient_forecast_status, patient_forecast_status_changed_date
) VALUES (
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
)
"""
col = df.columns.tolist()
def c(name):
return col.index(name) if name in col else None
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_date(r["Informed Consent Date"]),
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
to_int(r["Subject's age collection"]),
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
to_str(r["IRT Subject Status"]),
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
))
def insert_mdd3003_summary(cursor, import_id, df):
sql = """
INSERT INTO iwrs_mdd3003_subject_summary (
import_id, subject, prior_subject_identifier, site, investigator, location,
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
stratification_country, age_group, stable_remitters, irt_subject_status,
last_irt_transaction, last_irt_transaction_date_local,
last_irt_transaction_date_utc, next_irt_transaction,
next_irt_transaction_date_local, date_screened, date_screen_failed,
date_randomized_part1, date_early_withdraw_randomized_part1,
date_open_label_induction, date_early_withdraw_open_label_induction,
date_randomized_part2, date_early_withdraw_randomized_part2,
date_completed, date_unblinded
) VALUES (
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
)
"""
col = df.columns.tolist()
for _, r in df.iterrows():
cursor.execute(sql, (
import_id,
to_str(r["Subject"]),
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
to_str(r["Site"]),
to_str(r["Investigator"]),
to_str(r["Location"]),
to_str(r["Cohort per IRT"]),
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
to_date(r["Informed Consent Date"]),
to_int(r["Subject's age collection"]),
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
to_str(r["Age Group"]) if "Age Group" in col else None,
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
to_str(r["IRT Subject Status"]),
to_str(r["Last Recorded IRT Transaction"]),
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
to_str(r["Next Expected IRT Transaction"]),
to_date(r["Next Expected IRT Transaction Date [Local]"]),
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
))
def insert_visits(cursor, import_id, study, subject, visits):
if not visits:
return
sql = """
INSERT INTO iwrs_subject_visits (
import_id, study, subject, visit_type, scheduled_date, window_days,
actual_date, irt_transaction_no, irt_transaction_description,
medication_assignment, quantity_assigned, medication_id
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
for v in visits:
cursor.execute(sql, (
import_id, study, subject,
v["visit_type"], v["scheduled_date"], v["window_days"],
v["actual_date"], v["irt_transaction_no"],
v["irt_transaction_description"], v["medication_assignment"],
v["quantity_assigned"], v["medication_id"],
))
# ── notifications ─────────────────────────────────────────────────────────────
def find_notification_json_files(study):
"""Najde všechny .json soubory notifikací pro danou studii."""
out_dir = os.path.join(DETAILS_DIR, study)
return sorted(glob.glob(os.path.join(out_dir, "*.json")))
def import_notifications(conn, study):
import json as json_lib
json_files = find_notification_json_files(study)
if not json_files:
print(f" Žádné notifikace k importu pro {study}")
return 0
sql = """
INSERT INTO iwrs_notifications
(study, subject, pk, title, label, event, actual_date, text, pdf, source_file)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
label = VALUES(label),
text = VALUES(text),
pdf = VALUES(pdf),
source_file = VALUES(source_file)
"""
done_dir = os.path.join(os.path.join(DETAILS_DIR, study), "Zpracováno")
os.makedirs(done_dir, exist_ok=True)
cursor = conn.cursor()
count = 0
for json_path in json_files:
try:
with open(json_path, "r", encoding="utf-8") as f:
meta = json_lib.load(f)
pdf_path = json_path.replace(".json", ".pdf")
pdf_data = None
if os.path.exists(pdf_path):
with open(pdf_path, "rb") as f:
pdf_data = f.read()
cursor.execute(sql, (
meta.get("study", study),
meta.get("subject"),
meta.get("pk"),
meta.get("title"),
meta.get("label"),
meta.get("event"),
to_date(meta.get("actual_date")),
meta.get("text"),
pdf_data,
os.path.basename(json_path),
))
count += 1
# Přesun do Zpracováno
import shutil
shutil.move(json_path, os.path.join(done_dir, os.path.basename(json_path)))
if os.path.exists(pdf_path):
shutil.move(pdf_path, os.path.join(done_dir, os.path.basename(pdf_path)))
except Exception as e:
print(f" CHYBA při importu {os.path.basename(json_path)}: {e}")
conn.commit()
cursor.close()
print(f" Notifikací uloženo/přesunuto: {count}")
return count
# ── main ──────────────────────────────────────────────────────────────────────
def import_study(conn, study):
summary_path = find_summary_file(study)
print(f" Summary: {os.path.basename(summary_path)}")
df_summary = read_summary_df(summary_path)
df_summary = df_summary.dropna(how="all")
detail_files = find_detail_files(study)
print(f" Detail souborů: {len(detail_files)}")
cursor = conn.cursor()
import_id = insert_import(cursor, study, summary_path)
print(f" import_id = {import_id}")
if study == "77242113UCO3001":
insert_uco3001_summary(cursor, import_id, df_summary)
else:
insert_mdd3003_summary(cursor, import_id, df_summary)
print(f" Summary řádků: {len(df_summary)}")
visited = 0
for path in detail_files:
fname = os.path.basename(path)
# název: "2026-05-04 77242113UCO3001 CZ100012001 Subject Detail.xlsx"
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
subject = m.group(1) if m else "UNKNOWN"
visits = parse_detail_visits(path)
insert_visits(cursor, import_id, study, subject, visits)
visited += len(visits)
conn.commit()
cursor.close()
print(f" Transakce uloženo: {visited}")
return import_id
def main():
conn = get_conn()
print("Připojeno k MySQL.\n")
for study in STUDIES:
print(f"[{study}]")
try:
import_id = import_study(conn, study)
print(f" OK — import_id {import_id}")
except Exception as e:
print(f" CHYBA: {e}")
try:
import_notifications(conn, study)
except Exception as e:
print(f" CHYBA notifikace: {e}")
print()
conn.close()
print("Hotovo.")
main()
+175
View File
@@ -0,0 +1,175 @@
"""
Kompletní pipeline:
1. Stažení Subject Summary Reportů (obě studie)
2. Stažení Subject Detail Reportů + notifikací (obě studie)
3. Import do MongoDB (subject_summary + visits + notifications)
Spusť tento skript místo samostatných skriptů.
"""
import os
import sys
import datetime
import glob
from playwright.sync_api import sync_playwright
import download_subject_details as dsd
import import_to_mongo
import import_notifications_to_mongo
# ── CONFIG ───────────────────────────────────────────────────────────────────
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
# ── helpers ───────────────────────────────────────────────────────────────────
def unique_path(directory, stem):
path = os.path.join(directory, f"{stem}.xlsx")
if not os.path.exists(path):
return path
time_tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
def login(page, study):
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=study).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
# ── KROK 1: Subject Summary ───────────────────────────────────────────────────
def download_summary(page, study, today):
print(f" [{study}] Stahuji Subject Summary Report...")
page.goto(f"{BASE_URL}/report/patient_summary_report")
page.wait_for_load_state("networkidle", timeout=120000)
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
return filename
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
def get_subjects_from_summary(summary_path):
import pandas as pd
raw = pd.read_excel(summary_path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Subject" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
raise ValueError("Hlavičkový řádek nenalezen")
df = pd.read_excel(summary_path, header=header_row)
return df["Subject"].dropna().astype(str).str.strip().tolist()
def download_details(page, study, summary_path, today):
out_dir = os.path.join(DETAILS_DIR, study)
os.makedirs(out_dir, exist_ok=True)
subjects = get_subjects_from_summary(summary_path)
print(f" [{study}] Subjektů k stažení: {len(subjects)}")
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=120000)
for subject in subjects:
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
input_field.click()
input_field.fill(subject)
page.wait_for_timeout(500)
page.locator("mat-option").first.dispatch_event("click")
page.wait_for_load_state("networkidle", timeout=120000)
with page.expect_download(timeout=120000) as dl:
page.get_by_role("button", name="Download XLS").click()
dl.value.save_as(filename)
print(f" [{study}] Detail {subject} OK")
page.get_by_role("button", name="Clear").click()
page.wait_for_load_state("networkidle", timeout=120000)
# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────
def main():
today = datetime.date.today().strftime("%Y-%m-%d")
os.makedirs(INCOMING_DIR, exist_ok=True)
os.makedirs(DETAILS_DIR, exist_ok=True)
summary_paths = {}
# Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
with sync_playwright() as p:
for study in STUDIES:
print("\n" + "=" * 60)
print(f"[{study}] KROK 1: Subject Summary Report")
print("=" * 60)
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
try:
login(page, study)
summary_path = download_summary(page, study, today)
summary_paths[study] = summary_path
print(f"\n[{study}] KROK 2: Subject Detail Reports + notifikace")
dsd.run(page, study)
except Exception as e:
print(f" [{study}] CHYBA při stahování: {e}")
summary_paths[study] = None
finally:
browser.close()
# Krok 3: import do MongoDB
print("\n" + "=" * 60)
print("KROK 3: Import do MongoDB")
print("=" * 60)
for study in STUDIES:
summary_path = summary_paths.get(study)
if not summary_path:
print(f" [{study}] PŘESKOČENO — stahování selhalo")
continue
try:
import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
except Exception as e:
print(f" [{study}] CHYBA při importu summary/visits: {e}")
# Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
print("\n [notifikace] import PDF/JSON do Mongo...")
try:
import_notifications_to_mongo.main(STUDIES)
except Exception as e:
print(f" CHYBA při importu notifikací: {e}")
print("\n" + "=" * 60)
print("Vše hotovo.")
print("=" * 60)
main()
+172
View File
@@ -0,0 +1,172 @@
from playwright.sync_api import sync_playwright
import re
import os
import datetime
import mysql.connector
import db_config
def get_existing_pks(study):
"""Vrátí set pk notifikací které už jsou v DB pro danou studii."""
try:
conn = mysql.connector.connect(
host=db_config.DB_HOST, port=db_config.DB_PORT,
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
cursor = conn.cursor()
cursor.execute("SELECT pk FROM iwrs_notifications WHERE study = %s", (study,))
pks = {row[0] for row in cursor.fetchall()}
cursor.close()
conn.close()
return pks
except Exception as e:
print(f" UPOZORNĚNÍ: nelze načíst existující pk z DB ({e}), stahuji vše")
return set()
BASE_URL = "https://janssen.4gclinical.com"
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"
STUDY = "77242113UCO3001"
SUBJECT = "CZ100222003"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
def strip_html(html):
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def main():
existing_pks = get_existing_pks(STUDY)
print(f"V DB již existuje {len(existing_pks)} notifikací pro {STUDY}")
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, args=["--start-maximized"])
context = browser.new_context(no_viewport=True)
page = context.new_page()
print("Přihlašuji se...")
page.goto(BASE_URL)
page.wait_for_load_state("networkidle")
page.get_by_label("Email *").fill(EMAIL)
page.get_by_label("Password *").fill(PASSWORD)
page.locator("#login__submit").click()
page.wait_for_load_state("networkidle")
page.get_by_label("Study *").click()
page.get_by_role("option", name=STUDY).click()
page.get_by_role("button", name="SELECT").click()
page.wait_for_load_state("networkidle")
page.goto(f"{BASE_URL}/report/patient_detail_report")
page.wait_for_load_state("networkidle", timeout=60000)
# JWT + api_base
jwt = page.evaluate("localStorage.getItem('JWT.access')")
print(f"JWT: {jwt[:30]}...")
instances = page.evaluate("""async (jwt) => {
const res = await fetch('/_/api/dispatch/app_instances/', {
headers: { 'Authorization': `Bearer ${jwt}` }
});
return res.json();
}""", jwt)
instance = next((i for i in instances if STUDY in i.get("label", "")), None)
if not instance:
raise ValueError(f"Instance pro {STUDY} nenalezena")
api_base = instance["api_base_url"]
print(f"API base: {api_base}")
# Vyber subjekt a zachyť table_1 response přímo
print(f"Vybírám subjekt {SUBJECT}...")
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
input_field.click()
input_field.fill(SUBJECT)
page.wait_for_timeout(1000)
captured = {}
with page.expect_response(
lambda r: "report_data" in r.url and "table_1" in r.url,
timeout=60000
) as resp_info:
page.locator("mat-option").first.dispatch_event("click")
response = resp_info.value
data = response.json()
out_dir = os.path.join(DETAILS_DIR, STUDY)
os.makedirs(out_dir, exist_ok=True)
today = datetime.date.today().strftime("%Y-%m-%d")
print(f"\n{'='*60}")
print(f"Subjekt: {SUBJECT} | Studie: {STUDY}")
print(f"{'='*60}")
count = 0
for row in data.get("data", []):
for notif in (row.get("notification") or []):
item = notif.get("item", {})
pk = item.get("pk")
title = item.get("et_title")
label = (notif.get("label") or title or "").strip()
# Celý label, mezery → podtržítka, nepovolené znaky pryč
safe_label = re.sub(r'[\\/*?:"<>|]', "", label).replace(" ", "_")
body = item.get("body", "")
text = strip_html(body)
count += 1
print(f"\n--- Notifikace #{count}: {safe_label} (pk={pk}) | event: {row.get('event_event_id')} ---")
print(text)
if pk in existing_pks:
print(f" → pk={pk} již v DB, přeskakuji")
continue
actual_date = row.get("actual_date_raw", "0000-00-00")
pdf_filename = os.path.join(out_dir, f"{actual_date}_{safe_label}.pdf")
if os.path.exists(pdf_filename):
pdf_filename = os.path.join(out_dir, f"{actual_date}_{safe_label}_pk{pk}.pdf")
pdf_url = f"{BASE_URL}{api_base}/api/v1/meta_api/pdfnotification?pk={pk}&title={title}&html=true"
pdf_resp = page.request.get(pdf_url, headers={
"Authorization": f"Bearer {jwt}",
"lang": "en",
"prancer_study": STUDY,
"Accept": "application/json, text/plain, */*",
})
if pdf_resp.ok:
with open(pdf_filename, "wb") as f:
f.write(pdf_resp.body())
print(f" → PDF uloženo: {os.path.basename(pdf_filename)}")
json_filename = pdf_filename.replace(".pdf", ".json")
import json
with open(json_filename, "w", encoding="utf-8") as f:
json.dump({
"pk": pk,
"title": title,
"label": label,
"event": row.get("event_event_id"),
"actual_date": actual_date,
"subject": SUBJECT,
"study": STUDY,
"text": text,
}, f, ensure_ascii=False, indent=2)
print(f" → JSON uloženo: {os.path.basename(json_filename)}")
else:
print(f" → PDF chyba: {pdf_resp.status}")
page.wait_for_timeout(300)
if count == 0:
print("Žádné notifikace nalezeny.")
else:
print(f"\n{'='*60}")
print(f"Celkem notifikací: {count}")
browser.close()
main()