z230
This commit is contained in:
@@ -0,0 +1,56 @@
|
||||
# Kontext práce — IWRS Notifications Pipeline
|
||||
## Datum: 2026-06-01
|
||||
|
||||
## Co bylo uděláno
|
||||
|
||||
### Nové soubory
|
||||
- `download_subject_notifications.py` — standalone skript pro stažení notifikací (referenční, nepoužívaný v pipeline)
|
||||
- `test_notifications.py` — testovací skript pro jednoho pacienta (CZ100222003 / UCO3001)
|
||||
- `create_iwrs_tables.py` — jednorázový skript pro vytvoření MySQL tabulek
|
||||
|
||||
### Upravené soubory
|
||||
- `download_subject_details.py` — přidáno stahování notifikací (PDF + JSON) pro každý subjekt přímo v loopě
|
||||
- `import_to_mysql.py` — přidána funkce `import_notifications()` která importuje JSON+PDF do DB a přesouvá do `Zpracováno/`
|
||||
- `create_iwrs_tables.sql` — přidána tabulka `iwrs_notifications`
|
||||
- `run_all.py` — krok 2 nyní volá `dsd.run()` z `download_subject_details.py`
|
||||
|
||||
## Jak to funguje
|
||||
|
||||
### Stahování notifikací (v `download_subject_details.py`)
|
||||
1. Při výběru subjektu se zachytí `table_1` API response (obsahuje notifikace s `pk`, `et_title`, `label`, `body`, `actual_date_raw`)
|
||||
2. Porovná `pk` s DB (`iwrs_notifications`) — stahuje jen nové
|
||||
3. Stáhne PDF přes `page.request.get()` s Bearer tokenem (JWT se načítá čerstvě před každým requestem)
|
||||
4. Uloží PDF + JSON do `IncomingSourceReportsDetails/{study}/`
|
||||
5. Název souboru: `{actual_date_raw}_{label_s_podtržítky}.pdf` (při kolizi přidá `_pk{pk}`)
|
||||
|
||||
### API endpointy
|
||||
- **Notifikace data**: `POST /_/p/{instance_id}/api/v1/reports_api/report_data?path=patient_detail_report&id={subject}&key=table_1&unblinded=false`
|
||||
- **PDF download**: `GET /_/p/{instance_id}/api/v1/meta_api/pdfnotification?pk={pk}&title={et_title}&html=true`
|
||||
- **app_instances** (pro zjištění instance_id): `GET /_/api/dispatch/app_instances/`
|
||||
- Headers: `Authorization: Bearer {JWT}`, `lang: en`, `prancer_study: {study_code}`
|
||||
|
||||
### Instance ID mapping
|
||||
- `77242113UCO3001` → `/_/p/106`
|
||||
- `42847922MDD3003` → `/_/p/70`
|
||||
- `77242113CRD3001` → `/_/p/103`
|
||||
|
||||
### Import (`import_to_mysql.py`)
|
||||
- Čte všechny `.json` soubory z `IncomingSourceReportsDetails/{study}/`
|
||||
- Načte příslušné `.pdf` jako binární data
|
||||
- Uloží do tabulky `iwrs_notifications` (UNIQUE KEY na `pk` — bez duplikátů)
|
||||
- Přesune soubory do `IncomingSourceReportsDetails/{study}/Zpracováno/`
|
||||
|
||||
## MySQL tabulka `iwrs_notifications`
|
||||
```sql
|
||||
id, study, subject, pk (UNIQUE), title, label, event, actual_date, text (TEXT), pdf (MEDIUMBLOB), source_file, imported_at
|
||||
```
|
||||
|
||||
## Aktuální stav
|
||||
- UCO3001: ~76 notifikací importováno
|
||||
- MDD3003: ~119 notifikací importováno (část 403 chyb — JWT expiroval, opraveno načítáním JWT čerstvě)
|
||||
- MDD3003 notifikace s 403 čekají na příští `run_all.py` (soubory nejsou v `Zpracováno`, takže se znovu stáhnou)
|
||||
|
||||
## Co zbývá / možná vylepšení
|
||||
- Ověřit že MDD3003 403 chyby jsou opraveny (JWT refresh)
|
||||
- `CZ100132003` UCO3001 — timeout při stahování XLS (subjekt přeskočen, zkusit znovu)
|
||||
- Případně přidat retry logiku pro timeout
|
||||
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,39 @@
|
||||
"""
|
||||
Jednorázový skript — vytvoří/aktualizuje tabulky v MySQL.
|
||||
Spusť jednou: python create_iwrs_tables.py
|
||||
"""
|
||||
import os
|
||||
import mysql.connector
|
||||
import db_config
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
SQL_FILE = os.path.join(BASE_DIR, "create_iwrs_tables.sql")
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host=db_config.DB_HOST,
|
||||
port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER,
|
||||
password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql = open(SQL_FILE, encoding="utf-8").read()
|
||||
# Odstraň komentáře a rozdělíme na příkazy
|
||||
stmts = [s.strip() for s in sql.split(";")]
|
||||
for stmt in stmts:
|
||||
# Odstraň řádkové komentáře
|
||||
lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")]
|
||||
stmt = "\n".join(lines).strip()
|
||||
if not stmt or stmt.upper().startswith("USE"):
|
||||
continue
|
||||
try:
|
||||
cursor.execute(stmt)
|
||||
print(f"OK: {stmt[:80]}")
|
||||
except Exception as e:
|
||||
print(f"SKIP: {e}")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print("\nHotovo.")
|
||||
@@ -0,0 +1,128 @@
|
||||
-- IWRS tabulky pro databázi studie
|
||||
-- Spustit jednou: mysql -h 192.168.1.76 -u root -p studie < create_iwrs_tables.sql
|
||||
|
||||
USE studie;
|
||||
|
||||
-- ── Import log ───────────────────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS iwrs_import (
|
||||
import_id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
study VARCHAR(20) NOT NULL,
|
||||
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
source_file VARCHAR(500) NOT NULL,
|
||||
INDEX idx_study (study)
|
||||
);
|
||||
|
||||
-- ── UCO3001 subject summary ───────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS iwrs_uco3001_subject_summary (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
import_id INT NOT NULL,
|
||||
subject VARCHAR(20) NOT NULL,
|
||||
prior_subject_identifier VARCHAR(20),
|
||||
site VARCHAR(50),
|
||||
investigator VARCHAR(100),
|
||||
location VARCHAR(50),
|
||||
cohort_per_irt VARCHAR(100),
|
||||
informed_consent_date DATE,
|
||||
adolescent_assent_date DATE,
|
||||
age SMALLINT,
|
||||
weight DECIMAL(5,1),
|
||||
rescreened_subject VARCHAR(10),
|
||||
adt_ir VARCHAR(10),
|
||||
three_or_more_advanced_therapies VARCHAR(10),
|
||||
only_oral_5asa_compounds VARCHAR(10),
|
||||
ustekinumab VARCHAR(10),
|
||||
isolated_proctitis VARCHAR(10),
|
||||
clinical_responder_status_i12_m0 VARCHAR(100),
|
||||
irt_subject_status VARCHAR(50),
|
||||
i0_rand_date_local DATE,
|
||||
last_irt_transaction VARCHAR(100),
|
||||
last_irt_transaction_date_local DATE,
|
||||
last_irt_transaction_date_utc DATE,
|
||||
next_irt_transaction VARCHAR(100),
|
||||
next_irt_transaction_date_local DATE,
|
||||
most_recent_med_assignment_date DATE,
|
||||
days_since_last_med_assignment SMALLINT,
|
||||
patient_forecast_status VARCHAR(50),
|
||||
patient_forecast_status_changed_date DATE,
|
||||
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
|
||||
INDEX idx_import (import_id),
|
||||
INDEX idx_subject (subject)
|
||||
);
|
||||
|
||||
-- ── MDD3003 subject summary ───────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS iwrs_mdd3003_subject_summary (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
import_id INT NOT NULL,
|
||||
subject VARCHAR(20) NOT NULL,
|
||||
prior_subject_identifier VARCHAR(20),
|
||||
site VARCHAR(50),
|
||||
investigator VARCHAR(100),
|
||||
location VARCHAR(50),
|
||||
cohort_per_irt VARCHAR(50),
|
||||
madrs_criteria_integrated VARCHAR(50),
|
||||
informed_consent_date DATE,
|
||||
age SMALLINT,
|
||||
madrs_criteria_v15 VARCHAR(10),
|
||||
madrs_criteria_v16 VARCHAR(10),
|
||||
madrs_criteria_v17 VARCHAR(10),
|
||||
stratification_country VARCHAR(10),
|
||||
age_group VARCHAR(20),
|
||||
stable_remitters VARCHAR(50),
|
||||
irt_subject_status VARCHAR(100),
|
||||
last_irt_transaction VARCHAR(100),
|
||||
last_irt_transaction_date_local DATE,
|
||||
last_irt_transaction_date_utc DATE,
|
||||
next_irt_transaction VARCHAR(100),
|
||||
next_irt_transaction_date_local DATE,
|
||||
date_screened DATE,
|
||||
date_screen_failed DATE,
|
||||
date_randomized_part1 DATE,
|
||||
date_early_withdraw_randomized_part1 DATE,
|
||||
date_open_label_induction DATE,
|
||||
date_early_withdraw_open_label_induction DATE,
|
||||
date_randomized_part2 DATE,
|
||||
date_early_withdraw_randomized_part2 DATE,
|
||||
date_completed DATE,
|
||||
date_unblinded DATE,
|
||||
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
|
||||
INDEX idx_import (import_id),
|
||||
INDEX idx_subject (subject)
|
||||
);
|
||||
|
||||
-- ── Notifications ────────────────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS iwrs_notifications (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
study VARCHAR(20) NOT NULL,
|
||||
subject VARCHAR(20) NOT NULL,
|
||||
pk INT NOT NULL,
|
||||
title VARCHAR(100),
|
||||
label VARCHAR(500),
|
||||
event VARCHAR(50),
|
||||
actual_date DATE,
|
||||
text TEXT,
|
||||
pdf MEDIUMBLOB,
|
||||
source_file VARCHAR(500),
|
||||
imported_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE KEY uq_pk (pk),
|
||||
INDEX idx_study_subject (study, subject)
|
||||
);
|
||||
|
||||
-- ── Subject visits / transactions (obě studie) ───────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS iwrs_subject_visits (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
import_id INT NOT NULL,
|
||||
study VARCHAR(20) NOT NULL,
|
||||
subject VARCHAR(20) NOT NULL,
|
||||
visit_type ENUM('Past','Upcoming') NOT NULL,
|
||||
scheduled_date DATE,
|
||||
window_days VARCHAR(20),
|
||||
actual_date DATE,
|
||||
irt_transaction_no SMALLINT,
|
||||
irt_transaction_description VARCHAR(200),
|
||||
medication_assignment VARCHAR(200),
|
||||
quantity_assigned SMALLINT,
|
||||
medication_id VARCHAR(20),
|
||||
FOREIGN KEY (import_id) REFERENCES iwrs_import(import_id),
|
||||
INDEX idx_import (import_id),
|
||||
INDEX idx_study_subject (study, subject)
|
||||
);
|
||||
@@ -0,0 +1,310 @@
|
||||
import os
|
||||
import glob
|
||||
import datetime
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import (
|
||||
Font, PatternFill, Alignment, Border, Side, GradientFill
|
||||
)
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
|
||||
CREATED_DIR = os.path.join(BASE_DIR, "CreatedReports")
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
SOURCE_COLS = [
|
||||
"Subject",
|
||||
"Investigator",
|
||||
"Subject's age collection",
|
||||
"Cohort per IRT",
|
||||
"IRT Subject Status",
|
||||
"Last Recorded IRT Transaction",
|
||||
"Next Expected IRT Transaction",
|
||||
"Next Expected IRT Transaction Date [Local]",
|
||||
]
|
||||
|
||||
DISPLAY_HEADERS = [
|
||||
"Subject",
|
||||
"Investigator",
|
||||
"Věk",
|
||||
"Cohort",
|
||||
"Status",
|
||||
"Last IRT",
|
||||
"Next Visit",
|
||||
"Next Date",
|
||||
]
|
||||
|
||||
COL_WIDTHS = [14, 22, 6, 12, 14, 12, 12, 13]
|
||||
|
||||
# ── Styles ───────────────────────────────────────────────────────────────────
|
||||
HEADER_FILL = PatternFill("solid", fgColor="1F4E79")
|
||||
HEADER_FONT = Font(name="Arial", bold=True, color="FFFFFF", size=10)
|
||||
NORMAL_FONT = Font(name="Arial", size=10)
|
||||
BOLD_FONT = Font(name="Arial", bold=True, size=10)
|
||||
STRIKE_FONT = Font(name="Arial", size=10, strike=True, color="999999")
|
||||
ADOLESC_FONT = Font(name="Arial", bold=True, size=10)
|
||||
|
||||
THIN = Side(style="thin", color="CCCCCC")
|
||||
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
|
||||
|
||||
EVEN_FILL = PatternFill("solid", fgColor="EBF3FB")
|
||||
ODD_FILL = PatternFill("solid", fgColor="FFFFFF")
|
||||
|
||||
CENTER = Alignment(horizontal="center", vertical="center", wrap_text=False)
|
||||
LEFT = Alignment(horizontal="left", vertical="center", wrap_text=False)
|
||||
|
||||
|
||||
def unique_path(directory, stem):
|
||||
path = os.path.join(directory, f"{stem}.xlsx")
|
||||
if not os.path.exists(path):
|
||||
return path
|
||||
time_tag = datetime.datetime.now().strftime("%H%M")
|
||||
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
|
||||
|
||||
|
||||
def find_latest_source(study):
|
||||
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
|
||||
files = sorted(
|
||||
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
|
||||
key=os.path.getmtime,
|
||||
reverse=True,
|
||||
)
|
||||
if not files:
|
||||
raise FileNotFoundError(f"Nenalezen zdrojový soubor pro {study} v {INCOMING_DIR}")
|
||||
return files[0]
|
||||
|
||||
|
||||
def load_source(path):
|
||||
raw = pd.read_excel(path, header=None)
|
||||
# find header row (row with "Subject" in first cell)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Subject" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
raise ValueError("Hlavičkový řádek nenalezen")
|
||||
df = pd.read_excel(path, header=header_row)
|
||||
return df
|
||||
|
||||
|
||||
def simplify_cohort(val):
|
||||
if pd.isna(val):
|
||||
return ""
|
||||
val = str(val)
|
||||
if "dolescent" in val:
|
||||
return "Adolescent"
|
||||
if val.startswith("Adult"):
|
||||
return "Adult"
|
||||
# MDD3003: "Part 1", "Part 2" — keep as-is
|
||||
return val
|
||||
|
||||
|
||||
def format_date(val):
|
||||
if pd.isna(val):
|
||||
return ""
|
||||
if hasattr(val, "strftime"):
|
||||
return val.strftime("%Y-%m-%d")
|
||||
return str(val)[:10]
|
||||
|
||||
|
||||
def write_zdroj(wb, df_raw, source_path):
|
||||
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(source_path))
|
||||
sheet_name = f"ZDROJ ({mtime.strftime('%d%b%Y').upper()})"
|
||||
ws = wb.create_sheet(sheet_name)
|
||||
ws.sheet_view.showGridLines = True
|
||||
|
||||
# write raw headers + data as plain table
|
||||
headers = list(df_raw.columns)
|
||||
for c, h in enumerate(headers, 1):
|
||||
cell = ws.cell(row=1, column=c, value=h)
|
||||
cell.font = Font(name="Arial", bold=True, size=9, color="FFFFFF")
|
||||
cell.fill = PatternFill("solid", fgColor="404040")
|
||||
cell.alignment = LEFT
|
||||
cell.border = BORDER
|
||||
ws.column_dimensions[get_column_letter(c)].width = 20
|
||||
|
||||
for r, (_, row) in enumerate(df_raw.iterrows(), 2):
|
||||
fill = EVEN_FILL if r % 2 == 0 else ODD_FILL
|
||||
for c, col in enumerate(headers, 1):
|
||||
val = row[col]
|
||||
if pd.isna(val):
|
||||
val = ""
|
||||
elif hasattr(val, "strftime"):
|
||||
val = val.strftime("%Y-%m-%d")
|
||||
cell = ws.cell(row=r, column=c, value=val)
|
||||
cell.font = Font(name="Arial", size=9)
|
||||
cell.fill = fill
|
||||
cell.border = BORDER
|
||||
cell.alignment = LEFT
|
||||
|
||||
ws.freeze_panes = "A2"
|
||||
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
|
||||
|
||||
|
||||
def write_prehled(wb, df_raw, study):
|
||||
ws = wb.create_sheet("Přehled")
|
||||
ws.sheet_view.showGridLines = False
|
||||
ws.sheet_view.showRowColHeaders = True
|
||||
|
||||
# ── title row ────────────────────────────────────────────────────────────
|
||||
ws.merge_cells("A1:H1")
|
||||
title = ws["A1"]
|
||||
title.value = f"Subject Summary — {study} ({datetime.date.today().strftime('%d-%b-%Y')})"
|
||||
title.font = Font(name="Arial", bold=True, size=12, color="1F4E79")
|
||||
title.alignment = Alignment(horizontal="left", vertical="center")
|
||||
ws.row_dimensions[1].height = 22
|
||||
|
||||
# ── header row ───────────────────────────────────────────────────────────
|
||||
for c, (h, w) in enumerate(zip(DISPLAY_HEADERS, COL_WIDTHS), 1):
|
||||
cell = ws.cell(row=2, column=c, value=h)
|
||||
cell.font = HEADER_FONT
|
||||
cell.fill = HEADER_FILL
|
||||
cell.alignment = CENTER
|
||||
cell.border = BORDER
|
||||
ws.column_dimensions[get_column_letter(c)].width = w
|
||||
ws.row_dimensions[2].height = 18
|
||||
|
||||
# ── build display dataframe ───────────────────────────────────────────────
|
||||
display = pd.DataFrame()
|
||||
display["Subject"] = df_raw["Subject"].fillna("")
|
||||
display["Investigator"]= df_raw["Investigator"].fillna("")
|
||||
display["Věk"] = df_raw["Subject's age collection"].apply(
|
||||
lambda v: "" if pd.isna(v) else int(v))
|
||||
display["Cohort"] = df_raw["Cohort per IRT"].apply(simplify_cohort)
|
||||
display["Status"] = df_raw["IRT Subject Status"].fillna("")
|
||||
display["Last IRT"] = df_raw["Last Recorded IRT Transaction"].fillna("—")
|
||||
display["Next Visit"] = df_raw["Next Expected IRT Transaction"].fillna("—")
|
||||
display["Next Date"] = df_raw["Next Expected IRT Transaction Date [Local]"].apply(format_date)
|
||||
|
||||
display = display.sort_values("Subject").reset_index(drop=True)
|
||||
|
||||
# ── data rows ────────────────────────────────────────────────────────────
|
||||
for r_idx, row in display.iterrows():
|
||||
excel_row = r_idx + 3 # row 1=title, row 2=header
|
||||
status = str(row["Status"])
|
||||
is_failed = "Screen Failed" in status or "Discontinued" in status
|
||||
is_randomized = "Randomized" in status
|
||||
is_adolescent = row["Cohort"] == "Adolescent"
|
||||
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
|
||||
|
||||
values = [
|
||||
row["Subject"], row["Investigator"], row["Věk"],
|
||||
row["Cohort"], row["Status"], row["Last IRT"],
|
||||
row["Next Visit"], row["Next Date"],
|
||||
]
|
||||
|
||||
for c_idx, val in enumerate(values, 1):
|
||||
cell = ws.cell(row=excel_row, column=c_idx, value=val if val != "" else None)
|
||||
cell.fill = fill
|
||||
cell.border = BORDER
|
||||
|
||||
# alignment
|
||||
cell.alignment = CENTER if c_idx in (3,) else LEFT
|
||||
|
||||
# font logic
|
||||
if is_failed:
|
||||
cell.font = STRIKE_FONT
|
||||
elif c_idx == 5 and is_randomized:
|
||||
cell.font = BOLD_FONT
|
||||
elif c_idx == 4 and is_adolescent:
|
||||
cell.font = ADOLESC_FONT
|
||||
else:
|
||||
cell.font = NORMAL_FONT
|
||||
|
||||
ws.row_dimensions[excel_row].height = 16
|
||||
|
||||
ws.freeze_panes = "A3"
|
||||
last_data_row = len(display) + 2
|
||||
ws.auto_filter.ref = f"A2:H{last_data_row}"
|
||||
|
||||
|
||||
def write_next_visits(wb, df_raw, study):
|
||||
ws = wb.create_sheet("Next Visits")
|
||||
ws.sheet_view.showGridLines = False
|
||||
|
||||
# title
|
||||
ws.merge_cells("A1:D1")
|
||||
title = ws["A1"]
|
||||
title.value = f"Next Expected Visits — {study} ({datetime.date.today().strftime('%d-%b-%Y')})"
|
||||
title.font = Font(name="Arial", bold=True, size=12, color="1F4E79")
|
||||
title.alignment = Alignment(horizontal="left", vertical="center")
|
||||
ws.row_dimensions[1].height = 22
|
||||
|
||||
# headers
|
||||
nv_headers = ["Subject", "Investigator", "Next Visit", "Datum"]
|
||||
nv_widths = [14, 22, 26, 13]
|
||||
for c, (h, w) in enumerate(zip(nv_headers, nv_widths), 1):
|
||||
cell = ws.cell(row=2, column=c, value=h)
|
||||
cell.font = HEADER_FONT
|
||||
cell.fill = HEADER_FILL
|
||||
cell.alignment = CENTER
|
||||
cell.border = BORDER
|
||||
ws.column_dimensions[get_column_letter(c)].width = w
|
||||
ws.row_dimensions[2].height = 18
|
||||
|
||||
# data — only rows with a Next Date, exclude Screen Failed / Discontinued
|
||||
df = pd.DataFrame()
|
||||
df["Subject"] = df_raw["Subject"].fillna("")
|
||||
df["Investigator"]= df_raw["Investigator"].fillna("")
|
||||
df["Next Visit"] = df_raw["Next Expected IRT Transaction"].fillna("")
|
||||
df["Datum"] = df_raw["Next Expected IRT Transaction Date [Local]"]
|
||||
df["Status"] = df_raw["IRT Subject Status"].fillna("")
|
||||
|
||||
df = df[df["Datum"].notna()]
|
||||
df = df[~df["Status"].str.contains("Screen Failed|Discontinued", na=False)]
|
||||
df = df.sort_values("Datum").reset_index(drop=True)
|
||||
|
||||
for r_idx, row in df.iterrows():
|
||||
excel_row = r_idx + 3
|
||||
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
|
||||
datum_val = row["Datum"]
|
||||
datum_str = datum_val.strftime("%Y-%m-%d") if hasattr(datum_val, "strftime") else str(datum_val)[:10]
|
||||
|
||||
values = [row["Subject"], row["Investigator"], row["Next Visit"], datum_str]
|
||||
for c_idx, val in enumerate(values, 1):
|
||||
cell = ws.cell(row=excel_row, column=c_idx, value=val if val != "" else None)
|
||||
cell.fill = fill
|
||||
cell.border = BORDER
|
||||
cell.font = NORMAL_FONT
|
||||
cell.alignment = LEFT
|
||||
ws.row_dimensions[excel_row].height = 16
|
||||
|
||||
ws.freeze_panes = "A3"
|
||||
last_data_row = len(df) + 2
|
||||
ws.auto_filter.ref = f"A2:D{last_data_row}"
|
||||
|
||||
|
||||
def create_report(study):
|
||||
source_path = find_latest_source(study)
|
||||
print(f"[{study}] Čtu: {os.path.basename(source_path)}")
|
||||
|
||||
df_raw = load_source(source_path)
|
||||
|
||||
wb = Workbook()
|
||||
wb.remove(wb.active) # remove default sheet
|
||||
|
||||
write_prehled(wb, df_raw, study)
|
||||
write_next_visits(wb, df_raw, study)
|
||||
write_zdroj(wb, df_raw, source_path)
|
||||
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
out_path = unique_path(CREATED_DIR, f"{today} {study} Subject Summary")
|
||||
wb.save(out_path)
|
||||
print(f"[{study}] Uloženo: {out_path}")
|
||||
return out_path
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(CREATED_DIR, exist_ok=True)
|
||||
for study in STUDIES:
|
||||
try:
|
||||
create_report(study)
|
||||
except FileNotFoundError as e:
|
||||
print(f"[{study}] PŘESKOČENO: {e}")
|
||||
print("\nHotovo.")
|
||||
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Stažení reportů z IWRS portálu — vše do jednoho adresáře `Incoming/`.
|
||||
|
||||
1. Subject Summary Report (per studie)
|
||||
2. Subject Detail Reports + notifikace (per subjekt)
|
||||
|
||||
Import se spouští samostatně skriptem `import_all.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import datetime
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
import download_subject_details as dsd
|
||||
|
||||
# ── CONFIG ───────────────────────────────────────────────────────────────────
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "Vlado123++-+"
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "Incoming")
|
||||
|
||||
|
||||
def unique_path(directory, stem, ext=".xlsx"):
|
||||
path = os.path.join(directory, f"{stem}{ext}")
|
||||
if not os.path.exists(path):
|
||||
return path
|
||||
time_tag = datetime.datetime.now().strftime("%H%M")
|
||||
return os.path.join(directory, f"{stem} {time_tag}{ext}")
|
||||
|
||||
|
||||
def login(page, study):
|
||||
page.goto(BASE_URL)
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Email *").fill(EMAIL)
|
||||
page.get_by_label("Password *").fill(PASSWORD)
|
||||
page.locator("#login__submit").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Study *").click()
|
||||
page.get_by_role("option", name=study).click()
|
||||
page.get_by_role("button", name="SELECT").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
|
||||
def download_summary(page, study, today):
|
||||
print(f" [{study}] Stahuji Subject Summary Report...")
|
||||
page.goto(f"{BASE_URL}/report/patient_summary_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
|
||||
with page.expect_download(timeout=120000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
|
||||
return filename
|
||||
|
||||
|
||||
def main():
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
os.makedirs(INCOMING_DIR, exist_ok=True)
|
||||
|
||||
with sync_playwright() as p:
|
||||
for study in STUDIES:
|
||||
print("\n" + "=" * 60)
|
||||
print(f"[{study}] Stažení reportů")
|
||||
print("=" * 60)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(accept_downloads=True)
|
||||
page = context.new_page()
|
||||
try:
|
||||
login(page, study)
|
||||
download_summary(page, study, today)
|
||||
# detail XLSX + notifikace přímo do Incoming/
|
||||
dsd.run(page, study, out_dir=INCOMING_DIR, subjects_source_dir=INCOMING_DIR)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA: {e}")
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Stahování hotovo. Soubory v: {INCOMING_DIR}")
|
||||
print("Pro import spusť: python import_all.py")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,201 @@
|
||||
from playwright.sync_api import sync_playwright
|
||||
import os
|
||||
import glob
|
||||
import datetime
|
||||
import requests
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# ── CONFIG ──────────────────────────────────────────────────────────────────
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "Vlado123++-+"
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
|
||||
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
|
||||
# ────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def get_subjects(study):
|
||||
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
|
||||
files = sorted(
|
||||
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
|
||||
key=os.path.getmtime,
|
||||
reverse=True,
|
||||
)
|
||||
if not files:
|
||||
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
if not os.path.basename(files[0]).startswith(today):
|
||||
raise FileNotFoundError(
|
||||
f"Dnešní Subject Summary Report pro {study} neexistuje — spusť nejdříve download_subject_summary.py"
|
||||
)
|
||||
path = files[0]
|
||||
print(f" Čtu subjekty z: {os.path.basename(path)}")
|
||||
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Subject" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
raise ValueError("Hlavičkový řádek nenalezen")
|
||||
|
||||
df = pd.read_excel(path, header=header_row)
|
||||
subjects = df["Subject"].dropna().astype(str).str.strip().tolist()
|
||||
return subjects
|
||||
|
||||
|
||||
def get_jwt_and_api_base(page, study):
|
||||
"""Získá JWT token a api_base_url pro danou studii."""
|
||||
jwt = page.evaluate("localStorage.getItem('JWT.access')")
|
||||
if not jwt:
|
||||
raise ValueError("JWT token nenalezen v localStorage")
|
||||
|
||||
instances = page.evaluate("""async (jwt) => {
|
||||
const res = await fetch('/_/api/dispatch/app_instances/', {
|
||||
headers: { 'Authorization': `Bearer ${jwt}` }
|
||||
});
|
||||
return res.json();
|
||||
}""", jwt)
|
||||
|
||||
instance = next(
|
||||
(i for i in instances if study in i.get("label", "")),
|
||||
None
|
||||
)
|
||||
if not instance:
|
||||
raise ValueError(f"app_instance pro studii {study} nenalezena")
|
||||
|
||||
return jwt, instance["api_base_url"]
|
||||
|
||||
|
||||
def get_notifications(jwt, api_base, study, subject):
|
||||
"""Načte seznam notifikací pro daného subjekta přes report_data API."""
|
||||
url = f"{BASE_URL}{api_base}/api/v1/reports_api/report_data"
|
||||
params = {
|
||||
"path": "patient_detail_report",
|
||||
"id": subject,
|
||||
"key": "table_1",
|
||||
"unblinded": "false",
|
||||
}
|
||||
payload = {
|
||||
"path": "patient_detail_report",
|
||||
"study": study,
|
||||
"id": subject,
|
||||
"key": "table_1",
|
||||
"fields": {},
|
||||
"filters": [{"tableId": "table_1", "tableFilters": {}}],
|
||||
"pagination_details": {"order": "type", "reverseOrder": False, "page": 1, "limit": 500},
|
||||
"cache_key": f"py_{subject}_{datetime.datetime.now().timestamp()}",
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {jwt}",
|
||||
"Content-Type": "application/json",
|
||||
"lang": "en",
|
||||
}
|
||||
resp = requests.post(url, params=params, json=payload, headers=headers)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
notifications = []
|
||||
for row in data.get("data", []):
|
||||
for notif in row.get("notification", []):
|
||||
item = notif.get("item", {})
|
||||
pk = item.get("pk")
|
||||
title = item.get("et_title")
|
||||
if pk and title:
|
||||
notifications.append({"pk": pk, "title": title, "event": row.get("event_event_id", "")})
|
||||
return notifications
|
||||
|
||||
|
||||
def download_pdf(jwt, api_base, pk, title, out_path):
|
||||
"""Stáhne PDF notifikaci a uloží ji."""
|
||||
url = f"{BASE_URL}{api_base}/api/v1/meta_api/pdfnotification"
|
||||
params = {"pk": pk, "title": title, "html": "true"}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {jwt}",
|
||||
"lang": "en",
|
||||
"Accept": "*/*",
|
||||
}
|
||||
resp = requests.get(url, params=params, headers=headers)
|
||||
resp.raise_for_status()
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(resp.content)
|
||||
|
||||
|
||||
def run(page, study):
|
||||
out_dir = os.path.join(DETAILS_DIR, study)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
subjects = get_subjects(study)
|
||||
print(f" Nalezeno {len(subjects)} subjektů")
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
|
||||
# Načteme stránku aby byl platný session kontext
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
jwt, api_base = get_jwt_and_api_base(page, study)
|
||||
print(f" API base: {api_base}")
|
||||
|
||||
for subject in subjects:
|
||||
print(f" [{subject}] Stahuji notifikace...")
|
||||
try:
|
||||
notifications = get_notifications(jwt, api_base, study, subject)
|
||||
if not notifications:
|
||||
print(f" [{subject}] Žádné notifikace")
|
||||
continue
|
||||
|
||||
for notif in notifications:
|
||||
pk = notif["pk"]
|
||||
title = notif["title"]
|
||||
filename = os.path.join(out_dir, f"{today} {study} {subject} Notification {title} pk{pk}.pdf")
|
||||
if os.path.exists(filename):
|
||||
print(f" [{subject}] {title} (pk={pk}) — již existuje, přeskakuji")
|
||||
continue
|
||||
download_pdf(jwt, api_base, pk, title, filename)
|
||||
print(f" [{subject}] {title} (pk={pk}) OK")
|
||||
|
||||
except Exception as e:
|
||||
print(f" [{subject}] CHYBA při notifikacích: {e}")
|
||||
|
||||
print(f" [{study}] Notifikace hotovo.")
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(DETAILS_DIR, exist_ok=True)
|
||||
|
||||
with sync_playwright() as p:
|
||||
for study in STUDIES:
|
||||
print(f"\n[{study}] Přihlášení...")
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(accept_downloads=True)
|
||||
page = context.new_page()
|
||||
|
||||
page.goto(BASE_URL)
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Email *").fill(EMAIL)
|
||||
page.get_by_label("Password *").fill(PASSWORD)
|
||||
page.locator("#login__submit").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
page.get_by_label("Study *").click()
|
||||
page.get_by_role("option", name=study).click()
|
||||
page.get_by_role("button", name="SELECT").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
try:
|
||||
run(page, study)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA: {e}")
|
||||
|
||||
browser.close()
|
||||
|
||||
print("\nVše hotovo.")
|
||||
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,76 @@
|
||||
from playwright.sync_api import sync_playwright
|
||||
import os
|
||||
import datetime
|
||||
|
||||
# ── CONFIG ──────────────────────────────────────────────────────────────────
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "Vlado123++-+"
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
|
||||
CREATED_DIR = os.path.join(BASE_DIR, "CreatedReports")
|
||||
# ────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def unique_path(directory, stem):
|
||||
path = os.path.join(directory, f"{stem}.xlsx")
|
||||
if not os.path.exists(path):
|
||||
return path
|
||||
time_tag = datetime.datetime.now().strftime("%H%M")
|
||||
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
|
||||
|
||||
|
||||
def download_study(page, study, today):
|
||||
print(f"\n[{study}] Prihlaseni...")
|
||||
page.goto(BASE_URL)
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Email *").fill(EMAIL)
|
||||
page.get_by_label("Password *").fill(PASSWORD)
|
||||
page.locator("#login__submit").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
print(f"[{study}] Vyber studie...")
|
||||
page.get_by_label("Study *").click()
|
||||
page.get_by_role("option", name=study).click()
|
||||
page.get_by_role("button", name="SELECT").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
print(f"[{study}] Stahuji Subject Summary Report...")
|
||||
page.goto(f"{BASE_URL}/report/patient_summary_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
|
||||
with page.expect_download(timeout=120000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f"[{study}] OK -> {filename}")
|
||||
return filename
|
||||
|
||||
|
||||
def main():
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
os.makedirs(INCOMING_DIR, exist_ok=True)
|
||||
os.makedirs(CREATED_DIR, exist_ok=True)
|
||||
|
||||
downloaded = []
|
||||
|
||||
with sync_playwright() as p:
|
||||
for study in STUDIES:
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(accept_downloads=True)
|
||||
page = context.new_page()
|
||||
|
||||
filename = download_study(page, study, today)
|
||||
downloaded.append((study, filename))
|
||||
|
||||
browser.close()
|
||||
|
||||
print("\nVse stazeno:")
|
||||
for study, path in downloaded:
|
||||
print(f" {study}: {path}")
|
||||
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Import všech čekajících reportů z `Incoming/` do MongoDB.
|
||||
|
||||
Pořadí zpracování per typ + studie: nejstarší soubor podle mtime první
|
||||
(důležité pro chronologickou správnost snapshotů).
|
||||
|
||||
Po úspěšném importu se soubor přesune do `Incoming/Zpracováno/`.
|
||||
Při chybě zůstane soubor v `Incoming/`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import shutil
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from common.mongo_writer import ensure_indexes
|
||||
|
||||
import import_to_mongo
|
||||
import import_notifications_to_mongo
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "Incoming")
|
||||
DONE_DIR = os.path.join(INCOMING_DIR, "Zpracováno")
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
|
||||
def _move_done(path):
|
||||
os.makedirs(DONE_DIR, exist_ok=True)
|
||||
dst = os.path.join(DONE_DIR, os.path.basename(path))
|
||||
# kolize → přepiš (Mongo už má aktuální data, soubor je jen archiv)
|
||||
if os.path.exists(dst):
|
||||
os.remove(dst)
|
||||
shutil.move(path, dst)
|
||||
|
||||
|
||||
def _sorted_by_mtime(paths):
|
||||
"""Nejstarší první."""
|
||||
return sorted(
|
||||
(p for p in paths if not os.path.basename(p).startswith("~$")),
|
||||
key=os.path.getmtime,
|
||||
)
|
||||
|
||||
|
||||
def import_summaries(study):
|
||||
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report*.xlsx")
|
||||
files = _sorted_by_mtime(glob.glob(pattern))
|
||||
if not files:
|
||||
print(f" [{study}] summary: nic ke zpracování")
|
||||
return
|
||||
print(f" [{study}] summary: {len(files)} soubor(ů) (oldest first)")
|
||||
for path in files:
|
||||
try:
|
||||
import_to_mongo.import_subject_summary(study, path)
|
||||
_move_done(path)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA summary {os.path.basename(path)}: {e}")
|
||||
|
||||
|
||||
def import_details(study):
|
||||
pattern = os.path.join(INCOMING_DIR, f"* {study} * Subject Detail.xlsx")
|
||||
files = _sorted_by_mtime(glob.glob(pattern))
|
||||
if not files:
|
||||
print(f" [{study}] detail: nic ke zpracování")
|
||||
return
|
||||
print(f" [{study}] detail: {len(files)} soubor(ů) (oldest first)")
|
||||
for path in files:
|
||||
parsed = import_to_mongo.parse_detail_filename(path)
|
||||
if not parsed:
|
||||
print(f" [{study}] PŘESKAKUJI (nelze parsovat název): {os.path.basename(path)}")
|
||||
continue
|
||||
_, parsed_study, subject = parsed
|
||||
if parsed_study != study:
|
||||
continue # patří jiné studii
|
||||
try:
|
||||
import_to_mongo.import_visits_single_file(study, subject, path)
|
||||
_move_done(path)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA detail {os.path.basename(path)}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.isdir(INCOMING_DIR):
|
||||
print(f"Adresář neexistuje: {INCOMING_DIR}")
|
||||
return
|
||||
ensure_indexes()
|
||||
|
||||
print("=" * 60)
|
||||
print("Import Subject Summary + Visits")
|
||||
print("=" * 60)
|
||||
for study in STUDIES:
|
||||
import_summaries(study)
|
||||
import_details(study)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Import notifikací")
|
||||
print("=" * 60)
|
||||
import_notifications_to_mongo.import_from_dir(INCOMING_DIR, DONE_DIR, STUDIES)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Hotovo. Zpracované soubory: {DONE_DIR}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,453 @@
|
||||
"""
|
||||
Importuje data z IWRS Excel reportů do MySQL (databáze studie).
|
||||
|
||||
Pořadí spuštění:
|
||||
1. download_subject_summary.py
|
||||
2. download_subject_details.py
|
||||
3. tento skript
|
||||
|
||||
Každé spuštění vytvoří nový import_id v iwrs_import.
|
||||
Reportovací skripty pracují vždy s MAX(import_id) pro danou studii.
|
||||
"""
|
||||
|
||||
import os
|
||||
import glob
|
||||
import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import mysql.connector
|
||||
|
||||
import db_config
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
|
||||
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_conn():
|
||||
return mysql.connector.connect(
|
||||
host=db_config.DB_HOST,
|
||||
port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER,
|
||||
password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def _py(val):
|
||||
"""Převede numpy skalár na Python nativní typ."""
|
||||
if isinstance(val, np.generic):
|
||||
return val.item()
|
||||
return val
|
||||
|
||||
|
||||
def to_date(val):
|
||||
"""Převede pandas Timestamp / string / NaT / NaN na date nebo None."""
|
||||
val = _py(val)
|
||||
if val is None or (isinstance(val, float) and (val != val)): # NaN check
|
||||
return None
|
||||
try:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if isinstance(val, pd.Timestamp):
|
||||
return None if pd.isna(val) else val.date()
|
||||
if isinstance(val, datetime.datetime):
|
||||
return val.date()
|
||||
if isinstance(val, datetime.date):
|
||||
return val
|
||||
s = str(val).strip()
|
||||
if not s or s.lower() in ("nat", "nan", "none", ""):
|
||||
return None
|
||||
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
return datetime.datetime.strptime(s, fmt).date()
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def to_int(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else int(v) # v != v je True jen pro NaN
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_float(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else float(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_str(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val): # NaN
|
||||
return None
|
||||
s = str(val).strip()
|
||||
return None if s.lower() in ("nan", "nat", "none", "") else s
|
||||
|
||||
|
||||
def find_summary_file(study):
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
pattern = os.path.join(INCOMING_DIR, f"* {study} Subject Summary Report.xlsx")
|
||||
files = sorted(
|
||||
[f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")],
|
||||
key=os.path.getmtime,
|
||||
reverse=True,
|
||||
)
|
||||
if not files:
|
||||
raise FileNotFoundError(f"Nenalezen Subject Summary Report pro {study}")
|
||||
if not os.path.basename(files[0]).startswith(today):
|
||||
print(f" UPOZORNĚNÍ: nejnovější Summary Report pro {study} není z dnešního dne ({os.path.basename(files[0])[:10]})")
|
||||
return files[0]
|
||||
|
||||
|
||||
def read_summary_df(path):
|
||||
"""Přečte Summary xlsx, vrátí DataFrame od řádku s hlavičkou."""
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Subject" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
|
||||
return pd.read_excel(path, header=header_row)
|
||||
|
||||
|
||||
def find_detail_files(study):
|
||||
out_dir = os.path.join(DETAILS_DIR, study)
|
||||
# Vezme soubory ze stejného dne jako nejnovější Summary Report
|
||||
summary_path = find_summary_file(study)
|
||||
file_date = os.path.basename(summary_path)[:10] # "YYYY-MM-DD"
|
||||
pattern = os.path.join(out_dir, f"{file_date} {study} * Subject Detail.xlsx")
|
||||
files = [f for f in glob.glob(pattern) if not os.path.basename(f).startswith("~$")]
|
||||
return sorted(files)
|
||||
|
||||
|
||||
def parse_detail_visits(path):
|
||||
"""
|
||||
Vrátí list slovníků s daty visitů z Detail xlsx.
|
||||
Každý řádek tabulky (od řádku s hlavičkou Visit Type) je jedna transakce.
|
||||
"""
|
||||
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
|
||||
|
||||
header_row = None
|
||||
for i, row in df.iterrows():
|
||||
if "Visit Type" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
return []
|
||||
|
||||
visits_df = df.iloc[header_row + 1:].copy()
|
||||
visits_df.columns = range(visits_df.shape[1])
|
||||
|
||||
rows = []
|
||||
for _, r in visits_df.iterrows():
|
||||
visit_type = to_str(r.get(0))
|
||||
if visit_type not in ("Past", "Upcoming"):
|
||||
continue
|
||||
rows.append({
|
||||
"visit_type": visit_type,
|
||||
"scheduled_date": to_date(r.get(1)),
|
||||
"window_days": to_str(r.get(2)),
|
||||
"actual_date": to_date(r.get(3)),
|
||||
"irt_transaction_no": to_int(r.get(4)),
|
||||
"irt_transaction_description": to_str(r.get(5)),
|
||||
"medication_assignment": to_str(r.get(6)),
|
||||
"quantity_assigned": to_int(r.get(7)),
|
||||
"medication_id": to_str(r.get(8)),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
# ── insert helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def insert_import(cursor, study, source_file):
|
||||
cursor.execute(
|
||||
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
|
||||
(study, datetime.datetime.now(), os.path.basename(source_file)),
|
||||
)
|
||||
return cursor.lastrowid
|
||||
|
||||
|
||||
def insert_uco3001_summary(cursor, import_id, df):
|
||||
sql = """
|
||||
INSERT INTO iwrs_uco3001_subject_summary (
|
||||
import_id, subject, prior_subject_identifier, site, investigator, location,
|
||||
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
|
||||
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
|
||||
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
|
||||
clinical_responder_status_i12_m0, irt_subject_status,
|
||||
i0_rand_date_local, last_irt_transaction,
|
||||
last_irt_transaction_date_local, last_irt_transaction_date_utc,
|
||||
next_irt_transaction, next_irt_transaction_date_local,
|
||||
most_recent_med_assignment_date, days_since_last_med_assignment,
|
||||
patient_forecast_status, patient_forecast_status_changed_date
|
||||
) VALUES (
|
||||
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
|
||||
)
|
||||
"""
|
||||
col = df.columns.tolist()
|
||||
|
||||
def c(name):
|
||||
return col.index(name) if name in col else None
|
||||
|
||||
for _, r in df.iterrows():
|
||||
cursor.execute(sql, (
|
||||
import_id,
|
||||
to_str(r["Subject"]),
|
||||
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
||||
to_str(r["Site"]),
|
||||
to_str(r["Investigator"]),
|
||||
to_str(r["Location"]),
|
||||
to_str(r["Cohort per IRT"]),
|
||||
to_date(r["Informed Consent Date"]),
|
||||
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
|
||||
to_int(r["Subject's age collection"]),
|
||||
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
|
||||
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
|
||||
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
|
||||
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
|
||||
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
|
||||
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
|
||||
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
|
||||
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
|
||||
to_str(r["IRT Subject Status"]),
|
||||
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
|
||||
to_str(r["Last Recorded IRT Transaction"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
||||
to_str(r["Next Expected IRT Transaction"]),
|
||||
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
||||
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
|
||||
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
|
||||
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
|
||||
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
|
||||
))
|
||||
|
||||
|
||||
def insert_mdd3003_summary(cursor, import_id, df):
|
||||
sql = """
|
||||
INSERT INTO iwrs_mdd3003_subject_summary (
|
||||
import_id, subject, prior_subject_identifier, site, investigator, location,
|
||||
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
|
||||
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
|
||||
stratification_country, age_group, stable_remitters, irt_subject_status,
|
||||
last_irt_transaction, last_irt_transaction_date_local,
|
||||
last_irt_transaction_date_utc, next_irt_transaction,
|
||||
next_irt_transaction_date_local, date_screened, date_screen_failed,
|
||||
date_randomized_part1, date_early_withdraw_randomized_part1,
|
||||
date_open_label_induction, date_early_withdraw_open_label_induction,
|
||||
date_randomized_part2, date_early_withdraw_randomized_part2,
|
||||
date_completed, date_unblinded
|
||||
) VALUES (
|
||||
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
|
||||
)
|
||||
"""
|
||||
col = df.columns.tolist()
|
||||
|
||||
for _, r in df.iterrows():
|
||||
cursor.execute(sql, (
|
||||
import_id,
|
||||
to_str(r["Subject"]),
|
||||
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
||||
to_str(r["Site"]),
|
||||
to_str(r["Investigator"]),
|
||||
to_str(r["Location"]),
|
||||
to_str(r["Cohort per IRT"]),
|
||||
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
|
||||
to_date(r["Informed Consent Date"]),
|
||||
to_int(r["Subject's age collection"]),
|
||||
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
|
||||
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
|
||||
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
|
||||
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
|
||||
to_str(r["Age Group"]) if "Age Group" in col else None,
|
||||
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
|
||||
to_str(r["IRT Subject Status"]),
|
||||
to_str(r["Last Recorded IRT Transaction"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
||||
to_str(r["Next Expected IRT Transaction"]),
|
||||
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
||||
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
|
||||
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
|
||||
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
|
||||
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
|
||||
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
|
||||
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
|
||||
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
|
||||
))
|
||||
|
||||
|
||||
def insert_visits(cursor, import_id, study, subject, visits):
|
||||
if not visits:
|
||||
return
|
||||
sql = """
|
||||
INSERT INTO iwrs_subject_visits (
|
||||
import_id, study, subject, visit_type, scheduled_date, window_days,
|
||||
actual_date, irt_transaction_no, irt_transaction_description,
|
||||
medication_assignment, quantity_assigned, medication_id
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||
"""
|
||||
for v in visits:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, subject,
|
||||
v["visit_type"], v["scheduled_date"], v["window_days"],
|
||||
v["actual_date"], v["irt_transaction_no"],
|
||||
v["irt_transaction_description"], v["medication_assignment"],
|
||||
v["quantity_assigned"], v["medication_id"],
|
||||
))
|
||||
|
||||
|
||||
# ── notifications ─────────────────────────────────────────────────────────────
|
||||
|
||||
def find_notification_json_files(study):
|
||||
"""Najde všechny .json soubory notifikací pro danou studii."""
|
||||
out_dir = os.path.join(DETAILS_DIR, study)
|
||||
return sorted(glob.glob(os.path.join(out_dir, "*.json")))
|
||||
|
||||
|
||||
def import_notifications(conn, study):
|
||||
import json as json_lib
|
||||
json_files = find_notification_json_files(study)
|
||||
if not json_files:
|
||||
print(f" Žádné notifikace k importu pro {study}")
|
||||
return 0
|
||||
|
||||
sql = """
|
||||
INSERT INTO iwrs_notifications
|
||||
(study, subject, pk, title, label, event, actual_date, text, pdf, source_file)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
label = VALUES(label),
|
||||
text = VALUES(text),
|
||||
pdf = VALUES(pdf),
|
||||
source_file = VALUES(source_file)
|
||||
"""
|
||||
|
||||
done_dir = os.path.join(os.path.join(DETAILS_DIR, study), "Zpracováno")
|
||||
os.makedirs(done_dir, exist_ok=True)
|
||||
|
||||
cursor = conn.cursor()
|
||||
count = 0
|
||||
for json_path in json_files:
|
||||
try:
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
meta = json_lib.load(f)
|
||||
|
||||
pdf_path = json_path.replace(".json", ".pdf")
|
||||
pdf_data = None
|
||||
if os.path.exists(pdf_path):
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf_data = f.read()
|
||||
|
||||
cursor.execute(sql, (
|
||||
meta.get("study", study),
|
||||
meta.get("subject"),
|
||||
meta.get("pk"),
|
||||
meta.get("title"),
|
||||
meta.get("label"),
|
||||
meta.get("event"),
|
||||
to_date(meta.get("actual_date")),
|
||||
meta.get("text"),
|
||||
pdf_data,
|
||||
os.path.basename(json_path),
|
||||
))
|
||||
count += 1
|
||||
|
||||
# Přesun do Zpracováno
|
||||
import shutil
|
||||
shutil.move(json_path, os.path.join(done_dir, os.path.basename(json_path)))
|
||||
if os.path.exists(pdf_path):
|
||||
shutil.move(pdf_path, os.path.join(done_dir, os.path.basename(pdf_path)))
|
||||
|
||||
except Exception as e:
|
||||
print(f" CHYBA při importu {os.path.basename(json_path)}: {e}")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
print(f" Notifikací uloženo/přesunuto: {count}")
|
||||
return count
|
||||
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def import_study(conn, study):
|
||||
summary_path = find_summary_file(study)
|
||||
print(f" Summary: {os.path.basename(summary_path)}")
|
||||
|
||||
df_summary = read_summary_df(summary_path)
|
||||
df_summary = df_summary.dropna(how="all")
|
||||
|
||||
detail_files = find_detail_files(study)
|
||||
print(f" Detail souborů: {len(detail_files)}")
|
||||
|
||||
cursor = conn.cursor()
|
||||
import_id = insert_import(cursor, study, summary_path)
|
||||
print(f" import_id = {import_id}")
|
||||
|
||||
if study == "77242113UCO3001":
|
||||
insert_uco3001_summary(cursor, import_id, df_summary)
|
||||
else:
|
||||
insert_mdd3003_summary(cursor, import_id, df_summary)
|
||||
print(f" Summary řádků: {len(df_summary)}")
|
||||
|
||||
visited = 0
|
||||
for path in detail_files:
|
||||
fname = os.path.basename(path)
|
||||
# název: "2026-05-04 77242113UCO3001 CZ100012001 Subject Detail.xlsx"
|
||||
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
|
||||
subject = m.group(1) if m else "UNKNOWN"
|
||||
visits = parse_detail_visits(path)
|
||||
insert_visits(cursor, import_id, study, subject, visits)
|
||||
visited += len(visits)
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
print(f" Transakce uloženo: {visited}")
|
||||
return import_id
|
||||
|
||||
|
||||
def main():
|
||||
conn = get_conn()
|
||||
print("Připojeno k MySQL.\n")
|
||||
|
||||
for study in STUDIES:
|
||||
print(f"[{study}]")
|
||||
try:
|
||||
import_id = import_study(conn, study)
|
||||
print(f" OK — import_id {import_id}")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
try:
|
||||
import_notifications(conn, study)
|
||||
except Exception as e:
|
||||
print(f" CHYBA notifikace: {e}")
|
||||
print()
|
||||
|
||||
conn.close()
|
||||
print("Hotovo.")
|
||||
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Kompletní pipeline:
|
||||
1. Stažení Subject Summary Reportů (obě studie)
|
||||
2. Stažení Subject Detail Reportů + notifikací (obě studie)
|
||||
3. Import do MongoDB (subject_summary + visits + notifications)
|
||||
|
||||
Spusť tento skript místo samostatných skriptů.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import glob
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
import download_subject_details as dsd
|
||||
import import_to_mongo
|
||||
import import_notifications_to_mongo
|
||||
|
||||
# ── CONFIG ───────────────────────────────────────────────────────────────────
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "Vlado123++-+"
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports")
|
||||
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
|
||||
|
||||
|
||||
# ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def unique_path(directory, stem):
|
||||
path = os.path.join(directory, f"{stem}.xlsx")
|
||||
if not os.path.exists(path):
|
||||
return path
|
||||
time_tag = datetime.datetime.now().strftime("%H%M")
|
||||
return os.path.join(directory, f"{stem} {time_tag}.xlsx")
|
||||
|
||||
|
||||
def login(page, study):
|
||||
page.goto(BASE_URL)
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Email *").fill(EMAIL)
|
||||
page.get_by_label("Password *").fill(PASSWORD)
|
||||
page.locator("#login__submit").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Study *").click()
|
||||
page.get_by_role("option", name=study).click()
|
||||
page.get_by_role("button", name="SELECT").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
|
||||
# ── KROK 1: Subject Summary ───────────────────────────────────────────────────
|
||||
|
||||
def download_summary(page, study, today):
|
||||
print(f" [{study}] Stahuji Subject Summary Report...")
|
||||
page.goto(f"{BASE_URL}/report/patient_summary_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report")
|
||||
with page.expect_download(timeout=120000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{study}] Summary OK -> {os.path.basename(filename)}")
|
||||
return filename
|
||||
|
||||
|
||||
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
|
||||
|
||||
def get_subjects_from_summary(summary_path):
|
||||
import pandas as pd
|
||||
raw = pd.read_excel(summary_path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Subject" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
raise ValueError("Hlavičkový řádek nenalezen")
|
||||
df = pd.read_excel(summary_path, header=header_row)
|
||||
return df["Subject"].dropna().astype(str).str.strip().tolist()
|
||||
|
||||
|
||||
def download_details(page, study, summary_path, today):
|
||||
out_dir = os.path.join(DETAILS_DIR, study)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
subjects = get_subjects_from_summary(summary_path)
|
||||
print(f" [{study}] Subjektů k stažení: {len(subjects)}")
|
||||
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
for subject in subjects:
|
||||
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
|
||||
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
||||
input_field.click()
|
||||
input_field.fill(subject)
|
||||
page.wait_for_timeout(500)
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
with page.expect_download(timeout=120000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{study}] Detail {subject} OK")
|
||||
|
||||
page.get_by_role("button", name="Clear").click()
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
|
||||
# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
os.makedirs(INCOMING_DIR, exist_ok=True)
|
||||
os.makedirs(DETAILS_DIR, exist_ok=True)
|
||||
|
||||
summary_paths = {}
|
||||
|
||||
# Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
|
||||
with sync_playwright() as p:
|
||||
for study in STUDIES:
|
||||
print("\n" + "=" * 60)
|
||||
print(f"[{study}] KROK 1: Subject Summary Report")
|
||||
print("=" * 60)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(accept_downloads=True)
|
||||
page = context.new_page()
|
||||
|
||||
try:
|
||||
login(page, study)
|
||||
summary_path = download_summary(page, study, today)
|
||||
summary_paths[study] = summary_path
|
||||
|
||||
print(f"\n[{study}] KROK 2: Subject Detail Reports + notifikace")
|
||||
dsd.run(page, study)
|
||||
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA při stahování: {e}")
|
||||
summary_paths[study] = None
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
# Krok 3: import do MongoDB
|
||||
print("\n" + "=" * 60)
|
||||
print("KROK 3: Import do MongoDB")
|
||||
print("=" * 60)
|
||||
|
||||
for study in STUDIES:
|
||||
summary_path = summary_paths.get(study)
|
||||
if not summary_path:
|
||||
print(f" [{study}] PŘESKOČENO — stahování selhalo")
|
||||
continue
|
||||
|
||||
try:
|
||||
import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA při importu summary/visits: {e}")
|
||||
|
||||
# Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
|
||||
print("\n [notifikace] import PDF/JSON do Mongo...")
|
||||
try:
|
||||
import_notifications_to_mongo.main(STUDIES)
|
||||
except Exception as e:
|
||||
print(f" CHYBA při importu notifikací: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Vše hotovo.")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,172 @@
|
||||
from playwright.sync_api import sync_playwright
|
||||
import re
|
||||
import os
|
||||
import datetime
|
||||
import mysql.connector
|
||||
import db_config
|
||||
|
||||
|
||||
def get_existing_pks(study):
|
||||
"""Vrátí set pk notifikací které už jsou v DB pro danou studii."""
|
||||
try:
|
||||
conn = mysql.connector.connect(
|
||||
host=db_config.DB_HOST, port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT pk FROM iwrs_notifications WHERE study = %s", (study,))
|
||||
pks = {row[0] for row in cursor.fetchall()}
|
||||
cursor.close()
|
||||
conn.close()
|
||||
return pks
|
||||
except Exception as e:
|
||||
print(f" UPOZORNĚNÍ: nelze načíst existující pk z DB ({e}), stahuji vše")
|
||||
return set()
|
||||
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "Vlado123++-+"
|
||||
|
||||
STUDY = "77242113UCO3001"
|
||||
SUBJECT = "CZ100222003"
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
|
||||
|
||||
|
||||
def strip_html(html):
|
||||
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def main():
|
||||
existing_pks = get_existing_pks(STUDY)
|
||||
print(f"V DB již existuje {len(existing_pks)} notifikací pro {STUDY}")
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=False, args=["--start-maximized"])
|
||||
context = browser.new_context(no_viewport=True)
|
||||
page = context.new_page()
|
||||
|
||||
print("Přihlašuji se...")
|
||||
page.goto(BASE_URL)
|
||||
page.wait_for_load_state("networkidle")
|
||||
page.get_by_label("Email *").fill(EMAIL)
|
||||
page.get_by_label("Password *").fill(PASSWORD)
|
||||
page.locator("#login__submit").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
page.get_by_label("Study *").click()
|
||||
page.get_by_role("option", name=STUDY).click()
|
||||
page.get_by_role("button", name="SELECT").click()
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=60000)
|
||||
|
||||
# JWT + api_base
|
||||
jwt = page.evaluate("localStorage.getItem('JWT.access')")
|
||||
print(f"JWT: {jwt[:30]}...")
|
||||
instances = page.evaluate("""async (jwt) => {
|
||||
const res = await fetch('/_/api/dispatch/app_instances/', {
|
||||
headers: { 'Authorization': `Bearer ${jwt}` }
|
||||
});
|
||||
return res.json();
|
||||
}""", jwt)
|
||||
instance = next((i for i in instances if STUDY in i.get("label", "")), None)
|
||||
if not instance:
|
||||
raise ValueError(f"Instance pro {STUDY} nenalezena")
|
||||
api_base = instance["api_base_url"]
|
||||
print(f"API base: {api_base}")
|
||||
|
||||
# Vyber subjekt a zachyť table_1 response přímo
|
||||
print(f"Vybírám subjekt {SUBJECT}...")
|
||||
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
||||
input_field.click()
|
||||
input_field.fill(SUBJECT)
|
||||
page.wait_for_timeout(1000)
|
||||
|
||||
captured = {}
|
||||
with page.expect_response(
|
||||
lambda r: "report_data" in r.url and "table_1" in r.url,
|
||||
timeout=60000
|
||||
) as resp_info:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
|
||||
response = resp_info.value
|
||||
data = response.json()
|
||||
|
||||
out_dir = os.path.join(DETAILS_DIR, STUDY)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Subjekt: {SUBJECT} | Studie: {STUDY}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
count = 0
|
||||
for row in data.get("data", []):
|
||||
for notif in (row.get("notification") or []):
|
||||
item = notif.get("item", {})
|
||||
pk = item.get("pk")
|
||||
title = item.get("et_title")
|
||||
label = (notif.get("label") or title or "").strip()
|
||||
# Celý label, mezery → podtržítka, nepovolené znaky pryč
|
||||
safe_label = re.sub(r'[\\/*?:"<>|]', "", label).replace(" ", "_")
|
||||
body = item.get("body", "")
|
||||
text = strip_html(body)
|
||||
count += 1
|
||||
print(f"\n--- Notifikace #{count}: {safe_label} (pk={pk}) | event: {row.get('event_event_id')} ---")
|
||||
print(text)
|
||||
|
||||
if pk in existing_pks:
|
||||
print(f" → pk={pk} již v DB, přeskakuji")
|
||||
continue
|
||||
|
||||
actual_date = row.get("actual_date_raw", "0000-00-00")
|
||||
pdf_filename = os.path.join(out_dir, f"{actual_date}_{safe_label}.pdf")
|
||||
if os.path.exists(pdf_filename):
|
||||
pdf_filename = os.path.join(out_dir, f"{actual_date}_{safe_label}_pk{pk}.pdf")
|
||||
|
||||
pdf_url = f"{BASE_URL}{api_base}/api/v1/meta_api/pdfnotification?pk={pk}&title={title}&html=true"
|
||||
pdf_resp = page.request.get(pdf_url, headers={
|
||||
"Authorization": f"Bearer {jwt}",
|
||||
"lang": "en",
|
||||
"prancer_study": STUDY,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
})
|
||||
if pdf_resp.ok:
|
||||
with open(pdf_filename, "wb") as f:
|
||||
f.write(pdf_resp.body())
|
||||
print(f" → PDF uloženo: {os.path.basename(pdf_filename)}")
|
||||
json_filename = pdf_filename.replace(".pdf", ".json")
|
||||
import json
|
||||
with open(json_filename, "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"pk": pk,
|
||||
"title": title,
|
||||
"label": label,
|
||||
"event": row.get("event_event_id"),
|
||||
"actual_date": actual_date,
|
||||
"subject": SUBJECT,
|
||||
"study": STUDY,
|
||||
"text": text,
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
print(f" → JSON uloženo: {os.path.basename(json_filename)}")
|
||||
else:
|
||||
print(f" → PDF chyba: {pdf_resp.status}")
|
||||
page.wait_for_timeout(300)
|
||||
|
||||
if count == 0:
|
||||
print("Žádné notifikace nalezeny.")
|
||||
else:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Celkem notifikací: {count}")
|
||||
|
||||
browser.close()
|
||||
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,5 @@
|
||||
DB_HOST = "192.168.1.76"
|
||||
DB_PORT = 3306
|
||||
DB_USER = "root"
|
||||
DB_PASSWORD = "Vlado9674+"
|
||||
DB_NAME = "studie"
|
||||
Reference in New Issue
Block a user