""" Kompletní pipeline: 1. Stažení Subject Summary Reportů (obě studie) 2. Stažení Subject Detail Reportů + notifikací (obě studie) 3. Import do MongoDB (subject_summary + visits + notifications) Spusť tento skript místo samostatných skriptů. """ import os import sys import datetime import glob from playwright.sync_api import sync_playwright import download_subject_details as dsd import import_to_mongo import import_notifications_to_mongo # ── CONFIG ─────────────────────────────────────────────────────────────────── BASE_URL = "https://janssen.4gclinical.com" EMAIL = "vbuzalka@its.jnj.com" PASSWORD = "Vlado123++-+" STUDIES = ["77242113UCO3001", "42847922MDD3003"] BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INCOMING_DIR = os.path.join(BASE_DIR, "IncomingSourceReports") DETAILS_DIR = os.path.join(BASE_DIR, "IncomingSourceReportsDetails") # ── helpers ─────────────────────────────────────────────────────────────────── def unique_path(directory, stem): path = os.path.join(directory, f"{stem}.xlsx") if not os.path.exists(path): return path time_tag = datetime.datetime.now().strftime("%H%M") return os.path.join(directory, f"{stem} {time_tag}.xlsx") def login(page, study): page.goto(BASE_URL) page.wait_for_load_state("networkidle") page.get_by_label("Email *").fill(EMAIL) page.get_by_label("Password *").fill(PASSWORD) page.locator("#login__submit").click() page.wait_for_load_state("networkidle") page.get_by_label("Study *").click() page.get_by_role("option", name=study).click() page.get_by_role("button", name="SELECT").click() page.wait_for_load_state("networkidle") # ── KROK 1: Subject Summary ─────────────────────────────────────────────────── def download_summary(page, study, today): print(f" [{study}] Stahuji Subject Summary Report...") page.goto(f"{BASE_URL}/report/patient_summary_report") page.wait_for_load_state("networkidle", timeout=120000) filename = unique_path(INCOMING_DIR, f"{today} {study} Subject Summary Report") with page.expect_download(timeout=120000) as dl: page.get_by_role("button", name="Download XLS").click() dl.value.save_as(filename) print(f" [{study}] Summary OK -> {os.path.basename(filename)}") return filename # ── KROK 2: Subject Details ─────────────────────────────────────────────────── def get_subjects_from_summary(summary_path): import pandas as pd raw = pd.read_excel(summary_path, header=None) header_row = None for i, row in raw.iterrows(): if "Subject" in [str(v).strip() for v in row]: header_row = i break if header_row is None: raise ValueError("Hlavičkový řádek nenalezen") df = pd.read_excel(summary_path, header=header_row) return df["Subject"].dropna().astype(str).str.strip().tolist() def download_details(page, study, summary_path, today): out_dir = os.path.join(DETAILS_DIR, study) os.makedirs(out_dir, exist_ok=True) subjects = get_subjects_from_summary(summary_path) print(f" [{study}] Subjektů k stažení: {len(subjects)}") page.goto(f"{BASE_URL}/report/patient_detail_report") page.wait_for_load_state("networkidle", timeout=120000) for subject in subjects: filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx") input_field = page.locator('input[placeholder="search"], input[type="text"]').first input_field.click() input_field.fill(subject) page.wait_for_timeout(500) page.locator("mat-option").first.dispatch_event("click") page.wait_for_load_state("networkidle", timeout=120000) with page.expect_download(timeout=120000) as dl: page.get_by_role("button", name="Download XLS").click() dl.value.save_as(filename) print(f" [{study}] Detail {subject} OK") page.get_by_role("button", name="Clear").click() page.wait_for_load_state("networkidle", timeout=120000) # ── KROK 3: Import do MongoDB ──────────────────────────────────────────────── def main(): today = datetime.date.today().strftime("%Y-%m-%d") os.makedirs(INCOMING_DIR, exist_ok=True) os.makedirs(DETAILS_DIR, exist_ok=True) summary_paths = {} # Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) with sync_playwright() as p: for study in STUDIES: print("\n" + "=" * 60) print(f"[{study}] KROK 1: Subject Summary Report") print("=" * 60) browser = p.chromium.launch(headless=False) context = browser.new_context(accept_downloads=True) page = context.new_page() try: login(page, study) summary_path = download_summary(page, study, today) summary_paths[study] = summary_path print(f"\n[{study}] KROK 2: Subject Detail Reports + notifikace") dsd.run(page, study) except Exception as e: print(f" [{study}] CHYBA při stahování: {e}") summary_paths[study] = None finally: browser.close() # Krok 3: import do MongoDB print("\n" + "=" * 60) print("KROK 3: Import do MongoDB") print("=" * 60) for study in STUDIES: summary_path = summary_paths.get(study) if not summary_path: print(f" [{study}] PŘESKOČENO — stahování selhalo") continue try: import_to_mongo.run(study, summary_path, DETAILS_DIR, today) except Exception as e: print(f" [{study}] CHYBA při importu summary/visits: {e}") # Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications print("\n [notifikace] import PDF/JSON do Mongo...") try: import_notifications_to_mongo.main(STUDIES) except Exception as e: print(f" CHYBA při importu notifikací: {e}") print("\n" + "=" * 60) print("Vše hotovo.") print("=" * 60) main()