notebook

2026-06-03 07:10:15 +02:00
parent 9ed9f97140
commit 681095d557
60 changed files with 215 additions and 2161 deletions
@@ -4,7 +4,7 @@ Kompletní pipeline pro Drugs:
  2. IP destruction             (per košík, přeskočí již existující soubory)
  3. Shipments report           (jeden soubor na studii, přepisuje)
  4. Shipment details           (per zásilka CZ, vždy přepisuje)
-  5. Import do MySQL
+  5. Import do MongoDB (studie.iwrs_shipments / iwrs_shipment_items / iwrs_inventory / iwrs_destruction)

 Spusť tento skript — zpracuje obě studie automaticky.
 """
@@ -14,12 +14,11 @@ import glob
 import re
 import datetime

-import numpy as np
+import sys
 import pandas as pd
 from playwright.sync_api import sync_playwright
-import mysql.connector

-import db_config
+import import_to_mongo as drugs_mongo

 BASE_URL = "https://janssen.4gclinical.com"
 EMAIL    = "vbuzalka@its.jnj.com"
@@ -42,357 +41,6 @@ SITES = {
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))


-# ── type converters ──────────────────────────────────────────────────────────
-
-def _py(val):
-    if isinstance(val, np.generic):
-        return val.item()
-    return val
-
-def to_date(val):
-    val = _py(val)
-    if val is None:
-        return None
-    if isinstance(val, float) and (val != val):
-        return None
-    try:
-        if pd.isna(val):
-            return None
-    except (TypeError, ValueError):
-        pass
-    if isinstance(val, pd.Timestamp):
-        return None if pd.isna(val) else val.date()
-    if isinstance(val, datetime.datetime):
-        return val.date()
-    if isinstance(val, datetime.date):
-        return val
-    s = str(val).strip()
-    if not s or s.lower() in ("nat", "nan", "none", ""):
-        return None
-    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
-        try:
-            return datetime.datetime.strptime(s, fmt).date()
-        except ValueError:
-            pass
-    return None
-
-def to_int(val):
-    val = _py(val)
-    try:
-        v = float(val)
-        return None if (v != v) else int(v)
-    except (TypeError, ValueError):
-        return None
-
-def to_str(val):
-    val = _py(val)
-    if val is None:
-        return None
-    if isinstance(val, float) and (val != val):
-        return None
-    s = str(val).strip()
-    return None if s.lower() in ("nan", "nat", "none", "") else s
-
-
-# ── DB helpers ───────────────────────────────────────────────────────────────
-
-def get_conn():
-    return mysql.connector.connect(
-        host=db_config.DB_HOST, port=db_config.DB_PORT,
-        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
-        database=db_config.DB_NAME,
-    )
-
-def insert_import(cursor, study, source_label):
-    cursor.execute(
-        "INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)",
-        (study, datetime.datetime.now(), source_label, "drugs"),
-    )
-    return cursor.lastrowid
-
-def basket_already_imported(cursor, study, basket_id):
-    cursor.execute(
-        "SELECT 1 FROM iwrs_destruction WHERE study=%s AND basket_id=%s LIMIT 1",
-        (study, str(basket_id)),
-    )
-    return cursor.fetchone() is not None
-
-
-# ── parsery ──────────────────────────────────────────────────────────────────
-
-def parse_shipments_report(study):
-    path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
-    if not os.path.exists(path):
-        print(f"  CHYBÍ: {path}")
-        return []
-    raw = pd.read_excel(path, header=None)
-    header_row = None
-    for i, row in raw.iterrows():
-        if "Shipment ID" in [str(v).strip() for v in row]:
-            header_row = i
-            break
-    if header_row is None:
-        return []
-    df = pd.read_excel(path, header=header_row).dropna(how="all")
-    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
-    col = df.columns.tolist()
-    rows = []
-    for _, r in df.iterrows():
-        rows.append({
-            "shipment_id":          to_str(r["Shipment ID"]),
-            "status":               to_str(r["IRT Shipment Status"]),
-            "type":                 to_str(r["Type"]),
-            "ship_from":            to_str(r["Shipment From"]),
-            "ship_to_site":         to_str(r["Ship To:"]),
-            "location":             to_str(r["Location"]),
-            "request_date":         to_date(r["Request Date"]),
-            "shipped_date":         to_date(r["Shipped Date"]),
-            "received_date":        to_date(r["Received Date"]) if "Received Date" in col else None,
-            "received_by":          to_str(r["Received by"]) if "Received by" in col else None,
-            "delivered_date_utc":   to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
-            "delivery_recipient":   to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
-            "delivery_details":     to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
-            "cancelled_date":       to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
-            "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
-            "tracking_no":          to_str(r["Tracking #"]) if "Tracking #" in col else None,
-            "shipping_category":    to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
-            "expected_arrival":     to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
-        })
-    return rows
-
-
-def parse_shipment_details(study):
-    detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
-    files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
-    rows = []
-    for path in files:
-        m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
-        shipment_id = m.group(1) if m else "UNKNOWN"
-        raw = pd.read_excel(path, header=None)
-        header_row = None
-        for i, row in raw.iterrows():
-            if "Medication ID" in [str(v).strip() for v in row]:
-                header_row = i
-                break
-        if header_row is None:
-            continue
-        df = pd.read_excel(path, header=header_row).dropna(how="all")
-        for _, r in df.iterrows():
-            med_desc = (to_str(r.get("Medication Description"))
-                        or to_str(r.get("Medication ID Description")))
-            med_type = (to_str(r.get("Medication type"))
-                        or to_str(r.get("Medication ID type")))
-            rows.append({
-                "shipment_id":             shipment_id,
-                "destination_location":    to_str(r.get("Destination Location")),
-                "shipment_status":         to_str(r.get("IRT Shipment Status")),
-                "shipment_type":           to_str(r.get("Type")),
-                "destination_site":        to_str(r.get("Destination Site")),
-                "investigator":            to_str(r.get("Investigator")),
-                "medication_description":  med_desc,
-                "medication_type":         med_type,
-                "medication_id":           to_str(r.get("Medication ID")),
-                "packaged_lot_no":         to_str(r.get("Packaged Lot number")),
-                "packaged_lot_description": to_str(r.get("Packaged Lot description")),
-                "container_id":            to_str(r.get("Container ID")),
-                "quantity":                to_int(r.get("Quantity of Medication IDs")),
-                "expiration_date":         to_date(r.get("Expiration Date")),
-                "item_status":             to_str(r.get("Status")),
-            })
-    return rows
-
-
-def parse_inventory(study):
-    inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
-    files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
-    rows = []
-    for path in files:
-        raw = pd.read_excel(path, header=None)
-        site = investigator = location = None
-        header_row = None
-        for i, row in raw.iterrows():
-            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
-            if first.startswith("Site:"):
-                site = first.replace("Site:", "").strip()
-            elif first.startswith("Investigator:"):
-                investigator = first.replace("Investigator:", "").strip()
-            elif first.startswith("Location:"):
-                location = first.replace("Location:", "").strip()
-            if first in ("Medication", "Medication ID") and header_row is None:
-                header_row = i
-        if header_row is None:
-            continue
-        df = pd.read_excel(path, header=header_row).dropna(how="all")
-        df = df.rename(columns={df.columns[0]: "medication_id"})
-        for _, r in df.iterrows():
-            rows.append({
-                "site":                    site,
-                "investigator":            investigator,
-                "location":                location,
-                "medication_id":           to_str(r["medication_id"]),
-                "packaged_lot_no":         to_str(r.get("Packaged Lot number")),
-                "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
-                "expiration_date":         to_date(r.get("Expiration date")),
-                "received_date":           to_date(r.get("Received Date")),
-                "receipt_user":            to_str(r.get("Shipment Receipt User")),
-                "subject_identifier":      to_str(r.get("Subject Identifier")),
-                "quantity_assigned":       to_int(r.get("Quantity Assigned")),
-                "irt_transaction":         to_str(r.get("IRT Transaction")),
-                "date_assigned":           to_date(r.get("Date Assigned")),
-                "assignment_user":         to_str(r.get("Assignment User")),
-                "dispensation_status":     to_str(r.get("Dispensation Status")),
-                "dispensing_date":         to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
-                "quantity_dispensed":      to_int(r.get("Quantity Dispensed")),
-                "dispensing_user":         to_str(r.get("Dispensing User")),
-                "quantity_returned":       to_int(r.get("Quantity Returned")),
-                "date_returned":           to_date(r.get("Date Returned")),
-                "return_user":             to_str(r.get("Return User")),
-            })
-    return rows
-
-
-def parse_destruction_files(study):
-    dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
-    files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
-    baskets = []
-    for path in files:
-        raw = pd.read_excel(path, header=None)
-        meta = {}
-        header_row = None
-        for i, row in raw.iterrows():
-            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
-            for key, attr in [
-                ("Investigator Name:", "investigator"),
-                ("Site ID:", "site_id"),
-                ("Location:", "location"),
-                ("Basket ID:", "basket_id"),
-                ("Drug Destruction Created Date:", "destruction_date"),
-            ]:
-                if first.startswith(key):
-                    meta[attr] = first.replace(key, "").strip()
-            if first == "Medication ID Description" and header_row is None:
-                header_row = i
-        if header_row is None:
-            continue
-        df = pd.read_excel(path, header=header_row).dropna(how="all")
-        items = []
-        for _, r in df.iterrows():
-            items.append({
-                "medication_description":   to_str(r.get("Medication ID Description")),
-                "medication_id":            to_str(r.get("Medication ID")),
-                "packaged_lot_description": to_str(r.get("Packaged Lot description")),
-                "comments":                 to_str(r.get("Comments")),
-            })
-        baskets.append({
-            "site_id":          meta.get("site_id"),
-            "investigator":     meta.get("investigator"),
-            "location":         meta.get("location"),
-            "basket_id":        meta.get("basket_id"),
-            "destruction_date": to_date(meta.get("destruction_date")),
-            "items":            items,
-        })
-    return baskets
-
-
-# ── insertery ────────────────────────────────────────────────────────────────
-
-def insert_shipments(cursor, import_id, study, rows):
-    sql = """INSERT INTO iwrs_shipments
-        (import_id, study, shipment_id, status, type, ship_from, ship_to_site,
-         location, request_date, shipped_date, received_date, received_by,
-         delivered_date_utc, delivery_recipient, delivery_details, cancelled_date,
-         total_medication_ids, tracking_no, shipping_category, expected_arrival)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for r in rows:
-        cursor.execute(sql, (
-            import_id, study, r["shipment_id"], r["status"], r["type"],
-            r["ship_from"], r["ship_to_site"], r["location"],
-            r["request_date"], r["shipped_date"], r["received_date"],
-            r["received_by"], r["delivered_date_utc"], r["delivery_recipient"],
-            r["delivery_details"], r["cancelled_date"], r["total_medication_ids"],
-            r["tracking_no"], r["shipping_category"], r["expected_arrival"],
-        ))
-
-
-def insert_shipment_items(cursor, import_id, study, rows):
-    sql = """INSERT INTO iwrs_shipment_items
-        (import_id, study, shipment_id, destination_location, shipment_status,
-         shipment_type, destination_site, investigator, medication_description,
-         medication_type, medication_id, packaged_lot_no, packaged_lot_description,
-         container_id, quantity, expiration_date, item_status)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for r in rows:
-        cursor.execute(sql, (
-            import_id, study, r["shipment_id"], r["destination_location"],
-            r["shipment_status"], r["shipment_type"], r["destination_site"],
-            r["investigator"], r["medication_description"], r["medication_type"],
-            r["medication_id"], r["packaged_lot_no"], r["packaged_lot_description"],
-            r["container_id"], r["quantity"], r["expiration_date"], r["item_status"],
-        ))
-
-
-def insert_inventory(cursor, import_id, study, rows):
-    sql = """INSERT INTO iwrs_inventory
-        (import_id, study, site, investigator, location, medication_id,
-         packaged_lot_no, original_expiration_date, expiration_date, received_date,
-         receipt_user, subject_identifier, quantity_assigned, irt_transaction,
-         date_assigned, assignment_user, dispensation_status, dispensing_date,
-         quantity_dispensed, dispensing_user, quantity_returned, date_returned, return_user)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for r in rows:
-        cursor.execute(sql, (
-            import_id, study, r["site"], r["investigator"], r["location"],
-            r["medication_id"], r["packaged_lot_no"], r["original_expiration_date"],
-            r["expiration_date"], r["received_date"], r["receipt_user"],
-            r["subject_identifier"], r["quantity_assigned"], r["irt_transaction"],
-            r["date_assigned"], r["assignment_user"], r["dispensation_status"],
-            r["dispensing_date"], r["quantity_dispensed"], r["dispensing_user"],
-            r["quantity_returned"], r["date_returned"], r["return_user"],
-        ))
-
-
-def insert_destruction(cursor, study, baskets):
-    sql = """INSERT IGNORE INTO iwrs_destruction
-        (study, site_id, investigator, location, basket_id, destruction_date,
-         medication_description, medication_id, packaged_lot_description, comments)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    skipped = imported = 0
-    for b in baskets:
-        if basket_already_imported(cursor, study, b["basket_id"]):
-            skipped += 1
-            continue
-        for item in b["items"]:
-            cursor.execute(sql, (
-                study, b["site_id"], b["investigator"], b["location"],
-                b["basket_id"], b["destruction_date"],
-                item["medication_description"], item["medication_id"],
-                item["packaged_lot_description"], item["comments"],
-            ))
-            imported += 1
-    return imported, skipped
-
-
-def import_study(study):
-    print(f"\n  Parsování dat pro {study}...")
-    shipments = parse_shipments_report(study)
-    items     = parse_shipment_details(study)
-    inventory = parse_inventory(study)
-    baskets   = parse_destruction_files(study)
-    print(f"  Zásilky: {len(shipments)}  |  Položky: {len(items)}  |  Sklad: {len(inventory)}  |  Destrukce: {len(baskets)} košíků")
-
-    conn = get_conn()
-    cursor = conn.cursor()
-    import_id = insert_import(cursor, study, f"drugs_{study}")
-    print(f"  import_id = {import_id}")
-    insert_shipments(cursor, import_id, study, shipments)
-    insert_shipment_items(cursor, import_id, study, items)
-    insert_inventory(cursor, import_id, study, inventory)
-    dest_imported, dest_skipped = insert_destruction(cursor, study, baskets)
-    conn.commit()
-    cursor.close()
-    conn.close()
-    print(f"  Destrukce: {dest_imported} nových | {dest_skipped} košíků přeskočeno")
-

 # ── login ────────────────────────────────────────────────────────────────────

@@ -577,19 +225,17 @@ def main():
            finally:
                browser.close()

-    # ── Import do MySQL ───────────────────────────────────────────────────────
+    # ── Import do MongoDB ─────────────────────────────────────────────────────
    print(f"\n{'='*60}")
-    print("IMPORT DO MySQL")
+    print("IMPORT DO MongoDB")
    print(f"{'='*60}")

-    for study in STUDIES:
-        print(f"\n[{study}]")
-        try:
-            import_study(study)
-        except Exception as e:
-            import traceback
-            print(f"  CHYBA při importu: {e}")
-            traceback.print_exc()
+    try:
+        drugs_mongo.run(STUDIES)
+    except Exception as e:
+        import traceback
+        print(f"  CHYBA při importu: {e}")
+        traceback.print_exc()

    print(f"\n{'='*60}")
    print("Vše hotovo.")
@@ -156,38 +156,62 @@ def run(page, study):
    total_notif = 0
    for subject in subjects:
        filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
-        print(f"  [{subject}] Stahuji...")
-        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
-        input_field.click()
-        input_field.fill(subject)
-        page.wait_for_timeout(500)

-        # Zachytíme table_1 response při výběru subjektu
-        if api_base:
+        success = False
+        table1_data = None
+        for attempt in range(1, 4):
            try:
-                with page.expect_response(
-                    lambda r: "report_data" in r.url and "table_1" in r.url,
-                    timeout=60000
-                ) as resp_info:
+                print(f"  [{subject}] Stahuji... (pokus {attempt}/3)")
+                input_field = page.locator('input[placeholder="search"], input[type="text"]').first
+                input_field.click()
+                input_field.fill(subject)
+                page.wait_for_timeout(500)
+
+                # Zachytíme table_1 response při výběru subjektu
+                if api_base:
+                    try:
+                        with page.expect_response(
+                            lambda r: "report_data" in r.url and "table_1" in r.url,
+                            timeout=60000
+                        ) as resp_info:
+                            page.locator("mat-option").first.dispatch_event("click")
+                        table1_data = resp_info.value.json()
+                    except Exception as e:
+                        print(f"  [{subject}] CHYBA zachycení table_1: {e}")
+                        page.locator("mat-option").first.dispatch_event("click")
+                        page.wait_for_load_state("networkidle", timeout=120000)
+                        table1_data = None
+                else:
                    page.locator("mat-option").first.dispatch_event("click")
-                table1_data = resp_info.value.json()
-            except Exception as e:
-                print(f"  [{subject}] CHYBA zachycení table_1: {e}")
-                page.locator("mat-option").first.dispatch_event("click")
+                    page.wait_for_load_state("networkidle", timeout=120000)
+                    table1_data = None
+
                page.wait_for_load_state("networkidle", timeout=120000)
-                table1_data = None
-        else:
-            page.locator("mat-option").first.dispatch_event("click")
-            page.wait_for_load_state("networkidle", timeout=120000)
-            table1_data = None
+                page.wait_for_timeout(2000)

-        page.wait_for_load_state("networkidle", timeout=120000)
-        page.wait_for_timeout(1000)
+                with page.expect_download(timeout=60000) as dl:
+                    page.get_by_role("button", name="Download XLS").click()
+                dl.value.save_as(filename)
+                print(f"  [{subject}] XLS OK")
+                success = True
+                break
+            except Exception as e:
+                print(f"  [{subject}] pokus {attempt} selhal: {e}")
+                if attempt < 3:
+                    try:
+                        page.goto(f"{BASE_URL}/report/patient_detail_report")
+                        page.wait_for_load_state("networkidle", timeout=120000)
+                    except Exception as ge:
+                        print(f"  [{subject}] refresh selhal: {ge}")

-        with page.expect_download(timeout=120000) as dl:
-            page.get_by_role("button", name="Download XLS").click()
-        dl.value.save_as(filename)
-        print(f"  [{subject}] XLS OK")
+        if not success:
+            print(f"  [{subject}] PŘESKAKUJI po 3 neúspěšných pokusech")
+            try:
+                page.goto(f"{BASE_URL}/report/patient_detail_report")
+                page.wait_for_load_state("networkidle", timeout=120000)
+            except Exception:
+                pass
+            continue

        # Stáhnout notifikace pro tohoto subjekta
        if api_base and table1_data:
@@ -196,8 +220,13 @@ def run(page, study):
            )
            total_notif += n

-        page.get_by_role("button", name="Clear").click()
-        page.wait_for_load_state("networkidle", timeout=120000)
+        try:
+            page.get_by_role("button", name="Clear").click()
+            page.wait_for_load_state("networkidle", timeout=120000)
+        except Exception as e:
+            print(f"  [{subject}] Clear selhal: {e} — refresh")
+            page.goto(f"{BASE_URL}/report/patient_detail_report")
+            page.wait_for_load_state("networkidle", timeout=120000)

    print(f"  [{study}] Subject details hotovo. Nových notifikací: {total_notif}")

@@ -2,23 +2,21 @@
 Kompletní pipeline:
  1. Stažení Subject Summary Reportů (obě studie)
  2. Stažení Subject Detail Reportů + notifikací (obě studie)
-  3. Import do MySQL (summary, visits, notifikace)
+  3. Import do MongoDB (subject_summary + visits + notifications)

 Spusť tento skript místo samostatných skriptů.
 """

 import os
+import sys
 import datetime
 import glob
-import re

 from playwright.sync_api import sync_playwright
-import numpy as np
-import pandas as pd

-import db_config
-import mysql.connector
 import download_subject_details as dsd
+import import_to_mongo
+import import_notifications_to_mongo

 # ── CONFIG ───────────────────────────────────────────────────────────────────
 BASE_URL = "https://janssen.4gclinical.com"
@@ -72,6 +70,7 @@ def download_summary(page, study, today):
 # ── KROK 2: Subject Details ───────────────────────────────────────────────────

 def get_subjects_from_summary(summary_path):
+    import pandas as pd
    raw = pd.read_excel(summary_path, header=None)
    header_row = None
    for i, row in raw.iterrows():
@@ -112,277 +111,7 @@ def download_details(page, study, summary_path, today):
        page.wait_for_load_state("networkidle", timeout=120000)


-# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
-
-def get_conn():
-    return mysql.connector.connect(
-        host=db_config.DB_HOST,
-        port=db_config.DB_PORT,
-        user=db_config.DB_USER,
-        password=db_config.DB_PASSWORD,
-        database=db_config.DB_NAME,
-    )
-
-
-def _py(val):
-    """Převede numpy skalár na Python nativní typ."""
-    if isinstance(val, np.generic):
-        return val.item()
-    return val
-
-
-def to_date(val):
-    val = _py(val)
-    if val is None or (isinstance(val, float) and (val != val)):
-        return None
-    try:
-        if pd.isna(val):
-            return None
-    except (TypeError, ValueError):
-        pass
-    if isinstance(val, pd.Timestamp):
-        return None if pd.isna(val) else val.date()
-    if isinstance(val, datetime.datetime):
-        return val.date()
-    if isinstance(val, datetime.date):
-        return val
-    s = str(val).strip()
-    if not s or s.lower() in ("nat", "nan", "none", ""):
-        return None
-    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
-        try:
-            return datetime.datetime.strptime(s, fmt).date()
-        except ValueError:
-            pass
-    return None
-
-
-def to_int(val):
-    val = _py(val)
-    try:
-        v = float(val)
-        return None if (v != v) else int(v)
-    except (TypeError, ValueError):
-        return None
-
-
-def to_float(val):
-    val = _py(val)
-    try:
-        v = float(val)
-        return None if (v != v) else float(v)
-    except (TypeError, ValueError):
-        return None
-
-
-def to_str(val):
-    val = _py(val)
-    if val is None:
-        return None
-    if isinstance(val, float) and (val != val):
-        return None
-    s = str(val).strip()
-    return None if s.lower() in ("nan", "nat", "none", "") else s
-
-
-def read_summary_df(path):
-    raw = pd.read_excel(path, header=None)
-    header_row = None
-    for i, row in raw.iterrows():
-        if "Subject" in [str(v).strip() for v in row]:
-            header_row = i
-            break
-    if header_row is None:
-        raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
-    return pd.read_excel(path, header=header_row).dropna(how="all")
-
-
-def parse_detail_visits(path):
-    df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
-    header_row = None
-    for i, row in df.iterrows():
-        if "Visit Type" in [str(v).strip() for v in row]:
-            header_row = i
-            break
-    if header_row is None:
-        return []
-    visits_df = df.iloc[header_row + 1:].copy()
-    visits_df.columns = range(visits_df.shape[1])
-    rows = []
-    for _, r in visits_df.iterrows():
-        visit_type = to_str(r.get(0))
-        if visit_type not in ("Past", "Upcoming"):
-            continue
-        rows.append({
-            "visit_type":                  visit_type,
-            "scheduled_date":              to_date(r.get(1)),
-            "window_days":                 to_str(r.get(2)),
-            "actual_date":                 to_date(r.get(3)),
-            "irt_transaction_no":          to_int(r.get(4)),
-            "irt_transaction_description": to_str(r.get(5)),
-            "medication_assignment":       to_str(r.get(6)),
-            "quantity_assigned":           to_int(r.get(7)),
-            "medication_id":               to_str(r.get(8)),
-        })
-    return rows
-
-
-def insert_import(cursor, study, source_file):
-    cursor.execute(
-        "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
-        (study, datetime.datetime.now(), os.path.basename(source_file)),
-    )
-    return cursor.lastrowid
-
-
-def insert_uco3001_summary(cursor, import_id, df):
-    sql = """INSERT INTO iwrs_uco3001_subject_summary (
-        import_id, subject, prior_subject_identifier, site, investigator, location,
-        cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
-        rescreened_subject, adt_ir, three_or_more_advanced_therapies,
-        only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
-        clinical_responder_status_i12_m0, irt_subject_status,
-        i0_rand_date_local, last_irt_transaction,
-        last_irt_transaction_date_local, last_irt_transaction_date_utc,
-        next_irt_transaction, next_irt_transaction_date_local,
-        most_recent_med_assignment_date, days_since_last_med_assignment,
-        patient_forecast_status, patient_forecast_status_changed_date
-    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    col = df.columns.tolist()
-    for _, r in df.iterrows():
-        cursor.execute(sql, (
-            import_id,
-            to_str(r["Subject"]),
-            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
-            to_str(r["Site"]),
-            to_str(r["Investigator"]),
-            to_str(r["Location"]),
-            to_str(r["Cohort per IRT"]),
-            to_date(r["Informed Consent Date"]),
-            to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
-            to_int(r["Subject's age collection"]),
-            to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
-            to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
-            to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
-            to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
-            to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
-            to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
-            to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
-            to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
-            to_str(r["IRT Subject Status"]),
-            to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
-            to_str(r["Last Recorded IRT Transaction"]),
-            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
-            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
-            to_str(r["Next Expected IRT Transaction"]),
-            to_date(r["Next Expected IRT Transaction Date [Local]"]),
-            to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
-            to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
-            to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
-            to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
-        ))
-
-
-def insert_mdd3003_summary(cursor, import_id, df):
-    sql = """INSERT INTO iwrs_mdd3003_subject_summary (
-        import_id, subject, prior_subject_identifier, site, investigator, location,
-        cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
-        madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
-        stratification_country, age_group, stable_remitters, irt_subject_status,
-        last_irt_transaction, last_irt_transaction_date_local,
-        last_irt_transaction_date_utc, next_irt_transaction,
-        next_irt_transaction_date_local, date_screened, date_screen_failed,
-        date_randomized_part1, date_early_withdraw_randomized_part1,
-        date_open_label_induction, date_early_withdraw_open_label_induction,
-        date_randomized_part2, date_early_withdraw_randomized_part2,
-        date_completed, date_unblinded
-    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    col = df.columns.tolist()
-    for _, r in df.iterrows():
-        cursor.execute(sql, (
-            import_id,
-            to_str(r["Subject"]),
-            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
-            to_str(r["Site"]),
-            to_str(r["Investigator"]),
-            to_str(r["Location"]),
-            to_str(r["Cohort per IRT"]),
-            to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
-            to_date(r["Informed Consent Date"]),
-            to_int(r["Subject's age collection"]),
-            to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
-            to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
-            to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
-            to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
-            to_str(r["Age Group"]) if "Age Group" in col else None,
-            to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
-            to_str(r["IRT Subject Status"]),
-            to_str(r["Last Recorded IRT Transaction"]),
-            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
-            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
-            to_str(r["Next Expected IRT Transaction"]),
-            to_date(r["Next Expected IRT Transaction Date [Local]"]),
-            to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
-            to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
-            to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
-            to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
-            to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
-            to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
-            to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
-            to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
-            to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
-            to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
-        ))
-
-
-def insert_visits(cursor, import_id, study, subject, visits):
-    if not visits:
-        return
-    sql = """INSERT INTO iwrs_subject_visits (
-        import_id, study, subject, visit_type, scheduled_date, window_days,
-        actual_date, irt_transaction_no, irt_transaction_description,
-        medication_assignment, quantity_assigned, medication_id
-    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for v in visits:
-        cursor.execute(sql, (
-            import_id, study, subject,
-            v["visit_type"], v["scheduled_date"], v["window_days"],
-            v["actual_date"], v["irt_transaction_no"],
-            v["irt_transaction_description"], v["medication_assignment"],
-            v["quantity_assigned"], v["medication_id"],
-        ))
-
-
-def import_to_mysql(summary_path, detail_files, study):
-    print(f"\n  [MySQL] Importuji {study}...")
-    df_summary = read_summary_df(summary_path)
-    conn = get_conn()
-    cursor = conn.cursor()
-
-    import_id = insert_import(cursor, study, summary_path)
-
-    if study == "77242113UCO3001":
-        insert_uco3001_summary(cursor, import_id, df_summary)
-    else:
-        insert_mdd3003_summary(cursor, import_id, df_summary)
-
-    total_visits = 0
-    for path in detail_files:
-        fname = os.path.basename(path)
-        m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
-        subject = m.group(1) if m else "UNKNOWN"
-        visits = parse_detail_visits(path)
-        insert_visits(cursor, import_id, study, subject, visits)
-        total_visits += len(visits)
-
-    conn.commit()
-    cursor.close()
-    conn.close()
-    print(f"  [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
-    return import_id
-
-
-# ── MAIN ─────────────────────────────────────────────────────────────────────
+# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────

 def main():
    today = datetime.date.today().strftime("%Y-%m-%d")
@@ -391,12 +120,12 @@ def main():

    summary_paths = {}

-    # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
+    # Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
    with sync_playwright() as p:
        for study in STUDIES:
-            print(f"\n{'='*60}")
+            print("\n" + "=" * 60)
            print(f"[{study}] KROK 1: Subject Summary Report")
-            print(f"{'='*60}")
+            print("=" * 60)
            browser = p.chromium.launch(headless=False)
            context = browser.new_context(accept_downloads=True)
            page = context.new_page()
@@ -415,10 +144,10 @@ def main():
            finally:
                browser.close()

-    # ── Krok 3: import do MySQL ──────────────────────────────────────────────
-    print(f"\n{'='*60}")
-    print("KROK 3: Import do MySQL")
-    print(f"{'='*60}")
+    # Krok 3: import do MongoDB
+    print("\n" + "=" * 60)
+    print("KROK 3: Import do MongoDB")
+    print("=" * 60)

    for study in STUDIES:
        summary_path = summary_paths.get(study)
@@ -426,18 +155,21 @@ def main():
            print(f"  [{study}] PŘESKOČENO — stahování selhalo")
            continue

-        detail_files = sorted(glob.glob(
-            os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
-        ))
-
        try:
-            import_to_mysql(summary_path, detail_files, study)
+            import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
        except Exception as e:
-            print(f"  [{study}] CHYBA při importu: {e}")
+            print(f"  [{study}] CHYBA při importu summary/visits: {e}")

-    print(f"\n{'='*60}")
+    # Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
+    print("\n  [notifikace] import PDF/JSON do Mongo...")
+    try:
+        import_notifications_to_mongo.main(STUDIES)
+    except Exception as e:
+        print(f"  CHYBA při importu notifikací: {e}")
+
+    print("\n" + "=" * 60)
    print("Vše hotovo.")
-    print(f"{'='*60}")
+    print("=" * 60)


 main()