Migrate IWRS from MySQL to MongoDB

- Add IWRS/common/mongo_writer.py with shared connection, indexes, upsert+snapshot helpers - Add IWRS/Patients/import_to_mongo.py (subject_summary + visits) - Add IWRS/Patients/import_notifications_to_mongo.py: parse PDF/JSON directly to Mongo (incl. PDF as BinData), replaces 2-step MySQL flow - Add IWRS/Drugs/import_to_mongo.py (shipments, items, inventory, destruction) - Add IWRS/backfill_mysql_to_mongo.py: one-shot history backfill - Switch IWRS/Patients/run_all.py and IWRS/Drugs/run_all.py to Mongo - Rewrite IWRS/Drugs/create_report.py data loaders to read from Mongo - 8 main collections (upsert = latest state) + 5 snapshot collections (append-only with import_id) under studie database; notifications and destruction are immutable and need no snapshots Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 07:24:36 +02:00
parent 681095d557
commit ea9d611719
2080 changed files with 9465 additions and 172 deletions
@@ -1,5 +1,5 @@
 import os
-import mysql.connector
+import sys
 import pandas as pd
 from datetime import date
 from pathlib import Path
@@ -7,7 +7,8 @@ from openpyxl import load_workbook
 from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
 from openpyxl.utils import get_column_letter

-import db_config
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from common.mongo_writer import get_db

 STUDIES = ["77242113UCO3001", "42847922MDD3003"]

@@ -23,70 +24,56 @@ DATE_COLUMNS = {
 N_SHIP_COLS = 9  # počet shipment sloupců před detail sloupci


-# ── DB ────────────────────────────────────────────────────────────────────────
+# ── Načítání dat z MongoDB ────────────────────────────────────────────────────

-def get_conn():
-    return mysql.connector.connect(
-        host=db_config.DB_HOST, port=db_config.DB_PORT,
-        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
-        database=db_config.DB_NAME,
-    )
+INVENTORY_COLS = [
+    ("site",                     "Site"),
+    ("medication_id",            "Med ID"),
+    ("packaged_lot_no",          "Lot No."),
+    ("original_expiration_date", "Orig Exp Date"),
+    ("expiration_date",          "Exp Date"),
+    ("received_date",            "Rcv Date"),
+    ("receipt_user",             "Rcpt User"),
+    ("subject_identifier",       "Subject ID"),
+    ("quantity_assigned",        "Qty Asgn"),
+    ("irt_transaction",          "IRT Tx"),
+    ("date_assigned",            "Date Asgn"),
+    ("assignment_user",          "Asgn User"),
+    ("dispensation_status",      "Disp Status"),
+    ("dispensing_date",          "Disp Date"),
+    ("quantity_dispensed",       "Qty Disp"),
+    ("dispensing_user",          "Disp User"),
+    ("quantity_returned",        "Qty Ret"),
+    ("date_returned",            "Date Ret"),
+    ("return_user",              "Ret User"),
+]


-def get_latest_import_id(cursor, study):
-    cursor.execute(
-        "SELECT MAX(import_id) AS mid FROM iwrs_import WHERE study=%s AND report_type='drugs'",
-        (study,),
-    )
-    row = cursor.fetchone()
-    mid = row["mid"]
-    if mid is None:
-        raise RuntimeError(f"Žádná data v MySQL pro studii {study}")
-    return mid
+def load_inventory(study):
+    db = get_db()
+    inv = list(db.iwrs_inventory.find({"study": study}))
+    destr = list(db.iwrs_destruction.find({"study": study}))
+    # map medication_id -> first basket+date
+    destr_map = {}
+    for d in destr:
+        mid = d.get("medication_id")
+        if mid and mid not in destr_map:
+            destr_map[mid] = (d.get("basket_id"), d.get("destruction_date"))

+    records = []
+    for doc in inv:
+        row = {label: doc.get(key) for key, label in INVENTORY_COLS}
+        b, dt = destr_map.get(doc.get("medication_id"), (None, None))
+        row["Destroyed"]  = dt
+        row["Basket No."] = b
+        records.append(row)

-# ── Načítání dat ──────────────────────────────────────────────────────────────
+    df = pd.DataFrame(records)
+    if df.empty:
+        print("  Inventory: 0 kitu")
+        return df

-def load_inventory(cursor, study, import_id):
-    sql = """
-        SELECT
-            i.site                      AS Site,
-            i.medication_id             AS `Med ID`,
-            i.packaged_lot_no           AS `Lot No.`,
-            i.original_expiration_date  AS `Orig Exp Date`,
-            i.expiration_date           AS `Exp Date`,
-            i.received_date             AS `Rcv Date`,
-            i.receipt_user              AS `Rcpt User`,
-            i.subject_identifier        AS `Subject ID`,
-            i.quantity_assigned         AS `Qty Asgn`,
-            i.irt_transaction           AS `IRT Tx`,
-            i.date_assigned             AS `Date Asgn`,
-            i.assignment_user           AS `Asgn User`,
-            i.dispensation_status       AS `Disp Status`,
-            i.dispensing_date           AS `Disp Date`,
-            i.quantity_dispensed        AS `Qty Disp`,
-            i.dispensing_user           AS `Disp User`,
-            i.quantity_returned         AS `Qty Ret`,
-            i.date_returned             AS `Date Ret`,
-            i.return_user               AS `Ret User`,
-            d.destruction_date          AS Destroyed,
-            d.basket_id                 AS `Basket No.`
-        FROM iwrs_inventory i
-        LEFT JOIN (
-            SELECT medication_id,
-                   ANY_VALUE(basket_id)        AS basket_id,
-                   ANY_VALUE(destruction_date) AS destruction_date
-            FROM iwrs_destruction
-            WHERE study = %s
-            GROUP BY medication_id
-        ) d ON d.medication_id = i.medication_id
-        WHERE i.import_id = %s
-          AND i.study     = %s
-        ORDER BY i.site, i.received_date, i.medication_id
-    """
-    cursor.execute(sql, (study, import_id, study))
-    rows = cursor.fetchall()
-    df = pd.DataFrame(rows)
+    df = df.sort_values(["Site", "Rcv Date", "Med ID"], na_position="last").reset_index(drop=True)
    for col in DATE_COLUMNS:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
@@ -94,78 +81,102 @@ def load_inventory(cursor, study, import_id):
    return df


-def load_shipments(cursor, study, import_id):
-    sql = """
-        SELECT
-            s.shipment_id               AS `Shipment ID`,
-            s.status                    AS `IRT Shipment Status`,
-            s.type                      AS Type,
-            s.ship_from                 AS `Shipment From`,
-            s.ship_to_site              AS `Ship To:`,
-            s.request_date              AS `Request Date`,
-            s.received_date             AS `Received Date`,
-            s.received_by               AS `Received by`,
-            s.expected_arrival          AS `Expected Arrival`,
-            i.investigator              AS Investigator,
-            i.medication_description    AS `Medication Description`,
-            i.medication_id             AS `Medication ID`,
-            i.packaged_lot_no           AS `Packaged Lot number`,
-            i.expiration_date           AS `Expiration Date`,
-            i.item_status               AS Status
-        FROM iwrs_shipments s
-        JOIN iwrs_shipment_items i
-            ON  i.study       = s.study
-            AND i.shipment_id = s.shipment_id
-            AND i.import_id   = %s
-        WHERE s.import_id = %s
-          AND s.study     = %s
-        ORDER BY s.ship_to_site, s.shipment_id, i.medication_id
-    """
-    cursor.execute(sql, (import_id, import_id, study))
-    rows = cursor.fetchall()
-    df = pd.DataFrame(rows)
+SHIP_COLS = [
+    ("shipment_id",       "Shipment ID"),
+    ("status",             "IRT Shipment Status"),
+    ("type",               "Type"),
+    ("ship_from",          "Shipment From"),
+    ("ship_to_site",       "Ship To:"),
+    ("request_date",       "Request Date"),
+    ("received_date",      "Received Date"),
+    ("received_by",        "Received by"),
+    ("expected_arrival",   "Expected Arrival"),
+]
+
+ITEM_COLS = [
+    ("investigator",            "Investigator"),
+    ("medication_description",  "Medication Description"),
+    ("medication_id",           "Medication ID"),
+    ("packaged_lot_no",         "Packaged Lot number"),
+    ("expiration_date",         "Expiration Date"),
+    ("item_status",             "Status"),
+]
+
+
+def load_shipments(study):
+    db = get_db()
+    ships  = list(db.iwrs_shipments.find({"study": study}))
+    items  = list(db.iwrs_shipment_items.find({"study": study}))
+
+    # index items by shipment_id
+    items_by_ship = {}
+    for it in items:
+        items_by_ship.setdefault(it.get("shipment_id"), []).append(it)
+
+    records = []
+    for s in ships:
+        base = {label: s.get(key) for key, label in SHIP_COLS}
+        for it in items_by_ship.get(s.get("shipment_id"), []):
+            row = dict(base)
+            for key, label in ITEM_COLS:
+                row[label] = it.get(key)
+            records.append(row)
+
+    df = pd.DataFrame(records)
+    if df.empty:
+        print("  Shipments: 0 zásilek, 0 kitu")
+        return df
+
+    df = df.sort_values(["Ship To:", "Shipment ID", "Medication ID"], na_position="last").reset_index(drop=True)
    for col in ("Request Date", "Received Date", "Expiration Date", "Expected Arrival"):
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
-    n_ship = df["Shipment ID"].nunique() if len(df) else 0
+    n_ship = df["Shipment ID"].nunique()
    print(f"  Shipments: {n_ship} zásilek, {len(df)} kitu")
    return df


-def load_visits(cursor, study, import_id):
-    cursor.execute(
-        "SELECT MAX(import_id) AS mid FROM iwrs_import WHERE study=%s AND report_type='patients'",
-        (study,),
-    )
-    patients_import_id = cursor.fetchone()["mid"] or import_id
-    import_id = patients_import_id
-    sql = """
-        SELECT
-            v.subject                                                          AS Subject,
-            COALESCE(v.actual_date, v.scheduled_date)                         AS `Visit Date`,
-            v.scheduled_date                                                   AS `Scheduled Date`,
-            v.irt_transaction_no                                               AS `IRT Tx No`,
-            v.irt_transaction_description                                      AS `Visit`,
-            v.medication_assignment                                            AS `Medication`,
-            GROUP_CONCAT(v.medication_id ORDER BY v.medication_id SEPARATOR ', ') AS `Med IDs`,
-            SUM(v.quantity_assigned)                                           AS `Qty`
-        FROM iwrs_subject_visits v
-        WHERE v.import_id = %s AND v.study = %s AND v.visit_type = 'Past'
-          AND v.irt_transaction_no IS NOT NULL
-        GROUP BY v.subject, v.actual_date, v.scheduled_date,
-                 v.irt_transaction_no, v.irt_transaction_description, v.medication_assignment
-        ORDER BY v.subject, COALESCE(v.actual_date, v.scheduled_date)
-    """
-    cursor.execute(sql, (import_id, study))
-    rows = cursor.fetchall()
+def load_visits(study):
+    db = get_db()
+    cur = db.iwrs_visits.find({
+        "study": study,
+        "visit_type": "Past",
+        "irt_transaction_no": {"$ne": None},
+    })
+    rows = []
+    for v in cur:
+        rows.append({
+            "Subject":      v.get("subject"),
+            "Visit Date":   v.get("actual_date") or v.get("scheduled_date"),
+            "Scheduled Date": v.get("scheduled_date"),
+            "IRT Tx No":    v.get("irt_transaction_no"),
+            "Visit":        v.get("irt_transaction_description"),
+            "Medication":   v.get("medication_assignment"),
+            "medication_id": v.get("medication_id"),
+            "quantity_assigned": v.get("quantity_assigned"),
+        })
    df = pd.DataFrame(rows)
+    if df.empty:
+        print("  Visits: 0 radku")
+        return df
+
+    # GROUP BY subject/actual/scheduled/irt_no/desc/medication
+    grouped = (
+        df.groupby(["Subject", "Visit Date", "Scheduled Date", "IRT Tx No", "Visit", "Medication"],
+                   dropna=False, as_index=False)
+        .agg(**{
+            "Med IDs": ("medication_id", lambda s: ", ".join(sorted([str(x) for x in s if pd.notna(x)]))),
+            "Qty":     ("quantity_assigned", "sum"),
+        })
+    )
+    grouped = grouped.sort_values(["Subject", "Visit Date"]).reset_index(drop=True)
    for col in ("Visit Date", "Scheduled Date"):
-        if col in df.columns:
-            df[col] = pd.to_datetime(df[col], errors="coerce")
-    if study == "77242113UCO3001" and "Visit" in df.columns:
-        df["Visit"] = df["Visit"].replace("Subject Number Creation", "Screening")
-    print(f"  Visits: {len(df)} řádků")
-    return df
+        if col in grouped.columns:
+            grouped[col] = pd.to_datetime(grouped[col], errors="coerce")
+    if study == "77242113UCO3001":
+        grouped["Visit"] = grouped["Visit"].replace("Subject Number Creation", "Screening")
+    print(f"  Visits: {len(grouped)} řádků")
+    return grouped


 # ── Odvozené sheety ───────────────────────────────────────────────────────────
@@ -343,49 +354,42 @@ def format_shipment_sheet(ws, header_color_ship, header_color_detail, n_ship_col

 # ── Pacienti ─────────────────────────────────────────────────────────────────

-PATIENT_TABLE = {
-    "77242113UCO3001": "iwrs_uco3001_subject_summary",
-    "42847922MDD3003": "iwrs_mdd3003_subject_summary",
-}
+def load_patients(study):
+    db = get_db()
+    docs = list(db.iwrs_subject_summary.find({"study": study}))
+    if not docs:
+        raise RuntimeError(f"Žádná data v Mongo pro pacienty {study}")

-
-def load_patients(cursor, study):
-    table = PATIENT_TABLE[study]
-    cursor.execute(f"SELECT MAX(import_id) AS mid FROM {table}")
-    mid = cursor.fetchone()["mid"]
-    if mid is None:
-        raise RuntimeError(f"Žádná data v MySQL pro pacienty {study}")
-    extra_cols = ""
+    base_cols = [
+        ("subject",                          "Subject"),
+        ("investigator",                     "Investigator"),
+        ("age",                              "Subject's age collection"),
+        ("cohort_per_irt",                   "Cohort per IRT"),
+        ("irt_subject_status",               "IRT Subject Status"),
+        ("last_irt_transaction",             "Last Recorded IRT Transaction"),
+        ("next_irt_transaction",             "Next Expected IRT Transaction"),
+        ("next_irt_transaction_date_local",  "Next Expected IRT Transaction Date [Local]"),
+    ]
+    uco_extra = [
+        ("rescreened_subject",               "Rescreened Subject"),
+        ("adt_ir",                           "ADT-IR"),
+        ("three_or_more_advanced_therapies", "3+ Adv. Therapies"),
+        ("only_oral_5asa_compounds",         "Only 5-ASA"),
+        ("ustekinumab",                      "Ustekinumab"),
+        ("isolated_proctitis",               "Isolated Proctitis"),
+    ]
+    cols = list(base_cols)
    if study == "77242113UCO3001":
-        extra_cols = """
-            rescreened_subject                   AS `Rescreened Subject`,
-            adt_ir                               AS `ADT-IR`,
-            three_or_more_advanced_therapies     AS `3+ Adv. Therapies`,
-            only_oral_5asa_compounds             AS `Only 5-ASA`,
-            ustekinumab                          AS `Ustekinumab`,
-            isolated_proctitis                   AS `Isolated Proctitis`,"""
-    sql = f"""
-        SELECT
-            subject                              AS `Subject`,
-            investigator                         AS `Investigator`,
-            age                                  AS `Subject's age collection`,
-            cohort_per_irt                       AS `Cohort per IRT`,{extra_cols}
-            irt_subject_status                   AS `IRT Subject Status`,
-            last_irt_transaction                 AS `Last Recorded IRT Transaction`,
-            next_irt_transaction                 AS `Next Expected IRT Transaction`,
-            next_irt_transaction_date_local      AS `Next Expected IRT Transaction Date [Local]`
-        FROM {table}
-        WHERE import_id = %s
-        ORDER BY subject
-    """
-    cursor.execute(sql, (mid,))
-    rows = cursor.fetchall()
-    df = pd.DataFrame(rows)
+        cols += uco_extra
+
+    rows = [{label: d.get(key) for key, label in cols} for d in docs]
+    df = pd.DataFrame(rows).sort_values("Subject").reset_index(drop=True)
+
    if "Next Expected IRT Transaction Date [Local]" in df.columns:
        df["Next Expected IRT Transaction Date [Local]"] = pd.to_datetime(
            df["Next Expected IRT Transaction Date [Local]"], errors="coerce"
        )
-    print(f"  Pacienti: {len(df)} subjektů (import_id={mid})")
+    print(f"  Pacienti: {len(df)} subjektů")
    return df


@@ -574,18 +578,11 @@ def create_study_report(study):

    output_file = OUTPUT_DIR / f"{today} {study} CZ IWRS overview v{version}.xlsx"

-    print(f"\n[{study}] Načítám z MySQL...")
-    conn   = get_conn()
-    cursor = conn.cursor(dictionary=True)
-    import_id = get_latest_import_id(cursor, study)
-    print(f"  import_id = {import_id}")
-
-    df           = load_inventory(cursor, study, import_id)
-    shipments_df = load_shipments(cursor, study, import_id)
-    df_patients  = load_patients(cursor, study)
-    visits_df    = load_visits(cursor, study, import_id)
-    cursor.close()
-    conn.close()
+    print(f"\n[{study}] Nacitam z MongoDB...")
+    df           = load_inventory(study)
+    shipments_df = load_shipments(study)
+    df_patients  = load_patients(study)
+    visits_df    = load_visits(study)

    expired_df, expired_sheet = build_expired(df)
    assigned_df               = build_assigned_not_dispensed(df)
@@ -0,0 +1,253 @@
+"""
+Import Drugs dat (shipments, shipment_items, inventory, destruction) z XLSX do MongoDB.
+
+Volá se z IWRS/Drugs/run_all.py po stažení reportů.
+"""
+
+import os
+import sys
+import re
+import glob
+
+import pandas as pd
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from common.mongo_writer import (
+    to_str, to_int, to_date,
+    ensure_indexes, log_import,
+    bulk_upsert_with_snapshot, bulk_upsert_only,
+)
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+# ── XLSX parsery (převzaté z run_all.py + úprava na Mongo dokumenty) ─────────
+
+def parse_shipments_report(study):
+    path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
+    if not os.path.exists(path):
+        print(f"  CHYBI: {path}")
+        return []
+    raw = pd.read_excel(path, header=None)
+    header_row = None
+    for i, row in raw.iterrows():
+        if "Shipment ID" in [str(v).strip() for v in row]:
+            header_row = i
+            break
+    if header_row is None:
+        return []
+    df = pd.read_excel(path, header=header_row).dropna(how="all")
+    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
+    col = df.columns.tolist()
+    rows = []
+    for _, r in df.iterrows():
+        sid = to_str(r["Shipment ID"])
+        if not sid:
+            continue
+        rows.append({
+            "_id":                  sid,
+            "shipment_id":          sid,
+            "study":                study,
+            "status":               to_str(r["IRT Shipment Status"]),
+            "type":                 to_str(r["Type"]),
+            "ship_from":            to_str(r["Shipment From"]),
+            "ship_to_site":         to_str(r["Ship To:"]),
+            "location":             to_str(r["Location"]),
+            "request_date":         to_date(r["Request Date"]),
+            "shipped_date":         to_date(r["Shipped Date"]),
+            "received_date":        to_date(r["Received Date"]) if "Received Date" in col else None,
+            "received_by":          to_str(r["Received by"]) if "Received by" in col else None,
+            "delivered_date_utc":   to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
+            "delivery_recipient":   to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
+            "delivery_details":     to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
+            "cancelled_date":       to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
+            "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
+            "tracking_no":          to_str(r["Tracking #"]) if "Tracking #" in col else None,
+            "shipping_category":    to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
+            "expected_arrival":     to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
+        })
+    return rows
+
+
+def parse_shipment_details(study):
+    detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
+    files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
+    rows = []
+    for path in files:
+        m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
+        shipment_id = m.group(1) if m else "UNKNOWN"
+        raw = pd.read_excel(path, header=None)
+        header_row = None
+        for i, row in raw.iterrows():
+            if "Medication ID" in [str(v).strip() for v in row]:
+                header_row = i
+                break
+        if header_row is None:
+            continue
+        df = pd.read_excel(path, header=header_row).dropna(how="all")
+        for _, r in df.iterrows():
+            med_desc = (to_str(r.get("Medication Description"))
+                        or to_str(r.get("Medication ID Description")))
+            med_type = (to_str(r.get("Medication type"))
+                        or to_str(r.get("Medication ID type")))
+            med_id = to_str(r.get("Medication ID"))
+            if not med_id:
+                continue
+            rows.append({
+                "_id":                       f"{shipment_id}:{med_id}",
+                "study":                     study,
+                "shipment_id":               shipment_id,
+                "destination_location":      to_str(r.get("Destination Location")),
+                "shipment_status":           to_str(r.get("IRT Shipment Status")),
+                "shipment_type":             to_str(r.get("Type")),
+                "destination_site":          to_str(r.get("Destination Site")),
+                "investigator":              to_str(r.get("Investigator")),
+                "medication_description":    med_desc,
+                "medication_type":           med_type,
+                "medication_id":             med_id,
+                "packaged_lot_no":           to_str(r.get("Packaged Lot number")),
+                "packaged_lot_description":  to_str(r.get("Packaged Lot description")),
+                "container_id":              to_str(r.get("Container ID")),
+                "quantity":                  to_int(r.get("Quantity of Medication IDs")),
+                "expiration_date":           to_date(r.get("Expiration Date")),
+                "item_status":               to_str(r.get("Status")),
+            })
+    # dedupe (poslední vyhrává)
+    by_id = {r["_id"]: r for r in rows}
+    return list(by_id.values())
+
+
+def parse_inventory(study):
+    inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
+    files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
+    rows = []
+    for path in files:
+        raw = pd.read_excel(path, header=None)
+        site = investigator = location = None
+        header_row = None
+        for i, row in raw.iterrows():
+            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
+            if first.startswith("Site:"):
+                site = first.replace("Site:", "").strip()
+            elif first.startswith("Investigator:"):
+                investigator = first.replace("Investigator:", "").strip()
+            elif first.startswith("Location:"):
+                location = first.replace("Location:", "").strip()
+            if first in ("Medication", "Medication ID") and header_row is None:
+                header_row = i
+        if header_row is None:
+            continue
+        df = pd.read_excel(path, header=header_row).dropna(how="all")
+        df = df.rename(columns={df.columns[0]: "medication_id"})
+        for _, r in df.iterrows():
+            med_id = to_str(r["medication_id"])
+            if not med_id or not site:
+                continue
+            rows.append({
+                "_id":                      f"{site}:{med_id}",
+                "study":                    study,
+                "site":                     site,
+                "investigator":             investigator,
+                "location":                 location,
+                "medication_id":            med_id,
+                "packaged_lot_no":          to_str(r.get("Packaged Lot number")),
+                "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
+                "expiration_date":          to_date(r.get("Expiration date")),
+                "received_date":            to_date(r.get("Received Date")),
+                "receipt_user":             to_str(r.get("Shipment Receipt User")),
+                "subject_identifier":       to_str(r.get("Subject Identifier")),
+                "quantity_assigned":        to_int(r.get("Quantity Assigned")),
+                "irt_transaction":          to_str(r.get("IRT Transaction")),
+                "date_assigned":            to_date(r.get("Date Assigned")),
+                "assignment_user":          to_str(r.get("Assignment User")),
+                "dispensation_status":      to_str(r.get("Dispensation Status")),
+                "dispensing_date":          to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
+                "quantity_dispensed":       to_int(r.get("Quantity Dispensed")),
+                "dispensing_user":          to_str(r.get("Dispensing User")),
+                "quantity_returned":        to_int(r.get("Quantity Returned")),
+                "date_returned":            to_date(r.get("Date Returned")),
+                "return_user":              to_str(r.get("Return User")),
+            })
+    by_id = {r["_id"]: r for r in rows}
+    return list(by_id.values())
+
+
+def parse_destruction_files(study):
+    dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
+    files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
+    rows = []
+    for path in files:
+        raw = pd.read_excel(path, header=None)
+        meta = {}
+        header_row = None
+        for i, row in raw.iterrows():
+            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
+            for key, attr in [
+                ("Investigator Name:", "investigator"),
+                ("Site ID:", "site_id"),
+                ("Location:", "location"),
+                ("Basket ID:", "basket_id"),
+                ("Drug Destruction Created Date:", "destruction_date"),
+            ]:
+                if first.startswith(key):
+                    meta[attr] = first.replace(key, "").strip()
+            if first == "Medication ID Description" and header_row is None:
+                header_row = i
+        if header_row is None:
+            continue
+        df = pd.read_excel(path, header=header_row).dropna(how="all")
+        basket_id = meta.get("basket_id")
+        for _, r in df.iterrows():
+            med_id = to_str(r.get("Medication ID"))
+            if not med_id or not basket_id:
+                continue
+            rows.append({
+                "_id":                       f"{basket_id}:{med_id}",
+                "study":                     study,
+                "site_id":                   meta.get("site_id"),
+                "investigator":              meta.get("investigator"),
+                "location":                  meta.get("location"),
+                "basket_id":                 basket_id,
+                "destruction_date":          to_date(meta.get("destruction_date")),
+                "medication_description":    to_str(r.get("Medication ID Description")),
+                "medication_id":             med_id,
+                "packaged_lot_description":  to_str(r.get("Packaged Lot description")),
+                "comments":                  to_str(r.get("Comments")),
+            })
+    by_id = {r["_id"]: r for r in rows}
+    return list(by_id.values())
+
+
+# ── hlavní import ────────────────────────────────────────────────────────────
+
+def import_study(study):
+    print(f"\n  [{study}] parsovani XLSX...")
+    shipments = parse_shipments_report(study)
+    items     = parse_shipment_details(study)
+    inventory = parse_inventory(study)
+    destruct  = parse_destruction_files(study)
+    print(f"  Zasilky: {len(shipments)} | Polozky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(destruct)}")
+
+    import_id = log_import(study, f"drugs_{study}", "drugs", {
+        "shipments": len(shipments),
+        "shipment_items": len(items),
+        "inventory": len(inventory),
+        "destruction": len(destruct),
+    })
+    print(f"  import_id = {import_id}")
+
+    bulk_upsert_with_snapshot("iwrs_shipments", "iwrs_shipments_snapshots", shipments, import_id)
+    bulk_upsert_with_snapshot("iwrs_shipment_items", "iwrs_shipment_items_snapshots", items, import_id)
+    bulk_upsert_with_snapshot("iwrs_inventory", "iwrs_inventory_snapshots", inventory, import_id)
+    bulk_upsert_only("iwrs_destruction", destruct, import_id)
+
+
+def run(studies):
+    ensure_indexes()
+    for s in studies:
+        import_study(s)
+
+
+if __name__ == "__main__":
+    studies = sys.argv[1:] if len(sys.argv) > 1 else ["77242113UCO3001", "42847922MDD3003"]
+    run(studies)