Migrate IWRS from MySQL to MongoDB

- Add IWRS/common/mongo_writer.py with shared connection, indexes, upsert+snapshot helpers - Add IWRS/Patients/import_to_mongo.py (subject_summary + visits) - Add IWRS/Patients/import_notifications_to_mongo.py: parse PDF/JSON directly to Mongo (incl. PDF as BinData), replaces 2-step MySQL flow - Add IWRS/Drugs/import_to_mongo.py (shipments, items, inventory, destruction) - Add IWRS/backfill_mysql_to_mongo.py: one-shot history backfill - Switch IWRS/Patients/run_all.py and IWRS/Drugs/run_all.py to Mongo - Rewrite IWRS/Drugs/create_report.py data loaders to read from Mongo - 8 main collections (upsert = latest state) + 5 snapshot collections (append-only with import_id) under studie database; notifications and destruction are immutable and need no snapshots Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 07:24:36 +02:00
parent 681095d557
commit ea9d611719
2080 changed files with 9465 additions and 172 deletions
@@ -0,0 +1,253 @@
+"""
+Import Drugs dat (shipments, shipment_items, inventory, destruction) z XLSX do MongoDB.
+
+Volá se z IWRS/Drugs/run_all.py po stažení reportů.
+"""
+
+import os
+import sys
+import re
+import glob
+
+import pandas as pd
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from common.mongo_writer import (
+    to_str, to_int, to_date,
+    ensure_indexes, log_import,
+    bulk_upsert_with_snapshot, bulk_upsert_only,
+)
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+# ── XLSX parsery (převzaté z run_all.py + úprava na Mongo dokumenty) ─────────
+
+def parse_shipments_report(study):
+    path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
+    if not os.path.exists(path):
+        print(f"  CHYBI: {path}")
+        return []
+    raw = pd.read_excel(path, header=None)
+    header_row = None
+    for i, row in raw.iterrows():
+        if "Shipment ID" in [str(v).strip() for v in row]:
+            header_row = i
+            break
+    if header_row is None:
+        return []
+    df = pd.read_excel(path, header=header_row).dropna(how="all")
+    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
+    col = df.columns.tolist()
+    rows = []
+    for _, r in df.iterrows():
+        sid = to_str(r["Shipment ID"])
+        if not sid:
+            continue
+        rows.append({
+            "_id":                  sid,
+            "shipment_id":          sid,
+            "study":                study,
+            "status":               to_str(r["IRT Shipment Status"]),
+            "type":                 to_str(r["Type"]),
+            "ship_from":            to_str(r["Shipment From"]),
+            "ship_to_site":         to_str(r["Ship To:"]),
+            "location":             to_str(r["Location"]),
+            "request_date":         to_date(r["Request Date"]),
+            "shipped_date":         to_date(r["Shipped Date"]),
+            "received_date":        to_date(r["Received Date"]) if "Received Date" in col else None,
+            "received_by":          to_str(r["Received by"]) if "Received by" in col else None,
+            "delivered_date_utc":   to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
+            "delivery_recipient":   to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
+            "delivery_details":     to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
+            "cancelled_date":       to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
+            "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
+            "tracking_no":          to_str(r["Tracking #"]) if "Tracking #" in col else None,
+            "shipping_category":    to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
+            "expected_arrival":     to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
+        })
+    return rows
+
+
+def parse_shipment_details(study):
+    detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
+    files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
+    rows = []
+    for path in files:
+        m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
+        shipment_id = m.group(1) if m else "UNKNOWN"
+        raw = pd.read_excel(path, header=None)
+        header_row = None
+        for i, row in raw.iterrows():
+            if "Medication ID" in [str(v).strip() for v in row]:
+                header_row = i
+                break
+        if header_row is None:
+            continue
+        df = pd.read_excel(path, header=header_row).dropna(how="all")
+        for _, r in df.iterrows():
+            med_desc = (to_str(r.get("Medication Description"))
+                        or to_str(r.get("Medication ID Description")))
+            med_type = (to_str(r.get("Medication type"))
+                        or to_str(r.get("Medication ID type")))
+            med_id = to_str(r.get("Medication ID"))
+            if not med_id:
+                continue
+            rows.append({
+                "_id":                       f"{shipment_id}:{med_id}",
+                "study":                     study,
+                "shipment_id":               shipment_id,
+                "destination_location":      to_str(r.get("Destination Location")),
+                "shipment_status":           to_str(r.get("IRT Shipment Status")),
+                "shipment_type":             to_str(r.get("Type")),
+                "destination_site":          to_str(r.get("Destination Site")),
+                "investigator":              to_str(r.get("Investigator")),
+                "medication_description":    med_desc,
+                "medication_type":           med_type,
+                "medication_id":             med_id,
+                "packaged_lot_no":           to_str(r.get("Packaged Lot number")),
+                "packaged_lot_description":  to_str(r.get("Packaged Lot description")),
+                "container_id":              to_str(r.get("Container ID")),
+                "quantity":                  to_int(r.get("Quantity of Medication IDs")),
+                "expiration_date":           to_date(r.get("Expiration Date")),
+                "item_status":               to_str(r.get("Status")),
+            })
+    # dedupe (poslední vyhrává)
+    by_id = {r["_id"]: r for r in rows}
+    return list(by_id.values())
+
+
+def parse_inventory(study):
+    inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
+    files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
+    rows = []
+    for path in files:
+        raw = pd.read_excel(path, header=None)
+        site = investigator = location = None
+        header_row = None
+        for i, row in raw.iterrows():
+            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
+            if first.startswith("Site:"):
+                site = first.replace("Site:", "").strip()
+            elif first.startswith("Investigator:"):
+                investigator = first.replace("Investigator:", "").strip()
+            elif first.startswith("Location:"):
+                location = first.replace("Location:", "").strip()
+            if first in ("Medication", "Medication ID") and header_row is None:
+                header_row = i
+        if header_row is None:
+            continue
+        df = pd.read_excel(path, header=header_row).dropna(how="all")
+        df = df.rename(columns={df.columns[0]: "medication_id"})
+        for _, r in df.iterrows():
+            med_id = to_str(r["medication_id"])
+            if not med_id or not site:
+                continue
+            rows.append({
+                "_id":                      f"{site}:{med_id}",
+                "study":                    study,
+                "site":                     site,
+                "investigator":             investigator,
+                "location":                 location,
+                "medication_id":            med_id,
+                "packaged_lot_no":          to_str(r.get("Packaged Lot number")),
+                "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
+                "expiration_date":          to_date(r.get("Expiration date")),
+                "received_date":            to_date(r.get("Received Date")),
+                "receipt_user":             to_str(r.get("Shipment Receipt User")),
+                "subject_identifier":       to_str(r.get("Subject Identifier")),
+                "quantity_assigned":        to_int(r.get("Quantity Assigned")),
+                "irt_transaction":          to_str(r.get("IRT Transaction")),
+                "date_assigned":            to_date(r.get("Date Assigned")),
+                "assignment_user":          to_str(r.get("Assignment User")),
+                "dispensation_status":      to_str(r.get("Dispensation Status")),
+                "dispensing_date":          to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
+                "quantity_dispensed":       to_int(r.get("Quantity Dispensed")),
+                "dispensing_user":          to_str(r.get("Dispensing User")),
+                "quantity_returned":        to_int(r.get("Quantity Returned")),
+                "date_returned":            to_date(r.get("Date Returned")),
+                "return_user":              to_str(r.get("Return User")),
+            })
+    by_id = {r["_id"]: r for r in rows}
+    return list(by_id.values())
+
+
+def parse_destruction_files(study):
+    dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
+    files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
+    rows = []
+    for path in files:
+        raw = pd.read_excel(path, header=None)
+        meta = {}
+        header_row = None
+        for i, row in raw.iterrows():
+            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
+            for key, attr in [
+                ("Investigator Name:", "investigator"),
+                ("Site ID:", "site_id"),
+                ("Location:", "location"),
+                ("Basket ID:", "basket_id"),
+                ("Drug Destruction Created Date:", "destruction_date"),
+            ]:
+                if first.startswith(key):
+                    meta[attr] = first.replace(key, "").strip()
+            if first == "Medication ID Description" and header_row is None:
+                header_row = i
+        if header_row is None:
+            continue
+        df = pd.read_excel(path, header=header_row).dropna(how="all")
+        basket_id = meta.get("basket_id")
+        for _, r in df.iterrows():
+            med_id = to_str(r.get("Medication ID"))
+            if not med_id or not basket_id:
+                continue
+            rows.append({
+                "_id":                       f"{basket_id}:{med_id}",
+                "study":                     study,
+                "site_id":                   meta.get("site_id"),
+                "investigator":              meta.get("investigator"),
+                "location":                  meta.get("location"),
+                "basket_id":                 basket_id,
+                "destruction_date":          to_date(meta.get("destruction_date")),
+                "medication_description":    to_str(r.get("Medication ID Description")),
+                "medication_id":             med_id,
+                "packaged_lot_description":  to_str(r.get("Packaged Lot description")),
+                "comments":                  to_str(r.get("Comments")),
+            })
+    by_id = {r["_id"]: r for r in rows}
+    return list(by_id.values())
+
+
+# ── hlavní import ────────────────────────────────────────────────────────────
+
+def import_study(study):
+    print(f"\n  [{study}] parsovani XLSX...")
+    shipments = parse_shipments_report(study)
+    items     = parse_shipment_details(study)
+    inventory = parse_inventory(study)
+    destruct  = parse_destruction_files(study)
+    print(f"  Zasilky: {len(shipments)} | Polozky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(destruct)}")
+
+    import_id = log_import(study, f"drugs_{study}", "drugs", {
+        "shipments": len(shipments),
+        "shipment_items": len(items),
+        "inventory": len(inventory),
+        "destruction": len(destruct),
+    })
+    print(f"  import_id = {import_id}")
+
+    bulk_upsert_with_snapshot("iwrs_shipments", "iwrs_shipments_snapshots", shipments, import_id)
+    bulk_upsert_with_snapshot("iwrs_shipment_items", "iwrs_shipment_items_snapshots", items, import_id)
+    bulk_upsert_with_snapshot("iwrs_inventory", "iwrs_inventory_snapshots", inventory, import_id)
+    bulk_upsert_only("iwrs_destruction", destruct, import_id)
+
+
+def run(studies):
+    ensure_indexes()
+    for s in studies:
+        import_study(s)
+
+
+if __name__ == "__main__":
+    studies = sys.argv[1:] if len(sys.argv) > 1 else ["77242113UCO3001", "42847922MDD3003"]
+    run(studies)