janssen/IWRS/Drugs/import_to_mongo.py

"""
Import Drugs dat (shipments, shipment_items, inventory, destruction) z XLSX do MongoDB.

Volá se z IWRS/Drugs/run_all.py po stažení reportů.
"""

import os
import sys
import re
import glob

import pandas as pd

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from common.mongo_writer import (
    to_str, to_int, to_date,
    ensure_indexes, log_import,
    bulk_upsert_with_snapshot, bulk_upsert_only,
)

BASE_DIR = os.path.dirname(os.path.abspath(__file__))


# ── XLSX parsery (převzaté z run_all.py + úprava na Mongo dokumenty) ─────────

def parse_shipments_report(study):
    path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
    if not os.path.exists(path):
        print(f"  CHYBI: {path}")
        return []
    raw = pd.read_excel(path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Shipment ID" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        return []
    df = pd.read_excel(path, header=header_row).dropna(how="all")
    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
    col = df.columns.tolist()
    rows = []
    for _, r in df.iterrows():
        sid = to_str(r["Shipment ID"])
        if not sid:
            continue
        rows.append({
            "_id":                  sid,
            "shipment_id":          sid,
            "study":                study,
            "status":               to_str(r["IRT Shipment Status"]),
            "type":                 to_str(r["Type"]),
            "ship_from":            to_str(r["Shipment From"]),
            "ship_to_site":         to_str(r["Ship To:"]),
            "location":             to_str(r["Location"]),
            "request_date":         to_date(r["Request Date"]),
            "shipped_date":         to_date(r["Shipped Date"]),
            "received_date":        to_date(r["Received Date"]) if "Received Date" in col else None,
            "received_by":          to_str(r["Received by"]) if "Received by" in col else None,
            "delivered_date_utc":   to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
            "delivery_recipient":   to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
            "delivery_details":     to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
            "cancelled_date":       to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
            "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
            "tracking_no":          to_str(r["Tracking #"]) if "Tracking #" in col else None,
            "shipping_category":    to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
            "expected_arrival":     to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
        })
    return rows


def parse_shipment_details(study):
    detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
    files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
    rows = []
    for path in files:
        m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
        shipment_id = m.group(1) if m else "UNKNOWN"
        raw = pd.read_excel(path, header=None)
        header_row = None
        for i, row in raw.iterrows():
            if "Medication ID" in [str(v).strip() for v in row]:
                header_row = i
                break
        if header_row is None:
            continue
        df = pd.read_excel(path, header=header_row).dropna(how="all")
        for _, r in df.iterrows():
            med_desc = (to_str(r.get("Medication Description"))
                        or to_str(r.get("Medication ID Description")))
            med_type = (to_str(r.get("Medication type"))
                        or to_str(r.get("Medication ID type")))
            med_id = to_str(r.get("Medication ID"))
            if not med_id:
                continue
            rows.append({
                "_id":                       f"{shipment_id}:{med_id}",
                "study":                     study,
                "shipment_id":               shipment_id,
                "destination_location":      to_str(r.get("Destination Location")),
                "shipment_status":           to_str(r.get("IRT Shipment Status")),
                "shipment_type":             to_str(r.get("Type")),
                "destination_site":          to_str(r.get("Destination Site")),
                "investigator":              to_str(r.get("Investigator")),
                "medication_description":    med_desc,
                "medication_type":           med_type,
                "medication_id":             med_id,
                "packaged_lot_no":           to_str(r.get("Packaged Lot number")),
                "packaged_lot_description":  to_str(r.get("Packaged Lot description")),
                "container_id":              to_str(r.get("Container ID")),
                "quantity":                  to_int(r.get("Quantity of Medication IDs")),
                "expiration_date":           to_date(r.get("Expiration Date")),
                "item_status":               to_str(r.get("Status")),
            })
    # dedupe (poslední vyhrává)
    by_id = {r["_id"]: r for r in rows}
    return list(by_id.values())


def parse_inventory(study):
    inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
    files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
    rows = []
    for path in files:
        raw = pd.read_excel(path, header=None)
        site = investigator = location = None
        header_row = None
        for i, row in raw.iterrows():
            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
            if first.startswith("Site:"):
                site = first.replace("Site:", "").strip()
            elif first.startswith("Investigator:"):
                investigator = first.replace("Investigator:", "").strip()
            elif first.startswith("Location:"):
                location = first.replace("Location:", "").strip()
            if first in ("Medication", "Medication ID") and header_row is None:
                header_row = i
        if header_row is None:
            continue
        df = pd.read_excel(path, header=header_row).dropna(how="all")
        df = df.rename(columns={df.columns[0]: "medication_id"})
        for _, r in df.iterrows():
            med_id = to_str(r["medication_id"])
            if not med_id or not site:
                continue
            rows.append({
                "_id":                      f"{site}:{med_id}",
                "study":                    study,
                "site":                     site,
                "investigator":             investigator,
                "location":                 location,
                "medication_id":            med_id,
                "packaged_lot_no":          to_str(r.get("Packaged Lot number")),
                "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
                "expiration_date":          to_date(r.get("Expiration date")),
                "received_date":            to_date(r.get("Received Date")),
                "receipt_user":             to_str(r.get("Shipment Receipt User")),
                "subject_identifier":       to_str(r.get("Subject Identifier")),
                "quantity_assigned":        to_int(r.get("Quantity Assigned")),
                "irt_transaction":          to_str(r.get("IRT Transaction")),
                "date_assigned":            to_date(r.get("Date Assigned")),
                "assignment_user":          to_str(r.get("Assignment User")),
                "dispensation_status":      to_str(r.get("Dispensation Status")),
                "dispensing_date":          to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
                "quantity_dispensed":       to_int(r.get("Quantity Dispensed")),
                "dispensing_user":          to_str(r.get("Dispensing User")),
                "quantity_returned":        to_int(r.get("Quantity Returned")),
                "date_returned":            to_date(r.get("Date Returned")),
                "return_user":              to_str(r.get("Return User")),
            })
    by_id = {r["_id"]: r for r in rows}
    return list(by_id.values())


def parse_destruction_files(study):
    dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
    files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
    rows = []
    for path in files:
        raw = pd.read_excel(path, header=None)
        meta = {}
        header_row = None
        for i, row in raw.iterrows():
            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
            for key, attr in [
                ("Investigator Name:", "investigator"),
                ("Site ID:", "site_id"),
                ("Location:", "location"),
                ("Basket ID:", "basket_id"),
                ("Drug Destruction Created Date:", "destruction_date"),
            ]:
                if first.startswith(key):
                    meta[attr] = first.replace(key, "").strip()
            if first == "Medication ID Description" and header_row is None:
                header_row = i
        if header_row is None:
            continue
        df = pd.read_excel(path, header=header_row).dropna(how="all")
        basket_id = meta.get("basket_id")
        for _, r in df.iterrows():
            med_id = to_str(r.get("Medication ID"))
            if not med_id or not basket_id:
                continue
            rows.append({
                "_id":                       f"{basket_id}:{med_id}",
                "study":                     study,
                "site_id":                   meta.get("site_id"),
                "investigator":              meta.get("investigator"),
                "location":                  meta.get("location"),
                "basket_id":                 basket_id,
                "destruction_date":          to_date(meta.get("destruction_date")),
                "medication_description":    to_str(r.get("Medication ID Description")),
                "medication_id":             med_id,
                "packaged_lot_description":  to_str(r.get("Packaged Lot description")),
                "comments":                  to_str(r.get("Comments")),
            })
    by_id = {r["_id"]: r for r in rows}
    return list(by_id.values())


# ── hlavní import ────────────────────────────────────────────────────────────

def import_study(study):
    print(f"\n  [{study}] parsovani XLSX...")
    shipments = parse_shipments_report(study)
    items     = parse_shipment_details(study)
    inventory = parse_inventory(study)
    destruct  = parse_destruction_files(study)
    print(f"  Zasilky: {len(shipments)} | Polozky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(destruct)}")

    import_id = log_import(study, f"drugs_{study}", "drugs", {
        "shipments": len(shipments),
        "shipment_items": len(items),
        "inventory": len(inventory),
        "destruction": len(destruct),
    })
    print(f"  import_id = {import_id}")

    bulk_upsert_with_snapshot("iwrs_shipments", "iwrs_shipments_snapshots", shipments, import_id)
    bulk_upsert_with_snapshot("iwrs_shipment_items", "iwrs_shipment_items_snapshots", items, import_id)
    bulk_upsert_with_snapshot("iwrs_inventory", "iwrs_inventory_snapshots", inventory, import_id)
    bulk_upsert_only("iwrs_destruction", destruct, import_id)


def run(studies):
    ensure_indexes()
    for s in studies:
        import_study(s)


if __name__ == "__main__":
    studies = sys.argv[1:] if len(sys.argv) > 1 else ["77242113UCO3001", "42847922MDD3003"]
    run(studies)