Files
janssen/IWRS/Drugs/import_to_mongo.py
T
administrator ea9d611719 Migrate IWRS from MySQL to MongoDB
- Add IWRS/common/mongo_writer.py with shared connection, indexes,
  upsert+snapshot helpers
- Add IWRS/Patients/import_to_mongo.py (subject_summary + visits)
- Add IWRS/Patients/import_notifications_to_mongo.py: parse PDF/JSON
  directly to Mongo (incl. PDF as BinData), replaces 2-step MySQL flow
- Add IWRS/Drugs/import_to_mongo.py (shipments, items, inventory,
  destruction)
- Add IWRS/backfill_mysql_to_mongo.py: one-shot history backfill
- Switch IWRS/Patients/run_all.py and IWRS/Drugs/run_all.py to Mongo
- Rewrite IWRS/Drugs/create_report.py data loaders to read from Mongo
- 8 main collections (upsert = latest state) + 5 snapshot collections
  (append-only with import_id) under studie database; notifications and
  destruction are immutable and need no snapshots

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 07:24:36 +02:00

254 lines
12 KiB
Python

"""
Import Drugs dat (shipments, shipment_items, inventory, destruction) z XLSX do MongoDB.
Volá se z IWRS/Drugs/run_all.py po stažení reportů.
"""
import os
import sys
import re
import glob
import pandas as pd
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from common.mongo_writer import (
to_str, to_int, to_date,
ensure_indexes, log_import,
bulk_upsert_with_snapshot, bulk_upsert_only,
)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# ── XLSX parsery (převzaté z run_all.py + úprava na Mongo dokumenty) ─────────
def parse_shipments_report(study):
path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
if not os.path.exists(path):
print(f" CHYBI: {path}")
return []
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Shipment ID" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
return []
df = pd.read_excel(path, header=header_row).dropna(how="all")
df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
col = df.columns.tolist()
rows = []
for _, r in df.iterrows():
sid = to_str(r["Shipment ID"])
if not sid:
continue
rows.append({
"_id": sid,
"shipment_id": sid,
"study": study,
"status": to_str(r["IRT Shipment Status"]),
"type": to_str(r["Type"]),
"ship_from": to_str(r["Shipment From"]),
"ship_to_site": to_str(r["Ship To:"]),
"location": to_str(r["Location"]),
"request_date": to_date(r["Request Date"]),
"shipped_date": to_date(r["Shipped Date"]),
"received_date": to_date(r["Received Date"]) if "Received Date" in col else None,
"received_by": to_str(r["Received by"]) if "Received by" in col else None,
"delivered_date_utc": to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
"delivery_recipient": to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
"delivery_details": to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
"cancelled_date": to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
"total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
"tracking_no": to_str(r["Tracking #"]) if "Tracking #" in col else None,
"shipping_category": to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
"expected_arrival": to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
})
return rows
def parse_shipment_details(study):
detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
rows = []
for path in files:
m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
shipment_id = m.group(1) if m else "UNKNOWN"
raw = pd.read_excel(path, header=None)
header_row = None
for i, row in raw.iterrows():
if "Medication ID" in [str(v).strip() for v in row]:
header_row = i
break
if header_row is None:
continue
df = pd.read_excel(path, header=header_row).dropna(how="all")
for _, r in df.iterrows():
med_desc = (to_str(r.get("Medication Description"))
or to_str(r.get("Medication ID Description")))
med_type = (to_str(r.get("Medication type"))
or to_str(r.get("Medication ID type")))
med_id = to_str(r.get("Medication ID"))
if not med_id:
continue
rows.append({
"_id": f"{shipment_id}:{med_id}",
"study": study,
"shipment_id": shipment_id,
"destination_location": to_str(r.get("Destination Location")),
"shipment_status": to_str(r.get("IRT Shipment Status")),
"shipment_type": to_str(r.get("Type")),
"destination_site": to_str(r.get("Destination Site")),
"investigator": to_str(r.get("Investigator")),
"medication_description": med_desc,
"medication_type": med_type,
"medication_id": med_id,
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
"container_id": to_str(r.get("Container ID")),
"quantity": to_int(r.get("Quantity of Medication IDs")),
"expiration_date": to_date(r.get("Expiration Date")),
"item_status": to_str(r.get("Status")),
})
# dedupe (poslední vyhrává)
by_id = {r["_id"]: r for r in rows}
return list(by_id.values())
def parse_inventory(study):
inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
rows = []
for path in files:
raw = pd.read_excel(path, header=None)
site = investigator = location = None
header_row = None
for i, row in raw.iterrows():
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
if first.startswith("Site:"):
site = first.replace("Site:", "").strip()
elif first.startswith("Investigator:"):
investigator = first.replace("Investigator:", "").strip()
elif first.startswith("Location:"):
location = first.replace("Location:", "").strip()
if first in ("Medication", "Medication ID") and header_row is None:
header_row = i
if header_row is None:
continue
df = pd.read_excel(path, header=header_row).dropna(how="all")
df = df.rename(columns={df.columns[0]: "medication_id"})
for _, r in df.iterrows():
med_id = to_str(r["medication_id"])
if not med_id or not site:
continue
rows.append({
"_id": f"{site}:{med_id}",
"study": study,
"site": site,
"investigator": investigator,
"location": location,
"medication_id": med_id,
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
"original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
"expiration_date": to_date(r.get("Expiration date")),
"received_date": to_date(r.get("Received Date")),
"receipt_user": to_str(r.get("Shipment Receipt User")),
"subject_identifier": to_str(r.get("Subject Identifier")),
"quantity_assigned": to_int(r.get("Quantity Assigned")),
"irt_transaction": to_str(r.get("IRT Transaction")),
"date_assigned": to_date(r.get("Date Assigned")),
"assignment_user": to_str(r.get("Assignment User")),
"dispensation_status": to_str(r.get("Dispensation Status")),
"dispensing_date": to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
"quantity_dispensed": to_int(r.get("Quantity Dispensed")),
"dispensing_user": to_str(r.get("Dispensing User")),
"quantity_returned": to_int(r.get("Quantity Returned")),
"date_returned": to_date(r.get("Date Returned")),
"return_user": to_str(r.get("Return User")),
})
by_id = {r["_id"]: r for r in rows}
return list(by_id.values())
def parse_destruction_files(study):
dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
rows = []
for path in files:
raw = pd.read_excel(path, header=None)
meta = {}
header_row = None
for i, row in raw.iterrows():
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
for key, attr in [
("Investigator Name:", "investigator"),
("Site ID:", "site_id"),
("Location:", "location"),
("Basket ID:", "basket_id"),
("Drug Destruction Created Date:", "destruction_date"),
]:
if first.startswith(key):
meta[attr] = first.replace(key, "").strip()
if first == "Medication ID Description" and header_row is None:
header_row = i
if header_row is None:
continue
df = pd.read_excel(path, header=header_row).dropna(how="all")
basket_id = meta.get("basket_id")
for _, r in df.iterrows():
med_id = to_str(r.get("Medication ID"))
if not med_id or not basket_id:
continue
rows.append({
"_id": f"{basket_id}:{med_id}",
"study": study,
"site_id": meta.get("site_id"),
"investigator": meta.get("investigator"),
"location": meta.get("location"),
"basket_id": basket_id,
"destruction_date": to_date(meta.get("destruction_date")),
"medication_description": to_str(r.get("Medication ID Description")),
"medication_id": med_id,
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
"comments": to_str(r.get("Comments")),
})
by_id = {r["_id"]: r for r in rows}
return list(by_id.values())
# ── hlavní import ────────────────────────────────────────────────────────────
def import_study(study):
print(f"\n [{study}] parsovani XLSX...")
shipments = parse_shipments_report(study)
items = parse_shipment_details(study)
inventory = parse_inventory(study)
destruct = parse_destruction_files(study)
print(f" Zasilky: {len(shipments)} | Polozky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(destruct)}")
import_id = log_import(study, f"drugs_{study}", "drugs", {
"shipments": len(shipments),
"shipment_items": len(items),
"inventory": len(inventory),
"destruction": len(destruct),
})
print(f" import_id = {import_id}")
bulk_upsert_with_snapshot("iwrs_shipments", "iwrs_shipments_snapshots", shipments, import_id)
bulk_upsert_with_snapshot("iwrs_shipment_items", "iwrs_shipment_items_snapshots", items, import_id)
bulk_upsert_with_snapshot("iwrs_inventory", "iwrs_inventory_snapshots", inventory, import_id)
bulk_upsert_only("iwrs_destruction", destruct, import_id)
def run(studies):
ensure_indexes()
for s in studies:
import_study(s)
if __name__ == "__main__":
studies = sys.argv[1:] if len(sys.argv) > 1 else ["77242113UCO3001", "42847922MDD3003"]
run(studies)