""" Import Drugs dat (shipments, shipment_items, inventory, destruction) z XLSX do MongoDB. Volá se z IWRS/Drugs/run_all.py po stažení reportů. """ import os import sys import re import glob import pandas as pd sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from common.mongo_writer import ( to_str, to_int, to_date, ensure_indexes, log_import, bulk_upsert_with_snapshot, bulk_upsert_only, ) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # ── XLSX parsery (převzaté z run_all.py + úprava na Mongo dokumenty) ───────── def parse_shipments_report(study): path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx") if not os.path.exists(path): print(f" CHYBI: {path}") return [] raw = pd.read_excel(path, header=None) header_row = None for i, row in raw.iterrows(): if "Shipment ID" in [str(v).strip() for v in row]: header_row = i break if header_row is None: return [] df = pd.read_excel(path, header=header_row).dropna(how="all") df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)] col = df.columns.tolist() rows = [] for _, r in df.iterrows(): sid = to_str(r["Shipment ID"]) if not sid: continue rows.append({ "_id": sid, "shipment_id": sid, "study": study, "status": to_str(r["IRT Shipment Status"]), "type": to_str(r["Type"]), "ship_from": to_str(r["Shipment From"]), "ship_to_site": to_str(r["Ship To:"]), "location": to_str(r["Location"]), "request_date": to_date(r["Request Date"]), "shipped_date": to_date(r["Shipped Date"]), "received_date": to_date(r["Received Date"]) if "Received Date" in col else None, "received_by": to_str(r["Received by"]) if "Received by" in col else None, "delivered_date_utc": to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None, "delivery_recipient": to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None, "delivery_details": to_str(r["Delivery Details"]) if "Delivery Details" in col else None, "cancelled_date": to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None, "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None, "tracking_no": to_str(r["Tracking #"]) if "Tracking #" in col else None, "shipping_category": to_str(r["Shipping Category"]) if "Shipping Category" in col else None, "expected_arrival": to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None, }) return rows def parse_shipment_details(study): detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}") files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx"))) rows = [] for path in files: m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path)) shipment_id = m.group(1) if m else "UNKNOWN" raw = pd.read_excel(path, header=None) header_row = None for i, row in raw.iterrows(): if "Medication ID" in [str(v).strip() for v in row]: header_row = i break if header_row is None: continue df = pd.read_excel(path, header=header_row).dropna(how="all") for _, r in df.iterrows(): med_desc = (to_str(r.get("Medication Description")) or to_str(r.get("Medication ID Description"))) med_type = (to_str(r.get("Medication type")) or to_str(r.get("Medication ID type"))) med_id = to_str(r.get("Medication ID")) if not med_id: continue rows.append({ "_id": f"{shipment_id}:{med_id}", "study": study, "shipment_id": shipment_id, "destination_location": to_str(r.get("Destination Location")), "shipment_status": to_str(r.get("IRT Shipment Status")), "shipment_type": to_str(r.get("Type")), "destination_site": to_str(r.get("Destination Site")), "investigator": to_str(r.get("Investigator")), "medication_description": med_desc, "medication_type": med_type, "medication_id": med_id, "packaged_lot_no": to_str(r.get("Packaged Lot number")), "packaged_lot_description": to_str(r.get("Packaged Lot description")), "container_id": to_str(r.get("Container ID")), "quantity": to_int(r.get("Quantity of Medication IDs")), "expiration_date": to_date(r.get("Expiration Date")), "item_status": to_str(r.get("Status")), }) # dedupe (poslední vyhrává) by_id = {r["_id"]: r for r in rows} return list(by_id.values()) def parse_inventory(study): inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}") files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx"))) rows = [] for path in files: raw = pd.read_excel(path, header=None) site = investigator = location = None header_row = None for i, row in raw.iterrows(): first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else "" if first.startswith("Site:"): site = first.replace("Site:", "").strip() elif first.startswith("Investigator:"): investigator = first.replace("Investigator:", "").strip() elif first.startswith("Location:"): location = first.replace("Location:", "").strip() if first in ("Medication", "Medication ID") and header_row is None: header_row = i if header_row is None: continue df = pd.read_excel(path, header=header_row).dropna(how="all") df = df.rename(columns={df.columns[0]: "medication_id"}) for _, r in df.iterrows(): med_id = to_str(r["medication_id"]) if not med_id or not site: continue rows.append({ "_id": f"{site}:{med_id}", "study": study, "site": site, "investigator": investigator, "location": location, "medication_id": med_id, "packaged_lot_no": to_str(r.get("Packaged Lot number")), "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")), "expiration_date": to_date(r.get("Expiration date")), "received_date": to_date(r.get("Received Date")), "receipt_user": to_str(r.get("Shipment Receipt User")), "subject_identifier": to_str(r.get("Subject Identifier")), "quantity_assigned": to_int(r.get("Quantity Assigned")), "irt_transaction": to_str(r.get("IRT Transaction")), "date_assigned": to_date(r.get("Date Assigned")), "assignment_user": to_str(r.get("Assignment User")), "dispensation_status": to_str(r.get("Dispensation Status")), "dispensing_date": to_date(r.get("Dispensing date") or r.get("Dispensing Date")), "quantity_dispensed": to_int(r.get("Quantity Dispensed")), "dispensing_user": to_str(r.get("Dispensing User")), "quantity_returned": to_int(r.get("Quantity Returned")), "date_returned": to_date(r.get("Date Returned")), "return_user": to_str(r.get("Return User")), }) by_id = {r["_id"]: r for r in rows} return list(by_id.values()) def parse_destruction_files(study): dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}") files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx"))) rows = [] for path in files: raw = pd.read_excel(path, header=None) meta = {} header_row = None for i, row in raw.iterrows(): first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else "" for key, attr in [ ("Investigator Name:", "investigator"), ("Site ID:", "site_id"), ("Location:", "location"), ("Basket ID:", "basket_id"), ("Drug Destruction Created Date:", "destruction_date"), ]: if first.startswith(key): meta[attr] = first.replace(key, "").strip() if first == "Medication ID Description" and header_row is None: header_row = i if header_row is None: continue df = pd.read_excel(path, header=header_row).dropna(how="all") basket_id = meta.get("basket_id") for _, r in df.iterrows(): med_id = to_str(r.get("Medication ID")) if not med_id or not basket_id: continue rows.append({ "_id": f"{basket_id}:{med_id}", "study": study, "site_id": meta.get("site_id"), "investigator": meta.get("investigator"), "location": meta.get("location"), "basket_id": basket_id, "destruction_date": to_date(meta.get("destruction_date")), "medication_description": to_str(r.get("Medication ID Description")), "medication_id": med_id, "packaged_lot_description": to_str(r.get("Packaged Lot description")), "comments": to_str(r.get("Comments")), }) by_id = {r["_id"]: r for r in rows} return list(by_id.values()) # ── hlavní import ──────────────────────────────────────────────────────────── def import_study(study): print(f"\n [{study}] parsovani XLSX...") shipments = parse_shipments_report(study) items = parse_shipment_details(study) inventory = parse_inventory(study) destruct = parse_destruction_files(study) print(f" Zasilky: {len(shipments)} | Polozky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(destruct)}") import_id = log_import(study, f"drugs_{study}", "drugs", { "shipments": len(shipments), "shipment_items": len(items), "inventory": len(inventory), "destruction": len(destruct), }) print(f" import_id = {import_id}") bulk_upsert_with_snapshot("iwrs_shipments", "iwrs_shipments_snapshots", shipments, import_id) bulk_upsert_with_snapshot("iwrs_shipment_items", "iwrs_shipment_items_snapshots", items, import_id) bulk_upsert_with_snapshot("iwrs_inventory", "iwrs_inventory_snapshots", inventory, import_id) bulk_upsert_only("iwrs_destruction", destruct, import_id) def run(studies): ensure_indexes() for s in studies: import_study(s) if __name__ == "__main__": studies = sys.argv[1:] if len(sys.argv) > 1 else ["77242113UCO3001", "42847922MDD3003"] run(studies)