Migrate IWRS from MySQL to MongoDB
- Add IWRS/common/mongo_writer.py with shared connection, indexes, upsert+snapshot helpers - Add IWRS/Patients/import_to_mongo.py (subject_summary + visits) - Add IWRS/Patients/import_notifications_to_mongo.py: parse PDF/JSON directly to Mongo (incl. PDF as BinData), replaces 2-step MySQL flow - Add IWRS/Drugs/import_to_mongo.py (shipments, items, inventory, destruction) - Add IWRS/backfill_mysql_to_mongo.py: one-shot history backfill - Switch IWRS/Patients/run_all.py and IWRS/Drugs/run_all.py to Mongo - Rewrite IWRS/Drugs/create_report.py data loaders to read from Mongo - 8 main collections (upsert = latest state) + 5 snapshot collections (append-only with import_id) under studie database; notifications and destruction are immutable and need no snapshots Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+169
-172
@@ -1,5 +1,5 @@
|
||||
import os
|
||||
import mysql.connector
|
||||
import sys
|
||||
import pandas as pd
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
@@ -7,7 +7,8 @@ from openpyxl import load_workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
import db_config
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from common.mongo_writer import get_db
|
||||
|
||||
STUDIES = ["77242113UCO3001", "42847922MDD3003"]
|
||||
|
||||
@@ -23,70 +24,56 @@ DATE_COLUMNS = {
|
||||
N_SHIP_COLS = 9 # počet shipment sloupců před detail sloupci
|
||||
|
||||
|
||||
# ── DB ────────────────────────────────────────────────────────────────────────
|
||||
# ── Načítání dat z MongoDB ────────────────────────────────────────────────────
|
||||
|
||||
def get_conn():
|
||||
return mysql.connector.connect(
|
||||
host=db_config.DB_HOST, port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
INVENTORY_COLS = [
|
||||
("site", "Site"),
|
||||
("medication_id", "Med ID"),
|
||||
("packaged_lot_no", "Lot No."),
|
||||
("original_expiration_date", "Orig Exp Date"),
|
||||
("expiration_date", "Exp Date"),
|
||||
("received_date", "Rcv Date"),
|
||||
("receipt_user", "Rcpt User"),
|
||||
("subject_identifier", "Subject ID"),
|
||||
("quantity_assigned", "Qty Asgn"),
|
||||
("irt_transaction", "IRT Tx"),
|
||||
("date_assigned", "Date Asgn"),
|
||||
("assignment_user", "Asgn User"),
|
||||
("dispensation_status", "Disp Status"),
|
||||
("dispensing_date", "Disp Date"),
|
||||
("quantity_dispensed", "Qty Disp"),
|
||||
("dispensing_user", "Disp User"),
|
||||
("quantity_returned", "Qty Ret"),
|
||||
("date_returned", "Date Ret"),
|
||||
("return_user", "Ret User"),
|
||||
]
|
||||
|
||||
|
||||
def get_latest_import_id(cursor, study):
|
||||
cursor.execute(
|
||||
"SELECT MAX(import_id) AS mid FROM iwrs_import WHERE study=%s AND report_type='drugs'",
|
||||
(study,),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
mid = row["mid"]
|
||||
if mid is None:
|
||||
raise RuntimeError(f"Žádná data v MySQL pro studii {study}")
|
||||
return mid
|
||||
def load_inventory(study):
|
||||
db = get_db()
|
||||
inv = list(db.iwrs_inventory.find({"study": study}))
|
||||
destr = list(db.iwrs_destruction.find({"study": study}))
|
||||
# map medication_id -> first basket+date
|
||||
destr_map = {}
|
||||
for d in destr:
|
||||
mid = d.get("medication_id")
|
||||
if mid and mid not in destr_map:
|
||||
destr_map[mid] = (d.get("basket_id"), d.get("destruction_date"))
|
||||
|
||||
records = []
|
||||
for doc in inv:
|
||||
row = {label: doc.get(key) for key, label in INVENTORY_COLS}
|
||||
b, dt = destr_map.get(doc.get("medication_id"), (None, None))
|
||||
row["Destroyed"] = dt
|
||||
row["Basket No."] = b
|
||||
records.append(row)
|
||||
|
||||
# ── Načítání dat ──────────────────────────────────────────────────────────────
|
||||
df = pd.DataFrame(records)
|
||||
if df.empty:
|
||||
print(" Inventory: 0 kitu")
|
||||
return df
|
||||
|
||||
def load_inventory(cursor, study, import_id):
|
||||
sql = """
|
||||
SELECT
|
||||
i.site AS Site,
|
||||
i.medication_id AS `Med ID`,
|
||||
i.packaged_lot_no AS `Lot No.`,
|
||||
i.original_expiration_date AS `Orig Exp Date`,
|
||||
i.expiration_date AS `Exp Date`,
|
||||
i.received_date AS `Rcv Date`,
|
||||
i.receipt_user AS `Rcpt User`,
|
||||
i.subject_identifier AS `Subject ID`,
|
||||
i.quantity_assigned AS `Qty Asgn`,
|
||||
i.irt_transaction AS `IRT Tx`,
|
||||
i.date_assigned AS `Date Asgn`,
|
||||
i.assignment_user AS `Asgn User`,
|
||||
i.dispensation_status AS `Disp Status`,
|
||||
i.dispensing_date AS `Disp Date`,
|
||||
i.quantity_dispensed AS `Qty Disp`,
|
||||
i.dispensing_user AS `Disp User`,
|
||||
i.quantity_returned AS `Qty Ret`,
|
||||
i.date_returned AS `Date Ret`,
|
||||
i.return_user AS `Ret User`,
|
||||
d.destruction_date AS Destroyed,
|
||||
d.basket_id AS `Basket No.`
|
||||
FROM iwrs_inventory i
|
||||
LEFT JOIN (
|
||||
SELECT medication_id,
|
||||
ANY_VALUE(basket_id) AS basket_id,
|
||||
ANY_VALUE(destruction_date) AS destruction_date
|
||||
FROM iwrs_destruction
|
||||
WHERE study = %s
|
||||
GROUP BY medication_id
|
||||
) d ON d.medication_id = i.medication_id
|
||||
WHERE i.import_id = %s
|
||||
AND i.study = %s
|
||||
ORDER BY i.site, i.received_date, i.medication_id
|
||||
"""
|
||||
cursor.execute(sql, (study, import_id, study))
|
||||
rows = cursor.fetchall()
|
||||
df = pd.DataFrame(rows)
|
||||
df = df.sort_values(["Site", "Rcv Date", "Med ID"], na_position="last").reset_index(drop=True)
|
||||
for col in DATE_COLUMNS:
|
||||
if col in df.columns:
|
||||
df[col] = pd.to_datetime(df[col], errors="coerce")
|
||||
@@ -94,78 +81,102 @@ def load_inventory(cursor, study, import_id):
|
||||
return df
|
||||
|
||||
|
||||
def load_shipments(cursor, study, import_id):
|
||||
sql = """
|
||||
SELECT
|
||||
s.shipment_id AS `Shipment ID`,
|
||||
s.status AS `IRT Shipment Status`,
|
||||
s.type AS Type,
|
||||
s.ship_from AS `Shipment From`,
|
||||
s.ship_to_site AS `Ship To:`,
|
||||
s.request_date AS `Request Date`,
|
||||
s.received_date AS `Received Date`,
|
||||
s.received_by AS `Received by`,
|
||||
s.expected_arrival AS `Expected Arrival`,
|
||||
i.investigator AS Investigator,
|
||||
i.medication_description AS `Medication Description`,
|
||||
i.medication_id AS `Medication ID`,
|
||||
i.packaged_lot_no AS `Packaged Lot number`,
|
||||
i.expiration_date AS `Expiration Date`,
|
||||
i.item_status AS Status
|
||||
FROM iwrs_shipments s
|
||||
JOIN iwrs_shipment_items i
|
||||
ON i.study = s.study
|
||||
AND i.shipment_id = s.shipment_id
|
||||
AND i.import_id = %s
|
||||
WHERE s.import_id = %s
|
||||
AND s.study = %s
|
||||
ORDER BY s.ship_to_site, s.shipment_id, i.medication_id
|
||||
"""
|
||||
cursor.execute(sql, (import_id, import_id, study))
|
||||
rows = cursor.fetchall()
|
||||
df = pd.DataFrame(rows)
|
||||
SHIP_COLS = [
|
||||
("shipment_id", "Shipment ID"),
|
||||
("status", "IRT Shipment Status"),
|
||||
("type", "Type"),
|
||||
("ship_from", "Shipment From"),
|
||||
("ship_to_site", "Ship To:"),
|
||||
("request_date", "Request Date"),
|
||||
("received_date", "Received Date"),
|
||||
("received_by", "Received by"),
|
||||
("expected_arrival", "Expected Arrival"),
|
||||
]
|
||||
|
||||
ITEM_COLS = [
|
||||
("investigator", "Investigator"),
|
||||
("medication_description", "Medication Description"),
|
||||
("medication_id", "Medication ID"),
|
||||
("packaged_lot_no", "Packaged Lot number"),
|
||||
("expiration_date", "Expiration Date"),
|
||||
("item_status", "Status"),
|
||||
]
|
||||
|
||||
|
||||
def load_shipments(study):
|
||||
db = get_db()
|
||||
ships = list(db.iwrs_shipments.find({"study": study}))
|
||||
items = list(db.iwrs_shipment_items.find({"study": study}))
|
||||
|
||||
# index items by shipment_id
|
||||
items_by_ship = {}
|
||||
for it in items:
|
||||
items_by_ship.setdefault(it.get("shipment_id"), []).append(it)
|
||||
|
||||
records = []
|
||||
for s in ships:
|
||||
base = {label: s.get(key) for key, label in SHIP_COLS}
|
||||
for it in items_by_ship.get(s.get("shipment_id"), []):
|
||||
row = dict(base)
|
||||
for key, label in ITEM_COLS:
|
||||
row[label] = it.get(key)
|
||||
records.append(row)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
if df.empty:
|
||||
print(" Shipments: 0 zásilek, 0 kitu")
|
||||
return df
|
||||
|
||||
df = df.sort_values(["Ship To:", "Shipment ID", "Medication ID"], na_position="last").reset_index(drop=True)
|
||||
for col in ("Request Date", "Received Date", "Expiration Date", "Expected Arrival"):
|
||||
if col in df.columns:
|
||||
df[col] = pd.to_datetime(df[col], errors="coerce")
|
||||
n_ship = df["Shipment ID"].nunique() if len(df) else 0
|
||||
n_ship = df["Shipment ID"].nunique()
|
||||
print(f" Shipments: {n_ship} zásilek, {len(df)} kitu")
|
||||
return df
|
||||
|
||||
|
||||
def load_visits(cursor, study, import_id):
|
||||
cursor.execute(
|
||||
"SELECT MAX(import_id) AS mid FROM iwrs_import WHERE study=%s AND report_type='patients'",
|
||||
(study,),
|
||||
)
|
||||
patients_import_id = cursor.fetchone()["mid"] or import_id
|
||||
import_id = patients_import_id
|
||||
sql = """
|
||||
SELECT
|
||||
v.subject AS Subject,
|
||||
COALESCE(v.actual_date, v.scheduled_date) AS `Visit Date`,
|
||||
v.scheduled_date AS `Scheduled Date`,
|
||||
v.irt_transaction_no AS `IRT Tx No`,
|
||||
v.irt_transaction_description AS `Visit`,
|
||||
v.medication_assignment AS `Medication`,
|
||||
GROUP_CONCAT(v.medication_id ORDER BY v.medication_id SEPARATOR ', ') AS `Med IDs`,
|
||||
SUM(v.quantity_assigned) AS `Qty`
|
||||
FROM iwrs_subject_visits v
|
||||
WHERE v.import_id = %s AND v.study = %s AND v.visit_type = 'Past'
|
||||
AND v.irt_transaction_no IS NOT NULL
|
||||
GROUP BY v.subject, v.actual_date, v.scheduled_date,
|
||||
v.irt_transaction_no, v.irt_transaction_description, v.medication_assignment
|
||||
ORDER BY v.subject, COALESCE(v.actual_date, v.scheduled_date)
|
||||
"""
|
||||
cursor.execute(sql, (import_id, study))
|
||||
rows = cursor.fetchall()
|
||||
def load_visits(study):
|
||||
db = get_db()
|
||||
cur = db.iwrs_visits.find({
|
||||
"study": study,
|
||||
"visit_type": "Past",
|
||||
"irt_transaction_no": {"$ne": None},
|
||||
})
|
||||
rows = []
|
||||
for v in cur:
|
||||
rows.append({
|
||||
"Subject": v.get("subject"),
|
||||
"Visit Date": v.get("actual_date") or v.get("scheduled_date"),
|
||||
"Scheduled Date": v.get("scheduled_date"),
|
||||
"IRT Tx No": v.get("irt_transaction_no"),
|
||||
"Visit": v.get("irt_transaction_description"),
|
||||
"Medication": v.get("medication_assignment"),
|
||||
"medication_id": v.get("medication_id"),
|
||||
"quantity_assigned": v.get("quantity_assigned"),
|
||||
})
|
||||
df = pd.DataFrame(rows)
|
||||
if df.empty:
|
||||
print(" Visits: 0 radku")
|
||||
return df
|
||||
|
||||
# GROUP BY subject/actual/scheduled/irt_no/desc/medication
|
||||
grouped = (
|
||||
df.groupby(["Subject", "Visit Date", "Scheduled Date", "IRT Tx No", "Visit", "Medication"],
|
||||
dropna=False, as_index=False)
|
||||
.agg(**{
|
||||
"Med IDs": ("medication_id", lambda s: ", ".join(sorted([str(x) for x in s if pd.notna(x)]))),
|
||||
"Qty": ("quantity_assigned", "sum"),
|
||||
})
|
||||
)
|
||||
grouped = grouped.sort_values(["Subject", "Visit Date"]).reset_index(drop=True)
|
||||
for col in ("Visit Date", "Scheduled Date"):
|
||||
if col in df.columns:
|
||||
df[col] = pd.to_datetime(df[col], errors="coerce")
|
||||
if study == "77242113UCO3001" and "Visit" in df.columns:
|
||||
df["Visit"] = df["Visit"].replace("Subject Number Creation", "Screening")
|
||||
print(f" Visits: {len(df)} řádků")
|
||||
return df
|
||||
if col in grouped.columns:
|
||||
grouped[col] = pd.to_datetime(grouped[col], errors="coerce")
|
||||
if study == "77242113UCO3001":
|
||||
grouped["Visit"] = grouped["Visit"].replace("Subject Number Creation", "Screening")
|
||||
print(f" Visits: {len(grouped)} řádků")
|
||||
return grouped
|
||||
|
||||
|
||||
# ── Odvozené sheety ───────────────────────────────────────────────────────────
|
||||
@@ -343,49 +354,42 @@ def format_shipment_sheet(ws, header_color_ship, header_color_detail, n_ship_col
|
||||
|
||||
# ── Pacienti ─────────────────────────────────────────────────────────────────
|
||||
|
||||
PATIENT_TABLE = {
|
||||
"77242113UCO3001": "iwrs_uco3001_subject_summary",
|
||||
"42847922MDD3003": "iwrs_mdd3003_subject_summary",
|
||||
}
|
||||
def load_patients(study):
|
||||
db = get_db()
|
||||
docs = list(db.iwrs_subject_summary.find({"study": study}))
|
||||
if not docs:
|
||||
raise RuntimeError(f"Žádná data v Mongo pro pacienty {study}")
|
||||
|
||||
|
||||
def load_patients(cursor, study):
|
||||
table = PATIENT_TABLE[study]
|
||||
cursor.execute(f"SELECT MAX(import_id) AS mid FROM {table}")
|
||||
mid = cursor.fetchone()["mid"]
|
||||
if mid is None:
|
||||
raise RuntimeError(f"Žádná data v MySQL pro pacienty {study}")
|
||||
extra_cols = ""
|
||||
base_cols = [
|
||||
("subject", "Subject"),
|
||||
("investigator", "Investigator"),
|
||||
("age", "Subject's age collection"),
|
||||
("cohort_per_irt", "Cohort per IRT"),
|
||||
("irt_subject_status", "IRT Subject Status"),
|
||||
("last_irt_transaction", "Last Recorded IRT Transaction"),
|
||||
("next_irt_transaction", "Next Expected IRT Transaction"),
|
||||
("next_irt_transaction_date_local", "Next Expected IRT Transaction Date [Local]"),
|
||||
]
|
||||
uco_extra = [
|
||||
("rescreened_subject", "Rescreened Subject"),
|
||||
("adt_ir", "ADT-IR"),
|
||||
("three_or_more_advanced_therapies", "3+ Adv. Therapies"),
|
||||
("only_oral_5asa_compounds", "Only 5-ASA"),
|
||||
("ustekinumab", "Ustekinumab"),
|
||||
("isolated_proctitis", "Isolated Proctitis"),
|
||||
]
|
||||
cols = list(base_cols)
|
||||
if study == "77242113UCO3001":
|
||||
extra_cols = """
|
||||
rescreened_subject AS `Rescreened Subject`,
|
||||
adt_ir AS `ADT-IR`,
|
||||
three_or_more_advanced_therapies AS `3+ Adv. Therapies`,
|
||||
only_oral_5asa_compounds AS `Only 5-ASA`,
|
||||
ustekinumab AS `Ustekinumab`,
|
||||
isolated_proctitis AS `Isolated Proctitis`,"""
|
||||
sql = f"""
|
||||
SELECT
|
||||
subject AS `Subject`,
|
||||
investigator AS `Investigator`,
|
||||
age AS `Subject's age collection`,
|
||||
cohort_per_irt AS `Cohort per IRT`,{extra_cols}
|
||||
irt_subject_status AS `IRT Subject Status`,
|
||||
last_irt_transaction AS `Last Recorded IRT Transaction`,
|
||||
next_irt_transaction AS `Next Expected IRT Transaction`,
|
||||
next_irt_transaction_date_local AS `Next Expected IRT Transaction Date [Local]`
|
||||
FROM {table}
|
||||
WHERE import_id = %s
|
||||
ORDER BY subject
|
||||
"""
|
||||
cursor.execute(sql, (mid,))
|
||||
rows = cursor.fetchall()
|
||||
df = pd.DataFrame(rows)
|
||||
cols += uco_extra
|
||||
|
||||
rows = [{label: d.get(key) for key, label in cols} for d in docs]
|
||||
df = pd.DataFrame(rows).sort_values("Subject").reset_index(drop=True)
|
||||
|
||||
if "Next Expected IRT Transaction Date [Local]" in df.columns:
|
||||
df["Next Expected IRT Transaction Date [Local]"] = pd.to_datetime(
|
||||
df["Next Expected IRT Transaction Date [Local]"], errors="coerce"
|
||||
)
|
||||
print(f" Pacienti: {len(df)} subjektů (import_id={mid})")
|
||||
print(f" Pacienti: {len(df)} subjektů")
|
||||
return df
|
||||
|
||||
|
||||
@@ -574,18 +578,11 @@ def create_study_report(study):
|
||||
|
||||
output_file = OUTPUT_DIR / f"{today} {study} CZ IWRS overview v{version}.xlsx"
|
||||
|
||||
print(f"\n[{study}] Načítám z MySQL...")
|
||||
conn = get_conn()
|
||||
cursor = conn.cursor(dictionary=True)
|
||||
import_id = get_latest_import_id(cursor, study)
|
||||
print(f" import_id = {import_id}")
|
||||
|
||||
df = load_inventory(cursor, study, import_id)
|
||||
shipments_df = load_shipments(cursor, study, import_id)
|
||||
df_patients = load_patients(cursor, study)
|
||||
visits_df = load_visits(cursor, study, import_id)
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print(f"\n[{study}] Nacitam z MongoDB...")
|
||||
df = load_inventory(study)
|
||||
shipments_df = load_shipments(study)
|
||||
df_patients = load_patients(study)
|
||||
visits_df = load_visits(study)
|
||||
|
||||
expired_df, expired_sheet = build_expired(df)
|
||||
assigned_df = build_assigned_not_dispensed(df)
|
||||
|
||||
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
Import Drugs dat (shipments, shipment_items, inventory, destruction) z XLSX do MongoDB.
|
||||
|
||||
Volá se z IWRS/Drugs/run_all.py po stažení reportů.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import glob
|
||||
|
||||
import pandas as pd
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from common.mongo_writer import (
|
||||
to_str, to_int, to_date,
|
||||
ensure_indexes, log_import,
|
||||
bulk_upsert_with_snapshot, bulk_upsert_only,
|
||||
)
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
# ── XLSX parsery (převzaté z run_all.py + úprava na Mongo dokumenty) ─────────
|
||||
|
||||
def parse_shipments_report(study):
|
||||
path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
|
||||
if not os.path.exists(path):
|
||||
print(f" CHYBI: {path}")
|
||||
return []
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Shipment ID" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
return []
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
|
||||
col = df.columns.tolist()
|
||||
rows = []
|
||||
for _, r in df.iterrows():
|
||||
sid = to_str(r["Shipment ID"])
|
||||
if not sid:
|
||||
continue
|
||||
rows.append({
|
||||
"_id": sid,
|
||||
"shipment_id": sid,
|
||||
"study": study,
|
||||
"status": to_str(r["IRT Shipment Status"]),
|
||||
"type": to_str(r["Type"]),
|
||||
"ship_from": to_str(r["Shipment From"]),
|
||||
"ship_to_site": to_str(r["Ship To:"]),
|
||||
"location": to_str(r["Location"]),
|
||||
"request_date": to_date(r["Request Date"]),
|
||||
"shipped_date": to_date(r["Shipped Date"]),
|
||||
"received_date": to_date(r["Received Date"]) if "Received Date" in col else None,
|
||||
"received_by": to_str(r["Received by"]) if "Received by" in col else None,
|
||||
"delivered_date_utc": to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
|
||||
"delivery_recipient": to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
|
||||
"delivery_details": to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
|
||||
"cancelled_date": to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
|
||||
"total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
|
||||
"tracking_no": to_str(r["Tracking #"]) if "Tracking #" in col else None,
|
||||
"shipping_category": to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
|
||||
"expected_arrival": to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_shipment_details(study):
|
||||
detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
|
||||
files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
|
||||
shipment_id = m.group(1) if m else "UNKNOWN"
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Medication ID" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
for _, r in df.iterrows():
|
||||
med_desc = (to_str(r.get("Medication Description"))
|
||||
or to_str(r.get("Medication ID Description")))
|
||||
med_type = (to_str(r.get("Medication type"))
|
||||
or to_str(r.get("Medication ID type")))
|
||||
med_id = to_str(r.get("Medication ID"))
|
||||
if not med_id:
|
||||
continue
|
||||
rows.append({
|
||||
"_id": f"{shipment_id}:{med_id}",
|
||||
"study": study,
|
||||
"shipment_id": shipment_id,
|
||||
"destination_location": to_str(r.get("Destination Location")),
|
||||
"shipment_status": to_str(r.get("IRT Shipment Status")),
|
||||
"shipment_type": to_str(r.get("Type")),
|
||||
"destination_site": to_str(r.get("Destination Site")),
|
||||
"investigator": to_str(r.get("Investigator")),
|
||||
"medication_description": med_desc,
|
||||
"medication_type": med_type,
|
||||
"medication_id": med_id,
|
||||
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
|
||||
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
|
||||
"container_id": to_str(r.get("Container ID")),
|
||||
"quantity": to_int(r.get("Quantity of Medication IDs")),
|
||||
"expiration_date": to_date(r.get("Expiration Date")),
|
||||
"item_status": to_str(r.get("Status")),
|
||||
})
|
||||
# dedupe (poslední vyhrává)
|
||||
by_id = {r["_id"]: r for r in rows}
|
||||
return list(by_id.values())
|
||||
|
||||
|
||||
def parse_inventory(study):
|
||||
inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
|
||||
files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
raw = pd.read_excel(path, header=None)
|
||||
site = investigator = location = None
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
|
||||
if first.startswith("Site:"):
|
||||
site = first.replace("Site:", "").strip()
|
||||
elif first.startswith("Investigator:"):
|
||||
investigator = first.replace("Investigator:", "").strip()
|
||||
elif first.startswith("Location:"):
|
||||
location = first.replace("Location:", "").strip()
|
||||
if first in ("Medication", "Medication ID") and header_row is None:
|
||||
header_row = i
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
df = df.rename(columns={df.columns[0]: "medication_id"})
|
||||
for _, r in df.iterrows():
|
||||
med_id = to_str(r["medication_id"])
|
||||
if not med_id or not site:
|
||||
continue
|
||||
rows.append({
|
||||
"_id": f"{site}:{med_id}",
|
||||
"study": study,
|
||||
"site": site,
|
||||
"investigator": investigator,
|
||||
"location": location,
|
||||
"medication_id": med_id,
|
||||
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
|
||||
"original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
|
||||
"expiration_date": to_date(r.get("Expiration date")),
|
||||
"received_date": to_date(r.get("Received Date")),
|
||||
"receipt_user": to_str(r.get("Shipment Receipt User")),
|
||||
"subject_identifier": to_str(r.get("Subject Identifier")),
|
||||
"quantity_assigned": to_int(r.get("Quantity Assigned")),
|
||||
"irt_transaction": to_str(r.get("IRT Transaction")),
|
||||
"date_assigned": to_date(r.get("Date Assigned")),
|
||||
"assignment_user": to_str(r.get("Assignment User")),
|
||||
"dispensation_status": to_str(r.get("Dispensation Status")),
|
||||
"dispensing_date": to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
|
||||
"quantity_dispensed": to_int(r.get("Quantity Dispensed")),
|
||||
"dispensing_user": to_str(r.get("Dispensing User")),
|
||||
"quantity_returned": to_int(r.get("Quantity Returned")),
|
||||
"date_returned": to_date(r.get("Date Returned")),
|
||||
"return_user": to_str(r.get("Return User")),
|
||||
})
|
||||
by_id = {r["_id"]: r for r in rows}
|
||||
return list(by_id.values())
|
||||
|
||||
|
||||
def parse_destruction_files(study):
|
||||
dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
|
||||
files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
raw = pd.read_excel(path, header=None)
|
||||
meta = {}
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
|
||||
for key, attr in [
|
||||
("Investigator Name:", "investigator"),
|
||||
("Site ID:", "site_id"),
|
||||
("Location:", "location"),
|
||||
("Basket ID:", "basket_id"),
|
||||
("Drug Destruction Created Date:", "destruction_date"),
|
||||
]:
|
||||
if first.startswith(key):
|
||||
meta[attr] = first.replace(key, "").strip()
|
||||
if first == "Medication ID Description" and header_row is None:
|
||||
header_row = i
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
basket_id = meta.get("basket_id")
|
||||
for _, r in df.iterrows():
|
||||
med_id = to_str(r.get("Medication ID"))
|
||||
if not med_id or not basket_id:
|
||||
continue
|
||||
rows.append({
|
||||
"_id": f"{basket_id}:{med_id}",
|
||||
"study": study,
|
||||
"site_id": meta.get("site_id"),
|
||||
"investigator": meta.get("investigator"),
|
||||
"location": meta.get("location"),
|
||||
"basket_id": basket_id,
|
||||
"destruction_date": to_date(meta.get("destruction_date")),
|
||||
"medication_description": to_str(r.get("Medication ID Description")),
|
||||
"medication_id": med_id,
|
||||
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
|
||||
"comments": to_str(r.get("Comments")),
|
||||
})
|
||||
by_id = {r["_id"]: r for r in rows}
|
||||
return list(by_id.values())
|
||||
|
||||
|
||||
# ── hlavní import ────────────────────────────────────────────────────────────
|
||||
|
||||
def import_study(study):
|
||||
print(f"\n [{study}] parsovani XLSX...")
|
||||
shipments = parse_shipments_report(study)
|
||||
items = parse_shipment_details(study)
|
||||
inventory = parse_inventory(study)
|
||||
destruct = parse_destruction_files(study)
|
||||
print(f" Zasilky: {len(shipments)} | Polozky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(destruct)}")
|
||||
|
||||
import_id = log_import(study, f"drugs_{study}", "drugs", {
|
||||
"shipments": len(shipments),
|
||||
"shipment_items": len(items),
|
||||
"inventory": len(inventory),
|
||||
"destruction": len(destruct),
|
||||
})
|
||||
print(f" import_id = {import_id}")
|
||||
|
||||
bulk_upsert_with_snapshot("iwrs_shipments", "iwrs_shipments_snapshots", shipments, import_id)
|
||||
bulk_upsert_with_snapshot("iwrs_shipment_items", "iwrs_shipment_items_snapshots", items, import_id)
|
||||
bulk_upsert_with_snapshot("iwrs_inventory", "iwrs_inventory_snapshots", inventory, import_id)
|
||||
bulk_upsert_only("iwrs_destruction", destruct, import_id)
|
||||
|
||||
|
||||
def run(studies):
|
||||
ensure_indexes()
|
||||
for s in studies:
|
||||
import_study(s)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
studies = sys.argv[1:] if len(sys.argv) > 1 else ["77242113UCO3001", "42847922MDD3003"]
|
||||
run(studies)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user