Migrate IWRS from MySQL to MongoDB

- Add IWRS/common/mongo_writer.py with shared connection, indexes, upsert+snapshot helpers - Add IWRS/Patients/import_to_mongo.py (subject_summary + visits) - Add IWRS/Patients/import_notifications_to_mongo.py: parse PDF/JSON directly to Mongo (incl. PDF as BinData), replaces 2-step MySQL flow - Add IWRS/Drugs/import_to_mongo.py (shipments, items, inventory, destruction) - Add IWRS/backfill_mysql_to_mongo.py: one-shot history backfill - Switch IWRS/Patients/run_all.py and IWRS/Drugs/run_all.py to Mongo - Rewrite IWRS/Drugs/create_report.py data loaders to read from Mongo - 8 main collections (upsert = latest state) + 5 snapshot collections (append-only with import_id) under studie database; notifications and destruction are immutable and need no snapshots Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 07:24:36 +02:00
parent 681095d557
commit ea9d611719
2080 changed files with 9465 additions and 172 deletions
@@ -0,0 +1,163 @@
+"""
+Import IWRS notifikací (PDF + JSON metadata) přímo do MongoDB.
+
+Nahrazuje původní 2-krokový flow:
+  1) import_to_mysql.py: PDF/JSON → MySQL iwrs_notifications
+  2) parse_notifications_to_mongo.py: MySQL → Mongo iwrs_notifications
+
+Nyní vše v jednom: PDF/JSON → Mongo iwrs_notifications (text parsovaný per typ,
+PDF jako BinData). Po úspěšném importu se soubory přesunou do Zpracováno/.
+
+Idempotentní: _id = pk (IWRS unique identifier).
+"""
+
+import os
+import sys
+import re
+import json
+import shutil
+import datetime
+
+from bson.binary import Binary
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from common.mongo_writer import get_db, to_date
+
+# parsery z původního skriptu
+from parse_notifications_to_mongo import (
+    parse_kv_lines, parse_medication_table, to_date as parse_to_date,
+    to_datetime as parse_to_datetime,
+)
+
+BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
+DETAILS_DIR  = os.path.join(BASE_DIR, "IncomingSourceReportsDetails")
+STUDIES      = ["77242113UCO3001", "42847922MDD3003"]
+
+
+def find_pairs(study):
+    """Vrátí seznam (json_path, pdf_path) — i když pdf neexistuje."""
+    out_dir = os.path.join(DETAILS_DIR, study)
+    if not os.path.isdir(out_dir):
+        return []
+    pairs = []
+    import glob
+    for jp in sorted(glob.glob(os.path.join(out_dir, "*.json"))):
+        pp = jp.replace(".json", ".pdf")
+        pairs.append((jp, pp if os.path.exists(pp) else None))
+    return pairs
+
+
+def build_document(meta, pdf_bytes):
+    text = meta.get("text") or ""
+    kv = parse_kv_lines(text)
+    meds = parse_medication_table(text)
+
+    dt_site = parse_to_datetime(kv.get("Transaction Date/Time (site local)"))
+    dt_sys = parse_to_datetime(kv.get("Transaction Date/Time (system local)"))
+
+    date_fields = [
+        "Informed Consent Date",
+        "Informed Consent Date at Screening",
+        "Informed Consent Date at Subject Creation",
+        "Date of Subject Creation in IRT",
+        "Date of Screening in IRT",
+        "Screenfail Date",
+        "Discontinuation date",
+        "Dispensation date",
+        "Returned Date",
+    ]
+    parsed_dates = {}
+    for f in date_fields:
+        if f in kv:
+            d = parse_to_date(kv[f])
+            if d:
+                parsed_dates[f] = d
+
+    pk = meta.get("pk")
+    actual_date = to_date(meta.get("actual_date"))
+
+    doc = {
+        "_id": pk,
+        "pk": pk,
+        "study": meta.get("study"),
+        "subject": meta.get("subject"),
+        "title": meta.get("title"),
+        "label": meta.get("label"),
+        "event": meta.get("event"),
+        "actual_date": actual_date,
+        "site": kv.get("Site"),
+        "investigator": kv.get("Investigator"),
+        "location": kv.get("Location"),
+        "cohort": kv.get("Cohort"),
+        "irt_subject_status": kv.get("IRT Subject Status"),
+        "transaction_site_local": dt_site,
+        "transaction_system_local": dt_sys,
+        "transaction_by": kv.get("Transaction performed by"),
+        "medications": meds,
+        "fields": {k: v for k, v in kv.items() if k not in {
+            "Site", "Investigator", "Location", "Cohort", "IRT Subject Status",
+            "Subject",
+            "Transaction Date/Time (site local)",
+            "Transaction Date/Time (system local)",
+            "Transaction performed by",
+        }},
+        "parsed_dates": parsed_dates,
+        "raw_text": text,
+    }
+    if pdf_bytes is not None:
+        doc["pdf"] = Binary(pdf_bytes)
+    return doc
+
+
+def import_study(study):
+    pairs = find_pairs(study)
+    if not pairs:
+        print(f"  [{study}] zadne nove notifikace")
+        return 0
+
+    db = get_db()
+    coll = db.iwrs_notifications
+    done_dir = os.path.join(DETAILS_DIR, study, "Zpracovano")
+    os.makedirs(done_dir, exist_ok=True)
+
+    imported = 0
+    failed = 0
+    for json_path, pdf_path in pairs:
+        try:
+            with open(json_path, "r", encoding="utf-8") as f:
+                meta = json.load(f)
+            pdf_bytes = None
+            if pdf_path:
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+
+            if not meta.get("pk"):
+                print(f"  CHYBI pk: {os.path.basename(json_path)}")
+                failed += 1
+                continue
+
+            doc = build_document(meta, pdf_bytes)
+            doc["last_imported_at"] = datetime.datetime.now()
+            coll.replace_one({"_id": doc["_id"]}, doc, upsert=True)
+            imported += 1
+
+            # presun zpracovany pair
+            shutil.move(json_path, os.path.join(done_dir, os.path.basename(json_path)))
+            if pdf_path:
+                shutil.move(pdf_path, os.path.join(done_dir, os.path.basename(pdf_path)))
+        except Exception as e:
+            print(f"  CHYBA {os.path.basename(json_path)}: {e}")
+            failed += 1
+
+    print(f"  [{study}] importovano={imported}  selhalo={failed}")
+    return imported
+
+
+def main(studies=None):
+    studies = studies or STUDIES
+    for s in studies:
+        import_study(s)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:] or None)