janssen/Feasibility/sipiq_import_v1.2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
sipiq_import_v1.2.py
====================
Verze:  1.2
Datum:  2026-06-17
Autor:  Claude Code (pro MUDr. Vladimíra Buzalku)

Změny proti v1.1
----------------
- PROVENANCE: ke každé odpovědi se ukládá `source_exported_at` = datum/čas reportu
  podle FILESYSTÉMU (mtime CSV souboru). Mimo content-hash → nezpůsobuje zbytečné
  UPDATE; backfilluje se i na "beze změny" cestě. Stará v1.1 ponechána v TRASH.

Změny proti v1.0
----------------
- FOLDER WORKFLOW (v1.1): režim --folder sebere *.csv ve složce, naimportuje (delta)
  a přesune do podsložky `Zpracováno`. Default složka =
  U:\\PythonProject\\Janssen\\Feasibility\\77242113UCO2001\\ImportSIPIQcompled.

Popis
-----
Import SIPIQ odpovědí (Qualtrics CSV export, studie 77242113UCO3002 / ICONIC DAWN)
do MongoDB db `feasibility`. Dvě kolekce:
  * sipiq_questions  – slovník dotazníku (1 dok = 1 logická otázka).
  * sipiq_responses  – 1 dok = 1 odpověď (_id = Qualtrics ResponseId), ploché answers{},
                       soft-link investigator_oid, delta bookkeeping + history[].

DELTA import (přepíše JEN změněná data): nová->insert; beze změn->jen last_seen_at;
změna->$set jen změněných polí + push do history[].

Použití
-------
  python sipiq_import_v1.2.py --dry-run            # folder režim, default složka
  python sipiq_import_v1.2.py --apply
  python sipiq_import_v1.2.py --folder "<cesta>" --apply
  python sipiq_import_v1.2.py --csv "<cesta.csv>" --apply   # jediný soubor (NEpřesouvá)

Závislosti: pymongo (.venv). Mongo 192.168.1.76:27017, bez auth.
"""
import argparse
import csv
import glob
import hashlib
import json
import os
import re
import shutil
import sys
import unicodedata
from datetime import datetime, timezone

try:
    from pymongo import MongoClient
except ImportError:
    print("CHYBA: pymongo není nainstalován v aktuálním pythonu.", file=sys.stderr)
    raise

MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "feasibility"
COL_Q = "sipiq_questions"
COL_R = "sipiq_responses"
DEFAULT_FOLDER = r"U:\PythonProject\Janssen\Feasibility\77242113UCO2001\ImportSIPIQcompled"
PROCESSED_SUBDIR = "Zpracováno"

META_COLS = {
    "StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)",
    "Finished", "RecordedDate", "ResponseId", "RecipientLastName", "RecipientFirstName",
    "RecipientEmail", "ExternalReference", "LocationLatitude", "LocationLongitude",
    "DistributionChannel", "UserLanguage",
}

PROMOTE = [
    "site_name", "site_address", "site_city", "site_state", "site_postcode", "site_country",
    "pi_first_name", "pi_last_name", "pi_phone", "pi_email",
    "sdl_site_id", "fire_site_id", "fire_investigator_id", "mailinglist_id",
    "survey_generated_by", "Date", "Time",
]

SECTION_BY_QNUM = {}
def _sec(rng, name):
    for n in rng:
        SECTION_BY_QNUM[n] = name
_sec([2], "J&J Internal Assessment")
_sec([6, 7, 8, 9, 10, 11, 12, 13], "Contact Information")
_sec(range(14, 22), "Confidentiality Statement")
_sec([25, 26, 27], "Interest")
_sec([29, 30, 31, 32, 33, 34], "Protocol Requirements")
_sec([36, 37, 38], "Enrollment")
_sec([40, 41, 42, 43], "Patient Demographics Overview")
_sec([45, 46, 47, 48, 49], "Site Overview")
_sec([51], "Operational Considerations")
_sec([53, 54], "Comments")
_sec([57, 58, 59, 60, 61], "Patient Population")
_sec([63, 64, 65, 66, 67], "Site Experience and Staffing")
_sec([69], "Equipment and Facility Requirements")
_sec([71, 72, 73, 74, 75], "Institutional Review Board, Ethics Committee, and Contracts")

STEM_OVERRIDE = {
    "Q31": "At your site, at what line(s) of treatment do you most commonly prescribe "
           "vedolizumab for patients with moderately to severely active ulcerative colitis?",
    "Q63": "Do you or your site staff have experience in performing the following types of "
           "study assessments/procedures?",
    "Q64": "The following personnel are required to run the study. "
           "Will your site have the following available?",
    "Q69": "The following equipment and facilities are required to run the studies. "
           "Are these available at your site?",
}


def now_iso():
    return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")


def file_mtime_iso(path):
    return datetime.fromtimestamp(os.path.getmtime(path)).astimezone().isoformat(timespec="seconds")


def strip_accents(s):
    if not s:
        return ""
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))


def norm_name(s):
    return re.sub(r"\s+", " ", strip_accents(s or "").lower()).strip()


def sanitize_key(qcode):
    return qcode.replace("#", "_").replace(".", "_")


def qnum(qcode):
    m = re.match(r"Q(\d+)", qcode)
    return int(m.group(1)) if m else None


def qbase(qcode):
    m = re.match(r"(Q\d+)", qcode)
    return m.group(1) if m else qcode


def import_id(h3_cell):
    try:
        return json.loads(h3_cell).get("ImportId", "")
    except Exception:
        return h3_cell


def split_text(text):
    parts = [p.strip() for p in re.split(r"\s+-\s+", text)]
    stem = parts[0]
    if len(parts) == 1:
        return stem, None
    label_parts = [p for p in parts[1:] if p.lower() != "selected choice"]
    label_parts = [p for p in label_parts if not re.fullmatch(r"Q\d+#\d+", p)]
    return stem, (" - ".join(label_parts) if label_parts else None)


def detect_type(qcode, observed):
    has_hash = "#" in qcode
    vals = [v for v in observed if v]
    yesno = vals and all(v in ("Yes", "No") for v in vals)
    numeric = vals and all(re.fullmatch(r"-?\d+(\.\d+)?", v) for v in vals)
    if has_hash and yesno:
        return "matrix_yesno"
    if has_hash and numeric:
        return "matrix_percent"
    if has_hash:
        return "matrix"
    if numeric:
        return "numeric"
    if yesno:
        return "yesno"
    return "single_or_text"


def load_csv(path):
    with open(path, encoding="utf-8-sig", newline="") as fh:
        rows = list(csv.reader(fh))
    h1, h2, h3 = rows[0], rows[1], rows[2]
    data = rows[3:]
    cols = [{"i": i, "code": c, "text": t, "qid": import_id(j)}
            for i, (c, t, j) in enumerate(zip(h1, h2, h3))]
    return cols, data


def col_getter(cols, data):
    idx = {c["code"]: c["i"] for c in cols}
    def get(row, code):
        i = idx.get(code)
        return (row[i].strip() if i is not None and i < len(row) else "")
    return get, idx


def is_question_col(code):
    return bool(re.match(r"Q\d", code))


def build_questions(cols, data):
    qcols = [c for c in cols if is_question_col(c["code"])]
    observed = {c["code"]: set() for c in qcols}
    for row in data:
        for c in qcols:
            v = (row[c["i"]].strip() if c["i"] < len(row) else "")
            if v:
                observed[c["code"]].add(v)
    groups, order_seen = {}, []
    for c in qcols:
        base = qbase(c["code"])
        if base not in groups:
            groups[base] = {"_id": base, "order": c["i"], "qnum": qnum(c["code"]),
                            "section": SECTION_BY_QNUM.get(qnum(c["code"]), "Other"),
                            "qids": [], "text": split_text(c["text"])[0],
                            "items": [], "_obs": set(), "_types": []}
            order_seen.append(base)
        g = groups[base]
        bq = re.match(r"(QID\d+)", c["qid"] or "")
        if bq and bq.group(1) not in g["qids"]:
            g["qids"].append(bq.group(1))
        _, label = split_text(c["text"])
        item = {"key": sanitize_key(c["code"]), "qcode": c["code"], "qid": c["qid"]}
        if label:
            item["label"] = label
        g["items"].append(item)
        g["_obs"] |= observed[c["code"]]
        g["_types"].append(detect_type(c["code"], observed[c["code"]]))
    out = []
    for n, base in enumerate(order_seen):
        g = groups[base]
        obs = sorted(g.pop("_obs"))
        types = g.pop("_types")
        gtype = max(set(types), key=types.count) if types else "single_or_text"
        g["type"] = gtype
        if gtype in ("yesno", "matrix_yesno"):
            g["options"] = ["Yes", "No"]
        elif gtype == "single_or_text" and obs and len(obs) <= 12:
            g["options"] = obs
        else:
            g["options"] = []
        if base in STEM_OVERRIDE:
            g["text"] = STEM_OVERRIDE[base]
        g["order"] = n
        if len(g["items"]) == 1 and "label" not in g["items"][0]:
            g["items"] = []
        out.append(g)
    return out


def build_response(cols, get, row, source_file):
    rid = get(row, "ResponseId")
    answers = {}
    for c in cols:
        if is_question_col(c["code"]):
            v = (row[c["i"]].strip() if c["i"] < len(row) else "")
            if v:
                answers[sanitize_key(c["code"])] = v
    meta = {
        "start_date": get(row, "StartDate") or None,
        "end_date": get(row, "EndDate") or None,
        "recorded_date": get(row, "RecordedDate") or None,
        "status": get(row, "Status") or None,
        "progress": int(get(row, "Progress")) if get(row, "Progress").isdigit() else (get(row, "Progress") or None),
        "finished": get(row, "Finished") in ("True", "1", "TRUE"),
        "duration_sec": int(get(row, "Duration (in seconds)")) if get(row, "Duration (in seconds)").isdigit() else None,
        "user_language": get(row, "UserLanguage") or None,
        "distribution_channel": get(row, "DistributionChannel") or None,
        "ip_address": get(row, "IPAddress") or None,
        "location_lat": get(row, "LocationLatitude") or None,
        "location_lng": get(row, "LocationLongitude") or None,
        "survey_date": get(row, "Date") or None,
        "survey_time": get(row, "Time") or None,
    }
    doc = {
        "_id": rid, "study": "77242113UCO3002",
        "site_country": get(row, "site_country") or None,
        "site_name": get(row, "site_name") or None,
        "site_city": get(row, "site_city") or None,
        "site_state": get(row, "site_state") or None,
        "site_postcode": get(row, "site_postcode") or None,
        "site_address": get(row, "site_address") or None,
        "pi_first_name": get(row, "pi_first_name") or None,
        "pi_last_name": get(row, "pi_last_name") or None,
        "pi_email": (get(row, "pi_email") or "").lower() or None,
        "pi_phone": get(row, "pi_phone") or None,
        "sdl_site_id": get(row, "sdl_site_id") or None,
        "fire_site_id": get(row, "fire_site_id") or None,
        "fire_investigator_id": get(row, "fire_investigator_id") or None,
        "mailinglist_id": get(row, "mailinglist_id") or None,
        "survey_generated_by": get(row, "survey_generated_by") or None,
        "recipient_email": (get(row, "RecipientEmail") or "").lower() or None,
        "recipient_last_name": get(row, "RecipientLastName") or None,
        "recipient_first_name": get(row, "RecipientFirstName") or None,
        "meta": meta,
        "is_full_sipiq": any(k.startswith(("Q57", "Q58", "Q59", "Q63", "Q66", "Q71")) for k in answers),
        "interested": answers.get("Q25"),
        "answers": answers,
        "investigator_oid": None, "investigator_match": None,
        "source_file": source_file,
    }
    return doc


def content_hash(doc):
    payload = {k: doc[k] for k in doc if k not in
               ("content_sha256", "first_imported_at", "last_seen_at", "last_updated_at",
                "history", "investigator_oid", "investigator_match", "source_file",
                "source_exported_at")}
    return hashlib.sha256(json.dumps(payload, sort_keys=True, ensure_ascii=False,
                                     default=str).encode("utf-8")).hexdigest()


def load_investigators(db):
    inv = list(db.investigators.find(
        {"zeme": {"$in": ["Czech Republic", "Slovakia"]}},
        {"prijmeni": 1, "jmeno": 1, "email": 1, "email2": 1, "zeme": 1, "KROK": 1}))
    by_email, by_name = {}, {}
    for d in inv:
        for ef in ("email", "email2"):
            e = (d.get(ef) or "").lower().strip()
            if e:
                by_email.setdefault(e, d)
        nm = norm_name(d.get("prijmeni"))
        if nm:
            by_name.setdefault((nm, d.get("zeme")), []).append(d)
    return inv, by_email, by_name


def soft_link(doc, by_email, by_name):
    e = (doc.get("pi_email") or "").lower().strip()
    if e and e in by_email:
        d = by_email[e]; return d["_id"], f"email:{e}", d
    e2 = (doc.get("recipient_email") or "").lower().strip()
    if e2 and e2 in by_email:
        d = by_email[e2]; return d["_id"], f"recipient_email:{e2}", d
    nm = norm_name(doc.get("pi_last_name"))
    cand = by_name.get((nm, doc.get("site_country")), [])
    if len(cand) == 1:
        return cand[0]["_id"], f"prijmeni:{nm}", cand[0]
    if len(cand) > 1:
        return None, f"prijmeni_ambiguous:{nm}({len(cand)})", None
    return None, "NENALEZENO", None


def diff_docs(old, new):
    changes = []
    def walk(prefix, o, n):
        for k in sorted(set((o or {}).keys()) | set((n or {}).keys())):
            ov, nv = (o or {}).get(k), (n or {}).get(k)
            if isinstance(ov, dict) or isinstance(nv, dict):
                walk(f"{prefix}{k}.", ov or {}, nv or {})
            elif ov != nv:
                changes.append({"key": f"{prefix}{k}", "old": ov, "new": nv})
    for field in ("answers", "meta"):
        walk(f"{field}.", old.get(field, {}), new.get(field, {}))
    for k in ("site_name", "pi_email", "pi_last_name", "interested", "is_full_sipiq"):
        if old.get(k) != new.get(k):
            changes.append({"key": k, "old": old.get(k), "new": new.get(k)})
    return changes


# ---------------------------------------------------------------------------
def process_file(db, csv_path, scope, dry, by_email, by_name):
    source_file = os.path.basename(csv_path)
    exported_at = file_mtime_iso(csv_path)   # datum/čas reportu dle filesystému (mtime)
    cols, data = load_csv(csv_path)
    get, _ = col_getter(cols, data)
    if scope == "czsk":
        data = [r for r in data if get(r, "site_country") in ("Czech Republic", "Slovakia")]
    print(f"\n########## {source_file}  (rozsah={scope}, odpovědí={len(data)}, export={exported_at}) ##########")

    cols_all, data_all = load_csv(csv_path)
    questions = build_questions(cols_all, data_all)

    docs, link_rows = [], []
    for r in data:
        doc = build_response(cols, get, r, source_file)
        oid, how, matched = soft_link(doc, by_email, by_name)
        doc["investigator_oid"] = oid
        doc["investigator_match"] = how
        doc["source_exported_at"] = exported_at
        doc["content_sha256"] = content_hash(doc)
        docs.append(doc)
        link_rows.append((doc, how, matched))

    existing = {d["_id"]: d for d in db[COL_R].find({}, {"content_sha256": 1})}
    to_insert = [d for d in docs if d["_id"] not in existing]
    to_update = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") != d["content_sha256"]]
    unchanged = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") == d["content_sha256"]]

    mk7 = mko = un = 0
    for doc, how, m in link_rows:
        krok = (m or {}).get("KROK", "")
        if m and str(krok).startswith("7"): mk7 += 1
        elif m: mko += 1
        else: un += 1
    print(f"  slovník: {len(questions)} otázek | soft-link: KROK7={mk7}, jiný={mko}, nenapárováno={un}")
    print(f"  delta: INSERT={len(to_insert)}, UPDATE={len(to_update)}, beze změny={len(unchanged)}")
    if un:
        for doc, how, m in link_rows:
            if not m:
                print(f"    ✗ NENAPÁROVÁNO: {doc.get('pi_last_name')} / {doc.get('pi_email')} ({how})")

    if dry:
        print("  [DRY-RUN] nezapsáno")
        return {"insert": 0, "update": 0, "unchanged": 0, "wrote": False}

    for q in questions:
        db[COL_Q].replace_one({"_id": q["_id"]}, q, upsert=True)
    ts = now_iso()
    ni = nu = ns = 0
    for d in docs:
        cur = db[COL_R].find_one({"_id": d["_id"]})
        if cur is None:
            d.update({"first_imported_at": ts, "last_seen_at": ts, "last_updated_at": ts, "history": []})
            db[COL_R].insert_one(d); ni += 1
        elif cur.get("content_sha256") != d["content_sha256"]:
            changes = diff_docs(cur, d)
            db[COL_R].update_one({"_id": d["_id"]}, {
                "$set": {**{k: d[k] for k in d if k != "_id"}, "last_seen_at": ts, "last_updated_at": ts},
                "$push": {"history": {"changed_at": ts, "source_file": source_file, "changes": changes}}})
            nu += 1
        else:
            db[COL_R].update_one({"_id": d["_id"]}, {"$set": {
                "last_seen_at": ts, "source_file": source_file, "source_exported_at": d["source_exported_at"]}})
            ns += 1
    print(f"  [APPLY] questions upsert={len(questions)} | responses insert={ni}, update={nu}, beze změny={ns}")
    return {"insert": ni, "update": nu, "unchanged": ns, "wrote": True}


def move_to_processed(csv_path, folder):
    dest_dir = os.path.join(folder, PROCESSED_SUBDIR)
    os.makedirs(dest_dir, exist_ok=True)
    base = os.path.basename(csv_path)
    dest = os.path.join(dest_dir, base)
    if os.path.exists(dest):
        stem, ext = os.path.splitext(base)
        n = 1
        while os.path.exists(os.path.join(dest_dir, f"{stem}_{n}{ext}")):
            n += 1
        dest = os.path.join(dest_dir, f"{stem}_{n}{ext}")
    shutil.move(csv_path, dest)
    print(f"  -> přesunuto do {PROCESSED_SUBDIR}\\{os.path.basename(dest)}")


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--csv", help="jediný soubor (NEpřesouvá)")
    ap.add_argument("--folder", default=DEFAULT_FOLDER, help="složka se SIPIQ CSV (přesune do Zpracováno)")
    ap.add_argument("--scope", choices=["czsk", "all"], default="czsk")
    ap.add_argument("--apply", action="store_true")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    dry = not args.apply

    if args.csv:
        files, move_mode, folder = [args.csv], False, None
    else:
        folder = args.folder
        files = sorted(glob.glob(os.path.join(folder, "*.csv")))
        move_mode = True
        print(f"Složka: {folder}\nNalezeno CSV ke zpracování: {len(files)}")
        if not files:
            print("Nic ke zpracování (žádné *.csv).")
            return

    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=8000)
    db = client[DB_NAME]
    client.admin.command("ping")
    inv, by_email, by_name = load_investigators(db)
    print(f"Investigatorů CZ+SK v DB: {len(inv)}")

    total = {"insert": 0, "update": 0, "unchanged": 0}
    for f in files:
        res = process_file(db, f, args.scope, dry, by_email, by_name)
        for k in total:
            total[k] += res[k]
        if move_mode and res["wrote"]:
            move_to_processed(f, folder)

    print(f"\n=== CELKEM: insert={total['insert']}, update={total['update']}, beze změny={total['unchanged']} ===")
    if dry:
        print("[DRY-RUN] Nic se nezapsalo ani nepřesunulo. Ostrý běh: --apply")
    client.close()


if __name__ == "__main__":
    main()