Files
janssen/Feasibility/TRASH/sipiq_import_v1.0.py
T
2026-06-17 15:05:10 +02:00

535 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
sipiq_import_v1.0.py
====================
Verze: 1.0
Datum: 2026-06-17
Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
Popis
-----
Import SIPIQ odpovědí (Qualtrics CSV export, studie 77242113UCO3002 / ICONIC DAWN)
do MongoDB db `feasibility`. Cílem je:
(a) umožnit křížovou analýzu „otázka × otázka" (ploché odpovědi keyed by Qcode),
(b) umožnit zrekonstruovat KOMPLETNÍ SIPIQ tak, jak ho zkoušející vidí v PDF,
jen s vyplněnými odpověďmi (slovník otázek se sekcí/pořadím/popisky).
Dvě kolekce v db `feasibility`:
* sipiq_questions slovník dotazníku (1 dok = 1 logická otázka; section, order,
text, items[], type, options). Idempotentní (upsert dle _id).
* sipiq_responses 1 dok = 1 odpověď (_id = Qualtrics ResponseId). Identita centra/PI
nahoře, ploché answers{}, meta{}, soft-link investigator_oid,
delta bookkeeping (content_sha256, history[], timestamps).
DELTA import (přepíše JEN změněná data):
- nová odpověď -> insert
- existuje, beze změn -> aktualizuje pouze last_seen_at (+ source_file)
- existuje, něco se změnilo -> $set jen změněných polí + push do history[] {key,old,new}
Soft-link na feasibility.investigators:
- primárně pi_email == email / email2 (lowercase)
- fallback příjmení (bez diakritiky, lower) + země (CZ/SK)
- nedestruktivní: kolekci investigators NEMĚNÍ, jen ukládá investigator_oid do response.
Rozsah: default CZ + SK (--scope czsk). --scope all = všech 276.
Použití:
python sipiq_import_v1.0.py --csv "<cesta.csv>" --dry-run
python sipiq_import_v1.0.py --csv "<cesta.csv>" --apply
Závislosti: pymongo (.venv). Mongo 192.168.1.76:27017, bez auth.
"""
import argparse
import csv
import hashlib
import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
try:
from pymongo import MongoClient
except ImportError:
print("CHYBA: pymongo není nainstalován v aktuálním pythonu.", file=sys.stderr)
raise
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "feasibility"
COL_Q = "sipiq_questions"
COL_R = "sipiq_responses"
# Qualtrics systémová meta pole (NEjdou do answers)
META_COLS = {
"StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)",
"Finished", "RecordedDate", "ResponseId", "RecipientLastName", "RecipientFirstName",
"RecipientEmail", "ExternalReference", "LocationLatitude", "LocationLongitude",
"DistributionChannel", "UserLanguage",
}
# Embedded SDL pole povýšená nahoru do dokumentu (queryable identita)
PROMOTE = [
"site_name", "site_address", "site_city", "site_state", "site_postcode", "site_country",
"pi_first_name", "pi_last_name", "pi_phone", "pi_email",
"sdl_site_id", "fire_site_id", "fire_investigator_id", "mailinglist_id",
"survey_generated_by", "Date", "Time",
]
# Sekce dle ověřeného katalogu (mapování báze Q-čísla -> sekce v PDF)
SECTION_BY_QNUM = {}
def _sec(rng, name):
for n in rng:
SECTION_BY_QNUM[n] = name
_sec([2], "J&J Internal Assessment")
_sec([6, 7, 8, 9, 10, 11, 12, 13], "Contact Information")
_sec(range(14, 22), "Confidentiality Statement")
_sec([25, 26, 27], "Interest")
_sec([29, 30, 31, 32, 33, 34], "Protocol Requirements")
_sec([36, 37, 38], "Enrollment")
_sec([40, 41, 42, 43], "Patient Demographics Overview")
_sec([45, 46, 47, 48, 49], "Site Overview")
_sec([51], "Operational Considerations")
_sec([53, 54], "Comments")
_sec([57, 58, 59, 60, 61], "Patient Population")
_sec([63, 64, 65, 66, 67], "Site Experience and Staffing")
_sec([69], "Equipment and Facility Requirements")
_sec([71, 72, 73, 74, 75], "Institutional Review Board, Ethics Committee, and Contracts")
# Plné znění otázek, které Qualtrics v hlavičce CSV ořezává "..." (maticové otázky).
# Zdroj: prázdný SIPIQ PDF (ICONIC ... _SipIQ_V1_13MAY2026.pdf).
STEM_OVERRIDE = {
"Q31": "At your site, at what line(s) of treatment do you most commonly prescribe "
"vedolizumab for patients with moderately to severely active ulcerative colitis?",
"Q63": "Do you or your site staff have experience in performing the following types of "
"study assessments/procedures?",
"Q64": "The following personnel are required to run the study. "
"Will your site have the following available?",
"Q69": "The following equipment and facilities are required to run the studies. "
"Are these available at your site?",
}
def now_iso():
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
def strip_accents(s):
if not s:
return ""
nfkd = unicodedata.normalize("NFKD", s)
return "".join(c for c in nfkd if not unicodedata.combining(c))
def norm_name(s):
return re.sub(r"\s+", " ", strip_accents(s or "").lower()).strip()
def sanitize_key(qcode):
"""Qcode -> klíč do answers{} (MongoDB-safe): '#' a '.' -> '_'."""
return qcode.replace("#", "_").replace(".", "_")
def qnum(qcode):
"""Číslo otázky z Qcode (Q63#1_2 -> 63, Q40_6_TEXT -> 40)."""
m = re.match(r"Q(\d+)", qcode)
return int(m.group(1)) if m else None
def qbase(qcode):
"""Logická báze otázky (Q63#1_2 -> Q63, Q40_6 -> Q40, Q25 -> Q25)."""
m = re.match(r"(Q\d+)", qcode)
return m.group(1) if m else qcode
def import_id(h3_cell):
try:
return json.loads(h3_cell).get("ImportId", "")
except Exception:
return h3_cell
def split_text(text):
"""Vrátí (stem, item_label). Stem = text otázky, item_label = popisek podčásti."""
parts = [p.strip() for p in re.split(r"\s+-\s+", text)]
stem = parts[0]
if len(parts) == 1:
return stem, None
# poslední část = popisek řádku/části; vyčisti Qualtrics artefakty
label_parts = parts[1:]
# zahodit "Selected Choice" (artefakt single-choice s Other)
label_parts = [p for p in label_parts if p.lower() != "selected choice"]
# zahodit interní statement kód typu "Q63#1"
label_parts = [p for p in label_parts if not re.fullmatch(r"Q\d+#\d+", p)]
label = " - ".join(label_parts) if label_parts else None
return stem, label
def detect_type(qcode, observed):
"""Heuristika typu otázky z Qcode a pozorovaných hodnot."""
has_hash = "#" in qcode
vals = [v for v in observed if v]
yesno = vals and all(v in ("Yes", "No") for v in vals)
numeric = vals and all(re.fullmatch(r"-?\d+(\.\d+)?", v) for v in vals)
if has_hash and yesno:
return "matrix_yesno"
if has_hash and numeric:
return "matrix_percent"
if has_hash:
return "matrix"
if numeric:
return "numeric"
if yesno:
return "yesno"
return "single_or_text"
# ---------------------------------------------------------------------------
def load_csv(path):
with open(path, encoding="utf-8-sig", newline="") as fh:
rows = list(csv.reader(fh))
h1, h2, h3 = rows[0], rows[1], rows[2]
data = rows[3:]
cols = []
for i, (code, text, imp) in enumerate(zip(h1, h2, h3)):
cols.append({"i": i, "code": code, "text": text, "qid": import_id(imp)})
return cols, data
def col_getter(cols, data):
idx = {c["code"]: c["i"] for c in cols}
def get(row, code):
i = idx.get(code)
return (row[i].strip() if i is not None and i < len(row) else "")
return get, idx
def is_question_col(code):
return bool(re.match(r"Q\d", code))
# ---------------------------------------------------------------------------
def build_questions(cols, data):
"""Slovník otázek -> list dokumentů (1 = 1 logická otázka)."""
# observed hodnoty per Qcode (pro typ + options)
qcols = [c for c in cols if is_question_col(c["code"])]
observed = {c["code"]: set() for c in qcols}
for row in data:
for c in qcols:
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
if v:
observed[c["code"]].add(v)
groups = {} # base -> dict
order_seen = []
for c in qcols:
base = qbase(c["code"])
if base not in groups:
groups[base] = {
"_id": base,
"order": c["i"],
"qnum": qnum(c["code"]),
"section": SECTION_BY_QNUM.get(qnum(c["code"]), "Other"),
"qids": [],
"text": split_text(c["text"])[0],
"items": [],
"_obs": set(),
"_types": [],
}
order_seen.append(base)
g = groups[base]
base_qid = re.match(r"(QID\d+)", c["qid"] or "")
if base_qid and base_qid.group(1) not in g["qids"]:
g["qids"].append(base_qid.group(1))
stem, label = split_text(c["text"])
key = sanitize_key(c["code"])
item = {"key": key, "qcode": c["code"], "qid": c["qid"]}
if label:
item["label"] = label
g["items"].append(item)
g["_obs"] |= observed[c["code"]]
g["_types"].append(detect_type(c["code"], observed[c["code"]]))
out = []
for n, base in enumerate(order_seen):
g = groups[base]
obs = sorted(g.pop("_obs"))
types = g.pop("_types")
# typ skupiny: nejčastější netriviální
gtype = max(set(types), key=types.count) if types else "single_or_text"
g["type"] = gtype
# options jen u kategorických (yesno/single)
if gtype in ("yesno", "matrix_yesno"):
g["options"] = ["Yes", "No"]
elif gtype == "single_or_text" and obs and len(obs) <= 12:
g["options"] = obs
else:
g["options"] = []
if base in STEM_OVERRIDE:
g["text"] = STEM_OVERRIDE[base]
g["order"] = n # přečíslovat 0..N dle pořadí v CSV
# pokud má jen 1 item bez labelu, items vynech (je to prostá otázka)
if len(g["items"]) == 1 and "label" not in g["items"][0]:
g["items"] = []
out.append(g)
return out
# ---------------------------------------------------------------------------
def build_response(cols, get, row, source_file):
rid = get(row, "ResponseId")
answers = {}
for c in cols:
if is_question_col(c["code"]):
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
if v:
answers[sanitize_key(c["code"])] = v
def g(*names):
for nm in names:
v = get(row, nm)
if v:
return v
return None
meta = {
"start_date": get(row, "StartDate") or None,
"end_date": get(row, "EndDate") or None,
"recorded_date": get(row, "RecordedDate") or None,
"status": get(row, "Status") or None,
"progress": int(get(row, "Progress")) if get(row, "Progress").isdigit() else get(row, "Progress") or None,
"finished": get(row, "Finished") in ("True", "1", "TRUE"),
"duration_sec": int(get(row, "Duration (in seconds)")) if get(row, "Duration (in seconds)").isdigit() else None,
"user_language": get(row, "UserLanguage") or None,
"distribution_channel": get(row, "DistributionChannel") or None,
"ip_address": get(row, "IPAddress") or None,
"location_lat": get(row, "LocationLatitude") or None,
"location_lng": get(row, "LocationLongitude") or None,
"survey_date": get(row, "Date") or None,
"survey_time": get(row, "Time") or None,
}
doc = {
"_id": rid,
"study": "77242113UCO3002",
"site_country": get(row, "site_country") or None,
"site_name": get(row, "site_name") or None,
"site_city": get(row, "site_city") or None,
"site_state": get(row, "site_state") or None,
"site_postcode": get(row, "site_postcode") or None,
"site_address": get(row, "site_address") or None,
"pi_first_name": get(row, "pi_first_name") or None,
"pi_last_name": get(row, "pi_last_name") or None,
"pi_email": (get(row, "pi_email") or "").lower() or None,
"pi_phone": get(row, "pi_phone") or None,
"sdl_site_id": get(row, "sdl_site_id") or None,
"fire_site_id": get(row, "fire_site_id") or None,
"fire_investigator_id": get(row, "fire_investigator_id") or None,
"mailinglist_id": get(row, "mailinglist_id") or None,
"survey_generated_by": get(row, "survey_generated_by") or None,
"recipient_email": (get(row, "RecipientEmail") or "").lower() or None,
"recipient_last_name": get(row, "RecipientLastName") or None,
"recipient_first_name": get(row, "RecipientFirstName") or None,
"meta": meta,
"is_full_sipiq": any(k.startswith(("Q57", "Q58", "Q59", "Q63", "Q66", "Q71")) for k in answers),
"interested": answers.get("Q25"),
"answers": answers,
"investigator_oid": None,
"investigator_match": None,
"source_file": source_file,
}
return doc
def content_hash(doc):
payload = {k: doc[k] for k in doc if k not in
("content_sha256", "first_imported_at", "last_seen_at", "last_updated_at", "history",
"investigator_oid", "investigator_match", "source_file")}
blob = json.dumps(payload, sort_keys=True, ensure_ascii=False, default=str)
return hashlib.sha256(blob.encode("utf-8")).hexdigest()
# ---------------------------------------------------------------------------
def load_investigators(db):
inv = list(db.investigators.find(
{"zeme": {"$in": ["Czech Republic", "Slovakia"]}},
{"prijmeni": 1, "jmeno": 1, "email": 1, "email2": 1, "zeme": 1, "KROK": 1, "pracoviste": 1},
))
by_email = {}
by_name = {}
for d in inv:
for ef in ("email", "email2"):
e = (d.get(ef) or "").lower().strip()
if e:
by_email.setdefault(e, d)
nm = norm_name(d.get("prijmeni"))
if nm:
by_name.setdefault((nm, d.get("zeme")), []).append(d)
return inv, by_email, by_name
def soft_link(doc, by_email, by_name):
e = (doc.get("pi_email") or "").lower().strip()
if e and e in by_email:
d = by_email[e]
return d["_id"], f"email:{e}", d
e2 = (doc.get("recipient_email") or "").lower().strip()
if e2 and e2 in by_email:
d = by_email[e2]
return d["_id"], f"recipient_email:{e2}", d
nm = norm_name(doc.get("pi_last_name"))
cand = by_name.get((nm, doc.get("site_country")), [])
if len(cand) == 1:
return cand[0]["_id"], f"prijmeni:{nm}", cand[0]
if len(cand) > 1:
return None, f"prijmeni_ambiguous:{nm}({len(cand)})", None
return None, "NENALEZENO", None
# ---------------------------------------------------------------------------
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--csv", required=True)
ap.add_argument("--scope", choices=["czsk", "all"], default="czsk")
ap.add_argument("--apply", action="store_true", help="ostrý zápis (jinak dry-run)")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
dry = not args.apply
source_file = args.csv.replace("\\", "/").split("/")[-1]
cols, data = load_csv(args.csv)
get, idx = col_getter(cols, data)
# filtr rozsahu
if args.scope == "czsk":
data = [r for r in data if get(r, "site_country") in ("Czech Republic", "Slovakia")]
print(f"Zdroj: {source_file} | rozsah={args.scope} | odpovědí k importu: {len(data)}")
# --- slovník otázek (staví se z PLNÉHO CSV, ne jen scope) ---
cols_all, data_all = load_csv(args.csv)
questions = build_questions(cols_all, data_all)
print(f"Slovník otázek: {len(questions)} logických otázek "
f"(z toho {sum(1 for q in questions if q['items'])} vícedílných).")
# --- Mongo ---
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=8000)
db = client[DB_NAME]
client.admin.command("ping")
inv, by_email, by_name = load_investigators(db)
print(f"Investigatorů CZ+SK v DB: {len(inv)}")
# --- response dokumenty + soft-link ---
docs = []
link_rows = []
for r in data:
doc = build_response(cols, get, r, source_file)
oid, how, matched = soft_link(doc, by_email, by_name)
doc["investigator_oid"] = oid
doc["investigator_match"] = how
doc["content_sha256"] = content_hash(doc)
docs.append(doc)
link_rows.append((doc, how, matched))
# --- delta proti DB ---
existing = {d["_id"]: d for d in db[COL_R].find({}, {"content_sha256": 1})}
to_insert = [d for d in docs if d["_id"] not in existing]
to_update, unchanged = [], []
for d in docs:
if d["_id"] in existing:
if existing[d["_id"]].get("content_sha256") != d["content_sha256"]:
to_update.append(d)
else:
unchanged.append(d)
# ===================== REPORT =====================
print("\n=== SOFT-LINK na investigators ===")
matched_k7 = matched_other = unmatched = 0
for doc, how, m in link_rows:
krok = (m or {}).get("KROK", "")
tag = "" if m else ""
if m and str(krok).startswith("7"):
matched_k7 += 1
elif m:
matched_other += 1
else:
unmatched += 1
print(f" {tag} {doc.get('site_country','?')[:2]} {str(doc.get('pi_last_name'))[:18]:18} "
f"{str(doc.get('pi_email'))[:32]:32} -> {how[:40]:40} {('KROK '+str(krok)) if m else ''}")
print(f" Souhrn: napárováno KROK7={matched_k7}, jiný KROK={matched_other}, nenapárováno={unmatched}")
print("\n=== DELTA ===")
print(f" INSERT (nové): {len(to_insert)}")
print(f" UPDATE (změněné): {len(to_update)}")
print(f" beze změny: {len(unchanged)}")
# ukázka 1 dokumentu
if docs:
s = dict(docs[0])
s["answers"] = {k: s["answers"][k] for k in list(s["answers"])[:6]}
s["answers"][""] = f"(+{len(docs[0]['answers'])-6} dalších)"
print("\n=== UKÁZKA response dokumentu (zkráceno) ===")
print(json.dumps(s, ensure_ascii=False, indent=2, default=str)[:1800])
if dry:
print("\n[DRY-RUN] Nic se nezapsalo. Ostrý běh: přidej --apply")
client.close()
return
# ===================== ZÁPIS =====================
# 1) slovník otázek (idempotentní upsert)
nq = 0
for q in questions:
db[COL_Q].replace_one({"_id": q["_id"]}, q, upsert=True)
nq += 1
print(f"\n[APPLY] sipiq_questions: upsertnuto {nq}")
# 2) responses (delta)
ts = now_iso()
ni = nu = ns = 0
for d in docs:
cur = db[COL_R].find_one({"_id": d["_id"]})
if cur is None:
d["first_imported_at"] = ts
d["last_seen_at"] = ts
d["last_updated_at"] = ts
d["history"] = []
db[COL_R].insert_one(d)
ni += 1
elif cur.get("content_sha256") != d["content_sha256"]:
changes = diff_docs(cur, d)
db[COL_R].update_one({"_id": d["_id"]}, {
"$set": {**{k: d[k] for k in d if k not in ("_id",)},
"last_seen_at": ts, "last_updated_at": ts},
"$push": {"history": {"changed_at": ts, "source_file": source_file, "changes": changes}},
})
nu += 1
else:
db[COL_R].update_one({"_id": d["_id"]},
{"$set": {"last_seen_at": ts, "source_file": source_file}})
ns += 1
print(f"[APPLY] sipiq_responses: insert={ni}, update={nu}, beze změny={ns}")
client.close()
def diff_docs(old, new):
"""Field-level diff pro history (jen answers + povýšená pole + meta)."""
changes = []
def walk(prefix, o, n):
keys = set((o or {}).keys()) | set((n or {}).keys())
for k in sorted(keys):
ov, nv = (o or {}).get(k), (n or {}).get(k)
if isinstance(ov, dict) or isinstance(nv, dict):
walk(f"{prefix}{k}.", ov or {}, nv or {})
elif ov != nv:
changes.append({"key": f"{prefix}{k}", "old": ov, "new": nv})
for field in ("answers", "meta"):
walk(f"{field}.", old.get(field, {}), new.get(field, {}))
for k in ("site_name", "pi_email", "pi_last_name", "interested", "is_full_sipiq"):
if old.get(k) != new.get(k):
changes.append({"key": k, "old": old.get(k), "new": new.get(k)})
return changes
if __name__ == "__main__":
main()