Files
janssen/Feasibility/sipiq_import_v1.2.py
T
2026-06-17 15:05:10 +02:00

490 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
sipiq_import_v1.2.py
====================
Verze: 1.2
Datum: 2026-06-17
Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
Změny proti v1.1
----------------
- PROVENANCE: ke každé odpovědi se ukládá `source_exported_at` = datum/čas reportu
podle FILESYSTÉMU (mtime CSV souboru). Mimo content-hash → nezpůsobuje zbytečné
UPDATE; backfilluje se i na "beze změny" cestě. Stará v1.1 ponechána v TRASH.
Změny proti v1.0
----------------
- FOLDER WORKFLOW (v1.1): režim --folder sebere *.csv ve složce, naimportuje (delta)
a přesune do podsložky `Zpracováno`. Default složka =
U:\\PythonProject\\Janssen\\Feasibility\\77242113UCO2001\\ImportSIPIQcompled.
Popis
-----
Import SIPIQ odpovědí (Qualtrics CSV export, studie 77242113UCO3002 / ICONIC DAWN)
do MongoDB db `feasibility`. Dvě kolekce:
* sipiq_questions slovník dotazníku (1 dok = 1 logická otázka).
* sipiq_responses 1 dok = 1 odpověď (_id = Qualtrics ResponseId), ploché answers{},
soft-link investigator_oid, delta bookkeeping + history[].
DELTA import (přepíše JEN změněná data): nová->insert; beze změn->jen last_seen_at;
změna->$set jen změněných polí + push do history[].
Použití
-------
python sipiq_import_v1.2.py --dry-run # folder režim, default složka
python sipiq_import_v1.2.py --apply
python sipiq_import_v1.2.py --folder "<cesta>" --apply
python sipiq_import_v1.2.py --csv "<cesta.csv>" --apply # jediný soubor (NEpřesouvá)
Závislosti: pymongo (.venv). Mongo 192.168.1.76:27017, bez auth.
"""
import argparse
import csv
import glob
import hashlib
import json
import os
import re
import shutil
import sys
import unicodedata
from datetime import datetime, timezone
try:
from pymongo import MongoClient
except ImportError:
print("CHYBA: pymongo není nainstalován v aktuálním pythonu.", file=sys.stderr)
raise
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "feasibility"
COL_Q = "sipiq_questions"
COL_R = "sipiq_responses"
DEFAULT_FOLDER = r"U:\PythonProject\Janssen\Feasibility\77242113UCO2001\ImportSIPIQcompled"
PROCESSED_SUBDIR = "Zpracováno"
META_COLS = {
"StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)",
"Finished", "RecordedDate", "ResponseId", "RecipientLastName", "RecipientFirstName",
"RecipientEmail", "ExternalReference", "LocationLatitude", "LocationLongitude",
"DistributionChannel", "UserLanguage",
}
PROMOTE = [
"site_name", "site_address", "site_city", "site_state", "site_postcode", "site_country",
"pi_first_name", "pi_last_name", "pi_phone", "pi_email",
"sdl_site_id", "fire_site_id", "fire_investigator_id", "mailinglist_id",
"survey_generated_by", "Date", "Time",
]
SECTION_BY_QNUM = {}
def _sec(rng, name):
for n in rng:
SECTION_BY_QNUM[n] = name
_sec([2], "J&J Internal Assessment")
_sec([6, 7, 8, 9, 10, 11, 12, 13], "Contact Information")
_sec(range(14, 22), "Confidentiality Statement")
_sec([25, 26, 27], "Interest")
_sec([29, 30, 31, 32, 33, 34], "Protocol Requirements")
_sec([36, 37, 38], "Enrollment")
_sec([40, 41, 42, 43], "Patient Demographics Overview")
_sec([45, 46, 47, 48, 49], "Site Overview")
_sec([51], "Operational Considerations")
_sec([53, 54], "Comments")
_sec([57, 58, 59, 60, 61], "Patient Population")
_sec([63, 64, 65, 66, 67], "Site Experience and Staffing")
_sec([69], "Equipment and Facility Requirements")
_sec([71, 72, 73, 74, 75], "Institutional Review Board, Ethics Committee, and Contracts")
STEM_OVERRIDE = {
"Q31": "At your site, at what line(s) of treatment do you most commonly prescribe "
"vedolizumab for patients with moderately to severely active ulcerative colitis?",
"Q63": "Do you or your site staff have experience in performing the following types of "
"study assessments/procedures?",
"Q64": "The following personnel are required to run the study. "
"Will your site have the following available?",
"Q69": "The following equipment and facilities are required to run the studies. "
"Are these available at your site?",
}
def now_iso():
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
def file_mtime_iso(path):
return datetime.fromtimestamp(os.path.getmtime(path)).astimezone().isoformat(timespec="seconds")
def strip_accents(s):
if not s:
return ""
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
def norm_name(s):
return re.sub(r"\s+", " ", strip_accents(s or "").lower()).strip()
def sanitize_key(qcode):
return qcode.replace("#", "_").replace(".", "_")
def qnum(qcode):
m = re.match(r"Q(\d+)", qcode)
return int(m.group(1)) if m else None
def qbase(qcode):
m = re.match(r"(Q\d+)", qcode)
return m.group(1) if m else qcode
def import_id(h3_cell):
try:
return json.loads(h3_cell).get("ImportId", "")
except Exception:
return h3_cell
def split_text(text):
parts = [p.strip() for p in re.split(r"\s+-\s+", text)]
stem = parts[0]
if len(parts) == 1:
return stem, None
label_parts = [p for p in parts[1:] if p.lower() != "selected choice"]
label_parts = [p for p in label_parts if not re.fullmatch(r"Q\d+#\d+", p)]
return stem, (" - ".join(label_parts) if label_parts else None)
def detect_type(qcode, observed):
has_hash = "#" in qcode
vals = [v for v in observed if v]
yesno = vals and all(v in ("Yes", "No") for v in vals)
numeric = vals and all(re.fullmatch(r"-?\d+(\.\d+)?", v) for v in vals)
if has_hash and yesno:
return "matrix_yesno"
if has_hash and numeric:
return "matrix_percent"
if has_hash:
return "matrix"
if numeric:
return "numeric"
if yesno:
return "yesno"
return "single_or_text"
def load_csv(path):
with open(path, encoding="utf-8-sig", newline="") as fh:
rows = list(csv.reader(fh))
h1, h2, h3 = rows[0], rows[1], rows[2]
data = rows[3:]
cols = [{"i": i, "code": c, "text": t, "qid": import_id(j)}
for i, (c, t, j) in enumerate(zip(h1, h2, h3))]
return cols, data
def col_getter(cols, data):
idx = {c["code"]: c["i"] for c in cols}
def get(row, code):
i = idx.get(code)
return (row[i].strip() if i is not None and i < len(row) else "")
return get, idx
def is_question_col(code):
return bool(re.match(r"Q\d", code))
def build_questions(cols, data):
qcols = [c for c in cols if is_question_col(c["code"])]
observed = {c["code"]: set() for c in qcols}
for row in data:
for c in qcols:
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
if v:
observed[c["code"]].add(v)
groups, order_seen = {}, []
for c in qcols:
base = qbase(c["code"])
if base not in groups:
groups[base] = {"_id": base, "order": c["i"], "qnum": qnum(c["code"]),
"section": SECTION_BY_QNUM.get(qnum(c["code"]), "Other"),
"qids": [], "text": split_text(c["text"])[0],
"items": [], "_obs": set(), "_types": []}
order_seen.append(base)
g = groups[base]
bq = re.match(r"(QID\d+)", c["qid"] or "")
if bq and bq.group(1) not in g["qids"]:
g["qids"].append(bq.group(1))
_, label = split_text(c["text"])
item = {"key": sanitize_key(c["code"]), "qcode": c["code"], "qid": c["qid"]}
if label:
item["label"] = label
g["items"].append(item)
g["_obs"] |= observed[c["code"]]
g["_types"].append(detect_type(c["code"], observed[c["code"]]))
out = []
for n, base in enumerate(order_seen):
g = groups[base]
obs = sorted(g.pop("_obs"))
types = g.pop("_types")
gtype = max(set(types), key=types.count) if types else "single_or_text"
g["type"] = gtype
if gtype in ("yesno", "matrix_yesno"):
g["options"] = ["Yes", "No"]
elif gtype == "single_or_text" and obs and len(obs) <= 12:
g["options"] = obs
else:
g["options"] = []
if base in STEM_OVERRIDE:
g["text"] = STEM_OVERRIDE[base]
g["order"] = n
if len(g["items"]) == 1 and "label" not in g["items"][0]:
g["items"] = []
out.append(g)
return out
def build_response(cols, get, row, source_file):
rid = get(row, "ResponseId")
answers = {}
for c in cols:
if is_question_col(c["code"]):
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
if v:
answers[sanitize_key(c["code"])] = v
meta = {
"start_date": get(row, "StartDate") or None,
"end_date": get(row, "EndDate") or None,
"recorded_date": get(row, "RecordedDate") or None,
"status": get(row, "Status") or None,
"progress": int(get(row, "Progress")) if get(row, "Progress").isdigit() else (get(row, "Progress") or None),
"finished": get(row, "Finished") in ("True", "1", "TRUE"),
"duration_sec": int(get(row, "Duration (in seconds)")) if get(row, "Duration (in seconds)").isdigit() else None,
"user_language": get(row, "UserLanguage") or None,
"distribution_channel": get(row, "DistributionChannel") or None,
"ip_address": get(row, "IPAddress") or None,
"location_lat": get(row, "LocationLatitude") or None,
"location_lng": get(row, "LocationLongitude") or None,
"survey_date": get(row, "Date") or None,
"survey_time": get(row, "Time") or None,
}
doc = {
"_id": rid, "study": "77242113UCO3002",
"site_country": get(row, "site_country") or None,
"site_name": get(row, "site_name") or None,
"site_city": get(row, "site_city") or None,
"site_state": get(row, "site_state") or None,
"site_postcode": get(row, "site_postcode") or None,
"site_address": get(row, "site_address") or None,
"pi_first_name": get(row, "pi_first_name") or None,
"pi_last_name": get(row, "pi_last_name") or None,
"pi_email": (get(row, "pi_email") or "").lower() or None,
"pi_phone": get(row, "pi_phone") or None,
"sdl_site_id": get(row, "sdl_site_id") or None,
"fire_site_id": get(row, "fire_site_id") or None,
"fire_investigator_id": get(row, "fire_investigator_id") or None,
"mailinglist_id": get(row, "mailinglist_id") or None,
"survey_generated_by": get(row, "survey_generated_by") or None,
"recipient_email": (get(row, "RecipientEmail") or "").lower() or None,
"recipient_last_name": get(row, "RecipientLastName") or None,
"recipient_first_name": get(row, "RecipientFirstName") or None,
"meta": meta,
"is_full_sipiq": any(k.startswith(("Q57", "Q58", "Q59", "Q63", "Q66", "Q71")) for k in answers),
"interested": answers.get("Q25"),
"answers": answers,
"investigator_oid": None, "investigator_match": None,
"source_file": source_file,
}
return doc
def content_hash(doc):
payload = {k: doc[k] for k in doc if k not in
("content_sha256", "first_imported_at", "last_seen_at", "last_updated_at",
"history", "investigator_oid", "investigator_match", "source_file",
"source_exported_at")}
return hashlib.sha256(json.dumps(payload, sort_keys=True, ensure_ascii=False,
default=str).encode("utf-8")).hexdigest()
def load_investigators(db):
inv = list(db.investigators.find(
{"zeme": {"$in": ["Czech Republic", "Slovakia"]}},
{"prijmeni": 1, "jmeno": 1, "email": 1, "email2": 1, "zeme": 1, "KROK": 1}))
by_email, by_name = {}, {}
for d in inv:
for ef in ("email", "email2"):
e = (d.get(ef) or "").lower().strip()
if e:
by_email.setdefault(e, d)
nm = norm_name(d.get("prijmeni"))
if nm:
by_name.setdefault((nm, d.get("zeme")), []).append(d)
return inv, by_email, by_name
def soft_link(doc, by_email, by_name):
e = (doc.get("pi_email") or "").lower().strip()
if e and e in by_email:
d = by_email[e]; return d["_id"], f"email:{e}", d
e2 = (doc.get("recipient_email") or "").lower().strip()
if e2 and e2 in by_email:
d = by_email[e2]; return d["_id"], f"recipient_email:{e2}", d
nm = norm_name(doc.get("pi_last_name"))
cand = by_name.get((nm, doc.get("site_country")), [])
if len(cand) == 1:
return cand[0]["_id"], f"prijmeni:{nm}", cand[0]
if len(cand) > 1:
return None, f"prijmeni_ambiguous:{nm}({len(cand)})", None
return None, "NENALEZENO", None
def diff_docs(old, new):
changes = []
def walk(prefix, o, n):
for k in sorted(set((o or {}).keys()) | set((n or {}).keys())):
ov, nv = (o or {}).get(k), (n or {}).get(k)
if isinstance(ov, dict) or isinstance(nv, dict):
walk(f"{prefix}{k}.", ov or {}, nv or {})
elif ov != nv:
changes.append({"key": f"{prefix}{k}", "old": ov, "new": nv})
for field in ("answers", "meta"):
walk(f"{field}.", old.get(field, {}), new.get(field, {}))
for k in ("site_name", "pi_email", "pi_last_name", "interested", "is_full_sipiq"):
if old.get(k) != new.get(k):
changes.append({"key": k, "old": old.get(k), "new": new.get(k)})
return changes
# ---------------------------------------------------------------------------
def process_file(db, csv_path, scope, dry, by_email, by_name):
source_file = os.path.basename(csv_path)
exported_at = file_mtime_iso(csv_path) # datum/čas reportu dle filesystému (mtime)
cols, data = load_csv(csv_path)
get, _ = col_getter(cols, data)
if scope == "czsk":
data = [r for r in data if get(r, "site_country") in ("Czech Republic", "Slovakia")]
print(f"\n########## {source_file} (rozsah={scope}, odpovědí={len(data)}, export={exported_at}) ##########")
cols_all, data_all = load_csv(csv_path)
questions = build_questions(cols_all, data_all)
docs, link_rows = [], []
for r in data:
doc = build_response(cols, get, r, source_file)
oid, how, matched = soft_link(doc, by_email, by_name)
doc["investigator_oid"] = oid
doc["investigator_match"] = how
doc["source_exported_at"] = exported_at
doc["content_sha256"] = content_hash(doc)
docs.append(doc)
link_rows.append((doc, how, matched))
existing = {d["_id"]: d for d in db[COL_R].find({}, {"content_sha256": 1})}
to_insert = [d for d in docs if d["_id"] not in existing]
to_update = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") != d["content_sha256"]]
unchanged = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") == d["content_sha256"]]
mk7 = mko = un = 0
for doc, how, m in link_rows:
krok = (m or {}).get("KROK", "")
if m and str(krok).startswith("7"): mk7 += 1
elif m: mko += 1
else: un += 1
print(f" slovník: {len(questions)} otázek | soft-link: KROK7={mk7}, jiný={mko}, nenapárováno={un}")
print(f" delta: INSERT={len(to_insert)}, UPDATE={len(to_update)}, beze změny={len(unchanged)}")
if un:
for doc, how, m in link_rows:
if not m:
print(f" ✗ NENAPÁROVÁNO: {doc.get('pi_last_name')} / {doc.get('pi_email')} ({how})")
if dry:
print(" [DRY-RUN] nezapsáno")
return {"insert": 0, "update": 0, "unchanged": 0, "wrote": False}
for q in questions:
db[COL_Q].replace_one({"_id": q["_id"]}, q, upsert=True)
ts = now_iso()
ni = nu = ns = 0
for d in docs:
cur = db[COL_R].find_one({"_id": d["_id"]})
if cur is None:
d.update({"first_imported_at": ts, "last_seen_at": ts, "last_updated_at": ts, "history": []})
db[COL_R].insert_one(d); ni += 1
elif cur.get("content_sha256") != d["content_sha256"]:
changes = diff_docs(cur, d)
db[COL_R].update_one({"_id": d["_id"]}, {
"$set": {**{k: d[k] for k in d if k != "_id"}, "last_seen_at": ts, "last_updated_at": ts},
"$push": {"history": {"changed_at": ts, "source_file": source_file, "changes": changes}}})
nu += 1
else:
db[COL_R].update_one({"_id": d["_id"]}, {"$set": {
"last_seen_at": ts, "source_file": source_file, "source_exported_at": d["source_exported_at"]}})
ns += 1
print(f" [APPLY] questions upsert={len(questions)} | responses insert={ni}, update={nu}, beze změny={ns}")
return {"insert": ni, "update": nu, "unchanged": ns, "wrote": True}
def move_to_processed(csv_path, folder):
dest_dir = os.path.join(folder, PROCESSED_SUBDIR)
os.makedirs(dest_dir, exist_ok=True)
base = os.path.basename(csv_path)
dest = os.path.join(dest_dir, base)
if os.path.exists(dest):
stem, ext = os.path.splitext(base)
n = 1
while os.path.exists(os.path.join(dest_dir, f"{stem}_{n}{ext}")):
n += 1
dest = os.path.join(dest_dir, f"{stem}_{n}{ext}")
shutil.move(csv_path, dest)
print(f" -> přesunuto do {PROCESSED_SUBDIR}\\{os.path.basename(dest)}")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--csv", help="jediný soubor (NEpřesouvá)")
ap.add_argument("--folder", default=DEFAULT_FOLDER, help="složka se SIPIQ CSV (přesune do Zpracováno)")
ap.add_argument("--scope", choices=["czsk", "all"], default="czsk")
ap.add_argument("--apply", action="store_true")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
dry = not args.apply
if args.csv:
files, move_mode, folder = [args.csv], False, None
else:
folder = args.folder
files = sorted(glob.glob(os.path.join(folder, "*.csv")))
move_mode = True
print(f"Složka: {folder}\nNalezeno CSV ke zpracování: {len(files)}")
if not files:
print("Nic ke zpracování (žádné *.csv).")
return
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=8000)
db = client[DB_NAME]
client.admin.command("ping")
inv, by_email, by_name = load_investigators(db)
print(f"Investigatorů CZ+SK v DB: {len(inv)}")
total = {"insert": 0, "update": 0, "unchanged": 0}
for f in files:
res = process_file(db, f, args.scope, dry, by_email, by_name)
for k in total:
total[k] += res[k]
if move_mode and res["wrote"]:
move_to_processed(f, folder)
print(f"\n=== CELKEM: insert={total['insert']}, update={total['update']}, beze změny={total['unchanged']} ===")
if dry:
print("[DRY-RUN] Nic se nezapsalo ani nepřesunulo. Ostrý běh: --apply")
client.close()
if __name__ == "__main__":
main()