z230
This commit is contained in:
+577
File diff suppressed because one or more lines are too long
+580
File diff suppressed because one or more lines are too long
@@ -0,0 +1,26 @@
|
||||
# report_77242113UCO2001_v1.1.py
|
||||
|
||||
**Verze:** 1.1 · **Datum:** 2026-06-09
|
||||
|
||||
Generátor Excel reportu z Mongo `feasibility.investigators`.
|
||||
|
||||
## Změny v1.1
|
||||
- Přidán sloupec **`KROK`** (krok feasibility workflow) **hned PŘED `STATUS`**.
|
||||
- Barevné odlišení řádků dle KROK (místo dle STATUS):
|
||||
- 5/6/7 zelená, 4 tmavší zelená, 3.1 světle modrá, 3.2 červená,
|
||||
2 světle žlutá, 1 modrá, 0 šedá. KROK tučně.
|
||||
- KROK plní skript `classify_krok_v1.0.py` (odvození ze STATUS).
|
||||
|
||||
## Pořadí sloupců
|
||||
`Příjmení | Jméno | Email | KROK | STATUS | Kritická poznámka | Země | Pracoviště | Internet summary | …`
|
||||
|
||||
## Spuštění / výstup
|
||||
```
|
||||
python report_77242113UCO2001_v1.1.py
|
||||
```
|
||||
- venv: `U:\PythonProject\Janssen\.venv\Scripts\python.exe`
|
||||
- výstup: `u:\Dropbox\!!!Days\Downloads Z230\77242113UCO2001_investigators_<YYYYMMDD_HHMM>.xlsx`
|
||||
|
||||
## Historie
|
||||
- v1.0 (bez verze v názvu) — původní, přesunuto do `TRASH/`.
|
||||
- v1.1 — sloupec KROK před STATUS + obarvení dle KROK.
|
||||
@@ -0,0 +1,157 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# =============================================================================
|
||||
# Nazev: report_77242113UCO2001_v1.1.py
|
||||
# Verze: 1.1
|
||||
# Datum: 2026-06-09
|
||||
# Popis: Generator Excel reportu z Mongo feasibility.investigators.
|
||||
# v1.1 - pridan sloupec KROK (krok feasibility workflow) HNED PRED
|
||||
# STATUS + barevne odliseni dle KROK. KROK plni skript
|
||||
# classify_krok_v1.0.py (odvozeni ze STATUS).
|
||||
# Projekt: 77242113UCO2001 (DAWN / spravny kod 77242113UCO3002)
|
||||
# Vystup: u:\Dropbox\!!!Days\Downloads Z230\ (verzovany nazev s timestampem)
|
||||
# =============================================================================
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient
|
||||
import openpyxl
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
# --- Připojení k MongoDB ---
|
||||
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://192.168.1.76:27017")
|
||||
client = MongoClient(MONGO_URI)
|
||||
db = client["feasibility"]
|
||||
col = db["investigators"]
|
||||
|
||||
# --- Načtení dat ---
|
||||
docs = list(col.find({}))
|
||||
print(f"Načteno {len(docs)} záznamů.")
|
||||
|
||||
# --- Cílová složka ---
|
||||
OUTPUT_DIR = r"u:\Dropbox\!!!Days\Downloads Z230"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
datum = datetime.now().strftime("%Y%m%d_%H%M")
|
||||
filename = f"77242113UCO2001_investigators_{datum}.xlsx"
|
||||
filepath = os.path.join(OUTPUT_DIR, filename)
|
||||
|
||||
# --- Definice sloupců ---
|
||||
# Pořadí: jméno, email, KROK, STATUS, kriticka_poznamka, pak ostatní
|
||||
# KROK je ZÁMĚRNĚ hned PŘED STATUS.
|
||||
FIXED_COLS = [
|
||||
("prijmeni", "Příjmení"),
|
||||
("jmeno", "Jméno"),
|
||||
("email", "Email"),
|
||||
("KROK", "KROK"),
|
||||
("STATUS", "STATUS"),
|
||||
("kriticka_poznamka", "Kritická poznámka"),
|
||||
("zeme", "Země"),
|
||||
("pracoviste", "Pracoviště"),
|
||||
("internet_summary","Internet summary"),
|
||||
]
|
||||
|
||||
# Klíče, které přeskočíme (složité nested objekty)
|
||||
SKIP_KEYS = {"_id", "excel", "sites_illuminator", "maf", "zdroje", "studie", "Viper_Performance", "Viper_Contacts"}
|
||||
|
||||
# Ostatní skalární pole
|
||||
fixed_keys = {c[0] for c in FIXED_COLS}
|
||||
extra_keys = set()
|
||||
for doc in docs:
|
||||
for k in doc.keys():
|
||||
if k not in fixed_keys and k not in SKIP_KEYS:
|
||||
extra_keys.add(k)
|
||||
extra_keys = sorted(extra_keys)
|
||||
|
||||
ALL_COLS = FIXED_COLS + [(k, k) for k in extra_keys]
|
||||
|
||||
# --- Barvy podle KROK ---
|
||||
def krok_color(krok):
|
||||
if not krok:
|
||||
return None
|
||||
k = krok.strip()
|
||||
if k.startswith("5") or k.startswith("6") or k.startswith("7"):
|
||||
return "FFC6EFCE" # zelená - CDA podepsáno / SIPIQ
|
||||
if k.startswith("4"):
|
||||
return "FFB7E1CD" # tmavší zelená - CDA vyžádáno
|
||||
if k.startswith("3.1"):
|
||||
return "FFDDEBF7" # světle modrá - zájem
|
||||
if k.startswith("3.2"):
|
||||
return "FFFFC7CE" # červená - nezájem
|
||||
if k.startswith("2"):
|
||||
return "FFFFF2CC" # světle žlutá - připomenuto
|
||||
if k.startswith("1"):
|
||||
return "FFDCE6F1" # modrá - nabídka odeslána
|
||||
if k.startswith("0"):
|
||||
return "FFD9D9D9" # šedá - mimo
|
||||
return None
|
||||
|
||||
# --- Vytvoření workbooku ---
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Investigators"
|
||||
|
||||
# Styly
|
||||
header_font = Font(bold=True, color="FFFFFFFF")
|
||||
header_fill = PatternFill("solid", fgColor="FF1F4E79")
|
||||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
cell_align = Alignment(vertical="top", wrap_text=True)
|
||||
thin = Side(style="thin", color="FFB0B0B0")
|
||||
border = Border(left=thin, right=thin, top=thin, bottom=thin)
|
||||
|
||||
# Záhlaví
|
||||
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=label)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = header_align
|
||||
cell.border = border
|
||||
|
||||
ws.row_dimensions[1].height = 30
|
||||
|
||||
# Data
|
||||
for row_idx, doc in enumerate(docs, 2):
|
||||
krok_val = str(doc.get("KROK", "") or "")
|
||||
bg = krok_color(krok_val)
|
||||
|
||||
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
|
||||
val = doc.get(key, "")
|
||||
# Převod na string pokud je list nebo dict
|
||||
if isinstance(val, list):
|
||||
val = ", ".join(str(v) for v in val)
|
||||
elif isinstance(val, dict):
|
||||
val = str(val)
|
||||
elif val is None:
|
||||
val = ""
|
||||
else:
|
||||
val = str(val)
|
||||
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.alignment = cell_align
|
||||
cell.border = border
|
||||
|
||||
if bg:
|
||||
cell.fill = PatternFill("solid", fgColor=bg)
|
||||
if key == "KROK":
|
||||
cell.font = Font(bold=True)
|
||||
|
||||
# Šířky sloupců
|
||||
col_widths = {
|
||||
"prijmeni": 18, "jmeno": 15, "email": 35,
|
||||
"KROK": 26, "STATUS": 45, "kriticka_poznamka": 60,
|
||||
"zeme": 12, "pracoviste": 35, "internet_summary": 60,
|
||||
}
|
||||
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
|
||||
w = col_widths.get(key, 20)
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = w
|
||||
|
||||
# Zmrazení záhlaví
|
||||
ws.freeze_panes = "A2"
|
||||
|
||||
# Autofilter
|
||||
ws.auto_filter.ref = ws.dimensions
|
||||
|
||||
# Uložení
|
||||
wb.save(filepath)
|
||||
print(f"Ulozeno: {filepath}")
|
||||
print("Poradi sloupcu:", [label for _, label in ALL_COLS][:6], "...")
|
||||
@@ -0,0 +1,227 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# =============================================================================
|
||||
# Nazev: report_77242113UCO2001_v1.2.py
|
||||
# Verze: 1.2
|
||||
# Datum: 2026-06-19
|
||||
# Popis: Generator Excel reportu z Mongo feasibility.investigators.
|
||||
# v1.2 - pridan PRVNI list "Prehled KROK": A=KROK, B=pocet,
|
||||
# C=seznam jmen zkousejicich (oddeleno strednikem) v jedne bunce.
|
||||
# List Investigators nasleduje jako druhy.
|
||||
# v1.1 - sloupec KROK hned pred STATUS + barevne odliseni dle KROK.
|
||||
# Projekt: 77242113UCO2001 (DAWN / spravny kod 77242113UCO3002)
|
||||
# Vystup: u:\Dropbox\!!!Days\Downloads Z230\ (verzovany nazev s timestampem)
|
||||
# =============================================================================
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient
|
||||
import openpyxl
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
# --- Připojení k MongoDB ---
|
||||
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://192.168.1.76:27017")
|
||||
client = MongoClient(MONGO_URI)
|
||||
db = client["feasibility"]
|
||||
col = db["investigators"]
|
||||
|
||||
# --- Načtení dat ---
|
||||
docs = list(col.find({}))
|
||||
print(f"Načteno {len(docs)} záznamů.")
|
||||
|
||||
# --- Cílová složka ---
|
||||
OUTPUT_DIR = r"u:\Dropbox\!!!Days\Downloads Z230"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
datum = datetime.now().strftime("%Y%m%d_%H%M")
|
||||
filename = f"77242113UCO2001_investigators_{datum}.xlsx"
|
||||
filepath = os.path.join(OUTPUT_DIR, filename)
|
||||
|
||||
# --- Definice sloupců ---
|
||||
# Pořadí: jméno, email, KROK, STATUS, kriticka_poznamka, pak ostatní
|
||||
# KROK je ZÁMĚRNĚ hned PŘED STATUS.
|
||||
FIXED_COLS = [
|
||||
("prijmeni", "Příjmení"),
|
||||
("jmeno", "Jméno"),
|
||||
("email", "Email"),
|
||||
("KROK", "KROK"),
|
||||
("STATUS", "STATUS"),
|
||||
("kriticka_poznamka", "Kritická poznámka"),
|
||||
("zeme", "Země"),
|
||||
("pracoviste", "Pracoviště"),
|
||||
("internet_summary","Internet summary"),
|
||||
]
|
||||
|
||||
# Klíče, které přeskočíme (složité nested objekty)
|
||||
SKIP_KEYS = {"_id", "excel", "sites_illuminator", "maf", "zdroje", "studie",
|
||||
"Viper_Performance", "Viper_Contacts", "cda", "history",
|
||||
"zpracovane_emaily", "sipiq"}
|
||||
|
||||
# Ostatní skalární pole
|
||||
fixed_keys = {c[0] for c in FIXED_COLS}
|
||||
extra_keys = set()
|
||||
for doc in docs:
|
||||
for k in doc.keys():
|
||||
if k not in fixed_keys and k not in SKIP_KEYS:
|
||||
extra_keys.add(k)
|
||||
extra_keys = sorted(extra_keys)
|
||||
|
||||
ALL_COLS = FIXED_COLS + [(k, k) for k in extra_keys]
|
||||
|
||||
# --- Barvy podle KROK ---
|
||||
def krok_color(krok):
|
||||
if not krok:
|
||||
return None
|
||||
k = krok.strip()
|
||||
if k.startswith("5") or k.startswith("6") or k.startswith("7"):
|
||||
return "FFC6EFCE" # zelená - CDA podepsáno / SIPIQ
|
||||
if k.startswith("4"):
|
||||
return "FFB7E1CD" # tmavší zelená - CDA vyžádáno
|
||||
if k.startswith("3.1"):
|
||||
return "FFDDEBF7" # světle modrá - zájem
|
||||
if k.startswith("3.2"):
|
||||
return "FFFFC7CE" # červená - nezájem
|
||||
if k.startswith("2"):
|
||||
return "FFFFF2CC" # světle žlutá - připomenuto
|
||||
if k.startswith("1"):
|
||||
return "FFDCE6F1" # modrá - nabídka odeslána
|
||||
if k.startswith("0"):
|
||||
return "FFD9D9D9" # šedá - mimo
|
||||
return None
|
||||
|
||||
# --- Pomocná funkce: setřídění KROK dle čísla prefixu ---
|
||||
def krok_sort_key(krok):
|
||||
k = (krok or "").strip()
|
||||
num = ""
|
||||
for ch in k:
|
||||
if ch.isdigit() or ch == ".":
|
||||
num += ch
|
||||
else:
|
||||
break
|
||||
try:
|
||||
return (0, float(num)) if num else (1, 0.0)
|
||||
except ValueError:
|
||||
return (1, 0.0)
|
||||
|
||||
# --- Agregace pro přehledový list ---
|
||||
def cele_jmeno(doc):
|
||||
p = (doc.get("prijmeni") or "").strip()
|
||||
j = (doc.get("jmeno") or "").strip()
|
||||
return (p + " " + j).strip() or "(bez jména)"
|
||||
|
||||
skupiny = {} # krok -> list jmen
|
||||
for doc in docs:
|
||||
krok = str(doc.get("KROK", "") or "").strip() or "(nezařazeno)"
|
||||
skupiny.setdefault(krok, []).append(cele_jmeno(doc))
|
||||
|
||||
prehled_rows = []
|
||||
for krok in sorted(skupiny, key=krok_sort_key):
|
||||
jmena = sorted(skupiny[krok])
|
||||
prehled_rows.append((krok, len(jmena), "; ".join(jmena)))
|
||||
|
||||
# --- Styly ---
|
||||
header_font = Font(bold=True, color="FFFFFFFF")
|
||||
header_fill = PatternFill("solid", fgColor="FF1F4E79")
|
||||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
cell_align = Alignment(vertical="top", wrap_text=True)
|
||||
thin = Side(style="thin", color="FFB0B0B0")
|
||||
border = Border(left=thin, right=thin, top=thin, bottom=thin)
|
||||
|
||||
# --- Vytvoření workbooku ---
|
||||
wb = openpyxl.Workbook()
|
||||
|
||||
# === LIST 1: Přehled KROK ===
|
||||
ws_p = wb.active
|
||||
ws_p.title = "Přehled KROK"
|
||||
|
||||
PREHLED_COLS = [("KROK", 30), ("Počet", 10), ("Zkoušející (jména)", 120)]
|
||||
for col_idx, (label, w) in enumerate(PREHLED_COLS, 1):
|
||||
c = ws_p.cell(row=1, column=col_idx, value=label)
|
||||
c.font = header_font
|
||||
c.fill = header_fill
|
||||
c.alignment = header_align
|
||||
c.border = border
|
||||
ws_p.column_dimensions[get_column_letter(col_idx)].width = w
|
||||
ws_p.row_dimensions[1].height = 26
|
||||
|
||||
for r, (krok, pocet, jmena) in enumerate(prehled_rows, 2):
|
||||
bg = krok_color(krok)
|
||||
vals = [krok, pocet, jmena]
|
||||
for col_idx, val in enumerate(vals, 1):
|
||||
c = ws_p.cell(row=r, column=col_idx, value=val)
|
||||
c.alignment = cell_align
|
||||
c.border = border
|
||||
if bg:
|
||||
c.fill = PatternFill("solid", fgColor=bg)
|
||||
if col_idx == 1:
|
||||
c.font = Font(bold=True)
|
||||
if col_idx == 2:
|
||||
c.alignment = Alignment(horizontal="center", vertical="top")
|
||||
|
||||
# Řádek CELKEM
|
||||
tot_r = len(prehled_rows) + 2
|
||||
ws_p.cell(row=tot_r, column=1, value="CELKEM").font = Font(bold=True)
|
||||
tc = ws_p.cell(row=tot_r, column=2, value=sum(p for _, p, _ in prehled_rows))
|
||||
tc.font = Font(bold=True)
|
||||
tc.alignment = Alignment(horizontal="center")
|
||||
for col_idx in range(1, 4):
|
||||
ws_p.cell(row=tot_r, column=col_idx).border = border
|
||||
|
||||
ws_p.freeze_panes = "A2"
|
||||
ws_p.auto_filter.ref = f"A1:C{len(prehled_rows) + 1}"
|
||||
|
||||
# === LIST 2: Investigators ===
|
||||
ws = wb.create_sheet("Investigators")
|
||||
|
||||
# Záhlaví
|
||||
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=label)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = header_align
|
||||
cell.border = border
|
||||
ws.row_dimensions[1].height = 30
|
||||
|
||||
# Data
|
||||
for row_idx, doc in enumerate(docs, 2):
|
||||
krok_val = str(doc.get("KROK", "") or "")
|
||||
bg = krok_color(krok_val)
|
||||
|
||||
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
|
||||
val = doc.get(key, "")
|
||||
if isinstance(val, list):
|
||||
val = ", ".join(str(v) for v in val)
|
||||
elif isinstance(val, dict):
|
||||
val = str(val)
|
||||
elif val is None:
|
||||
val = ""
|
||||
else:
|
||||
val = str(val)
|
||||
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.alignment = cell_align
|
||||
cell.border = border
|
||||
|
||||
if bg:
|
||||
cell.fill = PatternFill("solid", fgColor=bg)
|
||||
if key == "KROK":
|
||||
cell.font = Font(bold=True)
|
||||
|
||||
# Šířky sloupců
|
||||
col_widths = {
|
||||
"prijmeni": 18, "jmeno": 15, "email": 35,
|
||||
"KROK": 26, "STATUS": 45, "kriticka_poznamka": 60,
|
||||
"zeme": 12, "pracoviste": 35, "internet_summary": 60,
|
||||
}
|
||||
for col_idx, (key, label) in enumerate(ALL_COLS, 1):
|
||||
w = col_widths.get(key, 20)
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = w
|
||||
|
||||
ws.freeze_panes = "A2"
|
||||
ws.auto_filter.ref = ws.dimensions
|
||||
|
||||
# --- Uložení ---
|
||||
wb.save(filepath)
|
||||
print(f"Ulozeno: {filepath}")
|
||||
print(f"List 1 'Přehled KROK': {len(prehled_rows)} kroků, celkem {sum(p for _,p,_ in prehled_rows)} zkoušejících.")
|
||||
print("List 2 'Investigators':", len(docs), "radku.")
|
||||
@@ -0,0 +1,55 @@
|
||||
# sipiq_download_v1.0
|
||||
|
||||
**Verze:** 1.0 · **Datum:** 2026-06-19
|
||||
|
||||
## Co dělá
|
||||
Automaticky stáhne SIPIQ survey report z Qualtrics přes oficiální **Export
|
||||
Responses API** (start → poll → download ZIP → rozbalit CSV) a uloží CSV
|
||||
s timestampovaným názvem do importní složky
|
||||
`Feasibility\77242113UCO2001\ImportSIPIQcompled`, odkud ho beze změny sebere
|
||||
`sipiq_import_v1.2.py`.
|
||||
|
||||
Nahrazuje ruční proklikávání *Results → Data & Analysis → Export & Import →
|
||||
Export Data → CSV → Download*.
|
||||
|
||||
## Konfigurace (root `.env`, neverzovat)
|
||||
```
|
||||
QUALTRICS_API_TOKEN=<token> # Account Settings → Qualtrics IDs → API → Generate Token
|
||||
QUALTRICS_DATACENTER=janssenfeasibility.co1
|
||||
QUALTRICS_SURVEY_ID=SV_9AdeNaNyohp5fNQ
|
||||
```
|
||||
Token je citlivý údaj — vlož ho ručně, nikdy ne do kódu/gitu.
|
||||
|
||||
## Použití
|
||||
```
|
||||
python sipiq_download_v1.0.py # CSV s labely (jako UI "Export labels")
|
||||
python sipiq_download_v1.0.py --values # CSV s hodnotami (useLabels=False)
|
||||
python sipiq_download_v1.0.py --format tsv
|
||||
python sipiq_download_v1.0.py --out "<jiná složka>"
|
||||
```
|
||||
|
||||
## Celá pipeline (download → import)
|
||||
```
|
||||
python sipiq_download_v1.0.py
|
||||
python sipiq_import_v1.2.py --apply
|
||||
```
|
||||
|
||||
## Mapování UI → API
|
||||
| UI dialog | API |
|
||||
|---|---|
|
||||
| CSV | `format=csv` |
|
||||
| Export labels | `useLabels=True` (default) |
|
||||
| Export values | `useLabels=False` (`--values`) |
|
||||
| Download all fields | default API chování |
|
||||
| komprese ZIP | API automaticky |
|
||||
|
||||
## Výstupní název
|
||||
`YYYY-MM-DD_HHMMSS sipiq-<původní_název>.csv` — timestamp = čas stažení;
|
||||
`sipiq_import` čte `source_exported_at` z mtime souboru. Re-download celé
|
||||
survey je bezpečný (import je delta podle ResponseId).
|
||||
|
||||
## Poznámky
|
||||
- Před nasazením na scheduler spusť jednou ručně a porovnej CSV s tím, co
|
||||
stahuješ přes UI.
|
||||
- Alternativa bez vlastního kódu: nativní **Response Export Automation**
|
||||
v Qualtrics (Export & Import) — plánovaný export na SFTP/cloud.
|
||||
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
sipiq_download_v1.0.py
|
||||
======================
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-19
|
||||
Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
|
||||
|
||||
Popis
|
||||
-----
|
||||
Automatické stažení SIPIQ survey reportu z Qualtrics přes oficiální
|
||||
Export Responses API (3 kroky: start → poll → download ZIP → rozbalit CSV).
|
||||
Nahrazuje ruční proklikávání Homepage → Results → Data & Analysis →
|
||||
Export & Import → Export Data → CSV → Download.
|
||||
|
||||
Stažené CSV se uloží s TIMESTAMPOVANÝM názvem rovnou do importní složky
|
||||
U:\\PythonProject\\Janssen\\Feasibility\\77242113UCO2001\\ImportSIPIQcompled
|
||||
odkud ho bez úprav sebere `sipiq_import_v1.2.py` (delta import → Mongo).
|
||||
|
||||
Mapování UI → API:
|
||||
"CSV" -> format=csv
|
||||
"Export labels" -> useLabels=True (default; --values přepne na hodnoty)
|
||||
"Download all" -> default API chování (všechna pole)
|
||||
komprese (ZIP) -> API zapnuto automaticky
|
||||
|
||||
Konfigurace (root .env, NEVERZOVAT):
|
||||
QUALTRICS_API_TOKEN – API token (Account Settings → Qualtrics IDs → API)
|
||||
QUALTRICS_DATACENTER – default janssenfeasibility.co1
|
||||
QUALTRICS_SURVEY_ID – default SV_9AdeNaNyohp5fNQ
|
||||
|
||||
Použití
|
||||
-------
|
||||
python sipiq_download_v1.0.py # CSV s labely do ImportSIPIQcompled
|
||||
python sipiq_download_v1.0.py --values # CSV s hodnotami místo labelů
|
||||
python sipiq_download_v1.0.py --out "<složka>" # jiná cílová složka
|
||||
python sipiq_download_v1.0.py --format tsv # jiný formát
|
||||
|
||||
Navazující import (samostatně):
|
||||
python sipiq_import_v1.2.py --apply
|
||||
|
||||
Závislosti: requests, python-dotenv (.venv).
|
||||
"""
|
||||
import argparse
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
except ImportError:
|
||||
print("CHYBA: chybí requests nebo python-dotenv v aktuálním pythonu.", file=sys.stderr)
|
||||
raise
|
||||
|
||||
# --- konfigurace z root .env -------------------------------------------------
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_ROOT = os.path.dirname(_HERE) # U:\PythonProject\Janssen
|
||||
load_dotenv(os.path.join(_ROOT, ".env"))
|
||||
|
||||
API_TOKEN = os.environ.get("QUALTRICS_API_TOKEN", "").strip()
|
||||
DATACENTER = os.environ.get("QUALTRICS_DATACENTER", "janssenfeasibility.co1").strip()
|
||||
SURVEY_ID = os.environ.get("QUALTRICS_SURVEY_ID", "SV_9AdeNaNyohp5fNQ").strip()
|
||||
|
||||
BASE_URL = f"https://{DATACENTER}.qualtrics.com/API/v3"
|
||||
HEADERS = {"X-API-TOKEN": API_TOKEN, "Content-Type": "application/json"}
|
||||
|
||||
DEFAULT_OUT = os.path.join(_HERE, "77242113UCO2001", "ImportSIPIQcompled")
|
||||
|
||||
log = logging.getLogger("sipiq_download")
|
||||
|
||||
|
||||
def start_export(fmt: str = "csv", use_labels: bool = True) -> str:
|
||||
"""Spustí export job. Vrací progressId."""
|
||||
url = f"{BASE_URL}/surveys/{SURVEY_ID}/export-responses"
|
||||
payload = {"format": fmt, "useLabels": use_labels}
|
||||
r = requests.post(url, json=payload, headers=HEADERS, timeout=30)
|
||||
r.raise_for_status()
|
||||
return r.json()["result"]["progressId"]
|
||||
|
||||
|
||||
def wait_for_export(progress_id: str, timeout_s: int = 300) -> str:
|
||||
"""Polluje stav exportu po 2 s. Vrací fileId po dokončení."""
|
||||
url = f"{BASE_URL}/surveys/{SURVEY_ID}/export-responses/{progress_id}"
|
||||
deadline = time.time() + timeout_s
|
||||
while time.time() < deadline:
|
||||
r = requests.get(url, headers=HEADERS, timeout=30)
|
||||
r.raise_for_status()
|
||||
result = r.json()["result"]
|
||||
status = result.get("status")
|
||||
if status == "complete":
|
||||
return result["fileId"]
|
||||
if status == "failed":
|
||||
raise RuntimeError("Qualtrics export selhal (status=failed).")
|
||||
log.info("Export běží… %s%%", result.get("percentComplete", "?"))
|
||||
time.sleep(2)
|
||||
raise TimeoutError("Export nedoběhl v časovém limitu.")
|
||||
|
||||
|
||||
def download_file(file_id: str, out_dir: str, stamp: str) -> list[str]:
|
||||
"""Stáhne ZIP a rozbalí CSV do out_dir s timestampovaným prefixem.
|
||||
|
||||
Vrací seznam výsledných cest. Stávající soubory NEpřepisuje (jiný prefix).
|
||||
"""
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
url = f"{BASE_URL}/surveys/{SURVEY_ID}/export-responses/{file_id}/file"
|
||||
r = requests.get(url, headers=HEADERS, timeout=180)
|
||||
r.raise_for_status()
|
||||
written = []
|
||||
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
|
||||
for name in z.namelist():
|
||||
if name.endswith("/"):
|
||||
continue
|
||||
safe = os.path.basename(name)
|
||||
target = os.path.join(out_dir, f"{stamp} sipiq-{safe}")
|
||||
with z.open(name) as src, open(target, "wb") as dst:
|
||||
dst.write(src.read())
|
||||
written.append(target)
|
||||
return written
|
||||
|
||||
|
||||
def run_export(fmt: str, use_labels: bool, out_dir: str) -> list[str]:
|
||||
"""Celý postup: start → wait → download."""
|
||||
if not API_TOKEN:
|
||||
raise SystemExit(
|
||||
"CHYBA: QUALTRICS_API_TOKEN není nastaven v .env.\n"
|
||||
"Vygeneruj token: Qualtrics → Account Settings → Qualtrics IDs → "
|
||||
"API → Generate Token a vlož do U:\\PythonProject\\Janssen\\.env"
|
||||
)
|
||||
stamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||
log.info("Survey %s @ %s → %s", SURVEY_ID, DATACENTER, out_dir)
|
||||
pid = start_export(fmt, use_labels)
|
||||
log.info("Export spuštěn, progressId=%s", pid)
|
||||
fid = wait_for_export(pid)
|
||||
log.info("Export hotov, fileId=%s", fid)
|
||||
files = download_file(fid, out_dir, stamp)
|
||||
log.info("Staženo %d souborů.", len(files))
|
||||
return files
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
p = argparse.ArgumentParser(description="SIPIQ Qualtrics CSV export → ImportSIPIQcompled")
|
||||
p.add_argument("--format", default="csv", choices=["csv", "tsv", "json", "spss"])
|
||||
p.add_argument("--values", action="store_true",
|
||||
help="Export hodnot místo labelů (useLabels=False)")
|
||||
p.add_argument("--out", default=DEFAULT_OUT, help="Cílová složka (default ImportSIPIQcompled)")
|
||||
args = p.parse_args()
|
||||
|
||||
try:
|
||||
files = run_export(fmt=args.format, use_labels=not args.values, out_dir=args.out)
|
||||
except requests.HTTPError as e:
|
||||
resp = e.response
|
||||
log.error("HTTP %s: %s", resp.status_code if resp is not None else "?",
|
||||
resp.text[:500] if resp is not None else e)
|
||||
raise SystemExit(1)
|
||||
|
||||
print("Hotovo. Stažené soubory:")
|
||||
for f in files:
|
||||
print(" -", f)
|
||||
if not files:
|
||||
print(" (žádné CSV v exportu)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,47 @@
|
||||
# sipiq_download_v2.0
|
||||
|
||||
**Verze:** 2.0 · **Datum:** 2026-06-19
|
||||
|
||||
## Co dělá
|
||||
Automaticky stáhne SIPIQ survey report (CSV) z Qualtrics přes **Playwright**.
|
||||
Přihlásí se (username/password), otevře Data & Analysis studie 77242113UCO3002
|
||||
(`SV_9AdeNaNyohp5fNQ`), spustí **Export & Import → Export Data → CSV → Download**
|
||||
a stažený soubor uloží s timestampem do `U:\Dropbox\!!!Days\Downloads Z230\`.
|
||||
|
||||
## Proč Playwright (a ne API)
|
||||
Účet `77242113uco3002_sipiq` **nemá povolené API** na úrovni účtu. Interní UI
|
||||
export běží jen na **session cookies** a vrací **interní `fileUrl`**
|
||||
(`riptooth.service.consul:9000`), který je zvenčí nedostupný — čistě `requests`
|
||||
proto soubor nestáhne (start+poll fungují, samotný download ne). Playwright se
|
||||
přihlásí sám (řeší expiraci session) a stažení nechá na prohlížeči
|
||||
(`expect_download`). Verze 1.0 (requests/API) je v `TRASH`.
|
||||
|
||||
## Konfigurace (root `.env`)
|
||||
```
|
||||
QUALTRICS_USER=77242113uco3002_sipiq
|
||||
QUALTRICS_PASS=77242113uco3002_sipiq
|
||||
```
|
||||
Fallback na zabudované hodnoty, pokud `.env` chybí.
|
||||
|
||||
## Použití
|
||||
```
|
||||
"U:\PythonProject\Janssen\.venv\Scripts\python.exe" sipiq_download_v2.0.py
|
||||
"U:\PythonProject\Janssen\.venv\Scripts\python.exe" sipiq_download_v2.0.py --headless
|
||||
```
|
||||
- Profil prohlížeče: `Feasibility\qualtrics_profile\` (persistent) → po prvním
|
||||
přihlášení se login přeskakuje.
|
||||
- Default běh je **headed** (vidíš okno); `--headless` pro scheduler.
|
||||
- Při chybě se uloží screenshot `_dl_99_error.png`.
|
||||
|
||||
## Ověřeno 19JUN2026
|
||||
CSV: 247 sloupců, 3 hlavičkové řádky (Qcode / text / `ImportId`), 323 odpovědí,
|
||||
498 110 B — shodné s ručním exportem. Modal defaulty sedí: **CSV**,
|
||||
☑ Download all fields, **Export labels** (= co `sipiq_import` potřebuje).
|
||||
|
||||
## Návazný import
|
||||
Výstup jde do `Downloads Z230`. Import do Mongo se spouští samostatně — buď
|
||||
soubor přesuň do `77242113UCO2001\ImportSIPIQcompled\` a spusť folder workflow,
|
||||
nebo přímo na soubor:
|
||||
```
|
||||
"U:\PythonProject\Janssen\.venv\Scripts\python.exe" sipiq_import_v1.3.py --csv "<cesta k CSV>"
|
||||
```
|
||||
@@ -0,0 +1,133 @@
|
||||
# =============================================================================
|
||||
# Název: sipiq_download_v2.0.py
|
||||
# Verze: 2.0
|
||||
# Datum: 2026-06-19
|
||||
# Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
|
||||
# Popis: Automatické stažení SIPIQ survey reportu (CSV) z Qualtrics přes
|
||||
# Playwright. Přihlásí se (username/password), otevře Data & Analysis
|
||||
# studie 77242113UCO3002 (SV_9AdeNaNyohp5fNQ), spustí Export & Import →
|
||||
# Export Data → CSV → Download a stažený soubor uloží s timestampem do
|
||||
# U:\Dropbox\!!!Days\Downloads Z230\.
|
||||
#
|
||||
# Proč Playwright a ne API: účet NEMÁ povolené API na úrovni účtu;
|
||||
# interní export běží jen na session cookies + interní fileUrl
|
||||
# (riptooth.service.consul) nedostupné zvenčí. Playwright se přihlásí
|
||||
# sám (řeší expiraci session) a stažení nechá na prohlížeči.
|
||||
#
|
||||
# Credentials: ROOT .env (QUALTRICS_USER / QUALTRICS_PASS), fallback
|
||||
# na zabudované hodnoty účtu sipiq.
|
||||
# Verze 1.0 (čistě requests/API) přesunuta do TRASH — na tomto účtu nefunguje.
|
||||
# =============================================================================
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
load_dotenv(os.path.join(_ROOT, ".env"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
DC = "janssenfeasibility.co1"
|
||||
SURVEY = "SV_9AdeNaNyohp5fNQ"
|
||||
USER = os.environ.get("QUALTRICS_USER", "77242113uco3002_sipiq")
|
||||
PWD = os.environ.get("QUALTRICS_PASS", "77242113uco3002_sipiq")
|
||||
LOGIN_URL = "https://login.qualtrics.com/login"
|
||||
RESP_URL = f"https://{DC}.qualtrics.com/responses/#/surveys/{SURVEY}"
|
||||
|
||||
OUT_DIR = r"U:\Dropbox\!!!Days\Downloads Z230"
|
||||
PROFILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "qualtrics_profile")
|
||||
HEADLESS = "--headless" in sys.argv
|
||||
DEBUG_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def dbg(page, name):
|
||||
try:
|
||||
page.screenshot(path=os.path.join(DEBUG_DIR, f"_dl_{name}.png"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def ensure_login(page):
|
||||
page.goto(LOGIN_URL, timeout=120000)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
page.wait_for_timeout(3000)
|
||||
field = page.locator("#UserName, input[name='username'], #username").first
|
||||
if field.is_visible(timeout=5000):
|
||||
print("Přihlašuji se…")
|
||||
field.fill(USER)
|
||||
page.locator("#UserPassword, input[name='password'], #password").first.fill(PWD)
|
||||
page.locator("#loginButton, button[type='submit'], #login-button").first.click()
|
||||
page.wait_for_timeout(8000)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
else:
|
||||
print("Session aktivní, přihlášení přeskočeno.")
|
||||
|
||||
|
||||
def open_data_table(page):
|
||||
page.goto(RESP_URL, timeout=120000)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
# počkat na tlačítko Export & Import
|
||||
page.get_by_role("button", name="Export & Import").wait_for(timeout=120000)
|
||||
page.wait_for_timeout(2000)
|
||||
print("Data & Analysis načteno.")
|
||||
|
||||
|
||||
def do_export(page):
|
||||
page.get_by_role("button", name="Export & Import").click()
|
||||
page.wait_for_timeout(1500)
|
||||
dbg(page, "01_menu")
|
||||
# položka Export Data v menu
|
||||
page.get_by_text("Export Data", exact=False).first.click()
|
||||
page.wait_for_timeout(3000)
|
||||
dbg(page, "02_modal")
|
||||
|
||||
# CSV tab (bývá default; klikneme pro jistotu)
|
||||
try:
|
||||
page.get_by_role("tab", name="CSV").click(timeout=5000)
|
||||
except PWTimeout:
|
||||
try:
|
||||
page.get_by_text("CSV", exact=True).first.click(timeout=5000)
|
||||
except Exception:
|
||||
print("CSV tab nenalezen, spoléhám na default.")
|
||||
page.wait_for_timeout(1500)
|
||||
dbg(page, "03_csv")
|
||||
|
||||
# Download tlačítko + zachycení stahování
|
||||
print("Spouštím Download…")
|
||||
with page.expect_download(timeout=180000) as dl_info:
|
||||
page.get_by_role("button", name="Download").last.click()
|
||||
download = dl_info.value
|
||||
return download
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
with sync_playwright() as pw:
|
||||
ctx = pw.chromium.launch_persistent_context(
|
||||
PROFILE, headless=HEADLESS, accept_downloads=True,
|
||||
args=["--start-maximized"], no_viewport=not HEADLESS)
|
||||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||||
try:
|
||||
ensure_login(page)
|
||||
open_data_table(page)
|
||||
download = do_export(page)
|
||||
stamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||
suggested = download.suggested_filename or "sipiq_export.csv"
|
||||
target = os.path.join(OUT_DIR, f"{stamp} {suggested}")
|
||||
download.save_as(target)
|
||||
print("HOTOVO. Uloženo:", target)
|
||||
except Exception as e:
|
||||
dbg(page, "99_error")
|
||||
print("CHYBA:", repr(e), file=sys.stderr)
|
||||
raise
|
||||
finally:
|
||||
page.wait_for_timeout(1500)
|
||||
ctx.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,40 @@
|
||||
# sipiq_import_v1.2 — import SIPIQ odpovědí (folder workflow + provenance)
|
||||
|
||||
**Verze:** 1.2 · **Datum:** 2026-06-17 · **Studie:** 77242113UCO3002 (ICONIC / DAWN)
|
||||
|
||||
## Změny
|
||||
- **v1.2:** ke každé odpovědi `source_exported_at` = **datum/čas reportu podle filesystému**
|
||||
(mtime CSV souboru). Mimo content-hash → nezpůsobuje zbytečné UPDATE; backfilluje se i na
|
||||
"beze změny" cestě. v1.1 → `Feasibility\TRASH`.
|
||||
- **v1.1:** FOLDER workflow (`--folder`) — sebere *.csv, delta import, přesun do `Zpracováno`.
|
||||
|
||||
## Kolekce
|
||||
- `sipiq_questions` — slovník dotazníku (rekonstrukce SIPIQ jako v PDF).
|
||||
- `sipiq_responses` — 1 dok = 1 odpověď (`_id`=ResponseId), ploché `answers{}`,
|
||||
soft-link `investigator_oid`, `source_file` + `source_exported_at`, delta + `history[]`.
|
||||
|
||||
Zdroj = Qualtrics **CSV** (ř.1 Qcode, ř.2 text otázky, ř.3 ImportId=QID). Export labels,
|
||||
desetinná tečka, recode unanswered vypnuté.
|
||||
|
||||
## Delta (přepíše JEN změněná data)
|
||||
nová→INSERT; beze změn (shodný `content_sha256`)→jen `last_seen_at` + `source_file` + `source_exported_at`;
|
||||
změna→`$set` jen změněných polí + `$push` do `history[]`.
|
||||
|
||||
## Soft-link na investigators (nedestruktivní)
|
||||
pi_email → email/email2 (lower), pak recipient_email, fallback příjmení (bez diakritiky)+země.
|
||||
|
||||
## Použití
|
||||
```
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.2.py --dry-run # folder režim, default složka
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.2.py --apply
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.2.py --folder "<cesta>" --apply
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.2.py --csv "<cesta.csv>" --apply # jediný soubor, NEpřesouvá
|
||||
```
|
||||
Default složka `…\77242113UCO2001\ImportSIPIQcompled`; přesun do `Zpracováno` jen v `--apply` + folder režimu.
|
||||
`--scope czsk` (default) / `all`. Default = dry-run.
|
||||
|
||||
## Workflow
|
||||
Uživatel pokládá kompletní SIPIQ reporty (Qualtrics CSV, název
|
||||
`ICONIC+Phase+3b+UC+Study+(77242113UCO3002)_SipIQ_V1_13MAY2026_<datum>_<čas>.csv`) do
|
||||
`ImportSIPIQcompled\`. Po `--apply` se naimportují (delta) a přesunou do `Zpracováno\`.
|
||||
`source_exported_at` se bere z mtime souboru (datum/čas reportu dle filesystému).
|
||||
@@ -0,0 +1,489 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
sipiq_import_v1.2.py
|
||||
====================
|
||||
Verze: 1.2
|
||||
Datum: 2026-06-17
|
||||
Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
|
||||
|
||||
Změny proti v1.1
|
||||
----------------
|
||||
- PROVENANCE: ke každé odpovědi se ukládá `source_exported_at` = datum/čas reportu
|
||||
podle FILESYSTÉMU (mtime CSV souboru). Mimo content-hash → nezpůsobuje zbytečné
|
||||
UPDATE; backfilluje se i na "beze změny" cestě. Stará v1.1 ponechána v TRASH.
|
||||
|
||||
Změny proti v1.0
|
||||
----------------
|
||||
- FOLDER WORKFLOW (v1.1): režim --folder sebere *.csv ve složce, naimportuje (delta)
|
||||
a přesune do podsložky `Zpracováno`. Default složka =
|
||||
U:\\PythonProject\\Janssen\\Feasibility\\77242113UCO2001\\ImportSIPIQcompled.
|
||||
|
||||
Popis
|
||||
-----
|
||||
Import SIPIQ odpovědí (Qualtrics CSV export, studie 77242113UCO3002 / ICONIC DAWN)
|
||||
do MongoDB db `feasibility`. Dvě kolekce:
|
||||
* sipiq_questions – slovník dotazníku (1 dok = 1 logická otázka).
|
||||
* sipiq_responses – 1 dok = 1 odpověď (_id = Qualtrics ResponseId), ploché answers{},
|
||||
soft-link investigator_oid, delta bookkeeping + history[].
|
||||
|
||||
DELTA import (přepíše JEN změněná data): nová->insert; beze změn->jen last_seen_at;
|
||||
změna->$set jen změněných polí + push do history[].
|
||||
|
||||
Použití
|
||||
-------
|
||||
python sipiq_import_v1.2.py --dry-run # folder režim, default složka
|
||||
python sipiq_import_v1.2.py --apply
|
||||
python sipiq_import_v1.2.py --folder "<cesta>" --apply
|
||||
python sipiq_import_v1.2.py --csv "<cesta.csv>" --apply # jediný soubor (NEpřesouvá)
|
||||
|
||||
Závislosti: pymongo (.venv). Mongo 192.168.1.76:27017, bez auth.
|
||||
"""
|
||||
import argparse
|
||||
import csv
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
print("CHYBA: pymongo není nainstalován v aktuálním pythonu.", file=sys.stderr)
|
||||
raise
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "feasibility"
|
||||
COL_Q = "sipiq_questions"
|
||||
COL_R = "sipiq_responses"
|
||||
DEFAULT_FOLDER = r"U:\PythonProject\Janssen\Feasibility\77242113UCO2001\ImportSIPIQcompled"
|
||||
PROCESSED_SUBDIR = "Zpracováno"
|
||||
|
||||
META_COLS = {
|
||||
"StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)",
|
||||
"Finished", "RecordedDate", "ResponseId", "RecipientLastName", "RecipientFirstName",
|
||||
"RecipientEmail", "ExternalReference", "LocationLatitude", "LocationLongitude",
|
||||
"DistributionChannel", "UserLanguage",
|
||||
}
|
||||
|
||||
PROMOTE = [
|
||||
"site_name", "site_address", "site_city", "site_state", "site_postcode", "site_country",
|
||||
"pi_first_name", "pi_last_name", "pi_phone", "pi_email",
|
||||
"sdl_site_id", "fire_site_id", "fire_investigator_id", "mailinglist_id",
|
||||
"survey_generated_by", "Date", "Time",
|
||||
]
|
||||
|
||||
SECTION_BY_QNUM = {}
|
||||
def _sec(rng, name):
|
||||
for n in rng:
|
||||
SECTION_BY_QNUM[n] = name
|
||||
_sec([2], "J&J Internal Assessment")
|
||||
_sec([6, 7, 8, 9, 10, 11, 12, 13], "Contact Information")
|
||||
_sec(range(14, 22), "Confidentiality Statement")
|
||||
_sec([25, 26, 27], "Interest")
|
||||
_sec([29, 30, 31, 32, 33, 34], "Protocol Requirements")
|
||||
_sec([36, 37, 38], "Enrollment")
|
||||
_sec([40, 41, 42, 43], "Patient Demographics Overview")
|
||||
_sec([45, 46, 47, 48, 49], "Site Overview")
|
||||
_sec([51], "Operational Considerations")
|
||||
_sec([53, 54], "Comments")
|
||||
_sec([57, 58, 59, 60, 61], "Patient Population")
|
||||
_sec([63, 64, 65, 66, 67], "Site Experience and Staffing")
|
||||
_sec([69], "Equipment and Facility Requirements")
|
||||
_sec([71, 72, 73, 74, 75], "Institutional Review Board, Ethics Committee, and Contracts")
|
||||
|
||||
STEM_OVERRIDE = {
|
||||
"Q31": "At your site, at what line(s) of treatment do you most commonly prescribe "
|
||||
"vedolizumab for patients with moderately to severely active ulcerative colitis?",
|
||||
"Q63": "Do you or your site staff have experience in performing the following types of "
|
||||
"study assessments/procedures?",
|
||||
"Q64": "The following personnel are required to run the study. "
|
||||
"Will your site have the following available?",
|
||||
"Q69": "The following equipment and facilities are required to run the studies. "
|
||||
"Are these available at your site?",
|
||||
}
|
||||
|
||||
|
||||
def now_iso():
|
||||
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def file_mtime_iso(path):
|
||||
return datetime.fromtimestamp(os.path.getmtime(path)).astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def strip_accents(s):
|
||||
if not s:
|
||||
return ""
|
||||
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
|
||||
|
||||
|
||||
def norm_name(s):
|
||||
return re.sub(r"\s+", " ", strip_accents(s or "").lower()).strip()
|
||||
|
||||
|
||||
def sanitize_key(qcode):
|
||||
return qcode.replace("#", "_").replace(".", "_")
|
||||
|
||||
|
||||
def qnum(qcode):
|
||||
m = re.match(r"Q(\d+)", qcode)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def qbase(qcode):
|
||||
m = re.match(r"(Q\d+)", qcode)
|
||||
return m.group(1) if m else qcode
|
||||
|
||||
|
||||
def import_id(h3_cell):
|
||||
try:
|
||||
return json.loads(h3_cell).get("ImportId", "")
|
||||
except Exception:
|
||||
return h3_cell
|
||||
|
||||
|
||||
def split_text(text):
|
||||
parts = [p.strip() for p in re.split(r"\s+-\s+", text)]
|
||||
stem = parts[0]
|
||||
if len(parts) == 1:
|
||||
return stem, None
|
||||
label_parts = [p for p in parts[1:] if p.lower() != "selected choice"]
|
||||
label_parts = [p for p in label_parts if not re.fullmatch(r"Q\d+#\d+", p)]
|
||||
return stem, (" - ".join(label_parts) if label_parts else None)
|
||||
|
||||
|
||||
def detect_type(qcode, observed):
|
||||
has_hash = "#" in qcode
|
||||
vals = [v for v in observed if v]
|
||||
yesno = vals and all(v in ("Yes", "No") for v in vals)
|
||||
numeric = vals and all(re.fullmatch(r"-?\d+(\.\d+)?", v) for v in vals)
|
||||
if has_hash and yesno:
|
||||
return "matrix_yesno"
|
||||
if has_hash and numeric:
|
||||
return "matrix_percent"
|
||||
if has_hash:
|
||||
return "matrix"
|
||||
if numeric:
|
||||
return "numeric"
|
||||
if yesno:
|
||||
return "yesno"
|
||||
return "single_or_text"
|
||||
|
||||
|
||||
def load_csv(path):
|
||||
with open(path, encoding="utf-8-sig", newline="") as fh:
|
||||
rows = list(csv.reader(fh))
|
||||
h1, h2, h3 = rows[0], rows[1], rows[2]
|
||||
data = rows[3:]
|
||||
cols = [{"i": i, "code": c, "text": t, "qid": import_id(j)}
|
||||
for i, (c, t, j) in enumerate(zip(h1, h2, h3))]
|
||||
return cols, data
|
||||
|
||||
|
||||
def col_getter(cols, data):
|
||||
idx = {c["code"]: c["i"] for c in cols}
|
||||
def get(row, code):
|
||||
i = idx.get(code)
|
||||
return (row[i].strip() if i is not None and i < len(row) else "")
|
||||
return get, idx
|
||||
|
||||
|
||||
def is_question_col(code):
|
||||
return bool(re.match(r"Q\d", code))
|
||||
|
||||
|
||||
def build_questions(cols, data):
|
||||
qcols = [c for c in cols if is_question_col(c["code"])]
|
||||
observed = {c["code"]: set() for c in qcols}
|
||||
for row in data:
|
||||
for c in qcols:
|
||||
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
|
||||
if v:
|
||||
observed[c["code"]].add(v)
|
||||
groups, order_seen = {}, []
|
||||
for c in qcols:
|
||||
base = qbase(c["code"])
|
||||
if base not in groups:
|
||||
groups[base] = {"_id": base, "order": c["i"], "qnum": qnum(c["code"]),
|
||||
"section": SECTION_BY_QNUM.get(qnum(c["code"]), "Other"),
|
||||
"qids": [], "text": split_text(c["text"])[0],
|
||||
"items": [], "_obs": set(), "_types": []}
|
||||
order_seen.append(base)
|
||||
g = groups[base]
|
||||
bq = re.match(r"(QID\d+)", c["qid"] or "")
|
||||
if bq and bq.group(1) not in g["qids"]:
|
||||
g["qids"].append(bq.group(1))
|
||||
_, label = split_text(c["text"])
|
||||
item = {"key": sanitize_key(c["code"]), "qcode": c["code"], "qid": c["qid"]}
|
||||
if label:
|
||||
item["label"] = label
|
||||
g["items"].append(item)
|
||||
g["_obs"] |= observed[c["code"]]
|
||||
g["_types"].append(detect_type(c["code"], observed[c["code"]]))
|
||||
out = []
|
||||
for n, base in enumerate(order_seen):
|
||||
g = groups[base]
|
||||
obs = sorted(g.pop("_obs"))
|
||||
types = g.pop("_types")
|
||||
gtype = max(set(types), key=types.count) if types else "single_or_text"
|
||||
g["type"] = gtype
|
||||
if gtype in ("yesno", "matrix_yesno"):
|
||||
g["options"] = ["Yes", "No"]
|
||||
elif gtype == "single_or_text" and obs and len(obs) <= 12:
|
||||
g["options"] = obs
|
||||
else:
|
||||
g["options"] = []
|
||||
if base in STEM_OVERRIDE:
|
||||
g["text"] = STEM_OVERRIDE[base]
|
||||
g["order"] = n
|
||||
if len(g["items"]) == 1 and "label" not in g["items"][0]:
|
||||
g["items"] = []
|
||||
out.append(g)
|
||||
return out
|
||||
|
||||
|
||||
def build_response(cols, get, row, source_file):
|
||||
rid = get(row, "ResponseId")
|
||||
answers = {}
|
||||
for c in cols:
|
||||
if is_question_col(c["code"]):
|
||||
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
|
||||
if v:
|
||||
answers[sanitize_key(c["code"])] = v
|
||||
meta = {
|
||||
"start_date": get(row, "StartDate") or None,
|
||||
"end_date": get(row, "EndDate") or None,
|
||||
"recorded_date": get(row, "RecordedDate") or None,
|
||||
"status": get(row, "Status") or None,
|
||||
"progress": int(get(row, "Progress")) if get(row, "Progress").isdigit() else (get(row, "Progress") or None),
|
||||
"finished": get(row, "Finished") in ("True", "1", "TRUE"),
|
||||
"duration_sec": int(get(row, "Duration (in seconds)")) if get(row, "Duration (in seconds)").isdigit() else None,
|
||||
"user_language": get(row, "UserLanguage") or None,
|
||||
"distribution_channel": get(row, "DistributionChannel") or None,
|
||||
"ip_address": get(row, "IPAddress") or None,
|
||||
"location_lat": get(row, "LocationLatitude") or None,
|
||||
"location_lng": get(row, "LocationLongitude") or None,
|
||||
"survey_date": get(row, "Date") or None,
|
||||
"survey_time": get(row, "Time") or None,
|
||||
}
|
||||
doc = {
|
||||
"_id": rid, "study": "77242113UCO3002",
|
||||
"site_country": get(row, "site_country") or None,
|
||||
"site_name": get(row, "site_name") or None,
|
||||
"site_city": get(row, "site_city") or None,
|
||||
"site_state": get(row, "site_state") or None,
|
||||
"site_postcode": get(row, "site_postcode") or None,
|
||||
"site_address": get(row, "site_address") or None,
|
||||
"pi_first_name": get(row, "pi_first_name") or None,
|
||||
"pi_last_name": get(row, "pi_last_name") or None,
|
||||
"pi_email": (get(row, "pi_email") or "").lower() or None,
|
||||
"pi_phone": get(row, "pi_phone") or None,
|
||||
"sdl_site_id": get(row, "sdl_site_id") or None,
|
||||
"fire_site_id": get(row, "fire_site_id") or None,
|
||||
"fire_investigator_id": get(row, "fire_investigator_id") or None,
|
||||
"mailinglist_id": get(row, "mailinglist_id") or None,
|
||||
"survey_generated_by": get(row, "survey_generated_by") or None,
|
||||
"recipient_email": (get(row, "RecipientEmail") or "").lower() or None,
|
||||
"recipient_last_name": get(row, "RecipientLastName") or None,
|
||||
"recipient_first_name": get(row, "RecipientFirstName") or None,
|
||||
"meta": meta,
|
||||
"is_full_sipiq": any(k.startswith(("Q57", "Q58", "Q59", "Q63", "Q66", "Q71")) for k in answers),
|
||||
"interested": answers.get("Q25"),
|
||||
"answers": answers,
|
||||
"investigator_oid": None, "investigator_match": None,
|
||||
"source_file": source_file,
|
||||
}
|
||||
return doc
|
||||
|
||||
|
||||
def content_hash(doc):
|
||||
payload = {k: doc[k] for k in doc if k not in
|
||||
("content_sha256", "first_imported_at", "last_seen_at", "last_updated_at",
|
||||
"history", "investigator_oid", "investigator_match", "source_file",
|
||||
"source_exported_at")}
|
||||
return hashlib.sha256(json.dumps(payload, sort_keys=True, ensure_ascii=False,
|
||||
default=str).encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def load_investigators(db):
|
||||
inv = list(db.investigators.find(
|
||||
{"zeme": {"$in": ["Czech Republic", "Slovakia"]}},
|
||||
{"prijmeni": 1, "jmeno": 1, "email": 1, "email2": 1, "zeme": 1, "KROK": 1}))
|
||||
by_email, by_name = {}, {}
|
||||
for d in inv:
|
||||
for ef in ("email", "email2"):
|
||||
e = (d.get(ef) or "").lower().strip()
|
||||
if e:
|
||||
by_email.setdefault(e, d)
|
||||
nm = norm_name(d.get("prijmeni"))
|
||||
if nm:
|
||||
by_name.setdefault((nm, d.get("zeme")), []).append(d)
|
||||
return inv, by_email, by_name
|
||||
|
||||
|
||||
def soft_link(doc, by_email, by_name):
|
||||
e = (doc.get("pi_email") or "").lower().strip()
|
||||
if e and e in by_email:
|
||||
d = by_email[e]; return d["_id"], f"email:{e}", d
|
||||
e2 = (doc.get("recipient_email") or "").lower().strip()
|
||||
if e2 and e2 in by_email:
|
||||
d = by_email[e2]; return d["_id"], f"recipient_email:{e2}", d
|
||||
nm = norm_name(doc.get("pi_last_name"))
|
||||
cand = by_name.get((nm, doc.get("site_country")), [])
|
||||
if len(cand) == 1:
|
||||
return cand[0]["_id"], f"prijmeni:{nm}", cand[0]
|
||||
if len(cand) > 1:
|
||||
return None, f"prijmeni_ambiguous:{nm}({len(cand)})", None
|
||||
return None, "NENALEZENO", None
|
||||
|
||||
|
||||
def diff_docs(old, new):
|
||||
changes = []
|
||||
def walk(prefix, o, n):
|
||||
for k in sorted(set((o or {}).keys()) | set((n or {}).keys())):
|
||||
ov, nv = (o or {}).get(k), (n or {}).get(k)
|
||||
if isinstance(ov, dict) or isinstance(nv, dict):
|
||||
walk(f"{prefix}{k}.", ov or {}, nv or {})
|
||||
elif ov != nv:
|
||||
changes.append({"key": f"{prefix}{k}", "old": ov, "new": nv})
|
||||
for field in ("answers", "meta"):
|
||||
walk(f"{field}.", old.get(field, {}), new.get(field, {}))
|
||||
for k in ("site_name", "pi_email", "pi_last_name", "interested", "is_full_sipiq"):
|
||||
if old.get(k) != new.get(k):
|
||||
changes.append({"key": k, "old": old.get(k), "new": new.get(k)})
|
||||
return changes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
def process_file(db, csv_path, scope, dry, by_email, by_name):
|
||||
source_file = os.path.basename(csv_path)
|
||||
exported_at = file_mtime_iso(csv_path) # datum/čas reportu dle filesystému (mtime)
|
||||
cols, data = load_csv(csv_path)
|
||||
get, _ = col_getter(cols, data)
|
||||
if scope == "czsk":
|
||||
data = [r for r in data if get(r, "site_country") in ("Czech Republic", "Slovakia")]
|
||||
print(f"\n########## {source_file} (rozsah={scope}, odpovědí={len(data)}, export={exported_at}) ##########")
|
||||
|
||||
cols_all, data_all = load_csv(csv_path)
|
||||
questions = build_questions(cols_all, data_all)
|
||||
|
||||
docs, link_rows = [], []
|
||||
for r in data:
|
||||
doc = build_response(cols, get, r, source_file)
|
||||
oid, how, matched = soft_link(doc, by_email, by_name)
|
||||
doc["investigator_oid"] = oid
|
||||
doc["investigator_match"] = how
|
||||
doc["source_exported_at"] = exported_at
|
||||
doc["content_sha256"] = content_hash(doc)
|
||||
docs.append(doc)
|
||||
link_rows.append((doc, how, matched))
|
||||
|
||||
existing = {d["_id"]: d for d in db[COL_R].find({}, {"content_sha256": 1})}
|
||||
to_insert = [d for d in docs if d["_id"] not in existing]
|
||||
to_update = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") != d["content_sha256"]]
|
||||
unchanged = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") == d["content_sha256"]]
|
||||
|
||||
mk7 = mko = un = 0
|
||||
for doc, how, m in link_rows:
|
||||
krok = (m or {}).get("KROK", "")
|
||||
if m and str(krok).startswith("7"): mk7 += 1
|
||||
elif m: mko += 1
|
||||
else: un += 1
|
||||
print(f" slovník: {len(questions)} otázek | soft-link: KROK7={mk7}, jiný={mko}, nenapárováno={un}")
|
||||
print(f" delta: INSERT={len(to_insert)}, UPDATE={len(to_update)}, beze změny={len(unchanged)}")
|
||||
if un:
|
||||
for doc, how, m in link_rows:
|
||||
if not m:
|
||||
print(f" ✗ NENAPÁROVÁNO: {doc.get('pi_last_name')} / {doc.get('pi_email')} ({how})")
|
||||
|
||||
if dry:
|
||||
print(" [DRY-RUN] nezapsáno")
|
||||
return {"insert": 0, "update": 0, "unchanged": 0, "wrote": False}
|
||||
|
||||
for q in questions:
|
||||
db[COL_Q].replace_one({"_id": q["_id"]}, q, upsert=True)
|
||||
ts = now_iso()
|
||||
ni = nu = ns = 0
|
||||
for d in docs:
|
||||
cur = db[COL_R].find_one({"_id": d["_id"]})
|
||||
if cur is None:
|
||||
d.update({"first_imported_at": ts, "last_seen_at": ts, "last_updated_at": ts, "history": []})
|
||||
db[COL_R].insert_one(d); ni += 1
|
||||
elif cur.get("content_sha256") != d["content_sha256"]:
|
||||
changes = diff_docs(cur, d)
|
||||
db[COL_R].update_one({"_id": d["_id"]}, {
|
||||
"$set": {**{k: d[k] for k in d if k != "_id"}, "last_seen_at": ts, "last_updated_at": ts},
|
||||
"$push": {"history": {"changed_at": ts, "source_file": source_file, "changes": changes}}})
|
||||
nu += 1
|
||||
else:
|
||||
db[COL_R].update_one({"_id": d["_id"]}, {"$set": {
|
||||
"last_seen_at": ts, "source_file": source_file, "source_exported_at": d["source_exported_at"]}})
|
||||
ns += 1
|
||||
print(f" [APPLY] questions upsert={len(questions)} | responses insert={ni}, update={nu}, beze změny={ns}")
|
||||
return {"insert": ni, "update": nu, "unchanged": ns, "wrote": True}
|
||||
|
||||
|
||||
def move_to_processed(csv_path, folder):
|
||||
dest_dir = os.path.join(folder, PROCESSED_SUBDIR)
|
||||
os.makedirs(dest_dir, exist_ok=True)
|
||||
base = os.path.basename(csv_path)
|
||||
dest = os.path.join(dest_dir, base)
|
||||
if os.path.exists(dest):
|
||||
stem, ext = os.path.splitext(base)
|
||||
n = 1
|
||||
while os.path.exists(os.path.join(dest_dir, f"{stem}_{n}{ext}")):
|
||||
n += 1
|
||||
dest = os.path.join(dest_dir, f"{stem}_{n}{ext}")
|
||||
shutil.move(csv_path, dest)
|
||||
print(f" -> přesunuto do {PROCESSED_SUBDIR}\\{os.path.basename(dest)}")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--csv", help="jediný soubor (NEpřesouvá)")
|
||||
ap.add_argument("--folder", default=DEFAULT_FOLDER, help="složka se SIPIQ CSV (přesune do Zpracováno)")
|
||||
ap.add_argument("--scope", choices=["czsk", "all"], default="czsk")
|
||||
ap.add_argument("--apply", action="store_true")
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
args = ap.parse_args()
|
||||
dry = not args.apply
|
||||
|
||||
if args.csv:
|
||||
files, move_mode, folder = [args.csv], False, None
|
||||
else:
|
||||
folder = args.folder
|
||||
files = sorted(glob.glob(os.path.join(folder, "*.csv")))
|
||||
move_mode = True
|
||||
print(f"Složka: {folder}\nNalezeno CSV ke zpracování: {len(files)}")
|
||||
if not files:
|
||||
print("Nic ke zpracování (žádné *.csv).")
|
||||
return
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=8000)
|
||||
db = client[DB_NAME]
|
||||
client.admin.command("ping")
|
||||
inv, by_email, by_name = load_investigators(db)
|
||||
print(f"Investigatorů CZ+SK v DB: {len(inv)}")
|
||||
|
||||
total = {"insert": 0, "update": 0, "unchanged": 0}
|
||||
for f in files:
|
||||
res = process_file(db, f, args.scope, dry, by_email, by_name)
|
||||
for k in total:
|
||||
total[k] += res[k]
|
||||
if move_mode and res["wrote"]:
|
||||
move_to_processed(f, folder)
|
||||
|
||||
print(f"\n=== CELKEM: insert={total['insert']}, update={total['update']}, beze změny={total['unchanged']} ===")
|
||||
if dry:
|
||||
print("[DRY-RUN] Nic se nezapsalo ani nepřesunulo. Ostrý běh: --apply")
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,38 @@
|
||||
# store_cda_seaweed_v1.0.py
|
||||
|
||||
**Verze:** 1.0 · **Datum:** 2026-06-17
|
||||
|
||||
## Účel
|
||||
Uloží podepsané CDA (PDF) z e-mailů asistentek (CTA) do Mongo
|
||||
`feasibility.investigators` do pole `cda.*` a posune lékaře na
|
||||
`KROK "5 - CDA podepsano"`.
|
||||
|
||||
Na rozdíl od `store_cda_batch` (stahuje `.msg` přes SFTP z Toweru a tahá přílohu
|
||||
přes `extract_msg`) tahle verze stahuje PDF **přímo ze SeaweedFS** přes
|
||||
`seaweed_url`, který parser ukládá k příloze v `emaily."vbuzalka@its.jnj.com"`
|
||||
(`attachments[].seaweed_url` + `sha256`). Jednodušší, bez SFTP.
|
||||
|
||||
## Jak to funguje
|
||||
- `MAPPING` = explicitní párování `investigator _id → (seaweed_url, filename, sha256, size, source_msg_id)`.
|
||||
- Pro každý záznam: stáhne PDF (urllib), ověří **SHA256 + velikost + PDF hlavičku**,
|
||||
base64-zakóduje a uloží do `cda`:
|
||||
`data_base64, data_sha256, data_filename, data_mime, data_size, data_stored_at,
|
||||
data_source_msg` + metadata `stav="podepsano", soubor, zdroj`.
|
||||
- Nastaví `KROK = "5 - CDA podepsano"` a předřadí řádek do `STATUS`.
|
||||
- `_id` se konvertuje na `ObjectId` (čisté pymongo nekonvertuje string→ObjectId samo).
|
||||
|
||||
## Použití
|
||||
```
|
||||
.venv\Scripts\python.exe Feasibility\store_cda_seaweed_v1.0.py # dry-run (ověří stažení+SHA, nezapisuje)
|
||||
.venv\Scripts\python.exe Feasibility\store_cda_seaweed_v1.0.py --apply # zapíše do Mongo
|
||||
```
|
||||
|
||||
## Běh 17JUN2026 (--apply)
|
||||
Uloženo 5/5 (všechny SHA256 OK), KROK 4 → 5:
|
||||
Závada Filip, Bruncák Michal (FNsP B. Bystrica), Machytka Evžen (Asclepiades),
|
||||
Pumprla Jiří (PreventaMed), Zapotocká Júlia (PAV-MED).
|
||||
GASTROMART/Molnár přeskočen (už KROK 6, CDA dříve uloženo).
|
||||
|
||||
## Závislosti
|
||||
`pymongo`, `bson` (+ stdlib). SeaweedFS volume server `192.168.1.50:8888`.
|
||||
Mongo `192.168.1.76:27017`.
|
||||
@@ -0,0 +1,126 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# =============================================================================
|
||||
# Nazev: store_cda_seaweed_v1.0.py
|
||||
# Verze: 1.0
|
||||
# Datum: 2026-06-17
|
||||
# Popis: Ulozi podepsane CDA (PDF) z e-mailu asistentek do Mongo
|
||||
# feasibility.investigators do pole cda.* a posune lekare na
|
||||
# KROK "5 - CDA podepsano". PDF se stahuji primo ze SeaweedFS
|
||||
# (seaweed_url z attachments v emaily."vbuzalka@its.jnj.com"),
|
||||
# overuje se SHA256 proti metadatum z Mongo.
|
||||
# Pouziti: python store_cda_seaweed_v1.0.py (dry-run / nahled)
|
||||
# python store_cda_seaweed_v1.0.py --apply (zapise do Mongo)
|
||||
# Pozn.: MAPPING nize = explicitni parovani investigator -> CDA priloha.
|
||||
# Jen stdlib + pymongo. SeaweedFS host 192.168.1.50:8888.
|
||||
# =============================================================================
|
||||
|
||||
import sys
|
||||
import base64
|
||||
import hashlib
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pymongo import MongoClient
|
||||
from bson import ObjectId
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DBN, COL = "feasibility", "investigators"
|
||||
|
||||
# (investigator _id, seaweed_url, filename, sha256, size, source_msg_id, label)
|
||||
MAPPING = [
|
||||
("6a198b661218c31ab0f5ba57",
|
||||
"http://192.168.1.50:8888/mail-attachments/1a/86/1a86e987b9d3da57c1d863b47734133f2e2d7eae3f5cfe91112c475eb86d86e9",
|
||||
"CZ_CDA PI_MUDr. Filip Zavada_fully signed_16Jun2026.pdf",
|
||||
"1a86e987b9d3da57c1d863b47734133f2e2d7eae3f5cfe91112c475eb86d86e9",
|
||||
479026, "<CH2PR07MB7190A5538ACDC1D49F8B430780E52@CH2PR07MB7190.namprd07.prod.outlook.com>",
|
||||
"Zavada Filip"),
|
||||
("6a19832b5fc2213518257957",
|
||||
"http://192.168.1.50:8888/mail-attachments/64/b0/64b06d48bfe3c49095e326988f14c04fd5849728b227647f6653b2e3c3095538",
|
||||
"SK_CDA PI_Bruncak_FNsP BBystrica_fully signed 16Jun2026.pdf",
|
||||
"64b06d48bfe3c49095e326988f14c04fd5849728b227647f6653b2e3c3095538",
|
||||
498069, "<SA1PR07MB952874B8654156369CDE44448CE52@SA1PR07MB9528.namprd07.prod.outlook.com>",
|
||||
"Bruncak Michal"),
|
||||
("6a19832b5fc2213518257961",
|
||||
"http://192.168.1.50:8888/mail-attachments/c2/72/c272ca62bd27ca10aed35cb54054d880f4f0e2f59940ed3b067b17d51a9ac041",
|
||||
"CZ_CDA Institution_Asclepiades s.r.o._MUDr. Machytka_16Jun2026.pdf",
|
||||
"c272ca62bd27ca10aed35cb54054d880f4f0e2f59940ed3b067b17d51a9ac041",
|
||||
460977, "<PH0PR07MB97879A9C9BF9C00D38D4798A9FE52@PH0PR07MB9787.namprd07.prod.outlook.com>",
|
||||
"Machytka Evzen (Asclepiades)"),
|
||||
("6a19832b5fc2213518257967",
|
||||
"http://192.168.1.50:8888/mail-attachments/99/37/99372c399be3b001428ef4b36d43e250dedced5955de5d1f3a2d63a9f0c1728b",
|
||||
"CZ_CDA institution_PreventaMed sro_fully signed_16Jun2026.pdf",
|
||||
"99372c399be3b001428ef4b36d43e250dedced5955de5d1f3a2d63a9f0c1728b",
|
||||
457745, "<CH2PR07MB719008DB0B3CAFD764AE2E8280E52@CH2PR07MB7190.namprd07.prod.outlook.com>",
|
||||
"Pumprla Jiri (PreventaMed)"),
|
||||
("6a1c4275aa46d8b608065ce9",
|
||||
"http://192.168.1.50:8888/mail-attachments/94/95/9495c742407873efd8dd9713e1dc962cb08e55e0d3690e4a79a90132ee358dee",
|
||||
"SK_CDA Institution_PAV-MED s r.o_fully signed_15Jun2026.pdf",
|
||||
"9495c742407873efd8dd9713e1dc962cb08e55e0d3690e4a79a90132ee358dee",
|
||||
460246, "<CH2PR07MB719008DB0B3CAFD764AE2E8280E52@CH2PR07MB7190.namprd07.prod.outlook.com>",
|
||||
"Zapotocka Julia (PAV-MED)"),
|
||||
]
|
||||
|
||||
|
||||
def fetch(url):
|
||||
with urllib.request.urlopen(url, timeout=30) as r:
|
||||
return r.read()
|
||||
|
||||
|
||||
def main():
|
||||
apply = "--apply" in sys.argv
|
||||
cli = MongoClient(MONGO_URI)
|
||||
col = cli[DBN][COL]
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
ok = 0
|
||||
for _id, url, fname, sha, size, src, label in MAPPING:
|
||||
oid = ObjectId(_id)
|
||||
doc = col.find_one({"_id": oid}, {"STATUS": 1, "KROK": 1, "cda.stav": 1})
|
||||
if not doc:
|
||||
print(f" !! {label}: investigator _id={_id} NENALEZEN"); continue
|
||||
try:
|
||||
raw = fetch(url)
|
||||
except Exception as e:
|
||||
print(f" !! {label}: stazeni selhalo: {e}"); continue
|
||||
got = hashlib.sha256(raw).hexdigest()
|
||||
sha_ok = (got == sha)
|
||||
size_ok = (len(raw) == size)
|
||||
head_ok = raw[:5] == b"%PDF-"
|
||||
print(f" [{label}]")
|
||||
print(f" soubor : {fname}")
|
||||
print(f" stazeno : {len(raw)} B (ocek. {size}) {'OK' if size_ok else 'MISMATCH'}")
|
||||
print(f" sha256 : {'OK' if sha_ok else 'MISMATCH! ' + got}")
|
||||
print(f" PDF hdr : {'OK' if head_ok else 'NENI PDF'}")
|
||||
print(f" KROK : {doc.get('KROK')} -> 5 - CDA podepsano")
|
||||
if not (sha_ok and size_ok and head_ok):
|
||||
print(" >> PRESKAKUJI (kontrola selhala)"); continue
|
||||
if not apply:
|
||||
ok += 1; continue
|
||||
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
old_status = doc.get("STATUS", "") or ""
|
||||
new_line = (f"17JUN2026: podepsane CDA ULOZENO do Mongo (cda.data) — {fname} "
|
||||
f"(z e-mailu asistentky). KROK 5, pripraveno na SIPIQ.")
|
||||
col.update_one({"_id": oid}, {"$set": {
|
||||
"KROK": "5 - CDA podepsano",
|
||||
"STATUS": new_line + "\n" + old_status,
|
||||
"cda.stav": "podepsano",
|
||||
"cda.soubor": fname,
|
||||
"cda.zdroj": "e-mail asistentky (SeaweedFS)",
|
||||
"cda.data_base64": b64,
|
||||
"cda.data_sha256": sha,
|
||||
"cda.data_filename": fname,
|
||||
"cda.data_mime": "application/pdf",
|
||||
"cda.data_size": len(raw),
|
||||
"cda.data_stored_at": now,
|
||||
"cda.data_source_msg": src,
|
||||
}})
|
||||
ok += 1
|
||||
print(" >> ULOZENO + KROK 5")
|
||||
|
||||
print(f"\n{'ZAPSANO' if apply else 'DRY-RUN OK'}: {ok}/{len(MAPPING)}")
|
||||
if not apply:
|
||||
print(">>> Pro zapis spust s --apply")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,53 @@
|
||||
# sipiq_download_v2.1
|
||||
|
||||
**Verze:** 2.1 · **Datum:** 2026-06-19
|
||||
|
||||
## Co dělá
|
||||
Automaticky stáhne SIPIQ survey report (CSV) z Qualtrics přes **Playwright** a
|
||||
volitelně rovnou naimportuje do Mongo.
|
||||
|
||||
1. Přihlásí se (username/password) na login.qualtrics.com.
|
||||
2. Otevře Data & Analysis studie 77242113UCO3002 (`SV_9AdeNaNyohp5fNQ`).
|
||||
3. Export & Import → Export Data → CSV → Download (zachyceno `expect_download`).
|
||||
4. Uloží s timestampem do `U:\Dropbox\!!!Days\Downloads Z230\`.
|
||||
5. **`--import`** → spustí `sipiq_import_v1.3.py --csv "<soubor>"` (delta import
|
||||
do Mongo, soubor NEpřesouvá).
|
||||
|
||||
## Proč Playwright (a ne API)
|
||||
Účet `77242113uco3002_sipiq` **nemá povolené API**. Interní UI export běží jen na
|
||||
session cookies a vrací interní `fileUrl` (`riptooth.service.consul:9000`)
|
||||
nedostupný zvenčí → čistě `requests` soubor nestáhne. Playwright se přihlásí sám
|
||||
(řeší expiraci session). Verze 1.0 (requests/API) i 2.0 (bez `--import`) v `TRASH`.
|
||||
|
||||
## Konfigurace (root `.env`)
|
||||
```
|
||||
QUALTRICS_USER=77242113uco3002_sipiq
|
||||
QUALTRICS_PASS=77242113uco3002_sipiq
|
||||
```
|
||||
Fallback na zabudované hodnoty, pokud `.env` chybí.
|
||||
|
||||
## Použití
|
||||
```
|
||||
PY=U:\PythonProject\Janssen\.venv\Scripts\python.exe
|
||||
|
||||
# jen stáhnout (headed okno)
|
||||
"%PY%" sipiq_download_v2.1.py
|
||||
|
||||
# stáhnout + import do Mongo, bez okna (scheduler)
|
||||
"%PY%" sipiq_download_v2.1.py --headless --import
|
||||
|
||||
# stáhnout + náhled importu (nic nezapíše)
|
||||
"%PY%" sipiq_download_v2.1.py --headless --import --dry-run
|
||||
|
||||
# scope all (default czsk)
|
||||
"%PY%" sipiq_download_v2.1.py --headless --import --scope all
|
||||
```
|
||||
- Profil prohlížeče: `Feasibility\qualtrics_profile\` (persistent, gitignored) →
|
||||
po prvním přihlášení se login přeskakuje.
|
||||
- Při chybě stahování se uloží screenshot `_dl_99_error.png`.
|
||||
|
||||
## Ověřeno 19JUN2026
|
||||
- Stažení: 247 sloupců, 3 hlavičky, 323 odpovědí, 498 110 B = shodné s ručním
|
||||
exportem. Modal defaulty sedí: CSV + Download all fields + Export labels.
|
||||
- Řetězení `--import`: download → `sipiq_import_v1.3 --csv` proběhlo (18 CZ+SK,
|
||||
delta beze změny=18 = idempotentní; `--dry-run` nic nezapsal).
|
||||
@@ -0,0 +1,166 @@
|
||||
# =============================================================================
|
||||
# Název: sipiq_download_v2.1.py
|
||||
# Verze: 2.1
|
||||
# Datum: 2026-06-19
|
||||
# Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
|
||||
# Popis: Automatické stažení SIPIQ survey reportu (CSV) z Qualtrics přes
|
||||
# Playwright. Přihlásí se (username/password), otevře Data & Analysis
|
||||
# studie 77242113UCO3002 (SV_9AdeNaNyohp5fNQ), spustí Export & Import →
|
||||
# Export Data → CSV → Download a stažený soubor uloží s timestampem do
|
||||
# U:\Dropbox\!!!Days\Downloads Z230\.
|
||||
#
|
||||
# v2.1: přepínač --import → po úspěšném stažení rovnou spustí
|
||||
# sipiq_import_v1.3.py --csv "<stažený soubor>" (delta import do Mongo,
|
||||
# NEpřesouvá soubor). --scope se předá importu (default czsk).
|
||||
#
|
||||
# Proč Playwright a ne API: účet NEMÁ povolené API na úrovni účtu;
|
||||
# interní export běží jen na session cookies + interní fileUrl
|
||||
# (riptooth.service.consul) nedostupné zvenčí. Playwright se přihlásí
|
||||
# sám (řeší expiraci session) a stažení nechá na prohlížeči.
|
||||
#
|
||||
# Credentials: ROOT .env (QUALTRICS_USER / QUALTRICS_PASS), fallback
|
||||
# na zabudované hodnoty účtu sipiq.
|
||||
# Verze 1.0 (requests/API) a 2.0 (bez --import) přesunuty do TRASH.
|
||||
# =============================================================================
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
load_dotenv(os.path.join(_ROOT, ".env"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
DC = "janssenfeasibility.co1"
|
||||
SURVEY = "SV_9AdeNaNyohp5fNQ"
|
||||
USER = os.environ.get("QUALTRICS_USER", "77242113uco3002_sipiq")
|
||||
PWD = os.environ.get("QUALTRICS_PASS", "77242113uco3002_sipiq")
|
||||
LOGIN_URL = "https://login.qualtrics.com/login"
|
||||
RESP_URL = f"https://{DC}.qualtrics.com/responses/#/surveys/{SURVEY}"
|
||||
|
||||
OUT_DIR = r"U:\Dropbox\!!!Days\Downloads Z230"
|
||||
HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE = os.path.join(HERE, "qualtrics_profile")
|
||||
IMPORT_SCRIPT = os.path.join(HERE, "sipiq_import_v1.3.py")
|
||||
DEBUG_DIR = HERE
|
||||
|
||||
|
||||
def dbg(page, name):
|
||||
try:
|
||||
page.screenshot(path=os.path.join(DEBUG_DIR, f"_dl_{name}.png"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def ensure_login(page):
|
||||
page.goto(LOGIN_URL, timeout=120000)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
page.wait_for_timeout(3000)
|
||||
field = page.locator("#UserName, input[name='username'], #username").first
|
||||
if field.is_visible(timeout=5000):
|
||||
print("Přihlašuji se…")
|
||||
field.fill(USER)
|
||||
page.locator("#UserPassword, input[name='password'], #password").first.fill(PWD)
|
||||
page.locator("#loginButton, button[type='submit'], #login-button").first.click()
|
||||
page.wait_for_timeout(8000)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
else:
|
||||
print("Session aktivní, přihlášení přeskočeno.")
|
||||
|
||||
|
||||
def open_data_table(page):
|
||||
page.goto(RESP_URL, timeout=120000)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
page.get_by_role("button", name="Export & Import").wait_for(timeout=120000)
|
||||
page.wait_for_timeout(2000)
|
||||
print("Data & Analysis načteno.")
|
||||
|
||||
|
||||
def do_export(page):
|
||||
page.get_by_role("button", name="Export & Import").click()
|
||||
page.wait_for_timeout(1500)
|
||||
dbg(page, "01_menu")
|
||||
page.get_by_text("Export Data", exact=False).first.click()
|
||||
page.wait_for_timeout(3000)
|
||||
dbg(page, "02_modal")
|
||||
try:
|
||||
page.get_by_role("tab", name="CSV").click(timeout=5000)
|
||||
except PWTimeout:
|
||||
try:
|
||||
page.get_by_text("CSV", exact=True).first.click(timeout=5000)
|
||||
except Exception:
|
||||
print("CSV tab nenalezen, spoléhám na default.")
|
||||
page.wait_for_timeout(1500)
|
||||
dbg(page, "03_csv")
|
||||
print("Spouštím Download…")
|
||||
with page.expect_download(timeout=180000) as dl_info:
|
||||
page.get_by_role("button", name="Download").last.click()
|
||||
return dl_info.value
|
||||
|
||||
|
||||
def download(headless):
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
with sync_playwright() as pw:
|
||||
ctx = pw.chromium.launch_persistent_context(
|
||||
PROFILE, headless=headless, accept_downloads=True,
|
||||
args=["--start-maximized"], no_viewport=not headless)
|
||||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||||
try:
|
||||
ensure_login(page)
|
||||
open_data_table(page)
|
||||
dl = do_export(page)
|
||||
stamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||
suggested = dl.suggested_filename or "sipiq_export.csv"
|
||||
target = os.path.join(OUT_DIR, f"{stamp} {suggested}")
|
||||
dl.save_as(target)
|
||||
print("Staženo:", target)
|
||||
return target
|
||||
except Exception as e:
|
||||
dbg(page, "99_error")
|
||||
print("CHYBA při stahování:", repr(e), file=sys.stderr)
|
||||
raise
|
||||
finally:
|
||||
page.wait_for_timeout(1500)
|
||||
ctx.close()
|
||||
|
||||
|
||||
def run_import(csv_path, scope, dry_run):
|
||||
cmd = [sys.executable, IMPORT_SCRIPT, "--csv", csv_path, "--scope", scope]
|
||||
if dry_run:
|
||||
cmd.append("--dry-run")
|
||||
print("\n=== IMPORT do Mongo ===")
|
||||
print("Spouštím:", " ".join(f'"{c}"' if " " in c else c for c in cmd))
|
||||
rc = subprocess.call(cmd)
|
||||
if rc != 0:
|
||||
print(f"CHYBA: import skončil s kódem {rc}.", file=sys.stderr)
|
||||
return rc
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Stažení SIPIQ CSV z Qualtrics (+ volitelný import)")
|
||||
ap.add_argument("--headless", action="store_true", help="bez okna (pro scheduler)")
|
||||
ap.add_argument("--import", dest="do_import", action="store_true",
|
||||
help="po stažení rovnou naimportovat do Mongo (sipiq_import_v1.3 --csv)")
|
||||
ap.add_argument("--scope", choices=["czsk", "all"], default="czsk",
|
||||
help="scope pro import (default czsk)")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="import jen jako náhled (předá se sipiq_import --dry-run)")
|
||||
args = ap.parse_args()
|
||||
|
||||
csv_path = download(args.headless)
|
||||
|
||||
if args.do_import:
|
||||
rc = run_import(csv_path, args.scope, args.dry_run)
|
||||
sys.exit(rc)
|
||||
else:
|
||||
print("\nHOTOVO. (Import nespuštěn — přidej --import pro napojení na Mongo.)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,30 @@
|
||||
# sipiq_import_v1.3 — import SIPIQ odpovědí do MongoDB (default = OSTŘE)
|
||||
|
||||
**Verze:** 1.3 · **Datum:** 2026-06-19 · **Studie:** 77242113UCO3002 (ICONIC / DAWN)
|
||||
|
||||
## Změna proti v1.2
|
||||
- **DEFAULT = OSTŘE.** Spuštění **bez argumentu** zapíše do Mongo a přesune CSV do `Zpracováno`
|
||||
(uživatel zapomínal na `--apply`). **Náhled jen s `--dry-run`.** `--apply` ponecháno jako no-op
|
||||
(zpětná kompatibilita — staré příkazy fungují dál). v1.2 → `Feasibility\TRASH`.
|
||||
|
||||
## Co dělá
|
||||
Import Qualtrics CSV exportu SIPIQ do db `feasibility` (kolekce `sipiq_questions` + `sipiq_responses`),
|
||||
delta (jen nové/změněné + `history[]`), soft-link na investigators, `source_exported_at` = mtime souboru.
|
||||
FOLDER workflow: sebere *.csv z `…\77242113UCO2001\ImportSIPIQcompled`, naimportuje, přesune do `Zpracováno\`.
|
||||
|
||||
## Použití
|
||||
```
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.3.py # OSTŘE (default) — zapíše + přesune
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.3.py --dry-run # jen náhled
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.3.py --csv "<cesta>" # jediný soubor (NEpřesouvá)
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_import_v1.3.py --scope all # vč. ostatních zemí
|
||||
```
|
||||
`--apply` stále funguje (no-op). Mongo 192.168.1.76:27017, pymongo.
|
||||
|
||||
## Pozor
|
||||
Protože je default ostře, pouhé spuštění skriptu nad neprázdnou složkou **zapíše a přesune**.
|
||||
Pro bezpečný náhled používej `--dry-run`.
|
||||
|
||||
## Stav 19JUN2026
|
||||
Import exportu June+19 (18 odpovědí) proveden: **+2 nové** odpovědi (INSERT=2), zbytek beze změny,
|
||||
soubor přesunut do `Zpracováno`.
|
||||
@@ -0,0 +1,495 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
sipiq_import_v1.3.py
|
||||
====================
|
||||
Verze: 1.3
|
||||
Datum: 2026-06-19
|
||||
Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
|
||||
|
||||
Změna proti v1.2
|
||||
----------------
|
||||
- DEFAULT = OSTŘE. Spuštění bez argumentu zapisuje do Mongo a přesune CSV do `Zpracováno`
|
||||
(uživatel zapomínal na `--apply`). Náhled JEN s `--dry-run`. `--apply` ponecháno jako
|
||||
no-op (zpětná kompatibilita). v1.2 v TRASH.
|
||||
|
||||
Změny proti v1.1
|
||||
----------------
|
||||
- PROVENANCE: ke každé odpovědi se ukládá `source_exported_at` = datum/čas reportu
|
||||
podle FILESYSTÉMU (mtime CSV souboru). Mimo content-hash → nezpůsobuje zbytečné
|
||||
UPDATE; backfilluje se i na "beze změny" cestě. Stará v1.1 ponechána v TRASH.
|
||||
|
||||
Změny proti v1.0
|
||||
----------------
|
||||
- FOLDER WORKFLOW (v1.1): režim --folder sebere *.csv ve složce, naimportuje (delta)
|
||||
a přesune do podsložky `Zpracováno`. Default složka =
|
||||
U:\\PythonProject\\Janssen\\Feasibility\\77242113UCO2001\\ImportSIPIQcompled.
|
||||
|
||||
Popis
|
||||
-----
|
||||
Import SIPIQ odpovědí (Qualtrics CSV export, studie 77242113UCO3002 / ICONIC DAWN)
|
||||
do MongoDB db `feasibility`. Dvě kolekce:
|
||||
* sipiq_questions – slovník dotazníku (1 dok = 1 logická otázka).
|
||||
* sipiq_responses – 1 dok = 1 odpověď (_id = Qualtrics ResponseId), ploché answers{},
|
||||
soft-link investigator_oid, delta bookkeeping + history[].
|
||||
|
||||
DELTA import (přepíše JEN změněná data): nová->insert; beze změn->jen last_seen_at;
|
||||
změna->$set jen změněných polí + push do history[].
|
||||
|
||||
Použití
|
||||
-------
|
||||
python sipiq_import_v1.3.py --dry-run # folder režim, default složka
|
||||
python sipiq_import_v1.3.py --apply
|
||||
python sipiq_import_v1.3.py --folder "<cesta>" --apply
|
||||
python sipiq_import_v1.3.py --csv "<cesta.csv>" --apply # jediný soubor (NEpřesouvá)
|
||||
|
||||
Závislosti: pymongo (.venv). Mongo 192.168.1.76:27017, bez auth.
|
||||
"""
|
||||
import argparse
|
||||
import csv
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import unicodedata
|
||||
from datetime import datetime, timezone
|
||||
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
print("CHYBA: pymongo není nainstalován v aktuálním pythonu.", file=sys.stderr)
|
||||
raise
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "feasibility"
|
||||
COL_Q = "sipiq_questions"
|
||||
COL_R = "sipiq_responses"
|
||||
DEFAULT_FOLDER = r"U:\PythonProject\Janssen\Feasibility\77242113UCO2001\ImportSIPIQcompled"
|
||||
PROCESSED_SUBDIR = "Zpracováno"
|
||||
|
||||
META_COLS = {
|
||||
"StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)",
|
||||
"Finished", "RecordedDate", "ResponseId", "RecipientLastName", "RecipientFirstName",
|
||||
"RecipientEmail", "ExternalReference", "LocationLatitude", "LocationLongitude",
|
||||
"DistributionChannel", "UserLanguage",
|
||||
}
|
||||
|
||||
PROMOTE = [
|
||||
"site_name", "site_address", "site_city", "site_state", "site_postcode", "site_country",
|
||||
"pi_first_name", "pi_last_name", "pi_phone", "pi_email",
|
||||
"sdl_site_id", "fire_site_id", "fire_investigator_id", "mailinglist_id",
|
||||
"survey_generated_by", "Date", "Time",
|
||||
]
|
||||
|
||||
SECTION_BY_QNUM = {}
|
||||
def _sec(rng, name):
|
||||
for n in rng:
|
||||
SECTION_BY_QNUM[n] = name
|
||||
_sec([2], "J&J Internal Assessment")
|
||||
_sec([6, 7, 8, 9, 10, 11, 12, 13], "Contact Information")
|
||||
_sec(range(14, 22), "Confidentiality Statement")
|
||||
_sec([25, 26, 27], "Interest")
|
||||
_sec([29, 30, 31, 32, 33, 34], "Protocol Requirements")
|
||||
_sec([36, 37, 38], "Enrollment")
|
||||
_sec([40, 41, 42, 43], "Patient Demographics Overview")
|
||||
_sec([45, 46, 47, 48, 49], "Site Overview")
|
||||
_sec([51], "Operational Considerations")
|
||||
_sec([53, 54], "Comments")
|
||||
_sec([57, 58, 59, 60, 61], "Patient Population")
|
||||
_sec([63, 64, 65, 66, 67], "Site Experience and Staffing")
|
||||
_sec([69], "Equipment and Facility Requirements")
|
||||
_sec([71, 72, 73, 74, 75], "Institutional Review Board, Ethics Committee, and Contracts")
|
||||
|
||||
STEM_OVERRIDE = {
|
||||
"Q31": "At your site, at what line(s) of treatment do you most commonly prescribe "
|
||||
"vedolizumab for patients with moderately to severely active ulcerative colitis?",
|
||||
"Q63": "Do you or your site staff have experience in performing the following types of "
|
||||
"study assessments/procedures?",
|
||||
"Q64": "The following personnel are required to run the study. "
|
||||
"Will your site have the following available?",
|
||||
"Q69": "The following equipment and facilities are required to run the studies. "
|
||||
"Are these available at your site?",
|
||||
}
|
||||
|
||||
|
||||
def now_iso():
|
||||
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def file_mtime_iso(path):
|
||||
return datetime.fromtimestamp(os.path.getmtime(path)).astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def strip_accents(s):
|
||||
if not s:
|
||||
return ""
|
||||
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
|
||||
|
||||
|
||||
def norm_name(s):
|
||||
return re.sub(r"\s+", " ", strip_accents(s or "").lower()).strip()
|
||||
|
||||
|
||||
def sanitize_key(qcode):
|
||||
return qcode.replace("#", "_").replace(".", "_")
|
||||
|
||||
|
||||
def qnum(qcode):
|
||||
m = re.match(r"Q(\d+)", qcode)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def qbase(qcode):
|
||||
m = re.match(r"(Q\d+)", qcode)
|
||||
return m.group(1) if m else qcode
|
||||
|
||||
|
||||
def import_id(h3_cell):
|
||||
try:
|
||||
return json.loads(h3_cell).get("ImportId", "")
|
||||
except Exception:
|
||||
return h3_cell
|
||||
|
||||
|
||||
def split_text(text):
|
||||
parts = [p.strip() for p in re.split(r"\s+-\s+", text)]
|
||||
stem = parts[0]
|
||||
if len(parts) == 1:
|
||||
return stem, None
|
||||
label_parts = [p for p in parts[1:] if p.lower() != "selected choice"]
|
||||
label_parts = [p for p in label_parts if not re.fullmatch(r"Q\d+#\d+", p)]
|
||||
return stem, (" - ".join(label_parts) if label_parts else None)
|
||||
|
||||
|
||||
def detect_type(qcode, observed):
|
||||
has_hash = "#" in qcode
|
||||
vals = [v for v in observed if v]
|
||||
yesno = vals and all(v in ("Yes", "No") for v in vals)
|
||||
numeric = vals and all(re.fullmatch(r"-?\d+(\.\d+)?", v) for v in vals)
|
||||
if has_hash and yesno:
|
||||
return "matrix_yesno"
|
||||
if has_hash and numeric:
|
||||
return "matrix_percent"
|
||||
if has_hash:
|
||||
return "matrix"
|
||||
if numeric:
|
||||
return "numeric"
|
||||
if yesno:
|
||||
return "yesno"
|
||||
return "single_or_text"
|
||||
|
||||
|
||||
def load_csv(path):
|
||||
with open(path, encoding="utf-8-sig", newline="") as fh:
|
||||
rows = list(csv.reader(fh))
|
||||
h1, h2, h3 = rows[0], rows[1], rows[2]
|
||||
data = rows[3:]
|
||||
cols = [{"i": i, "code": c, "text": t, "qid": import_id(j)}
|
||||
for i, (c, t, j) in enumerate(zip(h1, h2, h3))]
|
||||
return cols, data
|
||||
|
||||
|
||||
def col_getter(cols, data):
|
||||
idx = {c["code"]: c["i"] for c in cols}
|
||||
def get(row, code):
|
||||
i = idx.get(code)
|
||||
return (row[i].strip() if i is not None and i < len(row) else "")
|
||||
return get, idx
|
||||
|
||||
|
||||
def is_question_col(code):
|
||||
return bool(re.match(r"Q\d", code))
|
||||
|
||||
|
||||
def build_questions(cols, data):
|
||||
qcols = [c for c in cols if is_question_col(c["code"])]
|
||||
observed = {c["code"]: set() for c in qcols}
|
||||
for row in data:
|
||||
for c in qcols:
|
||||
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
|
||||
if v:
|
||||
observed[c["code"]].add(v)
|
||||
groups, order_seen = {}, []
|
||||
for c in qcols:
|
||||
base = qbase(c["code"])
|
||||
if base not in groups:
|
||||
groups[base] = {"_id": base, "order": c["i"], "qnum": qnum(c["code"]),
|
||||
"section": SECTION_BY_QNUM.get(qnum(c["code"]), "Other"),
|
||||
"qids": [], "text": split_text(c["text"])[0],
|
||||
"items": [], "_obs": set(), "_types": []}
|
||||
order_seen.append(base)
|
||||
g = groups[base]
|
||||
bq = re.match(r"(QID\d+)", c["qid"] or "")
|
||||
if bq and bq.group(1) not in g["qids"]:
|
||||
g["qids"].append(bq.group(1))
|
||||
_, label = split_text(c["text"])
|
||||
item = {"key": sanitize_key(c["code"]), "qcode": c["code"], "qid": c["qid"]}
|
||||
if label:
|
||||
item["label"] = label
|
||||
g["items"].append(item)
|
||||
g["_obs"] |= observed[c["code"]]
|
||||
g["_types"].append(detect_type(c["code"], observed[c["code"]]))
|
||||
out = []
|
||||
for n, base in enumerate(order_seen):
|
||||
g = groups[base]
|
||||
obs = sorted(g.pop("_obs"))
|
||||
types = g.pop("_types")
|
||||
gtype = max(set(types), key=types.count) if types else "single_or_text"
|
||||
g["type"] = gtype
|
||||
if gtype in ("yesno", "matrix_yesno"):
|
||||
g["options"] = ["Yes", "No"]
|
||||
elif gtype == "single_or_text" and obs and len(obs) <= 12:
|
||||
g["options"] = obs
|
||||
else:
|
||||
g["options"] = []
|
||||
if base in STEM_OVERRIDE:
|
||||
g["text"] = STEM_OVERRIDE[base]
|
||||
g["order"] = n
|
||||
if len(g["items"]) == 1 and "label" not in g["items"][0]:
|
||||
g["items"] = []
|
||||
out.append(g)
|
||||
return out
|
||||
|
||||
|
||||
def build_response(cols, get, row, source_file):
|
||||
rid = get(row, "ResponseId")
|
||||
answers = {}
|
||||
for c in cols:
|
||||
if is_question_col(c["code"]):
|
||||
v = (row[c["i"]].strip() if c["i"] < len(row) else "")
|
||||
if v:
|
||||
answers[sanitize_key(c["code"])] = v
|
||||
meta = {
|
||||
"start_date": get(row, "StartDate") or None,
|
||||
"end_date": get(row, "EndDate") or None,
|
||||
"recorded_date": get(row, "RecordedDate") or None,
|
||||
"status": get(row, "Status") or None,
|
||||
"progress": int(get(row, "Progress")) if get(row, "Progress").isdigit() else (get(row, "Progress") or None),
|
||||
"finished": get(row, "Finished") in ("True", "1", "TRUE"),
|
||||
"duration_sec": int(get(row, "Duration (in seconds)")) if get(row, "Duration (in seconds)").isdigit() else None,
|
||||
"user_language": get(row, "UserLanguage") or None,
|
||||
"distribution_channel": get(row, "DistributionChannel") or None,
|
||||
"ip_address": get(row, "IPAddress") or None,
|
||||
"location_lat": get(row, "LocationLatitude") or None,
|
||||
"location_lng": get(row, "LocationLongitude") or None,
|
||||
"survey_date": get(row, "Date") or None,
|
||||
"survey_time": get(row, "Time") or None,
|
||||
}
|
||||
doc = {
|
||||
"_id": rid, "study": "77242113UCO3002",
|
||||
"site_country": get(row, "site_country") or None,
|
||||
"site_name": get(row, "site_name") or None,
|
||||
"site_city": get(row, "site_city") or None,
|
||||
"site_state": get(row, "site_state") or None,
|
||||
"site_postcode": get(row, "site_postcode") or None,
|
||||
"site_address": get(row, "site_address") or None,
|
||||
"pi_first_name": get(row, "pi_first_name") or None,
|
||||
"pi_last_name": get(row, "pi_last_name") or None,
|
||||
"pi_email": (get(row, "pi_email") or "").lower() or None,
|
||||
"pi_phone": get(row, "pi_phone") or None,
|
||||
"sdl_site_id": get(row, "sdl_site_id") or None,
|
||||
"fire_site_id": get(row, "fire_site_id") or None,
|
||||
"fire_investigator_id": get(row, "fire_investigator_id") or None,
|
||||
"mailinglist_id": get(row, "mailinglist_id") or None,
|
||||
"survey_generated_by": get(row, "survey_generated_by") or None,
|
||||
"recipient_email": (get(row, "RecipientEmail") or "").lower() or None,
|
||||
"recipient_last_name": get(row, "RecipientLastName") or None,
|
||||
"recipient_first_name": get(row, "RecipientFirstName") or None,
|
||||
"meta": meta,
|
||||
"is_full_sipiq": any(k.startswith(("Q57", "Q58", "Q59", "Q63", "Q66", "Q71")) for k in answers),
|
||||
"interested": answers.get("Q25"),
|
||||
"answers": answers,
|
||||
"investigator_oid": None, "investigator_match": None,
|
||||
"source_file": source_file,
|
||||
}
|
||||
return doc
|
||||
|
||||
|
||||
def content_hash(doc):
|
||||
payload = {k: doc[k] for k in doc if k not in
|
||||
("content_sha256", "first_imported_at", "last_seen_at", "last_updated_at",
|
||||
"history", "investigator_oid", "investigator_match", "source_file",
|
||||
"source_exported_at")}
|
||||
return hashlib.sha256(json.dumps(payload, sort_keys=True, ensure_ascii=False,
|
||||
default=str).encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def load_investigators(db):
|
||||
inv = list(db.investigators.find(
|
||||
{"zeme": {"$in": ["Czech Republic", "Slovakia"]}},
|
||||
{"prijmeni": 1, "jmeno": 1, "email": 1, "email2": 1, "zeme": 1, "KROK": 1}))
|
||||
by_email, by_name = {}, {}
|
||||
for d in inv:
|
||||
for ef in ("email", "email2"):
|
||||
e = (d.get(ef) or "").lower().strip()
|
||||
if e:
|
||||
by_email.setdefault(e, d)
|
||||
nm = norm_name(d.get("prijmeni"))
|
||||
if nm:
|
||||
by_name.setdefault((nm, d.get("zeme")), []).append(d)
|
||||
return inv, by_email, by_name
|
||||
|
||||
|
||||
def soft_link(doc, by_email, by_name):
|
||||
e = (doc.get("pi_email") or "").lower().strip()
|
||||
if e and e in by_email:
|
||||
d = by_email[e]; return d["_id"], f"email:{e}", d
|
||||
e2 = (doc.get("recipient_email") or "").lower().strip()
|
||||
if e2 and e2 in by_email:
|
||||
d = by_email[e2]; return d["_id"], f"recipient_email:{e2}", d
|
||||
nm = norm_name(doc.get("pi_last_name"))
|
||||
cand = by_name.get((nm, doc.get("site_country")), [])
|
||||
if len(cand) == 1:
|
||||
return cand[0]["_id"], f"prijmeni:{nm}", cand[0]
|
||||
if len(cand) > 1:
|
||||
return None, f"prijmeni_ambiguous:{nm}({len(cand)})", None
|
||||
return None, "NENALEZENO", None
|
||||
|
||||
|
||||
def diff_docs(old, new):
|
||||
changes = []
|
||||
def walk(prefix, o, n):
|
||||
for k in sorted(set((o or {}).keys()) | set((n or {}).keys())):
|
||||
ov, nv = (o or {}).get(k), (n or {}).get(k)
|
||||
if isinstance(ov, dict) or isinstance(nv, dict):
|
||||
walk(f"{prefix}{k}.", ov or {}, nv or {})
|
||||
elif ov != nv:
|
||||
changes.append({"key": f"{prefix}{k}", "old": ov, "new": nv})
|
||||
for field in ("answers", "meta"):
|
||||
walk(f"{field}.", old.get(field, {}), new.get(field, {}))
|
||||
for k in ("site_name", "pi_email", "pi_last_name", "interested", "is_full_sipiq"):
|
||||
if old.get(k) != new.get(k):
|
||||
changes.append({"key": k, "old": old.get(k), "new": new.get(k)})
|
||||
return changes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
def process_file(db, csv_path, scope, dry, by_email, by_name):
|
||||
source_file = os.path.basename(csv_path)
|
||||
exported_at = file_mtime_iso(csv_path) # datum/čas reportu dle filesystému (mtime)
|
||||
cols, data = load_csv(csv_path)
|
||||
get, _ = col_getter(cols, data)
|
||||
if scope == "czsk":
|
||||
data = [r for r in data if get(r, "site_country") in ("Czech Republic", "Slovakia")]
|
||||
print(f"\n########## {source_file} (rozsah={scope}, odpovědí={len(data)}, export={exported_at}) ##########")
|
||||
|
||||
cols_all, data_all = load_csv(csv_path)
|
||||
questions = build_questions(cols_all, data_all)
|
||||
|
||||
docs, link_rows = [], []
|
||||
for r in data:
|
||||
doc = build_response(cols, get, r, source_file)
|
||||
oid, how, matched = soft_link(doc, by_email, by_name)
|
||||
doc["investigator_oid"] = oid
|
||||
doc["investigator_match"] = how
|
||||
doc["source_exported_at"] = exported_at
|
||||
doc["content_sha256"] = content_hash(doc)
|
||||
docs.append(doc)
|
||||
link_rows.append((doc, how, matched))
|
||||
|
||||
existing = {d["_id"]: d for d in db[COL_R].find({}, {"content_sha256": 1})}
|
||||
to_insert = [d for d in docs if d["_id"] not in existing]
|
||||
to_update = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") != d["content_sha256"]]
|
||||
unchanged = [d for d in docs if d["_id"] in existing and existing[d["_id"]].get("content_sha256") == d["content_sha256"]]
|
||||
|
||||
mk7 = mko = un = 0
|
||||
for doc, how, m in link_rows:
|
||||
krok = (m or {}).get("KROK", "")
|
||||
if m and str(krok).startswith("7"): mk7 += 1
|
||||
elif m: mko += 1
|
||||
else: un += 1
|
||||
print(f" slovník: {len(questions)} otázek | soft-link: KROK7={mk7}, jiný={mko}, nenapárováno={un}")
|
||||
print(f" delta: INSERT={len(to_insert)}, UPDATE={len(to_update)}, beze změny={len(unchanged)}")
|
||||
if un:
|
||||
for doc, how, m in link_rows:
|
||||
if not m:
|
||||
print(f" ✗ NENAPÁROVÁNO: {doc.get('pi_last_name')} / {doc.get('pi_email')} ({how})")
|
||||
|
||||
if dry:
|
||||
print(" [DRY-RUN] nezapsáno")
|
||||
return {"insert": 0, "update": 0, "unchanged": 0, "wrote": False}
|
||||
|
||||
for q in questions:
|
||||
db[COL_Q].replace_one({"_id": q["_id"]}, q, upsert=True)
|
||||
ts = now_iso()
|
||||
ni = nu = ns = 0
|
||||
for d in docs:
|
||||
cur = db[COL_R].find_one({"_id": d["_id"]})
|
||||
if cur is None:
|
||||
d.update({"first_imported_at": ts, "last_seen_at": ts, "last_updated_at": ts, "history": []})
|
||||
db[COL_R].insert_one(d); ni += 1
|
||||
elif cur.get("content_sha256") != d["content_sha256"]:
|
||||
changes = diff_docs(cur, d)
|
||||
db[COL_R].update_one({"_id": d["_id"]}, {
|
||||
"$set": {**{k: d[k] for k in d if k != "_id"}, "last_seen_at": ts, "last_updated_at": ts},
|
||||
"$push": {"history": {"changed_at": ts, "source_file": source_file, "changes": changes}}})
|
||||
nu += 1
|
||||
else:
|
||||
db[COL_R].update_one({"_id": d["_id"]}, {"$set": {
|
||||
"last_seen_at": ts, "source_file": source_file, "source_exported_at": d["source_exported_at"]}})
|
||||
ns += 1
|
||||
print(f" [APPLY] questions upsert={len(questions)} | responses insert={ni}, update={nu}, beze změny={ns}")
|
||||
return {"insert": ni, "update": nu, "unchanged": ns, "wrote": True}
|
||||
|
||||
|
||||
def move_to_processed(csv_path, folder):
|
||||
dest_dir = os.path.join(folder, PROCESSED_SUBDIR)
|
||||
os.makedirs(dest_dir, exist_ok=True)
|
||||
base = os.path.basename(csv_path)
|
||||
dest = os.path.join(dest_dir, base)
|
||||
if os.path.exists(dest):
|
||||
stem, ext = os.path.splitext(base)
|
||||
n = 1
|
||||
while os.path.exists(os.path.join(dest_dir, f"{stem}_{n}{ext}")):
|
||||
n += 1
|
||||
dest = os.path.join(dest_dir, f"{stem}_{n}{ext}")
|
||||
shutil.move(csv_path, dest)
|
||||
print(f" -> přesunuto do {PROCESSED_SUBDIR}\\{os.path.basename(dest)}")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--csv", help="jediný soubor (NEpřesouvá)")
|
||||
ap.add_argument("--folder", default=DEFAULT_FOLDER, help="složka se SIPIQ CSV (přesune do Zpracováno)")
|
||||
ap.add_argument("--scope", choices=["czsk", "all"], default="czsk")
|
||||
ap.add_argument("--dry-run", action="store_true", help="jen náhled, NEzapisuje (DEFAULT je ostře)")
|
||||
ap.add_argument("--apply", action="store_true", help="(zpětná kompatibilita; ostře je už default)")
|
||||
args = ap.parse_args()
|
||||
dry = args.dry_run # DEFAULT = OSTŘE (zapisuje + přesouvá do Zpracováno); náhled jen s --dry-run
|
||||
|
||||
if args.csv:
|
||||
files, move_mode, folder = [args.csv], False, None
|
||||
else:
|
||||
folder = args.folder
|
||||
files = sorted(glob.glob(os.path.join(folder, "*.csv")))
|
||||
move_mode = True
|
||||
print(f"Složka: {folder}\nNalezeno CSV ke zpracování: {len(files)}")
|
||||
if not files:
|
||||
print("Nic ke zpracování (žádné *.csv).")
|
||||
return
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=8000)
|
||||
db = client[DB_NAME]
|
||||
client.admin.command("ping")
|
||||
inv, by_email, by_name = load_investigators(db)
|
||||
print(f"Investigatorů CZ+SK v DB: {len(inv)}")
|
||||
|
||||
total = {"insert": 0, "update": 0, "unchanged": 0}
|
||||
for f in files:
|
||||
res = process_file(db, f, args.scope, dry, by_email, by_name)
|
||||
for k in total:
|
||||
total[k] += res[k]
|
||||
if move_mode and res["wrote"]:
|
||||
move_to_processed(f, folder)
|
||||
|
||||
print(f"\n=== CELKEM: insert={total['insert']}, update={total['update']}, beze změny={total['unchanged']} ===")
|
||||
if dry:
|
||||
print("[DRY-RUN] Nic se nezapsalo ani nepřesunulo. Ostrý běh = spusť BEZ --dry-run.")
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,28 @@
|
||||
# sipiq_send_v1.0 — první rozeslání SIPIQ (.eml draft + KROK 6)
|
||||
|
||||
**Verze:** 1.0 · **Datum:** 2026-06-19 · **Studie:** 77242113UCO3002 (ICONIC / DAWN)
|
||||
|
||||
## Co dělá
|
||||
Vygeneruje `.eml` draft **prvního rozeslání** feasibility dotazníku SIPIQ lékaři po
|
||||
podpisu CDA (KROK 5). Tělo: poděkování za CDA → odkaz na dotazník specifický pro centrum
|
||||
(link z Trilia, poznámka „SIPIQ" `hAMNUnUQdCRn`) → prosba o vyplnění. Skloňování dle pohlaví.
|
||||
**Tělo base64, BEZ hlavičky From** (JNJ Outlook doplní odesílatele z GAL → odešle přes JNJ
|
||||
Exchange). To = email(+email2, dd-proxy vynechány), **Cc** = koordinátorky (AKocourk+EBartoso).
|
||||
|
||||
S `--apply` zároveň: uloží `sipiq.link` + `sipiq.odeslano` do investigators a posune na
|
||||
**KROK „6 - SIPIQ odeslán"** + předřadí řádek do STATUS.
|
||||
|
||||
## Použití
|
||||
```
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_send_v1.0.py --name Leksa --link "https://..." # dry-run
|
||||
.venv\Scripts\python.exe Feasibility\sipiq_send_v1.0.py --name Leksa --link "https://..." --apply # .eml + Mongo
|
||||
```
|
||||
Výstup `…\UploadToJNJ\sipiq_odeslani_<prijmeni>_<DDMMMYYYY>.eml`. Mongo 192.168.1.76:27017.
|
||||
|
||||
## Pozor
|
||||
Draft ≠ odesláno — uživatel link odešle ručně z JNJ Outlooku. KROK 6 se ale na přání zapisuje
|
||||
rovnou při generování.
|
||||
|
||||
## Běh 19JUN2026 (--apply)
|
||||
**Leksa Václav** — KROK 5 → 6, link z Trilia uložen do `sipiq.link`,
|
||||
draft `sipiq_odeslani_leksa_19JUN2026.eml`.
|
||||
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
sipiq_send_v1.0.py
|
||||
==================
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-19
|
||||
Autor: Claude Code (pro MUDr. Vladimíra Buzalku)
|
||||
|
||||
Popis
|
||||
-----
|
||||
Vygeneruje .eml draft PRVNÍHO ROZESLÁNÍ feasibility dotazníku SIPIQ ke studii
|
||||
77242113UCO3002 (DAWN, icotrokinra, ulcerózní kolitida) lékaři, který podepsal CDA
|
||||
(KROK 5). Tělo: poděkování za podpis CDA + zaslání odkazu na dotazník specifický
|
||||
pro centrum (link z Trilia / argumentu) + nabídka pomoci.
|
||||
|
||||
Zároveň (s --apply):
|
||||
- uloží odkaz do investigators.sipiq.link (aby fungovaly pozdější připomínky),
|
||||
- posune lékaře na KROK "6 - SIPIQ odeslán" a předřadí řádek do STATUS.
|
||||
|
||||
To = email + email2 (dd-proxy vynechány), Cc = koordinátorky, BEZ hlavičky From
|
||||
(JNJ Outlook doplní odesílatele z GAL a odešle přes JNJ Exchange). Tělo BASE64.
|
||||
Draft ≠ odesláno — STATUS / KROK 6 se ale na přání píše rovnou (uživatel link odešle ručně).
|
||||
|
||||
Použití:
|
||||
python sipiq_send_v1.0.py --name Leksa --link "https://...." # dry-run
|
||||
python sipiq_send_v1.0.py --name Leksa --link "https://...." --apply # .eml + Mongo
|
||||
Mongo 192.168.1.76:27017, bez auth, pymongo.
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import unicodedata
|
||||
from datetime import date
|
||||
from email.message import EmailMessage
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
OUT_DIR = r"u:\Dropbox\!!!Days\Downloads Z230\UploadToJNJ"
|
||||
CC = "AKocourk@ITS.JNJ.com, EBartoso@its.jnj.com"
|
||||
SIG = ("MUDr. Vladimír BUZALKA<br>"
|
||||
"ICON plc / Performing Local Trial Management Services for Janssen – Cilag s.r.o. "
|
||||
"/ Global Clinical Operations<br>"
|
||||
"Mobile: +420 775 735 276 / Fax: +420 227 012 284<br>"
|
||||
"E-mail: vbuzalka@its.jnj.com, vladimir.buzalka@iconplc.com")
|
||||
|
||||
CZ_MON = {1: "ledna", 2: "února", 3: "března", 4: "dubna", 5: "května", 6: "června",
|
||||
7: "července", 8: "srpna", 9: "září", 10: "října", 11: "listopadu", 12: "prosince"}
|
||||
|
||||
|
||||
def ascii_slug(s):
|
||||
return "".join(c for c in unicodedata.normalize("NFKD", str(s or ""))
|
||||
if not unicodedata.combining(c)).lower().replace(" ", "")
|
||||
|
||||
|
||||
def is_female(prijmeni):
|
||||
p = (prijmeni or "").lower()
|
||||
return p.endswith("ová") or p.endswith("ova") or p.endswith("á")
|
||||
|
||||
|
||||
def build_body(prijmeni, link):
|
||||
f = is_female(prijmeni)
|
||||
greet = "Vážená paní doktorko," if f else "Vážený pane doktore,"
|
||||
return (
|
||||
f'<p>{greet}</p>'
|
||||
f'<p>velmi děkuji za podpis dohody o mlčenlivosti (CDA) ke studii '
|
||||
f'<b>77242113UCO3002 (DAWN)</b> (přípravek icotrokinra, ulcerózní kolitida).</p>'
|
||||
f'<p>Dovoluji si Vám nyní zaslat feasibility dotazník <b>SIPIQ</b>, který je '
|
||||
f'klíčový pro posouzení vhodnosti Vašeho centra a další postup. Níže najdete '
|
||||
f'odkaz na dotazník specifický pro Vaše centrum:</p>'
|
||||
f'<p><a href="{link}">{link}</a></p>'
|
||||
f'<p><b>Budu velmi rád za jeho vyplnění v co nejkratším termínu.</b> V případě '
|
||||
f'jakýchkoli dotazů (např. potíže s odkazem) jsem Vám plně k dispozici.</p>'
|
||||
f'<p>Předem děkuji za Váš čas a vstřícnost.</p>'
|
||||
f'<p>S pozdravem,<br>{SIG}</p>'
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--name", required=True, help="příjmení (pi v investigators)")
|
||||
ap.add_argument("--link", required=True, help="SIPIQ odkaz specifický pro centrum")
|
||||
ap.add_argument("--apply", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
db = MongoClient("mongodb://192.168.1.76:27017", serverSelectionTimeoutMS=8000).feasibility
|
||||
d = db.investigators.find_one({"prijmeni": args.name})
|
||||
if not d:
|
||||
print(f" ! nenalezeno: {args.name}"); return
|
||||
|
||||
tos = [e for e in [d.get("email"), d.get("email2")] if e]
|
||||
tos = [e for e in dict.fromkeys(tos) if "dd-proxy" not in e.lower()]
|
||||
to = ", ".join(tos)
|
||||
body = build_body(d["prijmeni"], args.link)
|
||||
today = date.today()
|
||||
today_cz = f"{today.day}. {CZ_MON[today.month]} {today.year}"
|
||||
print(f" {d.get('jmeno')} {d['prijmeni']} | To: {to} | Cc: {CC}")
|
||||
print(f" Link: {args.link}")
|
||||
print(f" KROK: {d.get('KROK')} -> 6 - SIPIQ odeslán")
|
||||
|
||||
if not args.apply:
|
||||
print("\n[DRY-RUN] nic nezapsáno. Ostrý: --apply"); return
|
||||
|
||||
subject = "77242113UCO3002 (DAWN) — feasibility dotazník SIPIQ k vyplnění"
|
||||
msg = EmailMessage()
|
||||
# ŽÁDNÝ From — JNJ Outlook doplní odesílatele z GAL (odešle přes JNJ Exchange)
|
||||
msg["To"] = to
|
||||
msg["Cc"] = CC
|
||||
msg["Subject"] = subject
|
||||
msg["X-Unsent"] = "1"
|
||||
html = (f'<html><body style="font-family:Calibri,Arial,sans-serif;font-size:11pt">'
|
||||
f'{body}</body></html>')
|
||||
msg.set_content(html, subtype="html", charset="utf-8", cte="base64")
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
DD = today.strftime("%d") + ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"][today.month - 1] + today.strftime("%Y")
|
||||
fn = f"sipiq_odeslani_{ascii_slug(d['prijmeni'])}_{DD}.eml"
|
||||
with open(os.path.join(OUT_DIR, fn), "wb") as fh:
|
||||
fh.write(bytes(msg))
|
||||
|
||||
old_status = d.get("STATUS", "") or ""
|
||||
new_line = (f"{DD}: SIPIQ ROZESLÁN — odkaz specifický pro centrum zaslán lékaři "
|
||||
f"(.eml draft {fn}, Cc koordinátorky). KROK 6.")
|
||||
db.investigators.update_one({"_id": d["_id"]}, {"$set": {
|
||||
"KROK": "6 - SIPIQ odeslán",
|
||||
"STATUS": new_line + "\n" + old_status,
|
||||
"sipiq.link": args.link,
|
||||
"sipiq.odeslano": today_cz,
|
||||
}})
|
||||
print(f"\n[APPLY] Draft: {fn} | sipiq.link uložen | KROK 6")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,33 @@
|
||||
# store_cda_seaweed_v1.1.py
|
||||
|
||||
**Verze:** 1.1 · **Datum:** 2026-06-19
|
||||
|
||||
## Účel
|
||||
Uloží podepsané CDA (PDF) z e-mailů asistentek (CTA) do Mongo
|
||||
`feasibility.investigators` do pole `cda.*` a posune lékaře na
|
||||
`KROK "5 - CDA podepsano"`. PDF se stahuje přímo ze SeaweedFS přes
|
||||
`seaweed_url`, který parser ukládá k příloze v `emaily."vbuzalka@its.jnj.com"`
|
||||
(`attachments[].seaweed_url` + `sha256`).
|
||||
|
||||
## Změna proti v1.0
|
||||
- STATUS řádek a datum z konstanty `DATE`.
|
||||
- `MAPPING` = explicitní párování investigator → CDA příloha (edituje se před každým během).
|
||||
- v1.0 → `Feasibility\TRASH`.
|
||||
|
||||
## Jak to funguje
|
||||
Pro každý záznam: stáhne PDF (urllib), ověří **SHA256 + velikost + PDF hlavičku**,
|
||||
base64-zakóduje a uloží do `cda` (`data_base64, data_sha256, data_filename, …`).
|
||||
Nastaví `KROK = "5 - CDA podepsano"` a předřadí řádek do `STATUS`.
|
||||
|
||||
## Použití
|
||||
```
|
||||
.venv\Scripts\python.exe Feasibility\store_cda_seaweed_v1.1.py # dry-run
|
||||
.venv\Scripts\python.exe Feasibility\store_cda_seaweed_v1.1.py --apply # zápis
|
||||
```
|
||||
|
||||
## Běh 19JUN2026 (--apply)
|
||||
Uloženo 1/1 (SHA256 OK): **Leksa Václav** (CZ_CDA PI fully signed 19Jun2026),
|
||||
KROK 4 → 5. Zdroj e-mail CTA Hrabalové.
|
||||
|
||||
## Závislosti
|
||||
`pymongo`, `bson` (+ stdlib). SeaweedFS `192.168.1.50:8888`. Mongo `192.168.1.76:27017`.
|
||||
@@ -0,0 +1,105 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# =============================================================================
|
||||
# Nazev: store_cda_seaweed_v1.1.py
|
||||
# Verze: 1.1
|
||||
# Datum: 2026-06-19
|
||||
# Popis: Ulozi podepsane CDA (PDF) z e-mailu asistentek do Mongo
|
||||
# feasibility.investigators do pole cda.* a posune lekare na
|
||||
# KROK "5 - CDA podepsano". PDF se stahuji primo ze SeaweedFS
|
||||
# (seaweed_url z attachments v emaily."vbuzalka@its.jnj.com"),
|
||||
# overuje se SHA256 proti metadatum z Mongo.
|
||||
# Pouziti: python store_cda_seaweed_v1.1.py (dry-run / nahled)
|
||||
# python store_cda_seaweed_v1.1.py --apply (zapise do Mongo)
|
||||
# Zmena: v1.1 - STATUS radek + datum bere z DATE; MAPPING = Leksa Vaclav.
|
||||
# v1.0 -> TRASH.
|
||||
# Pozn.: MAPPING nize = explicitni parovani investigator -> CDA priloha.
|
||||
# Jen stdlib + pymongo. SeaweedFS host 192.168.1.50:8888.
|
||||
# =============================================================================
|
||||
|
||||
import sys
|
||||
import base64
|
||||
import hashlib
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pymongo import MongoClient
|
||||
from bson import ObjectId
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DBN, COL = "feasibility", "investigators"
|
||||
DATE = "19JUN2026"
|
||||
|
||||
# (investigator _id, seaweed_url, filename, sha256, size, source_msg_id, label)
|
||||
MAPPING = [
|
||||
("6a268cdeb84bf5597759b478",
|
||||
"http://192.168.1.50:8888/mail-attachments/b5/c8/b5c8677c335f77e2b3184aca71628393bf30bd843334edfdecd32b544e91882d",
|
||||
"CZ_CDA PI_MUDr. Stanislav Reif_ICO_fully signed_18Jun2026.pdf",
|
||||
"b5c8677c335f77e2b3184aca71628393bf30bd843334edfdecd32b544e91882d",
|
||||
476306, "<CH2PR07MB7190C02CBCFF82E500D792B980E32@CH2PR07MB7190.namprd07.prod.outlook.com>",
|
||||
"Reif Stanislav"),
|
||||
]
|
||||
|
||||
|
||||
def fetch(url):
|
||||
with urllib.request.urlopen(url, timeout=30) as r:
|
||||
return r.read()
|
||||
|
||||
|
||||
def main():
|
||||
apply = "--apply" in sys.argv
|
||||
cli = MongoClient(MONGO_URI)
|
||||
col = cli[DBN][COL]
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
ok = 0
|
||||
for _id, url, fname, sha, size, src, label in MAPPING:
|
||||
oid = ObjectId(_id)
|
||||
doc = col.find_one({"_id": oid}, {"STATUS": 1, "KROK": 1, "cda.stav": 1})
|
||||
if not doc:
|
||||
print(f" !! {label}: investigator _id={_id} NENALEZEN"); continue
|
||||
try:
|
||||
raw = fetch(url)
|
||||
except Exception as e:
|
||||
print(f" !! {label}: stazeni selhalo: {e}"); continue
|
||||
got = hashlib.sha256(raw).hexdigest()
|
||||
sha_ok = (got == sha)
|
||||
size_ok = (len(raw) == size)
|
||||
head_ok = raw[:5] == b"%PDF-"
|
||||
print(f" [{label}]")
|
||||
print(f" soubor : {fname}")
|
||||
print(f" stazeno : {len(raw)} B (ocek. {size}) {'OK' if size_ok else 'MISMATCH'}")
|
||||
print(f" sha256 : {'OK' if sha_ok else 'MISMATCH! ' + got}")
|
||||
print(f" PDF hdr : {'OK' if head_ok else 'NENI PDF'}")
|
||||
print(f" KROK : {doc.get('KROK')} -> 5 - CDA podepsano")
|
||||
if not (sha_ok and size_ok and head_ok):
|
||||
print(" >> PRESKAKUJI (kontrola selhala)"); continue
|
||||
if not apply:
|
||||
ok += 1; continue
|
||||
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
old_status = doc.get("STATUS", "") or ""
|
||||
new_line = (f"{DATE}: podepsane CDA ULOZENO do Mongo (cda.data) — {fname} "
|
||||
f"(z e-mailu asistentky CTA). KROK 5, pripraveno na SIPIQ.")
|
||||
col.update_one({"_id": oid}, {"$set": {
|
||||
"KROK": "5 - CDA podepsano",
|
||||
"STATUS": new_line + "\n" + old_status,
|
||||
"cda.stav": "podepsano",
|
||||
"cda.soubor": fname,
|
||||
"cda.zdroj": "e-mail asistentky (SeaweedFS)",
|
||||
"cda.data_base64": b64,
|
||||
"cda.data_sha256": sha,
|
||||
"cda.data_filename": fname,
|
||||
"cda.data_mime": "application/pdf",
|
||||
"cda.data_size": len(raw),
|
||||
"cda.data_stored_at": now,
|
||||
"cda.data_source_msg": src,
|
||||
}})
|
||||
ok += 1
|
||||
print(" >> ULOZENO + KROK 5")
|
||||
|
||||
print(f"\n{'ZAPSANO' if apply else 'DRY-RUN OK'}: {ok}/{len(MAPPING)}")
|
||||
if not apply:
|
||||
print(">>> Pro zapis spust s --apply")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user