Files
janssen/Feasibility/77242113UCO2001/TRASH/store_cda_batch_v1.1.py
T

148 lines
5.7 KiB
Python

# -*- coding: utf-8 -*-
# =============================================================================
# Nazev: store_cda_batch_v1.1.py
# Verze: 1.1
# Datum: 2026-06-11
# Popis: Davkove ulozi binarky CDA (PDF) do Mongo k investigatorum
# (feasibility.investigators -> cda.data_*). Zdroj = .msg soubory na
# Toweru (/mnt/user/JNJEMAILS), stazene pres SFTP, priloha vytazena
# extract_msg. Mapovani investigator -> (.msg, attachment) je
# explicitni (zadne hadani za behu). Drzi se domluvy: fyzicky
# dokument z e-mailu -> do Mongo (CDA fyzicky ulozeno k lekarum).
# Zapise: cda.data_base64, cda.data_sha256, cda.data_filename,
# cda.data_mime, cda.data_size, cda.data_stored_at,
# cda.data_source_msg; doplni cda.soubor pokud chybi.
# Existujici cda.* (stav, datum_*, zdroj, poznamka) NEMENI.
# Pouziti: python store_cda_batch_v1.1.py (dry-run / nahled)
# python store_cda_batch_v1.1.py --apply (zapise do Mongo)
# Zmeny v1.1: DAVKA 4 (11JUN2026) - Konecny Michal + Balaz Jozef (krok 4 -> 5).
# =============================================================================
import os
import sys
import base64
import hashlib
import unicodedata
import paramiko
import extract_msg
from pymongo import MongoClient
from bson import ObjectId
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://192.168.1.76:27017")
TOWER_HOST = "192.168.1.76"
TOWER_USER = "root"
TOWER_PASS = "7309208104"
REMOTE_DIR = "/mnt/user/JNJEMAILS"
TMPDIR = r"u:\Dropbox\!!!Days\Downloads Z230\_cda_tmp"
STORED_AT = "2026-06-11"
# investigator_id -> (msg_filename, attachment_filename, label)
# DAVKA 4 (11JUN2026): nove CDA z 10.6. (krok 4 -> 5)
MAPPING = [
("6a19832b5fc221351825796f", "FC130007DE92C2310000.msg",
"CZ_CDA Institution_MUDr. Michal Konecný, Ph.D. s.r.o._fully signed 10Jun2026.pdf",
"Konecny Michal (MUDr. Michal Konecny, Ph.D. s.r.o.)"),
("6a19832b5fc2213518257953", "FC130007DE92C20F0000.msg",
"SK_CDA PI_MUDr. Jozef Balaz_FD Roosevelta_BB_10Jun2026.pdf",
"Balaz Jozef (FNsP F. D. Roosevelta Banska Bystrica)"),
]
# HISTORIE drivejsich davek (jiz ulozeno):
# DAVKA 3 (10JUN2026): Gregar/MUDr.GREGAR FC130007DE92C204, Durina/FN Nove Zamky
# FC130007DE92C203, Horvath/Accout Center FC130007DE92C1FE.
# DAVKA 1+2 (09JUN2026): Hlavaty/Cliniq FC1300053049739C, Fedurco/ENDOMED
# FC1300053049739B, Tichy FC13000530495B95, Falc FC130007D8A1F0E6, Pesta
# FC130007D8A1F0E1, Jungwirthova FC130007D8A1F0E2, Lukac FC130007C9E971FF
# (store_cda_to_mongo_v1.0), Matous/Axon FC130007D8A1F0E3, Mihalkanin/GastroLM
# FC130007D8A1F0E6, Krizova/Motol FC130007C1643CA1.
def norm(s):
"""lowercase, bez diakritiky, sjednocene mezery"""
s = s or ""
s = unicodedata.normalize("NFKD", s)
s = "".join(c for c in s if not unicodedata.combining(c))
return " ".join(s.lower().split())
def main():
apply = "--apply" in sys.argv
os.makedirs(TMPDIR, exist_ok=True)
# SSH/SFTP
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(TOWER_HOST, username=TOWER_USER, password=TOWER_PASS, timeout=30)
sftp = ssh.open_sftp()
client = MongoClient(MONGO_URI)
col = client["feasibility"]["investigators"]
plan = []
for inv_id, msg_name, att_name, label in MAPPING:
local_msg = os.path.join(TMPDIR, msg_name)
if not os.path.exists(local_msg):
sftp.get(f"{REMOTE_DIR}/{msg_name}", local_msg)
m = extract_msg.Message(local_msg)
target = norm(att_name)
chosen = None
for att in m.attachments:
name = att.longFilename or att.shortFilename or ""
if norm(name) == target or (target in norm(name)) or (norm(name) in target and name.lower().endswith(".pdf")):
chosen = (name, att.data)
break
m.close()
if not chosen:
plan.append((inv_id, label, msg_name, att_name, None, "!!! PRILOHA NENALEZENA"))
continue
raw = chosen[1]
sha = hashlib.sha256(raw).hexdigest()
plan.append((inv_id, label, msg_name, chosen[0], (len(raw), sha, raw), "OK"))
sftp.close(); ssh.close()
# Nahled
print("=== NAHLED DAVKY (CDA -> Mongo cda.data) ===\n")
for inv_id, label, msg_name, att_name, info, status in plan:
doc = col.find_one({"_id": ObjectId(inv_id)}, {"prijmeni": 1, "jmeno": 1, "cda.data_base64": 1})
has = bool(doc and doc.get("cda", {}).get("data_base64"))
print(f"[{status}] {label} (_id {inv_id})")
print(f" .msg: {msg_name}")
print(f" priloha: {att_name}")
if info:
print(f" velikost: {info[0]} B sha256: {info[1]}")
print(f" data_base64 jiz existuje: {has}")
print()
if not apply:
print(">>> DRY-RUN. Pro zapis spust s --apply")
return
n = 0
for inv_id, label, msg_name, att_name, info, status in plan:
if status != "OK" or not info:
print(f"PRESKAKUJI {label}: {status}")
continue
size, sha, raw = info
b64 = base64.b64encode(raw).decode("ascii")
res = col.update_one(
{"_id": ObjectId(inv_id)},
{"$set": {
"cda.data_base64": b64,
"cda.data_sha256": sha,
"cda.data_filename": att_name,
"cda.data_mime": "application/pdf",
"cda.data_size": size,
"cda.data_stored_at": STORED_AT,
"cda.data_source_msg": msg_name,
"cda.soubor": att_name,
}},
)
n += res.modified_count
print(f"ZAPSANO: {label} (modified={res.modified_count})")
print(f"\n>>> CELKEM ZAPSANO: {n}")
if __name__ == "__main__":
main()