janssen/Feasibility/analyze_sent_suspects_v1.0.py

# -*- coding: utf-8 -*-
# =============================================================================
# Nazev:   analyze_sent_suspects_v1.0.py
# Verze:   1.0
# Datum:   2026-06-16
# Popis:   LOKALNI (Z230) analyzator .msg souboru prenesenych z JNJ (vystup
#          jnj_scan_failed_sent). Pres olefile precte u kazdeho .msg klicove
#          MAPI vlastnosti a klasifikuje, zda jde o NEODESLANY e-mail:
#            FAIL_BODY  = telo/report obsahuje "could not be sent"/"SendAsDenied"
#            SENDAS_BUZ = send-account / sentrep / sender obsahuje "buzalka.cz"
#            NO_MSGID   = chybi Internet Message-ID (0x1035)
#          Vytahne prijemce (externi = lekar), subjekt, send-account, Message-ID.
#          Vystup: prehled do konzole + timestampovany .xlsx.
# Pouziti: python analyze_sent_suspects_v1.0.py [SLOZKA_S_MSG]
#          (default INPUT_DIR nize). Vyzaduje olefile + openpyxl.
# =============================================================================

import os
import re
import sys
import glob
import datetime
import olefile
import openpyxl

INPUT_DIR = r"U:\Dropbox\!!!Days\Downloads Z230\sent_suspects"
OUT_DIR = r"U:\Dropbox\!!!Days\Downloads Z230"

FAIL_SIGNS = [
    "could not be sent", "sendasdenied",
    "permission to send the message on behalf",
    "transportsend operation has failed", "mapiexceptionsendasdenied",
]
INTERNAL = ("its.jnj.com",)   # interni = ne-lekar (vc. cc Kocourkova/Bartosova)


def rd(o, tag):
    """Precti string stream __substg1.0_<tag> (zkousi 001F unicode i 001E ansi)."""
    for t in (tag, tag[:-1] + "F", tag[:-1] + "E"):
        name = "__substg1.0_" + t
        if o.exists(name):
            b = o.openstream(name).read()
            if t.endswith("001F"):
                try:
                    return b.decode("utf-16-le")
                except Exception:
                    pass
            for enc in ("cp1250", "latin-1", "utf-8"):
                try:
                    return b.decode(enc)
                except Exception:
                    pass
    return ""


def read_body(o):
    txt = rd(o, "1000001F")            # PR_BODY
    if not txt:
        txt = rd(o, "1001001F")        # ReportText
    # PR_HTML (binary) jako fallback
    if not txt and o.exists("__substg1.0_10130102"):
        try:
            txt = o.openstream("__substg1.0_10130102").read().decode("latin-1", "ignore")
        except Exception:
            pass
    return txt or ""


def recipients_smtp(o):
    """Posbira SMTP vsech prijemcu z __recip_version1.0_#xxxx storages."""
    out = []
    seen = set()
    for entry in o.listdir():
        # entry je list segmentu cesty; zajima nas prvni segment recip storage
        if entry and entry[0].startswith("__recip_version1.0_#") and len(entry) == 2:
            top = entry[0]
            if top in seen:
                continue
            seen.add(top)
            smtp = ""
            for tag in ("39FE001F", "39FE001E", "3003001F", "3003001E", "0C1F001F"):
                nm = top + "/__substg1.0_" + tag
                if o.exists(nm):
                    b = o.openstream(nm).read()
                    try:
                        s = b.decode("utf-16-le") if tag.endswith("1F") else b.decode("cp1250")
                    except Exception:
                        s = b.decode("latin-1", "ignore")
                    s = s.strip()
                    if "@" in s:
                        smtp = s
                        break
            if smtp:
                out.append(smtp)
    return out


def analyze_file(path):
    o = olefile.OleFileIO(path)
    try:
        subject = rd(o, "0037001F")
        msgid = rd(o, "1035001F")
        sendacct = rd(o, "0E28001F")
        sentrep = rd(o, "0065001F")
        sender = rd(o, "0C1F001F")
        body = read_body(o)
        recs = recipients_smtp(o)
    finally:
        o.close()

    low = body.lower()
    flags = []
    if any(s in low for s in FAIL_SIGNS):
        flags.append("FAIL_BODY")
    joined = " ".join([sendacct, sentrep, sender]).lower()
    if "buzalka.cz" in joined:
        flags.append("SENDAS_BUZ")
    if not msgid:
        flags.append("NO_MSGID")

    # prijemce-lekar = externi (ne its.jnj.com)
    ext = [r for r in recs if not any(d in r.lower() for d in INTERNAL)]
    recipient = ext[0] if ext else (recs[0] if recs else "")

    # datum z nazvu souboru (STRONG_YYYY-MM-DD_... / weak_YYYY-MM-DD_...)
    m = re.search(r"(\d{4}-\d{2}-\d{2})", os.path.basename(path))
    date = m.group(1) if m else ""

    return {
        "file": os.path.basename(path),
        "date": date,
        "recipient": recipient,
        "subject": subject.strip(),
        "msgid": msgid.strip(),
        "send_account": sendacct.strip(),
        "sentrep": sentrep.strip(),
        "flags": "+".join(flags),
        "failed": "ANO" if ("FAIL_BODY" in flags or "SENDAS_BUZ" in flags) else "?",
    }


def main():
    indir = sys.argv[1] if len(sys.argv) > 1 else INPUT_DIR
    files = sorted(glob.glob(os.path.join(indir, "*.msg")))
    if not files:
        print("Zadne .msg v:", indir)
        return

    rows = []
    for f in files:
        try:
            rows.append(analyze_file(f))
        except Exception as e:
            rows.append({"file": os.path.basename(f), "date": "", "recipient": "",
                         "subject": "<chyba cteni>", "msgid": "", "send_account": "",
                         "sentrep": "", "flags": "ERR:" + str(e), "failed": "?"})

    # serad: nejdriv jiste selhane, pak dle data
    rows.sort(key=lambda r: (r["failed"] != "ANO", r["date"]))

    n_fail = sum(1 for r in rows if r["failed"] == "ANO")
    n_sendas = sum(1 for r in rows if "SENDAS_BUZ" in r["flags"])
    n_failbody = sum(1 for r in rows if "FAIL_BODY" in r["flags"])
    n_nomid = sum(1 for r in rows if "NO_MSGID" in r["flags"])

    print(f"Souboru: {len(rows)}")
    print(f"  jiste selhane (FAIL_BODY/SENDAS_BUZ): {n_fail}")
    print(f"  z toho SENDAS_BUZ (buzalka.cz): {n_sendas} | FAIL_BODY: {n_failbody}")
    print(f"  jen NO_MSGID (slabe): {n_nomid - n_fail if n_nomid>=n_fail else n_nomid}")
    print("=" * 110)
    print(f"{'datum':10} {'prijemce':32} {'fail':4} {'flags':22} subjekt")
    print("-" * 110)
    for r in rows:
        print(f"{r['date']:10} {r['recipient'][:32]:32} {r['failed']:4} {r['flags']:22} {r['subject'][:40]}")

    # xlsx
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "suspects"
    cols = ["file", "date", "recipient", "subject", "msgid", "send_account", "sentrep", "flags", "failed"]
    from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

    def clean(v):
        return ILLEGAL_CHARACTERS_RE.sub("", str(v)) if v is not None else ""

    ws.append(cols)
    for r in rows:
        ws.append([clean(r[c]) for c in cols])
    stamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    out = os.path.join(OUT_DIR, f"sent_suspects_analyza_{stamp}.xlsx")
    wb.save(out)
    print("\nXLSX:", out)


if __name__ == "__main__":
    main()