janssen/EmailsImport/mcp_emaily.py

#!/usr/bin/env python3
"""
==============================================================================
MCP server: EMAILY  (vsechny schranky importovane z Microsoft Graph)

Hybridni dotaz nad:
  - PostgreSQL  192.168.1.76  db=MongoEmaily  tabulka=emails
                (fulltext tsvector - subject + sender + recipients +
                 attachments + body, GIN index, ts_headline, ts_rank)
  - MongoDB     192.168.1.76  db=emaily       kolekce=<mailbox>
                (puvodni dokumenty z parse_emails_graph_v1.3.py:
                 headers, body_html, recipients[], attachments[], ...)

Source: U:\\janssen\\EmailsImport\\enrich_fulltext_emails_v1.0.py

Spusteni:
    python mcp_emaily.py        (stdio MCP)
==============================================================================
"""

from __future__ import annotations

import re
import sys
import traceback
from datetime import datetime, timezone, timedelta
from typing import Optional, Union

import psycopg
from mcp.server.fastmcp import FastMCP
from pymongo import MongoClient

MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"

PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
          "user=vladimir.buzalka password=Vlado7309208104++")

DEFAULT_BODY_CHARS = 8000
MAX_BODY_CHARS = 200_000

SKIP_COLLECTIONS = {"attachments_index", "sync_state"}


def log(msg: str) -> None:
    print(msg, file=sys.stderr, flush=True)


try:
    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
    mongo.admin.command("ping")
    log(f"Mongo OK ({MONGO_URI})")
except Exception as e:
    log(f"Mongo connection failed: {e}")
    sys.exit(1)

try:
    _t = psycopg.connect(PG_DSN, connect_timeout=10)
    _t.close()
    log("Postgres OK")
except Exception as e:
    log(f"Postgres connection failed: {e}")
    sys.exit(1)


def pg_conn():
    return psycopg.connect(PG_DSN, connect_timeout=10)


def serialize(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    if isinstance(obj, bytes):
        return obj.decode("utf-8", errors="replace")
    if isinstance(obj, dict):
        return {k: serialize(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [serialize(v) for v in obj]
    return obj


def normalize_mailbox(mailbox: Optional[Union[str, list]]) -> Optional[list[str]]:
    if mailbox is None or mailbox == "" or mailbox == []:
        return None
    if isinstance(mailbox, str):
        return [mailbox]
    return list(mailbox)


def parse_since(s: Optional[str]) -> Optional[datetime]:
    if not s:
        return None
    try:
        if "T" in s:
            return datetime.fromisoformat(s.replace("Z", "+00:00"))
        return datetime.strptime(s, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    except Exception as e:
        raise ValueError(f"Bad date {s!r}: {e}")


# --- MCP --------------------------------------------------------------------
mcp = FastMCP("emaily")


@mcp.tool()
def ping() -> dict:
    """Quick health check. Reports Mongo + Postgres connectivity, total mailboxes,
    PG indexed emails count, ok/error breakdown.
    """
    try:
        info = mongo.admin.command("buildInfo")
        mailboxes = [c for c in mongo[MONGO_DB].list_collection_names()
                     if c not in SKIP_COLLECTIONS]
        mongo_counts = {}
        for mb in mailboxes:
            mongo_counts[mb] = mongo[MONGO_DB][mb].estimated_document_count()
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute("SELECT mailbox, ok, count(*) FROM emails "
                        "GROUP BY mailbox, ok ORDER BY mailbox, ok")
            rows = cur.fetchall()
        pg_summary: dict = {}
        for mb, ok, c in rows:
            pg_summary.setdefault(mb, {})[("ok" if ok else "error")] = c
        return {
            "status": "ok",
            "mongo_version": info.get("version"),
            "mailboxes": mailboxes,
            "mongo_email_count": mongo_counts,
            "pg_indexed_per_mailbox": pg_summary,
        }
    except Exception as e:
        log(traceback.format_exc())
        return {"status": "error", "error": str(e)}


@mcp.tool()
def list_mailboxes() -> dict:
    """Overview of all mailboxes — totals, indexed coverage, earliest/latest received_at,
    top senders by volume. Use to understand the corpus before searching.
    """
    out = {}
    try:
        mailboxes = [c for c in mongo[MONGO_DB].list_collection_names()
                     if c not in SKIP_COLLECTIONS]
        for mb in mailboxes:
            with pg_conn() as pg, pg.cursor() as cur:
                cur.execute("""
                    SELECT count(*) FILTER (WHERE ok) AS ok,
                           count(*) AS total,
                           min(received_at) AS first_at,
                           max(received_at) AS last_at,
                           count(*) FILTER (WHERE has_attachments) AS with_att
                    FROM emails WHERE mailbox = %s
                """, (mb,))
                ok, total, first_at, last_at, with_att = cur.fetchone()
                cur.execute("""
                    SELECT sender_email, count(*) c FROM emails
                    WHERE mailbox = %s AND sender_email IS NOT NULL
                    GROUP BY sender_email ORDER BY c DESC LIMIT 5
                """, (mb,))
                top_senders = [{"email": s, "count": c} for s, c in cur.fetchall()]
            out[mb] = {
                "indexed_ok": ok,
                "indexed_total": total,
                "with_attachments": with_att,
                "first_received": serialize(first_at),
                "last_received": serialize(last_at),
                "top_senders": top_senders,
            }
        return {"mailboxes": out}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def search(
    query: str,
    mailbox: Optional[Union[str, list]] = None,
    since: Optional[str] = None,
    until: Optional[str] = None,
    days: Optional[int] = None,
    inflect: bool = False,
    folder_contains: Optional[str] = None,
    sender_contains: Optional[str] = None,
    has_attachments: Optional[bool] = None,
    limit: int = 20,
) -> dict:
    """PRIMARY TOOL — fulltext search across all indexed emails.

    Index includes: subject, sender (email + name), recipients (to/cc),
    attachment filenames, AND full body text. Diacritics are stripped at index
    time, so search is already accent-insensitive (recept == řecept == RECEPT).

    query: websearch_to_tsquery syntax (only when inflect=False):
        invoice payment           -> AND
        "lot expiration"          -> phrase
        SAE OR "serious adverse"  -> OR
        urgent -newsletter        -> exclude
    mailbox: one mailbox string or list (e.g. "vbuzalka@its.jnj.com"). None = all.
    since/until: ISO date "YYYY-MM-DD" on received_at
    days: convenience window — only the last N days (overrides `since`). "za posledních X dní".
    inflect: Czech declension. The index uses no stemmer, so a plain search for
        "recept" misses "recepty/receptu/receptem/...". With inflect=True each word
        in `query` is prefix-matched (recept -> recept:*) and AND-ed, catching the
        other grammatical cases. Set this for Czech-word searches. Trade-off: a
        prefix also matches unrelated longer words (recept:* also hits "receptor").
        In this mode the query is treated as plain words (operators/quotes ignored).
    folder_contains: substring match against folder_path (case-insensitive)
    sender_contains: substring match against sender_email OR sender_name (case-insensitive)
    has_attachments: True / False / None (any)
    limit: max 100

    Returns ranked results with `snippet` showing matches highlighted as <<...>>.
    Use `read_email` to fetch full body of any hit.
    """
    try:
        mboxes = normalize_mailbox(mailbox)
        since_dt = parse_since(since)
        until_dt = parse_since(until)
        if days and days > 0:
            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
        limit = min(max(1, limit), 100)

        # Build the tsquery. inflect=True → prefix-match each word (Czech cases)
        # via to_tsquery; otherwise use websearch_to_tsquery for full operator support.
        tsq_func = "websearch_to_tsquery"
        tsq_text = query
        if inflect:
            tokens = re.findall(r"\w+", query, flags=re.UNICODE)
            if tokens:
                tsq_func = "to_tsquery"
                tsq_text = " & ".join(f"{t}:*" for t in tokens)

        sql = f"""
        WITH q AS (
            SELECT {tsq_func}('soubory'::regconfig, %(query)s) AS tsq
        )
        SELECT
            e.id, e.mailbox, e.message_id, e.conversation_id, e.folder_path,
            e.subject, e.sender_email, e.sender_name,
            e.to_addrs, e.cc_addrs,
            e.received_at, e.sent_at, e.is_read,
            e.has_attachments, e.attachment_count, e.attachments_summary,
            e.body_length, e.body_source,
            ts_rank(e.tsv, q.tsq) AS rank,
            ts_headline('soubory'::regconfig,
                left(coalesce(e.body, e.subject), 200000),
                q.tsq,
                'MaxFragments=3, MinWords=4, MaxWords=18, '
                'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
        FROM emails e, q
        WHERE e.tsv @@ q.tsq
          AND e.ok = TRUE
          AND (%(mboxes)s::text[] IS NULL OR e.mailbox = ANY(%(mboxes)s::text[]))
          AND (%(since)s::timestamptz IS NULL OR e.received_at >= %(since)s::timestamptz)
          AND (%(until)s::timestamptz IS NULL OR e.received_at <  %(until)s::timestamptz)
          AND (%(folder)s::text IS NULL OR e.folder_path ILIKE %(folder_like)s)
          AND (%(sender)s::text IS NULL
               OR e.sender_email ILIKE %(sender_like)s
               OR e.sender_name  ILIKE %(sender_like)s)
          AND (%(has_att)s::boolean IS NULL OR e.has_attachments = %(has_att)s::boolean)
        ORDER BY rank DESC, e.received_at DESC NULLS LAST
        LIMIT %(limit)s
        """
        params = {
            "query": query, "mboxes": mboxes,
            "since": since_dt, "until": until_dt,
            "folder": folder_contains,
            "folder_like": f"%{folder_contains}%" if folder_contains else None,
            "sender": sender_contains,
            "sender_like": f"%{sender_contains}%" if sender_contains else None,
            "has_att": has_attachments,
            "limit": limit,
        }
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute(sql, params)
            cols = [c.name for c in cur.description]
            rows = [dict(zip(cols, r)) for r in cur.fetchall()]

        results = []
        for r in rows:
            results.append({
                "mailbox": r["mailbox"],
                "message_id": r["message_id"],
                "conversation_id": r["conversation_id"],
                "folder": r["folder_path"],
                "subject": r["subject"],
                "from": (f"{r['sender_name']} <{r['sender_email']}>"
                         if r["sender_name"] else r["sender_email"]),
                "to": r["to_addrs"],
                "cc": r["cc_addrs"],
                "received_at": serialize(r["received_at"]),
                "is_read": r["is_read"],
                "has_attachments": r["has_attachments"],
                "attachment_count": r["attachment_count"],
                "attachments": r["attachments_summary"],
                "body_length": r["body_length"],
                "body_source": r["body_source"],
                "rank": round(float(r["rank"]), 5),
                "snippet": (r["snippet"] or "").strip(),
            })

        return {
            "query": query,
            "filters": {"mailbox": mboxes, "since": since, "until": until,
                        "folder_contains": folder_contains,
                        "sender_contains": sender_contains,
                        "has_attachments": has_attachments,
                        "limit": limit},
            "count": len(results),
            "results": results,
            "tip": "Use read_email(mailbox=..., message_id=...) for full body or thread.",
        }
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e), "query": query}


@mcp.tool()
def read_email(
    message_id: Optional[str] = None,
    mailbox: Optional[str] = None,
    offset: int = 0,
    length: int = DEFAULT_BODY_CHARS,
    around_match: Optional[str] = None,
    include_html: bool = False,
) -> dict:
    """Read one email — full plain text body + metadata.

    Identify by `message_id` (Internet Message-ID, the _id in Mongo).
    `mailbox` narrows the lookup if the same Message-ID appears in multiple mailboxes
    (e.g. you got copies in both work and personal accounts).

    offset, length: slice the body. length max 200000.
    around_match: case-insensitive substring; returns up to 3 windows of ~1000 chars
                  centered on matches, instead of a flat slice.
    include_html: also return raw body_html from Mongo (typically large — only if you
                  really need the original markup).
    """
    if not message_id:
        return {"error": "Provide message_id."}
    try:
        length = min(max(1, length), MAX_BODY_CHARS)

        sql = """
        SELECT id, mailbox, message_id, graph_id, conversation_id, folder_path,
               subject, sender_email, sender_name,
               to_addrs, cc_addrs, bcc_addrs,
               sent_at, received_at, modified_at, is_read, is_draft,
               has_attachments, attachment_count, attachments_summary,
               body, body_length, body_source,
               extractor_version, extracted_at, ok, error
        FROM emails WHERE message_id = %s
        """
        params = [message_id]
        if mailbox:
            sql += " AND mailbox = %s"
            params.append(mailbox)
        sql += " LIMIT 1"

        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute(sql, params)
            row = cur.fetchone()
            cols = [c.name for c in cur.description]
        if not row:
            return {"error": "Email not found.",
                    "message_id": message_id, "mailbox": mailbox}
        rec = dict(zip(cols, row))

        body = rec.get("body") or ""
        if around_match and body:
            needle = around_match.lower()
            hay = body.lower()
            windows = []
            start = 0
            while len(windows) < 3:
                pos = hay.find(needle, start)
                if pos < 0:
                    break
                lo = max(0, pos - 400)
                hi = min(len(body), pos + 600)
                windows.append({"offset": lo, "text": body[lo:hi]})
                start = pos + len(needle)
            body_out = None
            slice_info = {"mode": "around_match", "match": around_match,
                          "windows_found": len(windows), "windows": windows}
        else:
            body_out = body[offset:offset + length]
            slice_info = {
                "mode": "slice", "offset": offset,
                "length_returned": len(body_out),
                "has_more": offset + length < len(body),
                "next_offset": offset + length if offset + length < len(body) else None,
            }

        out = {
            "mailbox": rec["mailbox"],
            "message_id": rec["message_id"],
            "conversation_id": rec["conversation_id"],
            "folder": rec["folder_path"],
            "subject": rec["subject"],
            "from": (f"{rec['sender_name']} <{rec['sender_email']}>"
                     if rec["sender_name"] else rec["sender_email"]),
            "to": rec["to_addrs"],
            "cc": rec["cc_addrs"],
            "bcc": rec["bcc_addrs"],
            "received_at": serialize(rec["received_at"]),
            "sent_at": serialize(rec["sent_at"]),
            "is_read": rec["is_read"],
            "is_draft": rec["is_draft"],
            "has_attachments": rec["has_attachments"],
            "attachment_count": rec["attachment_count"],
            "attachments": rec["attachments_summary"],
            "body_length": rec["body_length"],
            "body_source": rec["body_source"],
            "extractor_version": rec["extractor_version"],
            "ok": rec["ok"],
            "error": rec["error"],
        }
        if body_out is not None:
            out["body"] = body_out
        out["slice"] = slice_info

        if include_html:
            mdoc = mongo[MONGO_DB][rec["mailbox"]].find_one(
                {"_id": rec["message_id"]}, {"body_html": 1, "attachments": 1})
            if mdoc:
                out["body_html"] = mdoc.get("body_html")
                out["attachments_detail"] = mdoc.get("attachments")
        return out
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def by_sender(
    sender: str,
    mailbox: Optional[Union[str, list]] = None,
    since: Optional[str] = None,
    has_attachments: Optional[bool] = None,
    limit: int = 30,
) -> dict:
    """List emails from a specific sender (substring match on sender_email or sender_name,
    case-insensitive). Use for "what did X send me" or "all newsletters from Y".

    Returned sorted by received_at DESC.
    """
    try:
        mboxes = normalize_mailbox(mailbox)
        since_dt = parse_since(since)
        limit = min(max(1, limit), 200)
        sql = """
        SELECT mailbox, message_id, subject, sender_email, sender_name,
               to_addrs, folder_path, received_at, has_attachments, attachment_count,
               attachments_summary, body_length
        FROM emails
        WHERE ok = TRUE
          AND (sender_email ILIKE %(s)s OR sender_name ILIKE %(s)s)
          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
          AND (%(has_att)s::boolean IS NULL OR has_attachments = %(has_att)s::boolean)
        ORDER BY received_at DESC NULLS LAST
        LIMIT %(limit)s
        """
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute(sql, {"s": f"%{sender}%", "mboxes": mboxes,
                              "since": since_dt, "has_att": has_attachments,
                              "limit": limit})
            cols = [c.name for c in cur.description]
            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
        for r in rows:
            r["received_at"] = serialize(r["received_at"])
        return {"sender_match": sender, "count": len(rows), "results": rows}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def recent_emails(
    mailbox: Optional[Union[str, list]] = None,
    days: int = 7,
    folder_contains: Optional[str] = None,
    has_attachments: Optional[bool] = None,
    limit: int = 30,
) -> dict:
    """List recent emails (by received_at). Use for "what came in today/this week".
    days=0 to ignore time window (just top-N newest).
    """
    try:
        mboxes = normalize_mailbox(mailbox)
        limit = min(max(1, limit), 200)
        since_dt = None
        if days and days > 0:
            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
        sql = """
        SELECT mailbox, message_id, subject, sender_email, sender_name,
               folder_path, received_at, has_attachments, attachment_count,
               attachments_summary, body_length, is_read
        FROM emails
        WHERE ok = TRUE
          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
          AND (%(folder)s::text IS NULL OR folder_path ILIKE %(folder_like)s)
          AND (%(has_att)s::boolean IS NULL OR has_attachments = %(has_att)s::boolean)
        ORDER BY received_at DESC NULLS LAST
        LIMIT %(limit)s
        """
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute(sql, {
                "mboxes": mboxes, "since": since_dt,
                "folder": folder_contains,
                "folder_like": f"%{folder_contains}%" if folder_contains else None,
                "has_att": has_attachments, "limit": limit,
            })
            cols = [c.name for c in cur.description]
            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
        for r in rows:
            r["received_at"] = serialize(r["received_at"])
        return {"days": days, "count": len(rows), "results": rows}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def conversation_thread(conversation_id: str, limit: int = 50) -> dict:
    """Return all emails in one Outlook conversation thread (conversation_id from Graph).
    Ordered chronologically. Use to see the full back-and-forth on a topic.
    """
    try:
        limit = min(max(1, limit), 200)
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute("""
                SELECT mailbox, message_id, subject, sender_email, sender_name,
                       to_addrs, received_at, folder_path, body_length, has_attachments,
                       attachments_summary
                FROM emails
                WHERE conversation_id = %s AND ok = TRUE
                ORDER BY received_at ASC NULLS LAST
                LIMIT %s
            """, (conversation_id, limit))
            cols = [c.name for c in cur.description]
            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
        for r in rows:
            r["received_at"] = serialize(r["received_at"])
        return {"conversation_id": conversation_id, "count": len(rows), "thread": rows}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def find_attachment(
    name_contains: str,
    mailbox: Optional[Union[str, list]] = None,
    since: Optional[str] = None,
    limit: int = 30,
) -> dict:
    """Find emails whose attachment filename contains the substring (case-insensitive).
    Use for "find emails with that protocol PDF" or "any invoice attachments".
    Returns emails ordered by received_at DESC.
    """
    try:
        mboxes = normalize_mailbox(mailbox)
        since_dt = parse_since(since)
        limit = min(max(1, limit), 200)
        sql = """
        SELECT mailbox, message_id, subject, sender_email, sender_name,
               received_at, attachment_count, attachments_summary, folder_path
        FROM emails
        WHERE ok = TRUE
          AND has_attachments = TRUE
          AND attachments_summary ILIKE %(s)s
          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
        ORDER BY received_at DESC NULLS LAST
        LIMIT %(limit)s
        """
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute(sql, {"s": f"%{name_contains}%",
                              "mboxes": mboxes, "since": since_dt, "limit": limit})
            cols = [c.name for c in cur.description]
            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
        for r in rows:
            r["received_at"] = serialize(r["received_at"])
        return {"name_match": name_contains, "count": len(rows), "results": rows}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def top_senders(
    mailbox: Optional[Union[str, list]] = None,
    since: Optional[str] = None,
    days: Optional[int] = None,
    folder_contains: Optional[str] = None,
    limit: int = 20,
) -> dict:
    """Unique senders grouped by sender_email, counted, sorted by count DESC.
    Use for "who emails me most" / "top senders this month".

    mailbox: one mailbox string or list. None = all.
    since: ISO date "YYYY-MM-DD" lower bound on received_at.
    days: convenience window — count only the last N days (overrides `since`
          when both given). Use for "za posledních X dní".
    folder_contains: substring match against folder_path (case-insensitive).
          Pass "Inbox" to count ONLY received/incoming mail and exclude the
          mailbox owner's own Sent Items, Drafts, etc. Default None = all folders.
    """
    try:
        mboxes = normalize_mailbox(mailbox)
        since_dt = parse_since(since)
        if days and days > 0:
            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
        limit = min(max(1, limit), 100)
        sql = """
        SELECT sender_email, count(*) AS c, max(received_at) AS last_at
        FROM emails
        WHERE ok = TRUE AND sender_email IS NOT NULL
          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
          AND (%(folder)s::text IS NULL OR folder_path ILIKE %(folder_like)s)
        GROUP BY sender_email
        ORDER BY c DESC
        LIMIT %(limit)s
        """
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute(sql, {
                "mboxes": mboxes, "since": since_dt,
                "folder": folder_contains,
                "folder_like": f"%{folder_contains}%" if folder_contains else None,
                "limit": limit,
            })
            rows = [{"sender_email": s, "count": c, "last_at": serialize(t)}
                    for s, c, t in cur.fetchall()]
        return {"count": len(rows), "results": rows}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def pipeline_status(mailbox: Optional[Union[str, list]] = None) -> dict:
    """End-to-end status of the email-ingest pipeline per mailbox.

    Reports, for each mailbox, where it stands in the 5-step pipeline:
      1. parse_emails_graph         -> mongo_total
      2. (refetch text bodies)      -> body_text_missing (legacy v1.3 emails)
      3. download_attachments       -> attach_done / attach_pending
                                       attach_missing  (404 — marked, won't retry)
                                       attach_reference (OneDrive/SharePoint link, no content)
      4. unwrap_smime               -> smime_p7m_total / smime_unwrapped / smime_pending
                                       smime_p7s_count (informational; not unwrapped by design)
      5. enrich_fulltext            -> pg_indexed

    Plus:
      - permanently_deleted (marked by delta sync)

    Use this instead of running multiple Mongo count queries by hand. Returns
    one row per mailbox; if `mailbox` is given, returns just those rows.
    """
    try:
        mbs = normalize_mailbox(mailbox)
        all_mb = [c for c in mongo[MONGO_DB].list_collection_names()
                  if c not in SKIP_COLLECTIONS]
        targets = [m for m in all_mb if (mbs is None or m in mbs)]

        # PG counts in one pass
        pg_counts: dict[str, int] = {}
        with pg_conn() as pg, pg.cursor() as cur:
            cur.execute("SELECT mailbox, count(*) FROM emails "
                        "WHERE ok = true GROUP BY mailbox")
            for mb, c in cur.fetchall():
                pg_counts[mb] = c

        out = {}
        for mb in targets:
            col = mongo[MONGO_DB][mb]
            mongo_total = col.estimated_document_count()
            with_att = col.count_documents({"has_attachments": True})
            attach_pending = col.count_documents({
                "has_attachments": True,
                "attachments": {"$elemMatch": {
                    "is_inline": False,
                    "file_hash": {"$exists": False},
                    "attachment_missing": {"$ne": True},
                    "attachment_reference": {"$ne": True},
                }},
            })
            attach_missing = col.count_documents({
                "attachments.attachment_missing": True,
            })
            attach_reference = col.count_documents({
                "attachments.attachment_reference": True,
            })
            attach_done = with_att - attach_pending - attach_missing - attach_reference

            smime_p7m_total = col.count_documents(
                {"attachments.filename": {"$regex": r"^smime\.p7m$", "$options": "i"}}
            )
            smime_unwrapped = col.count_documents({
                "attachments.filename": {"$regex": r"^smime\.p7m$", "$options": "i"},
                "smime_unwrapped": True,
            })
            smime_p7s_count = col.count_documents(
                {"attachments.filename": {"$regex": r"^smime\.p7s$", "$options": "i"}}
            )

            body_text_missing = col.count_documents({
                "body_html": {"$in": [None, ""]},
                "body_text": {"$exists": False},
                "graph_id": {"$exists": True},
            })

            permanently_deleted = col.count_documents({"permanently_deleted": True})

            out[mb] = {
                "mongo_total":         mongo_total,
                "with_attachments":    with_att,
                "attach_done":         attach_done,
                "attach_pending":      attach_pending,
                "attach_missing":      attach_missing,
                "attach_reference":    attach_reference,
                "smime_p7m_total":     smime_p7m_total,
                "smime_unwrapped":     smime_unwrapped,
                "smime_pending":       smime_p7m_total - smime_unwrapped,
                "smime_p7s_count":     smime_p7s_count,
                "body_text_missing":   body_text_missing,
                "pg_indexed":          pg_counts.get(mb, 0),
                "permanently_deleted": permanently_deleted,
            }
        return {"mailboxes": out}
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


@mcp.tool()
def sync_state_overview(mailbox: Optional[Union[str, list]] = None) -> dict:
    """Delta-sync state across mailboxes (collection `emaily.sync_state`).

    For each (mailbox, folder) pair shows: deltaLink present?, last_run_at,
    cumulative new/sync/removed/run_count. Use to confirm a mailbox is
    incrementally synced and to spot folders that haven't run in a while.
    """
    try:
        sync_col = mongo[MONGO_DB]["sync_state"]
        q: dict = {}
        mbs = normalize_mailbox(mailbox)
        if mbs:
            q["mailbox"] = {"$in": mbs}
        cursor = sync_col.find(q, {
            "mailbox": 1, "folder_path": 1, "folder_id": 1,
            "delta_link": 1, "last_run_at": 1,
            "cumulative_new": 1, "cumulative_sync": 1,
            "cumulative_removed": 1, "run_count": 1,
        }).sort([("mailbox", 1), ("folder_path", 1)])

        by_mailbox: dict[str, list] = {}
        for d in cursor:
            row = {
                "folder_path":        d.get("folder_path"),
                "folder_id":          d.get("folder_id"),
                "has_delta_link":     bool(d.get("delta_link")),
                "last_run_at":        serialize(d.get("last_run_at")),
                "cumulative_new":     d.get("cumulative_new", 0),
                "cumulative_sync":    d.get("cumulative_sync", 0),
                "cumulative_removed": d.get("cumulative_removed", 0),
                "run_count":          d.get("run_count", 0),
            }
            by_mailbox.setdefault(d["mailbox"], []).append(row)

        # mailboxes that have collections but ZERO sync_state entries
        all_mb = {c for c in mongo[MONGO_DB].list_collection_names()
                  if c not in SKIP_COLLECTIONS}
        not_synced = sorted(all_mb - set(by_mailbox.keys()))
        if mbs:
            not_synced = [m for m in not_synced if m in mbs]
        return {
            "mailboxes":          by_mailbox,
            "never_delta_synced": not_synced,
        }
    except Exception as e:
        log(traceback.format_exc())
        return {"error": str(e)}


if __name__ == "__main__":
    log("MCP emaily server started (FastMCP)")
    mcp.run()