From a3470511452f55fbf50a48263013b363d34d280d Mon Sep 17 00:00:00 2001
From: Vladimir Buzalka <vladimir.buzalka@buzalka.cz>
Date: Fri, 5 Jun 2026 21:21:30 +0200
Subject: [PATCH] notebook

---
 EmailsImport/mcp_emaily.py                    | 753 ++++++++++++++++++
 Python-runner/0_run_pipeline_v1.0.md          |  77 ++
 Python-runner/0_run_pipeline_v1.0.py          | 176 ++++
 Python-runner/1_parse_emails_graph_v1.4.md    |  41 +
 Python-runner/1_parse_emails_graph_v1.4.py    | 624 +++++++++++++++
 .../1b_parse_emails_graph_delta_v1.0.md       | 139 ++++
 .../1b_parse_emails_graph_delta_v1.0.py       | 514 ++++++++++++
 Python-runner/2_refetch_text_bodies_v1.0.md   |  34 +
 Python-runner/2_refetch_text_bodies_v1.0.py   | 270 +++++++
 Python-runner/3_download_attachments_v1.3.md  |  47 ++
 Python-runner/3_download_attachments_v1.3.py  | 546 +++++++++++++
 Python-runner/3_download_attachments_v1.4.md  |  74 ++
 Python-runner/3_download_attachments_v1.4.py  | 713 +++++++++++++++++
 Python-runner/4_unwrap_smime_v1.0.md          |  63 ++
 Python-runner/4_unwrap_smime_v1.0.py          | 445 +++++++++++
 .../5_enrich_fulltext_emails_v1.2.md          |  47 ++
 .../5_enrich_fulltext_emails_v1.2.py          | 489 ++++++++++++
 .../5_enrich_fulltext_emails_v1.3.md          |  79 ++
 .../5_enrich_fulltext_emails_v1.3.py          | 567 +++++++++++++
 .../Trash/enrich_fulltext_emails_v1.1.py      | 455 +++++++++++
 Python-runner/run_pipeline.sh                 |  41 +
 Soubory/mcp_soubory.py                        | 672 ++++++++++++++++
 Soubory/priklady_dotazu.md                    | 210 +++++
 Soubory/query_v0.1.py                         | 203 +++++
 .../feedback_proactive_suggestions.md         |  22 +
 claude-memory/feedback_use_mcp_emaily.md      |  31 +
 claude-memory/project_mcp_emaily.md           |  39 +
 claude-memory/project_mcp_soubory.md          |  31 +
 28 files changed, 7402 insertions(+)
 create mode 100644 EmailsImport/mcp_emaily.py
 create mode 100644 Python-runner/0_run_pipeline_v1.0.md
 create mode 100644 Python-runner/0_run_pipeline_v1.0.py
 create mode 100644 Python-runner/1_parse_emails_graph_v1.4.md
 create mode 100644 Python-runner/1_parse_emails_graph_v1.4.py
 create mode 100644 Python-runner/1b_parse_emails_graph_delta_v1.0.md
 create mode 100644 Python-runner/1b_parse_emails_graph_delta_v1.0.py
 create mode 100644 Python-runner/2_refetch_text_bodies_v1.0.md
 create mode 100644 Python-runner/2_refetch_text_bodies_v1.0.py
 create mode 100644 Python-runner/3_download_attachments_v1.3.md
 create mode 100644 Python-runner/3_download_attachments_v1.3.py
 create mode 100644 Python-runner/3_download_attachments_v1.4.md
 create mode 100644 Python-runner/3_download_attachments_v1.4.py
 create mode 100644 Python-runner/4_unwrap_smime_v1.0.md
 create mode 100644 Python-runner/4_unwrap_smime_v1.0.py
 create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.2.md
 create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.2.py
 create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.3.md
 create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.3.py
 create mode 100644 Python-runner/Trash/enrich_fulltext_emails_v1.1.py
 create mode 100644 Python-runner/run_pipeline.sh
 create mode 100644 Soubory/mcp_soubory.py
 create mode 100644 Soubory/priklady_dotazu.md
 create mode 100644 Soubory/query_v0.1.py
 create mode 100644 claude-memory/feedback_proactive_suggestions.md
 create mode 100644 claude-memory/feedback_use_mcp_emaily.md
 create mode 100644 claude-memory/project_mcp_emaily.md
 create mode 100644 claude-memory/project_mcp_soubory.md

diff --git a/EmailsImport/mcp_emaily.py b/EmailsImport/mcp_emaily.py
new file mode 100644
index 0000000..3f193b7
--- /dev/null
+++ b/EmailsImport/mcp_emaily.py
@@ -0,0 +1,753 @@
+#!/usr/bin/env python3
+"""
+==============================================================================
+MCP server: EMAILY  (vsechny schranky importovane z Microsoft Graph)
+
+Hybridni dotaz nad:
+  - PostgreSQL  192.168.1.76  db=MongoEmaily  tabulka=emails
+                (fulltext tsvector - subject + sender + recipients +
+                 attachments + body, GIN index, ts_headline, ts_rank)
+  - MongoDB     192.168.1.76  db=emaily       kolekce=<mailbox>
+                (puvodni dokumenty z parse_emails_graph_v1.3.py:
+                 headers, body_html, recipients[], attachments[], ...)
+
+Source: U:\\janssen\\EmailsImport\\enrich_fulltext_emails_v1.0.py
+
+Spusteni:
+    python mcp_emaily.py        (stdio MCP)
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import sys
+import traceback
+from datetime import datetime, timezone, timedelta
+from typing import Optional, Union
+
+import psycopg
+from mcp.server.fastmcp import FastMCP
+from pymongo import MongoClient
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+DEFAULT_BODY_CHARS = 8000
+MAX_BODY_CHARS = 200_000
+
+SKIP_COLLECTIONS = {"attachments_index", "sync_state"}
+
+
+def log(msg: str) -> None:
+    print(msg, file=sys.stderr, flush=True)
+
+
+try:
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    log(f"Mongo OK ({MONGO_URI})")
+except Exception as e:
+    log(f"Mongo connection failed: {e}")
+    sys.exit(1)
+
+try:
+    _t = psycopg.connect(PG_DSN, connect_timeout=10)
+    _t.close()
+    log("Postgres OK")
+except Exception as e:
+    log(f"Postgres connection failed: {e}")
+    sys.exit(1)
+
+
+def pg_conn():
+    return psycopg.connect(PG_DSN, connect_timeout=10)
+
+
+def serialize(obj):
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    if isinstance(obj, bytes):
+        return obj.decode("utf-8", errors="replace")
+    if isinstance(obj, dict):
+        return {k: serialize(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [serialize(v) for v in obj]
+    return obj
+
+
+def normalize_mailbox(mailbox: Optional[Union[str, list]]) -> Optional[list[str]]:
+    if mailbox is None or mailbox == "" or mailbox == []:
+        return None
+    if isinstance(mailbox, str):
+        return [mailbox]
+    return list(mailbox)
+
+
+def parse_since(s: Optional[str]) -> Optional[datetime]:
+    if not s:
+        return None
+    try:
+        if "T" in s:
+            return datetime.fromisoformat(s.replace("Z", "+00:00"))
+        return datetime.strptime(s, "%Y-%m-%d").replace(tzinfo=timezone.utc)
+    except Exception as e:
+        raise ValueError(f"Bad date {s!r}: {e}")
+
+
+# --- MCP --------------------------------------------------------------------
+mcp = FastMCP("emaily")
+
+
+@mcp.tool()
+def ping() -> dict:
+    """Quick health check. Reports Mongo + Postgres connectivity, total mailboxes,
+    PG indexed emails count, ok/error breakdown.
+    """
+    try:
+        info = mongo.admin.command("buildInfo")
+        mailboxes = [c for c in mongo[MONGO_DB].list_collection_names()
+                     if c not in SKIP_COLLECTIONS]
+        mongo_counts = {}
+        for mb in mailboxes:
+            mongo_counts[mb] = mongo[MONGO_DB][mb].estimated_document_count()
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute("SELECT mailbox, ok, count(*) FROM emails "
+                        "GROUP BY mailbox, ok ORDER BY mailbox, ok")
+            rows = cur.fetchall()
+        pg_summary: dict = {}
+        for mb, ok, c in rows:
+            pg_summary.setdefault(mb, {})[("ok" if ok else "error")] = c
+        return {
+            "status": "ok",
+            "mongo_version": info.get("version"),
+            "mailboxes": mailboxes,
+            "mongo_email_count": mongo_counts,
+            "pg_indexed_per_mailbox": pg_summary,
+        }
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"status": "error", "error": str(e)}
+
+
+@mcp.tool()
+def list_mailboxes() -> dict:
+    """Overview of all mailboxes — totals, indexed coverage, earliest/latest received_at,
+    top senders by volume. Use to understand the corpus before searching.
+    """
+    out = {}
+    try:
+        mailboxes = [c for c in mongo[MONGO_DB].list_collection_names()
+                     if c not in SKIP_COLLECTIONS]
+        for mb in mailboxes:
+            with pg_conn() as pg, pg.cursor() as cur:
+                cur.execute("""
+                    SELECT count(*) FILTER (WHERE ok) AS ok,
+                           count(*) AS total,
+                           min(received_at) AS first_at,
+                           max(received_at) AS last_at,
+                           count(*) FILTER (WHERE has_attachments) AS with_att
+                    FROM emails WHERE mailbox = %s
+                """, (mb,))
+                ok, total, first_at, last_at, with_att = cur.fetchone()
+                cur.execute("""
+                    SELECT sender_email, count(*) c FROM emails
+                    WHERE mailbox = %s AND sender_email IS NOT NULL
+                    GROUP BY sender_email ORDER BY c DESC LIMIT 5
+                """, (mb,))
+                top_senders = [{"email": s, "count": c} for s, c in cur.fetchall()]
+            out[mb] = {
+                "indexed_ok": ok,
+                "indexed_total": total,
+                "with_attachments": with_att,
+                "first_received": serialize(first_at),
+                "last_received": serialize(last_at),
+                "top_senders": top_senders,
+            }
+        return {"mailboxes": out}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def search(
+    query: str,
+    mailbox: Optional[Union[str, list]] = None,
+    since: Optional[str] = None,
+    until: Optional[str] = None,
+    folder_contains: Optional[str] = None,
+    sender_contains: Optional[str] = None,
+    has_attachments: Optional[bool] = None,
+    limit: int = 20,
+) -> dict:
+    """PRIMARY TOOL — fulltext search across all indexed emails.
+
+    Index includes: subject, sender (email + name), recipients (to/cc),
+    attachment filenames, AND full body text.
+
+    query: websearch_to_tsquery syntax:
+        invoice payment           -> AND
+        "lot expiration"          -> phrase
+        SAE OR "serious adverse"  -> OR
+        urgent -newsletter        -> exclude
+    mailbox: one mailbox string or list (e.g. "vbuzalka@its.jnj.com"). None = all.
+    since/until: ISO date "YYYY-MM-DD" on received_at
+    folder_contains: substring match against folder_path (case-insensitive)
+    sender_contains: substring match against sender_email OR sender_name (case-insensitive)
+    has_attachments: True / False / None (any)
+    limit: max 100
+
+    Returns ranked results with `snippet` showing matches highlighted as <<...>>.
+    Use `read_email` to fetch full body of any hit.
+    """
+    try:
+        mboxes = normalize_mailbox(mailbox)
+        since_dt = parse_since(since)
+        until_dt = parse_since(until)
+        limit = min(max(1, limit), 100)
+
+        sql = """
+        WITH q AS (
+            SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
+        )
+        SELECT
+            e.id, e.mailbox, e.message_id, e.conversation_id, e.folder_path,
+            e.subject, e.sender_email, e.sender_name,
+            e.to_addrs, e.cc_addrs,
+            e.received_at, e.sent_at, e.is_read,
+            e.has_attachments, e.attachment_count, e.attachments_summary,
+            e.body_length, e.body_source,
+            ts_rank(e.tsv, q.tsq) AS rank,
+            ts_headline('soubory'::regconfig,
+                left(coalesce(e.body, e.subject), 200000),
+                q.tsq,
+                'MaxFragments=3, MinWords=4, MaxWords=18, '
+                'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
+        FROM emails e, q
+        WHERE e.tsv @@ q.tsq
+          AND e.ok = TRUE
+          AND (%(mboxes)s::text[] IS NULL OR e.mailbox = ANY(%(mboxes)s::text[]))
+          AND (%(since)s::timestamptz IS NULL OR e.received_at >= %(since)s::timestamptz)
+          AND (%(until)s::timestamptz IS NULL OR e.received_at <  %(until)s::timestamptz)
+          AND (%(folder)s::text IS NULL OR e.folder_path ILIKE %(folder_like)s)
+          AND (%(sender)s::text IS NULL
+               OR e.sender_email ILIKE %(sender_like)s
+               OR e.sender_name  ILIKE %(sender_like)s)
+          AND (%(has_att)s::boolean IS NULL OR e.has_attachments = %(has_att)s::boolean)
+        ORDER BY rank DESC, e.received_at DESC NULLS LAST
+        LIMIT %(limit)s
+        """
+        params = {
+            "query": query, "mboxes": mboxes,
+            "since": since_dt, "until": until_dt,
+            "folder": folder_contains,
+            "folder_like": f"%{folder_contains}%" if folder_contains else None,
+            "sender": sender_contains,
+            "sender_like": f"%{sender_contains}%" if sender_contains else None,
+            "has_att": has_attachments,
+            "limit": limit,
+        }
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, params)
+            cols = [c.name for c in cur.description]
+            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+
+        results = []
+        for r in rows:
+            results.append({
+                "mailbox": r["mailbox"],
+                "message_id": r["message_id"],
+                "conversation_id": r["conversation_id"],
+                "folder": r["folder_path"],
+                "subject": r["subject"],
+                "from": (f"{r['sender_name']} <{r['sender_email']}>"
+                         if r["sender_name"] else r["sender_email"]),
+                "to": r["to_addrs"],
+                "cc": r["cc_addrs"],
+                "received_at": serialize(r["received_at"]),
+                "is_read": r["is_read"],
+                "has_attachments": r["has_attachments"],
+                "attachment_count": r["attachment_count"],
+                "attachments": r["attachments_summary"],
+                "body_length": r["body_length"],
+                "body_source": r["body_source"],
+                "rank": round(float(r["rank"]), 5),
+                "snippet": (r["snippet"] or "").strip(),
+            })
+
+        return {
+            "query": query,
+            "filters": {"mailbox": mboxes, "since": since, "until": until,
+                        "folder_contains": folder_contains,
+                        "sender_contains": sender_contains,
+                        "has_attachments": has_attachments,
+                        "limit": limit},
+            "count": len(results),
+            "results": results,
+            "tip": "Use read_email(mailbox=..., message_id=...) for full body or thread.",
+        }
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e), "query": query}
+
+
+@mcp.tool()
+def read_email(
+    message_id: Optional[str] = None,
+    mailbox: Optional[str] = None,
+    offset: int = 0,
+    length: int = DEFAULT_BODY_CHARS,
+    around_match: Optional[str] = None,
+    include_html: bool = False,
+) -> dict:
+    """Read one email — full plain text body + metadata.
+
+    Identify by `message_id` (Internet Message-ID, the _id in Mongo).
+    `mailbox` narrows the lookup if the same Message-ID appears in multiple mailboxes
+    (e.g. you got copies in both work and personal accounts).
+
+    offset, length: slice the body. length max 200000.
+    around_match: case-insensitive substring; returns up to 3 windows of ~1000 chars
+                  centered on matches, instead of a flat slice.
+    include_html: also return raw body_html from Mongo (typically large — only if you
+                  really need the original markup).
+    """
+    if not message_id:
+        return {"error": "Provide message_id."}
+    try:
+        length = min(max(1, length), MAX_BODY_CHARS)
+
+        sql = """
+        SELECT id, mailbox, message_id, graph_id, conversation_id, folder_path,
+               subject, sender_email, sender_name,
+               to_addrs, cc_addrs, bcc_addrs,
+               sent_at, received_at, modified_at, is_read, is_draft,
+               has_attachments, attachment_count, attachments_summary,
+               body, body_length, body_source,
+               extractor_version, extracted_at, ok, error
+        FROM emails WHERE message_id = %s
+        """
+        params = [message_id]
+        if mailbox:
+            sql += " AND mailbox = %s"
+            params.append(mailbox)
+        sql += " LIMIT 1"
+
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, params)
+            row = cur.fetchone()
+            cols = [c.name for c in cur.description]
+        if not row:
+            return {"error": "Email not found.",
+                    "message_id": message_id, "mailbox": mailbox}
+        rec = dict(zip(cols, row))
+
+        body = rec.get("body") or ""
+        if around_match and body:
+            needle = around_match.lower()
+            hay = body.lower()
+            windows = []
+            start = 0
+            while len(windows) < 3:
+                pos = hay.find(needle, start)
+                if pos < 0:
+                    break
+                lo = max(0, pos - 400)
+                hi = min(len(body), pos + 600)
+                windows.append({"offset": lo, "text": body[lo:hi]})
+                start = pos + len(needle)
+            body_out = None
+            slice_info = {"mode": "around_match", "match": around_match,
+                          "windows_found": len(windows), "windows": windows}
+        else:
+            body_out = body[offset:offset + length]
+            slice_info = {
+                "mode": "slice", "offset": offset,
+                "length_returned": len(body_out),
+                "has_more": offset + length < len(body),
+                "next_offset": offset + length if offset + length < len(body) else None,
+            }
+
+        out = {
+            "mailbox": rec["mailbox"],
+            "message_id": rec["message_id"],
+            "conversation_id": rec["conversation_id"],
+            "folder": rec["folder_path"],
+            "subject": rec["subject"],
+            "from": (f"{rec['sender_name']} <{rec['sender_email']}>"
+                     if rec["sender_name"] else rec["sender_email"]),
+            "to": rec["to_addrs"],
+            "cc": rec["cc_addrs"],
+            "bcc": rec["bcc_addrs"],
+            "received_at": serialize(rec["received_at"]),
+            "sent_at": serialize(rec["sent_at"]),
+            "is_read": rec["is_read"],
+            "is_draft": rec["is_draft"],
+            "has_attachments": rec["has_attachments"],
+            "attachment_count": rec["attachment_count"],
+            "attachments": rec["attachments_summary"],
+            "body_length": rec["body_length"],
+            "body_source": rec["body_source"],
+            "extractor_version": rec["extractor_version"],
+            "ok": rec["ok"],
+            "error": rec["error"],
+        }
+        if body_out is not None:
+            out["body"] = body_out
+        out["slice"] = slice_info
+
+        if include_html:
+            mdoc = mongo[MONGO_DB][rec["mailbox"]].find_one(
+                {"_id": rec["message_id"]}, {"body_html": 1, "attachments": 1})
+            if mdoc:
+                out["body_html"] = mdoc.get("body_html")
+                out["attachments_detail"] = mdoc.get("attachments")
+        return out
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def by_sender(
+    sender: str,
+    mailbox: Optional[Union[str, list]] = None,
+    since: Optional[str] = None,
+    has_attachments: Optional[bool] = None,
+    limit: int = 30,
+) -> dict:
+    """List emails from a specific sender (substring match on sender_email or sender_name,
+    case-insensitive). Use for "what did X send me" or "all newsletters from Y".
+
+    Returned sorted by received_at DESC.
+    """
+    try:
+        mboxes = normalize_mailbox(mailbox)
+        since_dt = parse_since(since)
+        limit = min(max(1, limit), 200)
+        sql = """
+        SELECT mailbox, message_id, subject, sender_email, sender_name,
+               to_addrs, folder_path, received_at, has_attachments, attachment_count,
+               attachments_summary, body_length
+        FROM emails
+        WHERE ok = TRUE
+          AND (sender_email ILIKE %(s)s OR sender_name ILIKE %(s)s)
+          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
+          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
+          AND (%(has_att)s::boolean IS NULL OR has_attachments = %(has_att)s::boolean)
+        ORDER BY received_at DESC NULLS LAST
+        LIMIT %(limit)s
+        """
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, {"s": f"%{sender}%", "mboxes": mboxes,
+                              "since": since_dt, "has_att": has_attachments,
+                              "limit": limit})
+            cols = [c.name for c in cur.description]
+            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+        for r in rows:
+            r["received_at"] = serialize(r["received_at"])
+        return {"sender_match": sender, "count": len(rows), "results": rows}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def recent_emails(
+    mailbox: Optional[Union[str, list]] = None,
+    days: int = 7,
+    folder_contains: Optional[str] = None,
+    has_attachments: Optional[bool] = None,
+    limit: int = 30,
+) -> dict:
+    """List recent emails (by received_at). Use for "what came in today/this week".
+    days=0 to ignore time window (just top-N newest).
+    """
+    try:
+        mboxes = normalize_mailbox(mailbox)
+        limit = min(max(1, limit), 200)
+        since_dt = None
+        if days and days > 0:
+            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
+        sql = """
+        SELECT mailbox, message_id, subject, sender_email, sender_name,
+               folder_path, received_at, has_attachments, attachment_count,
+               attachments_summary, body_length, is_read
+        FROM emails
+        WHERE ok = TRUE
+          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
+          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
+          AND (%(folder)s::text IS NULL OR folder_path ILIKE %(folder_like)s)
+          AND (%(has_att)s::boolean IS NULL OR has_attachments = %(has_att)s::boolean)
+        ORDER BY received_at DESC NULLS LAST
+        LIMIT %(limit)s
+        """
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, {
+                "mboxes": mboxes, "since": since_dt,
+                "folder": folder_contains,
+                "folder_like": f"%{folder_contains}%" if folder_contains else None,
+                "has_att": has_attachments, "limit": limit,
+            })
+            cols = [c.name for c in cur.description]
+            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+        for r in rows:
+            r["received_at"] = serialize(r["received_at"])
+        return {"days": days, "count": len(rows), "results": rows}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def conversation_thread(conversation_id: str, limit: int = 50) -> dict:
+    """Return all emails in one Outlook conversation thread (conversation_id from Graph).
+    Ordered chronologically. Use to see the full back-and-forth on a topic.
+    """
+    try:
+        limit = min(max(1, limit), 200)
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute("""
+                SELECT mailbox, message_id, subject, sender_email, sender_name,
+                       to_addrs, received_at, folder_path, body_length, has_attachments,
+                       attachments_summary
+                FROM emails
+                WHERE conversation_id = %s AND ok = TRUE
+                ORDER BY received_at ASC NULLS LAST
+                LIMIT %s
+            """, (conversation_id, limit))
+            cols = [c.name for c in cur.description]
+            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+        for r in rows:
+            r["received_at"] = serialize(r["received_at"])
+        return {"conversation_id": conversation_id, "count": len(rows), "thread": rows}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def find_attachment(
+    name_contains: str,
+    mailbox: Optional[Union[str, list]] = None,
+    since: Optional[str] = None,
+    limit: int = 30,
+) -> dict:
+    """Find emails whose attachment filename contains the substring (case-insensitive).
+    Use for "find emails with that protocol PDF" or "any invoice attachments".
+    Returns emails ordered by received_at DESC.
+    """
+    try:
+        mboxes = normalize_mailbox(mailbox)
+        since_dt = parse_since(since)
+        limit = min(max(1, limit), 200)
+        sql = """
+        SELECT mailbox, message_id, subject, sender_email, sender_name,
+               received_at, attachment_count, attachments_summary, folder_path
+        FROM emails
+        WHERE ok = TRUE
+          AND has_attachments = TRUE
+          AND attachments_summary ILIKE %(s)s
+          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
+          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
+        ORDER BY received_at DESC NULLS LAST
+        LIMIT %(limit)s
+        """
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, {"s": f"%{name_contains}%",
+                              "mboxes": mboxes, "since": since_dt, "limit": limit})
+            cols = [c.name for c in cur.description]
+            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+        for r in rows:
+            r["received_at"] = serialize(r["received_at"])
+        return {"name_match": name_contains, "count": len(rows), "results": rows}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def top_senders(
+    mailbox: Optional[Union[str, list]] = None,
+    since: Optional[str] = None,
+    limit: int = 20,
+) -> dict:
+    """Top senders by volume (count of received emails). Optionally limit by mailbox or date window.
+    Use for "who emails me most" or "top senders this month".
+    """
+    try:
+        mboxes = normalize_mailbox(mailbox)
+        since_dt = parse_since(since)
+        limit = min(max(1, limit), 100)
+        sql = """
+        SELECT sender_email, count(*) AS c, max(received_at) AS last_at
+        FROM emails
+        WHERE ok = TRUE AND sender_email IS NOT NULL
+          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
+          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
+        GROUP BY sender_email
+        ORDER BY c DESC
+        LIMIT %(limit)s
+        """
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, {"mboxes": mboxes, "since": since_dt, "limit": limit})
+            rows = [{"sender_email": s, "count": c, "last_at": serialize(t)}
+                    for s, c, t in cur.fetchall()]
+        return {"count": len(rows), "results": rows}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def pipeline_status(mailbox: Optional[Union[str, list]] = None) -> dict:
+    """End-to-end status of the email-ingest pipeline per mailbox.
+
+    Reports, for each mailbox, where it stands in the 5-step pipeline:
+      1. parse_emails_graph         -> mongo_total
+      2. (refetch text bodies)      -> body_text_missing (legacy v1.3 emails)
+      3. download_attachments       -> attach_done / attach_pending
+                                       attach_missing  (404 — marked, won't retry)
+                                       attach_reference (OneDrive/SharePoint link, no content)
+      4. unwrap_smime               -> smime_p7m_total / smime_unwrapped / smime_pending
+                                       smime_p7s_count (informational; not unwrapped by design)
+      5. enrich_fulltext            -> pg_indexed
+
+    Plus:
+      - permanently_deleted (marked by delta sync)
+
+    Use this instead of running multiple Mongo count queries by hand. Returns
+    one row per mailbox; if `mailbox` is given, returns just those rows.
+    """
+    try:
+        mbs = normalize_mailbox(mailbox)
+        all_mb = [c for c in mongo[MONGO_DB].list_collection_names()
+                  if c not in SKIP_COLLECTIONS]
+        targets = [m for m in all_mb if (mbs is None or m in mbs)]
+
+        # PG counts in one pass
+        pg_counts: dict[str, int] = {}
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute("SELECT mailbox, count(*) FROM emails "
+                        "WHERE ok = true GROUP BY mailbox")
+            for mb, c in cur.fetchall():
+                pg_counts[mb] = c
+
+        out = {}
+        for mb in targets:
+            col = mongo[MONGO_DB][mb]
+            mongo_total = col.estimated_document_count()
+            with_att = col.count_documents({"has_attachments": True})
+            attach_pending = col.count_documents({
+                "has_attachments": True,
+                "attachments": {"$elemMatch": {
+                    "is_inline": False,
+                    "file_hash": {"$exists": False},
+                    "attachment_missing": {"$ne": True},
+                    "attachment_reference": {"$ne": True},
+                }},
+            })
+            attach_missing = col.count_documents({
+                "attachments.attachment_missing": True,
+            })
+            attach_reference = col.count_documents({
+                "attachments.attachment_reference": True,
+            })
+            attach_done = with_att - attach_pending - attach_missing - attach_reference
+
+            smime_p7m_total = col.count_documents(
+                {"attachments.filename": {"$regex": r"^smime\.p7m$", "$options": "i"}}
+            )
+            smime_unwrapped = col.count_documents({
+                "attachments.filename": {"$regex": r"^smime\.p7m$", "$options": "i"},
+                "smime_unwrapped": True,
+            })
+            smime_p7s_count = col.count_documents(
+                {"attachments.filename": {"$regex": r"^smime\.p7s$", "$options": "i"}}
+            )
+
+            body_text_missing = col.count_documents({
+                "body_html": {"$in": [None, ""]},
+                "body_text": {"$exists": False},
+                "graph_id": {"$exists": True},
+            })
+
+            permanently_deleted = col.count_documents({"permanently_deleted": True})
+
+            out[mb] = {
+                "mongo_total":         mongo_total,
+                "with_attachments":    with_att,
+                "attach_done":         attach_done,
+                "attach_pending":      attach_pending,
+                "attach_missing":      attach_missing,
+                "attach_reference":    attach_reference,
+                "smime_p7m_total":     smime_p7m_total,
+                "smime_unwrapped":     smime_unwrapped,
+                "smime_pending":       smime_p7m_total - smime_unwrapped,
+                "smime_p7s_count":     smime_p7s_count,
+                "body_text_missing":   body_text_missing,
+                "pg_indexed":          pg_counts.get(mb, 0),
+                "permanently_deleted": permanently_deleted,
+            }
+        return {"mailboxes": out}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def sync_state_overview(mailbox: Optional[Union[str, list]] = None) -> dict:
+    """Delta-sync state across mailboxes (collection `emaily.sync_state`).
+
+    For each (mailbox, folder) pair shows: deltaLink present?, last_run_at,
+    cumulative new/sync/removed/run_count. Use to confirm a mailbox is
+    incrementally synced and to spot folders that haven't run in a while.
+    """
+    try:
+        sync_col = mongo[MONGO_DB]["sync_state"]
+        q: dict = {}
+        mbs = normalize_mailbox(mailbox)
+        if mbs:
+            q["mailbox"] = {"$in": mbs}
+        cursor = sync_col.find(q, {
+            "mailbox": 1, "folder_path": 1, "folder_id": 1,
+            "delta_link": 1, "last_run_at": 1,
+            "cumulative_new": 1, "cumulative_sync": 1,
+            "cumulative_removed": 1, "run_count": 1,
+        }).sort([("mailbox", 1), ("folder_path", 1)])
+
+        by_mailbox: dict[str, list] = {}
+        for d in cursor:
+            row = {
+                "folder_path":        d.get("folder_path"),
+                "folder_id":          d.get("folder_id"),
+                "has_delta_link":     bool(d.get("delta_link")),
+                "last_run_at":        serialize(d.get("last_run_at")),
+                "cumulative_new":     d.get("cumulative_new", 0),
+                "cumulative_sync":    d.get("cumulative_sync", 0),
+                "cumulative_removed": d.get("cumulative_removed", 0),
+                "run_count":          d.get("run_count", 0),
+            }
+            by_mailbox.setdefault(d["mailbox"], []).append(row)
+
+        # mailboxes that have collections but ZERO sync_state entries
+        all_mb = {c for c in mongo[MONGO_DB].list_collection_names()
+                  if c not in SKIP_COLLECTIONS}
+        not_synced = sorted(all_mb - set(by_mailbox.keys()))
+        if mbs:
+            not_synced = [m for m in not_synced if m in mbs]
+        return {
+            "mailboxes":          by_mailbox,
+            "never_delta_synced": not_synced,
+        }
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+if __name__ == "__main__":
+    log("MCP emaily server started (FastMCP)")
+    mcp.run()
diff --git a/Python-runner/0_run_pipeline_v1.0.md b/Python-runner/0_run_pipeline_v1.0.md
new file mode 100644
index 0000000..5f45d12
--- /dev/null
+++ b/Python-runner/0_run_pipeline_v1.0.md
@@ -0,0 +1,77 @@
+# 0_run_pipeline_v1.0.py
+
+**Wrapper kolem celé emailové pipeline.** Spustí postupně všechny 4 kroky daily syncu, vždy přes všechny dostupné schránky:
+
+| # | Krok | Skript |
+|---|---|---|
+| 1b | Graph delta sync (inkrementální Mongo update) | `1b_parse_emails_graph_delta_v1.0.py` |
+| 3  | Download attachments | `3_download_attachments_v1.4.py` |
+| 4  | Unwrap S/MIME | `4_unwrap_smime_v1.0.py` |
+| 5  | Enrich fulltext (PG) | `5_enrich_fulltext_emails_v1.3.py` |
+
+## Politika chyb
+
+Default je **continue-on-error** — když některý krok selže, pipeline pokračuje dalším (downstream se nezasekne kvůli minor problému). Po vyběhnutí dostaneš souhrnnou tabulku s `OK / FAIL(N)` per krok.
+
+Použij `--stop-on-error` pokud chceš tvrdou abort při první chybě.
+
+## Logování
+
+Každý krok jde do vlastního logu v `/scripts/pipeline_<id>.log`:
+- `pipeline_1b.log`
+- `pipeline_3.log`
+- `pipeline_4.log`
+- `pipeline_5.log`
+
+Live výstup se zároveň tee-uje na konzoli (vypneš přes `--quiet`).
+
+## Argumenty
+
+| Argument | Hodnoty | Popis |
+|---|---|---|
+| `--only` | `1b 3 4 5` | Spustit jen tyto kroky |
+| `--skip` | `1b 3 4 5` | Přeskočit tyto kroky |
+| `--stop-on-error` | flag | Zastavit při první chybě (default: pokračovat) |
+| `--quiet` | flag | Necpat stdout na konzoli (zůstane v logu) |
+
+## Varianty volání
+
+```bash
+# Daily run — vše, všechny schránky:
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py
+
+# Jen enrich (např. po manuálním zásahu do Mongo):
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 5
+
+# Vše mimo S/MIME (krok 4 občas vyžaduje pip install asn1crypto):
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --skip 4
+
+# Test daily sync bez fulltextu:
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 1b 3 4
+
+# Na pozadí, master log:
+docker exec -d python-runner bash -c "python /scripts/0_run_pipeline_v1.0.py --quiet > /scripts/pipeline_master.log 2>&1"
+docker exec -it python-runner tail -f /scripts/pipeline_master.log
+```
+
+## Cron / nightly automation
+
+Pro nightly se hodí jednoduchý cron na Unraidu (`/etc/cron.daily/` nebo User Scripts plugin):
+
+```bash
+#!/bin/bash
+docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet \
+    > /mnt/user/Scripts/pipeline_$(date +%Y%m%d).log 2>&1
+```
+
+Stačí denně, delta sync z minulého stavu trvá ~30s s prázdným backlogem.
+
+## Exit kódy wrapperu
+
+| Kód | Význam |
+|---|---|
+| 0 | Všechny kroky OK |
+| 1 | Alespoň jeden krok selhal |
+| 2 | Žádný krok k běhu (--only + --skip vyloučily vše) |
+| 127 | Některý skript neexistuje v `/scripts/` |
+| 130 | Přerušeno Ctrl+C |
diff --git a/Python-runner/0_run_pipeline_v1.0.py b/Python-runner/0_run_pipeline_v1.0.py
new file mode 100644
index 0000000..eec6988
--- /dev/null
+++ b/Python-runner/0_run_pipeline_v1.0.py
@@ -0,0 +1,176 @@
+"""
+==============================================================================
+Skript:   0_run_pipeline_v1.0.py
+Verze:    1.0
+Datum:    2026-06-04
+Autor:    vladimir.buzalka
+
+Popis:
+  Wrapper kolem cele emailove pipeline. Spousti postupne:
+    1b. parse_emails_graph_delta  -> delta sync z Graph API do Mongo
+     3. download_attachments      -> stahne pripojeny soubory
+     4. unwrap_smime              -> rozbali S/MIME wrapper zpravy
+     5. enrich_fulltext_emails    -> doindexuje do PG fulltext
+
+  Vzdy projizdi VSECHNY schranky (mimo SKIP_MAILBOXES v jednotlivych skriptech).
+  Per-krok merici cas + exit code. Pokud krok selze, default pokracuje dal
+  (aby se downstream nezasekl) — viz --stop-on-error.
+
+  Vsechny vystupy a chyby kazdeho kroku jsou ulozeny do /scripts/pipeline_<step>.log
+
+Spousteni:
+  python 0_run_pipeline_v1.0.py                          # vse, vsechny schranky
+  python 0_run_pipeline_v1.0.py --only 5                 # jen krok 5 (enrich)
+  python 0_run_pipeline_v1.0.py --skip 4                 # vse mimo smime unwrap
+  python 0_run_pipeline_v1.0.py --stop-on-error          # zastavit pri prvni chybe
+  python 0_run_pipeline_v1.0.py --quiet                  # bez tee na konzoli, jen logy
+
+Docker:
+  docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+SCRIPTS_DIR = Path("/scripts")
+LOGS_DIR    = SCRIPTS_DIR  # vse do /scripts/
+
+# Definice pipeline (step_id, label, executable filename)
+STEPS = [
+    ("1b", "Graph delta sync",      "1b_parse_emails_graph_delta_v1.0.py"),
+    ("3",  "Download attachments",  "3_download_attachments_v1.4.py"),
+    ("4",  "Unwrap S/MIME",         "4_unwrap_smime_v1.0.py"),
+    ("5",  "Enrich fulltext (PG)",  "5_enrich_fulltext_emails_v1.3.py"),
+]
+
+
+def fmt_dur(s: float) -> str:
+    if s < 60:
+        return f"{s:.1f}s"
+    m, s = divmod(int(s), 60)
+    if m < 60:
+        return f"{m}m{s:02d}s"
+    h, m = divmod(m, 60)
+    return f"{h}h{m:02d}m{s:02d}s"
+
+
+def run_step(step_id: str, label: str, script: str, *,
+             quiet: bool = False) -> tuple[int, float]:
+    script_path = SCRIPTS_DIR / script
+    log_path    = LOGS_DIR / f"pipeline_{step_id}.log"
+
+    if not script_path.exists():
+        print(f"  CHYBA: {script_path} neexistuje!")
+        return 127, 0.0
+
+    print(f"\n{'='*70}")
+    print(f"  KROK {step_id}: {label}")
+    print(f"  script: {script_path}")
+    print(f"  log:    {log_path}")
+    print(f"  start:  {datetime.now().strftime('%H:%M:%S')}")
+    print(f"{'='*70}")
+
+    t0 = time.time()
+
+    # Tee: zaroven do konzole i do logu (pokud ne --quiet)
+    with open(log_path, "w", encoding="utf-8") as logf:
+        proc = subprocess.Popen(
+            [sys.executable, str(script_path)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+            encoding="utf-8",
+            errors="replace",
+        )
+        for line in proc.stdout:
+            logf.write(line)
+            if not quiet:
+                print(line, end="", flush=True)
+        ret = proc.wait()
+
+    dur = time.time() - t0
+    print(f"\n  KROK {step_id} {'OK' if ret == 0 else f'FAILED ({ret})'} za {fmt_dur(dur)}")
+    return ret, dur
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Email pipeline wrapper v1.0")
+    ap.add_argument("--only", nargs="+", default=None,
+                    help="Spustit jen tyto kroky (napr. --only 3 4 5)")
+    ap.add_argument("--skip", nargs="+", default=None,
+                    help="Preskocit tyto kroky")
+    ap.add_argument("--stop-on-error", action="store_true",
+                    help="Zastavit pipeline pri prvni nenulovem exit kodu")
+    ap.add_argument("--quiet", action="store_true",
+                    help="Necpat stdout kroku na konzoli, jen do logu")
+    args = ap.parse_args()
+
+    # Filter step set
+    only_set = set(args.only) if args.only else None
+    skip_set = set(args.skip) if args.skip else set()
+
+    to_run = []
+    for sid, label, script in STEPS:
+        if only_set and sid not in only_set:
+            continue
+        if sid in skip_set:
+            continue
+        to_run.append((sid, label, script))
+
+    if not to_run:
+        print("Zadny krok k spusteni.")
+        return 2
+
+    print(f"=== Email Pipeline Wrapper v1.0 ===")
+    print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Kroku k spusteni: {len(to_run)}")
+    for sid, label, _ in to_run:
+        print(f"  {sid}: {label}")
+    if args.stop_on_error:
+        print("Politika: stop-on-error")
+    else:
+        print("Politika: continue-on-error (default)")
+
+    t_all = time.time()
+    results = []
+
+    for sid, label, script in to_run:
+        ret, dur = run_step(sid, label, script, quiet=args.quiet)
+        results.append((sid, label, ret, dur))
+        if ret != 0 and args.stop_on_error:
+            print(f"\n!!! Pipeline zastavena na kroku {sid} (--stop-on-error)")
+            break
+
+    total_dur = time.time() - t_all
+
+    print(f"\n{'='*70}")
+    print("=== SHRNUTI PIPELINE ===")
+    print(f"{'='*70}")
+    failed = 0
+    for sid, label, ret, dur in results:
+        status = "OK" if ret == 0 else f"FAIL({ret})"
+        if ret != 0:
+            failed += 1
+        print(f"  [{sid:>2}] {label:30} {status:>8}  {fmt_dur(dur):>10}")
+    print(f"{'='*70}")
+    print(f"  Celkem: {len(results)} kroku, {failed} chyb, {fmt_dur(total_dur)}")
+    print(f"  Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"  Per-krok logy: {LOGS_DIR}/pipeline_<id>.log")
+
+    return 1 if failed else 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+        sys.exit(130)
diff --git a/Python-runner/1_parse_emails_graph_v1.4.md b/Python-runner/1_parse_emails_graph_v1.4.md
new file mode 100644
index 0000000..9dc59c4
--- /dev/null
+++ b/Python-runner/1_parse_emails_graph_v1.4.md
@@ -0,0 +1,41 @@
+# 1_parse_emails_graph_v1.4.py
+
+**Krok 1 pipeline** — import emailů z libovolné schránky přes Microsoft Graph API do MongoDB (`emaily.<mailbox>`). Čte všechny složky rekurzivně. Upsert podle Message-ID → bezpečné přerušit a opakovat.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ano | e-mail | — | Schránka = název kolekce v Mongo |
+| `--mode` | ne | `full` / `new-only` / `sync` | `full` | `full` = plný upsert; `new-only` = jen nové; `sync` = aktualizuje `is_read`/`flag_status`/`categories`/`folder_path` u existujících + importuje nové |
+| `--folder` | ne | název složky | (všechny) | Jen jedna složka (např. `Inbox`) |
+| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N zpráv (test) |
+| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci |
+
+## Varianty volání
+
+```bash
+# První plný import schránky (vše):
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz
+
+# Test na 50 zprávách bez vytváření indexů:
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Jen nové emaily (po prvním importu):
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode new-only
+
+# Pravidelný sync (nové + aktualizace flagů u existujících) na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
+
+# Import jen složky Inbox:
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder Inbox
+
+# Test 10 emailů z konkrétní složky:
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder "Sent Items" --limit 10
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/parse_emails.log
+```
diff --git a/Python-runner/1_parse_emails_graph_v1.4.py b/Python-runner/1_parse_emails_graph_v1.4.py
new file mode 100644
index 0000000..6836f9a
--- /dev/null
+++ b/Python-runner/1_parse_emails_graph_v1.4.py
@@ -0,0 +1,624 @@
+"""
+parse_emails_graph_v1.4.py
+Nazev:  parse_emails_graph_v1.4.py
+Verze:  1.4
+Datum:  2026-06-03
+Autor:  vladimir.buzalka
+
+Popis:
+    Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API
+    a importuje je jako dokumenty do MongoDB.
+    Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
+
+        - predmet, odesilatel, prijemci (To/CC/BCC s typy)
+        - cas doruceni, odeslani, vytvoreni, modifikace (UTC)
+        - telo HTML (max 2 MB) + textovy preview
+        - prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id)
+        - internet headers (SPF, DKIM, Received, X-*, ...)
+        - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
+          kategorie, In-Reply-To, References, ...
+        - navic: isRead, isDraft, folder_path, inferenceClassification
+
+    Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
+    archivni slozky, ...).
+
+    DB:       emaily
+    Kolekce:  <mailbox> (napr. ordinace@buzalkova.cz)
+    _id:      Internet Message-ID (nebo "graphid:<id>" jako fallback)
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    # Prvni import (vsechno):
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
+
+    # Test na prvnich 50:
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+    # Jen jedna slozka:
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox
+
+    # Pokracovani po preruseni (pouze nove):
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only
+
+    # Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync
+
+    # Jina schranka:
+    python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz
+
+Rezimy (--mode):
+    full      Plny upsert vsech poli pro kazdou zpravu (vychozi)
+    new-only  Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
+    sync      Existujici: aktualizuje jen is_read/flag_status/categories/
+              modified_at/folder_path. Nove zpravy importuje cely.
+              Idealni pro pravidelne spousteni.
+
+Zavislosti:
+    msal, requests, pymongo, python-dateutil
+    Python 3.10+
+
+Struktura dokumentu v MongoDB:
+    _id                     Internet Message-ID (nebo graphid: fallback)
+    graph_id                Graph API message ID
+    subject                 predmet zpravy
+    normalized_subject      predmet bez RE:/FW:/AW: prefixu
+    importance              0=nizka 1=normalni 2=vysoka
+    flag_status             0=bez priznaku 1=oznaceno 2=dokonceno
+    is_read                 bool — aktualni stav precteni ve schrance
+    is_draft                bool
+    has_attachments         bool
+    attachment_count        int
+    inference_classification focused / other
+    categories              [str]
+    conversation_id         Graph conversationId
+    conversation_index      base64 conversationIndex
+    conversation_topic      tema vlakna (z internet headers Thread-Topic)
+    in_reply_to             Message-ID predchozi zpravy
+    internet_references     [Message-ID]
+    received_at             datetime UTC
+    sent_at                 datetime UTC
+    created_at              datetime UTC
+    modified_at             datetime UTC
+    folder_id               Graph parentFolderId
+    folder_path             cela cesta slozky (napr. Inbox/Subfolder)
+    sender.email            emailova adresa odesilatele
+    sender.name             zobrazovane jmeno
+    to                      retezec To (joined)
+    cc                      retezec CC
+    bcc                     retezec BCC
+    recipients              [{type, email, name}]
+    body_html               HTML telo (pokud contentType=='html', max 2 MB)
+    body_text               plain-text telo (pokud contentType=='text', max 2 MB)
+    body_preview            textovy nahled z Graph bodyPreview (max 255 znaku)
+    attachments             [{filename, size_bytes, mime_type, is_inline, graph_att_id}]
+    headers                 dict internet headers
+    parsed_at               datetime UTC
+
+Indexy:
+    received_at, sent_at, sender.email, graph_id (unique),
+    conversation_id, folder_path, has_attachments, categories,
+    importance, flag_status, is_read,
+    text_search (subject + body_preview + to + cc)
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze
+    1.1  2026-06-02  Pridany rezimy --mode full/new-only/sync;
+                     odstranen --skip-existing (nahrazen --mode new-only)
+    1.2  2026-06-02  $expand attachments s $select (bez contentBytes — rychlejsi);
+                     prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu
+    1.3  2026-06-02  --mailbox jako povinny parametr — univerzalni pouziti pro
+                     libovolnou schranku; kolekce v MongoDB = nazev schranky
+    1.4  2026-06-03  Plain-text emaily (contentType=='text') se ukladaji do
+                     noveho pole body_text (max 2 MB), drive se truncovalo na
+                     2000 znaku do body_preview a zbytek se zahazoval.
+                     body_preview ted obsahuje vzdy puvodni Graph bodyPreview.
+                     Pro existujici emaily z v1.3 lze pouzit
+                     refetch_text_bodies_v1.0.py.
+"""
+
+import sys
+import re
+import logging
+import argparse
+import base64
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from dateutil import parser as dtparser
+from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI      = "mongodb://192.168.1.76:27017"
+MONGO_DB       = "emaily"
+BATCH_SIZE     = 100
+PAGE_SIZE      = 50
+LOG_FILE       = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION = "1.4"
+
+# Schránka se nastavuje za behu z --mailbox parametru
+GRAPH_MAILBOX: str = ""
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+IMPORTANCE_MAP  = {"low": 0, "normal": 1, "high": 2}
+FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
+RE_SUBJECT      = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
+
+# $expand prilohy bez contentBytes — jen metadata co potrebujeme
+ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
+
+MSG_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification,internetMessageHeaders"
+)
+
+MSG_SELECT_SYNC = (
+    "id,internetMessageId,isRead,isDraft,flag,categories,"
+    "lastModifiedDateTime,parentFolderId,importance"
+)
+
+
+# ─── Graph API helpers ────────────────────────────────────────────────────────
+
+_graph_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def graph_get(url: str, params: dict = None) -> dict:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET failed after retry: {url}")
+
+
+def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
+    """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
+    if parent_id is None:
+        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
+    else:
+        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
+
+    folders = []
+    params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
+    while url:
+        data = graph_get(url, params)
+        for f in data.get("value", []):
+            path = f"{parent_path}/{f['displayName']}".lstrip("/")
+            folders.append({"id": f["id"], "path": path})
+            if f.get("childFolderCount", 0) > 0:
+                folders.extend(get_all_folders(f["id"], path))
+        url = data.get("@odata.nextLink")
+        params = None
+    return folders
+
+
+def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
+    """Generator: vraci zpravy ze slozky po strankach."""
+    url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
+    params = {"$top": PAGE_SIZE, "$select": select}
+    if expand_attachments:
+        params["$expand"] = ATT_EXPAND
+    while url:
+        data = graph_get(url, params)
+        for msg in data.get("value", []):
+            yield msg
+        url = data.get("@odata.nextLink")
+        params = None
+
+
+# ─── Pomocné funkce ───────────────────────────────────────────────────────────
+
+def parse_date(raw) -> Optional[datetime]:
+    if raw is None:
+        return None
+    if isinstance(raw, datetime):
+        if raw.tzinfo:
+            return raw.astimezone(timezone.utc).replace(tzinfo=None)
+        return raw
+    try:
+        dt = dtparser.parse(str(raw))
+        if dt.tzinfo:
+            return dt.astimezone(timezone.utc).replace(tzinfo=None)
+        return dt
+    except Exception:
+        return None
+
+
+def normalize_subject(subject: str) -> str:
+    s = subject.strip()
+    while True:
+        m = RE_SUBJECT.match(s)
+        if not m:
+            break
+        s = s[m.end():].strip()
+    return s
+
+
+def parse_headers(raw_headers: list) -> dict:
+    result = {}
+    for h in raw_headers:
+        k = h["name"].lower().replace("-", "_")
+        v = h["value"]
+        if k in result:
+            existing = result[k]
+            result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
+        else:
+            result[k] = v
+    return result
+
+
+def format_recipients(lst: list) -> str:
+    return "; ".join(
+        f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
+        for r in lst
+    )
+
+
+# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
+
+def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
+    """Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
+    try:
+        mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
+        subject = msg.get("subject") or ""
+
+        body_html = None
+        body_text = None
+        body_preview = msg.get("bodyPreview") or ""
+        body = msg.get("body", {})
+        _MAX_BODY = 2 * 1024 * 1024  # 2 MB
+        if body.get("contentType") == "html":
+            content = body.get("content") or ""
+            body_html = content if len(content) <= _MAX_BODY else content[:_MAX_BODY]
+        elif body.get("contentType") == "text":
+            content = body.get("content") or ""
+            # v1.4: ulozime PLNY plain text do body_text (drive se truncovalo na 2000 znaku
+            # do body_preview a zbytek se zahodil)
+            body_text = content if len(content) <= _MAX_BODY else content[:_MAX_BODY]
+
+        sender_ea    = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
+        to_list      = msg.get("toRecipients", [])
+        cc_list      = msg.get("ccRecipients", [])
+        bcc_list     = msg.get("bccRecipients", [])
+
+        recipients = (
+            [{"type": "to",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
+            [{"type": "cc",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
+            [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
+        )
+
+        importance  = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
+        flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
+
+        raw_headers   = msg.get("internetMessageHeaders") or []
+        headers       = parse_headers(raw_headers)
+
+        in_reply_to = headers.get("in_reply_to", "")
+        if isinstance(in_reply_to, list):
+            in_reply_to = in_reply_to[0]
+
+        refs_raw = headers.get("references", "")
+        if isinstance(refs_raw, list):
+            refs_raw = " ".join(refs_raw)
+        internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
+
+        conv_topic = headers.get("thread_topic", "")
+        if isinstance(conv_topic, list):
+            conv_topic = conv_topic[0]
+
+        conv_index = ""
+        ci_raw = msg.get("conversationIndex")
+        if ci_raw:
+            try:
+                conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
+            except Exception:
+                conv_index = ci_raw
+
+        attachments = []
+        for att in msg.get("attachments") or []:
+            fname = att.get("name") or ""
+            if not fname:
+                continue
+            attachments.append({
+                "filename":     fname,
+                "size_bytes":   att.get("size", 0),
+                "mime_type":    att.get("contentType", "application/octet-stream"),
+                "is_inline":    att.get("isInline", False),
+                "graph_att_id": att.get("id"),
+            })
+
+        return {
+            "_id":      mid,
+            "graph_id": msg["id"],
+
+            "subject":            subject,
+            "normalized_subject": normalize_subject(subject),
+            "importance":         importance,
+            "flag_status":        flag_status,
+            "is_read":            msg.get("isRead", False),
+            "is_draft":           msg.get("isDraft", False),
+            "has_attachments":    msg.get("hasAttachments", False),
+            "attachment_count":   len(attachments),
+            "inference_classification": msg.get("inferenceClassification", ""),
+            "categories":         msg.get("categories") or [],
+
+            "conversation_id":     msg.get("conversationId", ""),
+            "conversation_index":  conv_index,
+            "conversation_topic":  conv_topic,
+            "in_reply_to":         in_reply_to,
+            "internet_references": internet_refs,
+
+            "received_at": parse_date(msg.get("receivedDateTime")),
+            "sent_at":     parse_date(msg.get("sentDateTime")),
+            "created_at":  parse_date(msg.get("createdDateTime")),
+            "modified_at": parse_date(msg.get("lastModifiedDateTime")),
+
+            "folder_id":   msg.get("parentFolderId", ""),
+            "folder_path": folder_path,
+
+            "sender": {
+                "email": sender_ea.get("address", ""),
+                "name":  sender_ea.get("name", ""),
+            },
+            "to":         format_recipients(to_list),
+            "cc":         format_recipients(cc_list),
+            "bcc":        format_recipients(bcc_list),
+            "recipients": recipients,
+
+            "body_html":    body_html,
+            "body_text":    body_text,
+            "body_preview": body_preview,
+
+            "attachments": attachments,
+            "headers":     headers,
+
+            "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        }
+
+    except Exception as e:
+        logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
+        return None
+
+
+def extract_sync_fields(msg: dict, folder_path: str) -> dict:
+    """Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
+    return {
+        "is_read":    msg.get("isRead", False),
+        "is_draft":   msg.get("isDraft", False),
+        "flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
+        "importance":  IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
+        "categories":  msg.get("categories") or [],
+        "modified_at": parse_date(msg.get("lastModifiedDateTime")),
+        "folder_id":   msg.get("parentFolderId", ""),
+        "folder_path": folder_path,
+        "parsed_at":   datetime.now(timezone.utc).replace(tzinfo=None),
+    }
+
+
+# ─── MongoDB indexy ───────────────────────────────────────────────────────────
+
+def create_indexes(col):
+    print("  Vytvarim indexy...")
+    col.create_index([("received_at",     ASCENDING)])
+    col.create_index([("sent_at",         ASCENDING)])
+    col.create_index([("sender.email",    ASCENDING)])
+    col.create_index([("graph_id",        ASCENDING)], unique=True, sparse=True)
+    col.create_index([("conversation_id", ASCENDING)])
+    col.create_index([("folder_path",     ASCENDING)])
+    col.create_index([("has_attachments", ASCENDING)])
+    col.create_index([("categories",      ASCENDING)])
+    col.create_index([("importance",      ASCENDING)])
+    col.create_index([("flag_status",     ASCENDING)])
+    col.create_index([("is_read",         ASCENDING)])
+    col.create_index([
+        ("subject",      TEXT),
+        ("body_preview", TEXT),
+        ("to",           TEXT),
+        ("cc",           TEXT),
+    ], name="text_search", default_language="none")
+    print("  Indexy hotovy.")
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def main():
+    global GRAPH_MAILBOX
+
+    ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox",    required=True,
+                    help="Emailova schranka (napr. ordinace@buzalkova.cz)")
+    ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
+                    help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
+                         "sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
+    ap.add_argument("--limit",      type=int, default=0,
+                    help="Zpracovat max N zprav (0 = vse)")
+    ap.add_argument("--folder",     default="",
+                    help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
+    ap.add_argument("--no-indexes", action="store_true",
+                    help="Nevytvorit indexy na konci")
+    args = ap.parse_args()
+
+    GRAPH_MAILBOX = args.mailbox
+    mongo_col     = args.mailbox
+
+    start = datetime.now()
+    print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
+    print(f"Start:    {start.strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Schránka: {GRAPH_MAILBOX}")
+    print(f"MongoDB:  {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
+    print(f"Režim:    {args.mode}")
+
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+    col = client[MONGO_DB][mongo_col]
+
+    existing: set = set()
+    if args.mode in ("new-only", "sync"):
+        print("  Nacitam existujici zaznamy z MongoDB...")
+        existing = set(col.distinct("_id"))
+        print(f"  {len(existing)} jiz importovano")
+
+    print("\nNacitam seznam slozek...")
+    all_folders = get_all_folders()
+    if args.folder:
+        all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
+    print(f"  Slozek ke zpracovani: {len(all_folders)}")
+    for f in all_folders:
+        print(f"    {f['path']}")
+
+    is_sync    = args.mode == "sync"
+    msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
+    expand_att = not is_sync
+
+    batch      = []
+    ok_count   = 0
+    sync_count = 0
+    err_count  = 0
+    skip_count = 0
+    total_i    = 0
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    print()
+    for folder in all_folders:
+        print(f"--- Složka: {folder['path']} ---")
+        folder_count = 0
+
+        for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
+            if args.limit and total_i >= args.limit:
+                break
+
+            mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
+            total_i += 1
+            folder_count += 1
+
+            if args.mode == "new-only" and mid in existing:
+                skip_count += 1
+                continue
+
+            if is_sync and mid in existing:
+                fields = extract_sync_fields(msg, folder["path"])
+                batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
+                sync_count += 1
+                print(f"  {total_i:>6}  SYN   {mid[:80]}")
+            else:
+                if is_sync:
+                    full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
+                    full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND}
+                    try:
+                        msg = graph_get(full_url, full_params)
+                    except Exception as e:
+                        logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
+                        err_count += 1
+                        continue
+
+                doc = extract_message(msg, folder["path"])
+                if doc is None:
+                    err_count += 1
+                    print(f"  {total_i:>6}  ERR   {mid[:80]}")
+                else:
+                    batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
+                    ok_count += 1
+                    subject_str = (doc.get("subject") or "")[:60]
+                    sender_str  = (doc.get("sender", {}).get("email") or "")[:40]
+                    print(f"  {total_i:>6}  OK    {subject_str:<60}  {sender_str}")
+
+            if len(batch) >= BATCH_SIZE:
+                flush()
+
+            if total_i % 500 == 0:
+                elapsed = (datetime.now() - start).total_seconds()
+                rate    = total_i / elapsed if elapsed > 0 else 0
+                print(f"  {'─'*80}")
+                print(f"  Průběh: ok={ok_count}  sync={sync_count}  skip={skip_count}  err={err_count}  {rate:.1f} msg/s")
+                print(f"  {'─'*80}")
+
+        flush()
+        print(f"  → {folder_count} zprav ze slozky {folder['path']}")
+
+        if args.limit and total_i >= args.limit:
+            break
+
+    elapsed_total = (datetime.now() - start).total_seconds()
+    print(f"\n{'='*52}")
+    print(f"Vysledek:  ok={ok_count}  |  sync={sync_count}  |  skip={skip_count}  |  err={err_count}")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"Dokumentu v kolekci: {col.count_documents({})}")
+
+    if not args.no_indexes:
+        print()
+        create_indexes(col)
+
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if err_count:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Python-runner/1b_parse_emails_graph_delta_v1.0.md b/Python-runner/1b_parse_emails_graph_delta_v1.0.md
new file mode 100644
index 0000000..83e02d9
--- /dev/null
+++ b/Python-runner/1b_parse_emails_graph_delta_v1.0.md
@@ -0,0 +1,139 @@
+# 1b_parse_emails_graph_delta_v1.0.py
+
+**Inkrementalní sync přes Microsoft Graph delta query.** Sourozenec [`1_parse_emails_graph_v1.4.py`](1_parse_emails_graph_v1.4.md) — každý řeší jiný use case:
+
+| Skript | Použití |
+|---|---|
+| `1_parse_emails_graph_v1.4.py` | **První plný import** schránky (vše od začátku) |
+| `1b_parse_emails_graph_delta_v1.0.py` | **Pravidelný sync** — jen co se od minula změnilo |
+
+## Jak funguje
+
+Graph API vystavuje `messages/delta` endpoint, který si pamatuje **záložku** (`deltaLink` s tokenem). Při dalším volání s touto záložkou vrátí jen:
+
+- **nové zprávy**
+- **změny** existujících (`isRead`, vlajka, přesun do jiné složky, kategorie)
+- **smazané** zprávy (`@removed`)
+
+Delta běží **per složka**. Skript drží stav v Mongo kolekci `emaily.sync_state`:
+
+```json
+{
+  "_id": "ordinace@buzalkova.cz|<folder_id>",
+  "mailbox": "ordinace@buzalkova.cz",
+  "folder_id": "AAA...",
+  "folder_path": "Inbox",
+  "delta_link": "https://graph.microsoft.com/.../delta?$deltatoken=...",
+  "last_run_at": "2026-06-04T10:00:00Z",
+  "cumulative_new": 1234, "cumulative_sync": 5678, "cumulative_removed": 12, "run_count": 42
+}
+```
+
+První běh = fresh delta (Graph vrátí všechno + dá `deltaLink`). Každý další = jen změny od poslední záložky.
+
+## Co se stane se smazanými zprávami
+
+Když delta vrátí `@removed` pro zprávu, skript ji **nemaže** z Mongo. Pouze nastaví:
+
+```json
+{ "permanently_deleted": true, "permanently_deleted_at": "2026-06-04T10:00:00Z" }
+```
+
+Dohledatelné: `col.find({"permanently_deleted": true})`.
+
+**`@removed` přijde jen pro definitivně smazané** zprávy (uživatel vysypal koš / Shift+Del). Mail v `Deleted Items` je pořád normální zpráva, jen má `folder_path = "Deleted Items"`.
+
+## Extrakce zprávy
+
+Funkce `extract_message` a `extract_sync_fields` se načítají přímo z modulu `1_parse_emails_graph_v1.4.py` (přes `importlib`) — extrakční logika je jediná na celý projekt, nemůže se rozejít.
+
+## Nové vs změněné — jak skript pozná
+
+Pro každou položku z delta odpovědi:
+
+1. **Má `@removed`?** → označit `permanently_deleted` v Mongo, hotovo.
+2. **`graph_id` už je v Mongo?** → existující změna — pošle se jen `extract_sync_fields` (is_read, flag, folder, …) přes `$set`.
+3. **`graph_id` v Mongo není?** → nová zpráva — udělá se druhý GET `/messages/{id}?$expand=attachments` (delta nepodporuje `$expand`), aby přišla těla, hlavičky i přílohy, a uloží se přes `extract_message` jako klasický nový dokument.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) |
+| `--folder` | ne | substring | (všechny) | Filtr složek (např. `Inbox` zahrne i `Inbox/Archive`) |
+| `--limit N` | ne | int | 0 (bez limitu) | Max položek na složku (test) |
+| `--reset` | ne | flag | false | Smaže všechny `deltaLink`y pro vybrané schránky → další běh začne od fresh delta |
+| `--dry-run` | ne | flag | false | Nic neuloží do Mongo, jen vypíše co by se stalo |
+
+## SKIP_MAILBOXES (hardcoded ve skriptu)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Pro tuto schránku je nutný samostatný skript (lokální `.msg` parser nebo jiný zdroj). |
+
+Při `--mailbox vbuzalka@its.jnj.com` skript skončí s exit kódem 2. Při běhu bez `--mailbox` se schránka tiše přeskočí s hlášením `[skip]`.
+
+## Varianty volání
+
+```bash
+# VŠECHNY schránky najednou (mimo SKIP_MAILBOXES) — pro cron / pravidelný sync:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py
+
+# Jedna schránka — první běh (fresh delta — projde všechno, uloží deltaLinky):
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Pravidelný sync jedné schránky (jen změny od minulého běhu):
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Dry-run — uvidíš co by se stalo, nic se neuloží:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --dry-run
+
+# Test jen na složce Inbox, max 20 položek:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox --limit 20
+
+# Reset — zahodí deltaLinky a najede znova od plné delta:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --reset
+
+# Cron / na pozadí (každých 5 min):
+docker exec -d python-runner bash -c "python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz > /scripts/delta_sync.log 2>&1"
+```
+
+## Co dělat na začátek
+
+1. **První import** schránky pořád přes `1_parse_emails_graph_v1.4.py` (existující data zůstanou).
+2. **První běh** `1b_…delta_v1.0.py` — fresh delta projde znovu všechny zprávy a hlavně uloží `deltaLink`y do `sync_state`. To může chvíli trvat (podobně jako `--mode new-only` na v1.4).
+3. **Další běhy** = už jen rychlé, vrací 0-X změn za interval.
+
+## Otevřené body k otestování
+
+- Jak rychle běží první (fresh) delta na velké schránce (`vladimir.buzalka@buzalka.cz` ~80k mailů)
+- Co Graph vrátí pro nově vytvořené složky (mělo by fungovat — appendnou se do `folders` při dalším `get_all_folders`)
+- Chování při `--limit` (drží se starý deltaLink → pristi beh dokonci zbytek)
+
+## HTTP 410 — expirovaný deltaLink
+
+DeltaLinky drží Graph cca 30 dní. Pokud nebudeš schránku syncovat měsíc, skript dostane 410, **smaže starý state** a sám zopakuje běh jako fresh delta. Žádný manuální zásah není potřeba.
+
+## Závislosti
+
+Stejné jako `1_parse_emails_graph_v1.4.py` (msal, requests, pymongo, dateutil) — žádné nové.
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/delta_sync.log
+docker exec -it python-runner tail -f /scripts/delta_errors.log
+```
+
+## Stav sync_state v Mongo
+
+```python
+# Přehled posledních synců:
+db.sync_state.find().sort("last_run_at", -1)
+
+# Zahodit deltaLinky pro jednu schránku (= efekt --reset):
+db.sync_state.delete_many({"mailbox": "ordinace@buzalkova.cz"})
+
+# Najít všechny permanentně smazané v jedné schránce:
+db["ordinace@buzalkova.cz"].find({"permanently_deleted": true}, {"subject": 1, "permanently_deleted_at": 1})
+```
diff --git a/Python-runner/1b_parse_emails_graph_delta_v1.0.py b/Python-runner/1b_parse_emails_graph_delta_v1.0.py
new file mode 100644
index 0000000..b9a8ae7
--- /dev/null
+++ b/Python-runner/1b_parse_emails_graph_delta_v1.0.py
@@ -0,0 +1,514 @@
+"""
+==============================================================================
+Skript:   1b_parse_emails_graph_delta_v1.0.py
+Verze:    1.0
+Datum:    2026-06-04
+Autor:    vladimir.buzalka
+
+Popis:
+  Inkrementalni sync emailu pres Microsoft Graph DELTA QUERY.
+  Sourozenec `1_parse_emails_graph_v1.4.py` — kazdy resi jiny use case:
+
+    1_parse_emails_graph_v1.4.py   = prvni plny import schranky
+    1b_parse_emails_graph_delta_v1.0.py = pravidelny sync (zmeny od minula)
+
+  Delta query je server-side change tracking — Graph si pamatuje "zalozku"
+  (deltaLink) a vraci jen to, co se od ni zmenilo:
+    - nove zpravy
+    - zmeny existujicich (isRead, flag, presun do jine slozky, kategorie)
+    - SMAZANE zpravy (@removed) — definitivne smazane, nikoli v kosi
+
+  Pro mail v "Deleted Items" delta nic specialniho nedela — je to porad
+  normalni zprava, jen s folder_path="Deleted Items". @removed prijde az
+  kdyz uzivatel vysype kos / Shift+Del.
+
+State:
+  Kolekce `emaily.sync_state`, _id = "<mailbox>|<folder_id>".
+  {
+    mailbox, folder_id, folder_path,
+    delta_link,           # plny URL s $deltatoken na pristi beh
+    last_run_at,
+    cumulative_new, cumulative_sync, cumulative_removed
+  }
+
+Permanentne smazane zpravy:
+  Skript je NEMAZE z Mongo. Pouze nastavi:
+    permanently_deleted: True
+    permanently_deleted_at: <UTC datetime detekce>
+  Dohledani: col.find({"permanently_deleted": True})
+
+Reuse:
+  Funkce extract_message / extract_sync_fields se nactou primo z modulu
+  1_parse_emails_graph_v1.4.py (importlib, file-based), aby se logika
+  extrahce nikdy nerozesla.
+
+Spousteni:
+  python 1b_parse_emails_graph_delta_v1.0.py                                   # VSECHNY schranky (mimo SKIP_MAILBOXES)
+  python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz   # jedna schranka
+  python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox
+  python 1b_parse_emails_graph_delta_v1.0.py --reset                           # zahodit deltaLinky a najet znova
+  python 1b_parse_emails_graph_delta_v1.0.py --dry-run                         # nic neulozit
+
+SKIP_MAILBOXES (hardcoded):
+  vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup. Pro tuto
+                            schranku je nutny samostatny skript (lokalni .msg).
+
+Zavislosti:
+  msal, requests, pymongo, python-dateutil
+  Python 3.10+
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import logging
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, ASCENDING
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI       = "mongodb://192.168.1.76:27017"
+MONGO_DB        = "emaily"
+SYNC_STATE_COL  = "sync_state"
+PAGE_SIZE       = 100  # delta endpoint typicky vraci max 100/stranka
+LOG_FILE        = Path(__file__).parent / "delta_errors.log"
+SCRIPT_VERSION  = "1.0"
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy:
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky, kde NEMAME Graph API pristup — pri bezneho behu se preskoci.
+# Pro tyto je nutny separatni skript (napr. lokalni .msg parser).
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",   # JNJ tenant — nemame Graph credentials
+}
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+# Co tahnout z delta endpointu (stejne jako MSG_SELECT v v1.4, mimo internetMessageHeaders
+# ktere delta neumi vratit pro vsechny polozky — pro nove zpravy si je dotahneme
+# samostatnym fetchem).
+DELTA_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification"
+)
+
+# Pro plne nacteni nove zpravy (vcetne hlavicek + priloh) pouzijeme stejny
+# select+expand jako v1.4
+FULL_FETCH_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification,internetMessageHeaders"
+)
+FULL_FETCH_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
+
+# ─── Reuse extract logiky z v1.4 ──────────────────────────────────────────────
+
+_HERE = Path(__file__).parent
+_V14_PATH = _HERE / "1_parse_emails_graph_v1.4.py"
+if not _V14_PATH.exists():
+    print(f"CHYBA: chybi sourozenec {_V14_PATH.name} — extract logiku nelze nacist", file=sys.stderr)
+    sys.exit(1)
+
+_spec = importlib.util.spec_from_file_location("v14_parse", _V14_PATH)
+_v14 = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_v14)
+extract_message     = _v14.extract_message
+extract_sync_fields = _v14.extract_sync_fields
+
+# GRAPH_MAILBOX modul-level v v1.4 — pro extract neni potreba, ale pro
+# konzistenci nastavujeme ho v main()
+
+# ─── Graph API ────────────────────────────────────────────────────────────────
+
+_graph_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+class DeltaExpired(Exception):
+    """deltaLink expiroval (HTTP 410) — je nutne zacit od plne delta znovu."""
+
+
+def graph_get(url: str, params: dict = None, allow_410: bool = False) -> dict:
+    """GET na Graph s retry pri 401. Pri 410 a allow_410=True vyhodi DeltaExpired."""
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(3):
+        r = requests.get(
+            url,
+            headers={"Authorization": f"Bearer {_graph_token}"},
+            params=params,
+            timeout=60,
+        )
+        if r.status_code == 401:
+            get_token()
+            continue
+        if r.status_code == 410 and allow_410:
+            raise DeltaExpired(url)
+        if r.status_code == 429:
+            # rate limit — respect Retry-After
+            wait = int(r.headers.get("Retry-After", "5"))
+            print(f"  [429] cekam {wait}s ...")
+            time.sleep(wait)
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET failed after retries: {url}")
+
+
+def get_all_folders(mailbox: str, parent_id: str = None, parent_path: str = "") -> list[dict]:
+    if parent_id is None:
+        url = f"{GRAPH_URL}/users/{mailbox}/mailFolders"
+    else:
+        url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{parent_id}/childFolders"
+
+    folders = []
+    params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
+    while url:
+        data = graph_get(url, params)
+        for f in data.get("value", []):
+            path = f"{parent_path}/{f['displayName']}".lstrip("/")
+            folders.append({"id": f["id"], "path": path})
+            if f.get("childFolderCount", 0) > 0:
+                folders.extend(get_all_folders(mailbox, f["id"], path))
+        url = data.get("@odata.nextLink")
+        params = None
+    return folders
+
+
+def fetch_full_message(mailbox: str, msg_id: str) -> Optional[dict]:
+    """Stahne celou zpravu vcetne hlavicek a priloh — pro nove zpravy zachycene v delte."""
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{msg_id}"
+    params = {"$select": FULL_FETCH_SELECT, "$expand": FULL_FETCH_EXPAND}
+    try:
+        return graph_get(url, params)
+    except requests.HTTPError as e:
+        logging.error("fetch_full_message %s: %s", msg_id, e)
+        return None
+
+
+# ─── Delta iterace ────────────────────────────────────────────────────────────
+
+def iter_folder_delta(mailbox: str, folder_id: str, delta_link: Optional[str], limit: int = 0):
+    """
+    Generator: vraci (item, final_delta_link).
+    item je dict s polozkou (bud zmena nebo {'@removed': ...}).
+    Posledni vyhozeny tuple ma final_delta_link != None (zbytek None).
+
+    Pri HTTP 410 (expirovany deltaLink) vyhodi DeltaExpired — caller ma
+    pustit znova s delta_link=None (= fresh full delta).
+    """
+    if delta_link:
+        url = delta_link
+        params = None
+    else:
+        url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{folder_id}/messages/delta"
+        params = {"$select": DELTA_SELECT, "$top": PAGE_SIZE}
+
+    n = 0
+    while url:
+        data = graph_get(url, params, allow_410=True)
+        params = None
+        for item in data.get("value", []):
+            yield item, None
+            n += 1
+            if limit and n >= limit:
+                # ulozime aspon stavajici nextLink jako "delta" — neni to ciste,
+                # ale pri --limit jde o test, takze pristi beh proste pocnize znovu
+                return
+        next_link  = data.get("@odata.nextLink")
+        final_link = data.get("@odata.deltaLink")
+        if final_link:
+            # konec — predame final delta
+            yield None, final_link
+            return
+        url = next_link
+
+
+# ─── Per-folder sync ──────────────────────────────────────────────────────────
+
+def sync_folder(col, sync_col, mailbox: str, folder: dict, dry_run: bool, limit: int) -> dict:
+    """Vrati statistiky."""
+    fid   = folder["id"]
+    fpath = folder["path"]
+    state_id = f"{mailbox}|{fid}"
+    state = sync_col.find_one({"_id": state_id})
+    delta_link = state.get("delta_link") if state else None
+
+    is_first_run = delta_link is None
+    label = "FRESH" if is_first_run else "DELTA"
+    print(f"\n[{label}] {fpath}")
+
+    stats = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
+    final_delta = None
+
+    try:
+        gen = iter_folder_delta(mailbox, fid, delta_link, limit=limit)
+        for item, fin in gen:
+            if fin:
+                final_delta = fin
+                break
+            try:
+                process_item(col, mailbox, fpath, item, stats, dry_run)
+            except Exception as e:
+                stats["errors"] += 1
+                logging.error("process_item %s: %s", item.get("id", "?"), e)
+    except DeltaExpired:
+        print(f"  [410] deltaLink expiroval — restart od fresh delta")
+        # rekurzivni restart s vymazanym statem
+        sync_col.delete_one({"_id": state_id})
+        return sync_folder(col, sync_col, mailbox, folder, dry_run, limit)
+
+    print(f"  new={stats['new']}  sync={stats['sync']}  removed={stats['removed']}  err={stats['errors']}")
+
+    # Ulozit sync_state pokud mame final_delta a neni dry run
+    if final_delta and not dry_run:
+        sync_col.update_one(
+            {"_id": state_id},
+            {
+                "$set": {
+                    "mailbox":     mailbox,
+                    "folder_id":   fid,
+                    "folder_path": fpath,
+                    "delta_link":  final_delta,
+                    "last_run_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                },
+                "$inc": {
+                    "cumulative_new":     stats["new"],
+                    "cumulative_sync":    stats["sync"],
+                    "cumulative_removed": stats["removed"],
+                    "run_count":          1,
+                },
+            },
+            upsert=True,
+        )
+    elif not final_delta:
+        # neprisel deltaLink (napr. limit nebo chyba) — nemenime state, pristi beh
+        # bude pokracovat normalne podle stareho deltaLinku nebo zacne od fresh
+        if not is_first_run:
+            print(f"  [pozn] delta neukoncena — pristi beh pojede od ulozeneho deltaLinku")
+
+    return stats
+
+
+def process_item(col, mailbox: str, folder_path: str, item: dict, stats: dict, dry_run: bool):
+    """Zpracuje jednu polozku z delta odpovedi."""
+    # 1) Smazana zprava (@removed)
+    if "@removed" in item or item.get("@removed.reason"):
+        graph_id = item.get("id")
+        if not graph_id:
+            return
+        if dry_run:
+            print(f"  REMOVED  graph_id={graph_id[:30]}...")
+        else:
+            col.update_one(
+                {"graph_id": graph_id},
+                {"$set": {
+                    "permanently_deleted":    True,
+                    "permanently_deleted_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }},
+            )
+        stats["removed"] += 1
+        return
+
+    # 2) Nova nebo zmenena zprava — rozhodneme podle existence graph_id v Mongo
+    graph_id = item.get("id")
+    if not graph_id:
+        return
+
+    existing = col.find_one({"graph_id": graph_id}, {"_id": 1})
+
+    if existing:
+        # Existujici zprava — update jen sync poli (delta payload je obsahuje)
+        fields = extract_sync_fields(item, folder_path)
+        if dry_run:
+            print(f"  SYNC     {item.get('subject','')[:60]}")
+        else:
+            col.update_one({"_id": existing["_id"]}, {"$set": fields})
+        stats["sync"] += 1
+    else:
+        # Nova zprava — pro telo+attachments+headers fetchneme plnou verzi
+        full = fetch_full_message(mailbox, graph_id)
+        if full is None:
+            stats["errors"] += 1
+            return
+        doc = extract_message(full, folder_path)
+        if doc is None:
+            stats["errors"] += 1
+            return
+        if dry_run:
+            print(f"  NEW      {doc.get('subject','')[:60]}")
+        else:
+            col.update_one({"_id": doc["_id"]}, {"$set": doc}, upsert=True)
+        stats["new"] += 1
+
+
+# ─── Indexy pro sync_state ────────────────────────────────────────────────────
+
+def ensure_sync_state_indexes(sync_col):
+    sync_col.create_index([("mailbox", ASCENDING), ("folder_id", ASCENDING)])
+    sync_col.create_index([("last_run_at", ASCENDING)])
+
+
+def ensure_perm_deleted_index(col):
+    col.create_index([("permanently_deleted", ASCENDING)], sparse=True)
+
+
+# ─── Main ─────────────────────────────────────────────────────────────────────
+
+def discover_mailboxes(db) -> list[str]:
+    """Vrati seznam mailboxu = vsechny kolekce v `emaily` mimo NON_MAILBOX_COLLECTIONS
+    a SKIP_MAILBOXES."""
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        if name in SKIP_MAILBOXES:
+            print(f"  [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
+            continue
+        out.append(name)
+    return out
+
+
+def sync_mailbox(client, mailbox: str, args) -> dict:
+    """Sync jedne schranky. Vraci totals dict."""
+    _v14.GRAPH_MAILBOX = mailbox
+
+    print(f"\n========== {mailbox} ==========")
+
+    col      = client[MONGO_DB][mailbox]
+    sync_col = client[MONGO_DB][SYNC_STATE_COL]
+
+    if not args.dry_run:
+        ensure_sync_state_indexes(sync_col)
+        ensure_perm_deleted_index(col)
+
+    if args.reset:
+        n = sync_col.delete_many({"mailbox": mailbox}).deleted_count
+        print(f"  --reset: smazano {n} deltaLinku pro {mailbox}")
+
+    print("Nacitam seznam slozek...")
+    try:
+        folders = get_all_folders(mailbox)
+    except requests.HTTPError as e:
+        print(f"  CHYBA: nelze nacist slozky pro {mailbox}: {e}")
+        logging.error("get_all_folders %s: %s", mailbox, e)
+        return {"new": 0, "sync": 0, "removed": 0, "errors": 1}
+
+    if args.folder:
+        folders = [f for f in folders if args.folder.lower() in f["path"].lower()]
+    print(f"  Slozek ke zpracovani: {len(folders)}")
+
+    totals = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
+    for folder in folders:
+        s = sync_folder(col, sync_col, mailbox, folder, args.dry_run, args.limit)
+        for k in totals:
+            totals[k] += s[k]
+    print(f"  -> mailbox total: new={totals['new']}  sync={totals['sync']}  removed={totals['removed']}  err={totals['errors']}")
+    return totals
+
+
+def main():
+    ap = argparse.ArgumentParser(description=f"parse_emails_graph delta sync v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox", default="",
+                    help="E-mail schranky (= kolekce v Mongo). "
+                         "Bez argumentu projede vsechny schranky z `emaily` (mimo SKIP_MAILBOXES).")
+    ap.add_argument("--folder",  default="",   help="Filtruje slozky obsahujici tento retezec (default: vsechny)")
+    ap.add_argument("--limit",   type=int, default=0, help="Max polozek na slozku (test)")
+    ap.add_argument("--reset",   action="store_true",
+                    help="Smaze deltaLinky pro vybrane schranky — pristi beh zacne od fresh delta")
+    ap.add_argument("--dry-run", action="store_true", help="Nic neulozi do Mongo, jen vypise co by se stalo")
+    args = ap.parse_args()
+
+    print(f"=== Delta sync v{SCRIPT_VERSION} ===")
+    if args.dry_run:
+        print("  DRY-RUN — zadne zmeny v Mongo")
+
+    print("Pripojuji se k MongoDB...")
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    client.admin.command("ping")
+    db = client[MONGO_DB]
+
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"  CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            sys.exit(2)
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+        print(f"  Schranky ke zpracovani: {len(mailboxes)}")
+        for m in mailboxes:
+            print(f"    {m}")
+
+    print("Token Graph API...")
+    get_token()
+    print("  OK")
+
+    t0 = time.time()
+    grand = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
+    per_mailbox = []
+    for mb in mailboxes:
+        try:
+            s = sync_mailbox(client, mb, args)
+        except Exception as e:
+            print(f"  FATAL pri sync {mb}: {e}")
+            logging.error("sync_mailbox %s: %s", mb, e)
+            s = {"new": 0, "sync": 0, "removed": 0, "errors": 1}
+        per_mailbox.append((mb, s))
+        for k in grand:
+            grand[k] += s[k]
+
+    dt = time.time() - t0
+    print(f"\n=== SHRNUTI ===")
+    for mb, s in per_mailbox:
+        print(f"  {mb:40} new={s['new']:>5} sync={s['sync']:>5} removed={s['removed']:>4} err={s['errors']:>3}")
+    print(f"  {'TOTAL':40} new={grand['new']:>5} sync={grand['sync']:>5} removed={grand['removed']:>4} err={grand['errors']:>3}")
+    print(f"  trvalo: {dt:.1f} s")
+    return 1 if grand["errors"] > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/Python-runner/2_refetch_text_bodies_v1.0.md b/Python-runner/2_refetch_text_bodies_v1.0.md
new file mode 100644
index 0000000..6203ae0
--- /dev/null
+++ b/Python-runner/2_refetch_text_bodies_v1.0.md
@@ -0,0 +1,34 @@
+# 2_refetch_text_bodies_v1.0.py
+
+**Krok 2 pipeline** — ONETIME oprava starých plain-text emailů. Starý `parse_emails_graph_v1.3` ukládal plain-text emaily jen jako prvních 2000 znaků do `body_preview`; plné tělo se zahazovalo. Tento skript najde takové emaily a re-fetchne plný obsah do nového pole `body_text` (max 2 MB).
+
+> Pro schránky importované rovnou v1.4 nemá co dělat (kandidátů 0). Drží se kvůli archivním schránkám importovaným ve v1.3.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka |
+| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
+
+## Varianty volání
+
+```bash
+# Všechny schránky:
+docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Test 20 emailů:
+docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz --limit 20
+
+# Plný běh na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/2_refetch_text_bodies_v1.0.py > /scripts/refetch.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/refetch.log
+```
diff --git a/Python-runner/2_refetch_text_bodies_v1.0.py b/Python-runner/2_refetch_text_bodies_v1.0.py
new file mode 100644
index 0000000..64ba811
--- /dev/null
+++ b/Python-runner/2_refetch_text_bodies_v1.0.py
@@ -0,0 +1,270 @@
+"""
+==============================================================================
+Skript:   refetch_text_bodies_v1.0.py
+Verze:    1.0
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  ONETIME oprava — parse_emails_graph_v1.3 ukladal plain-text emaily jen jako
+  prvnich 2000 znaku do `body_preview`. Plne telo se zahazovalo.
+
+  Tento skript:
+    1) Najde v Mongo emaily kde body_html IS NULL/missing/empty
+       a soucasne maji graph_id (lze refetch)
+    2) Pro kazdy GET /users/{mailbox}/messages/{graph_id}?$select=body,bodyPreview
+    3) Pokud body.contentType == 'text' -> ulozi PLNY obsah do noveho pole
+       body_text (max 2 MB - stejny limit jako body_html)
+    4) Pokud body.contentType == 'html' (Graph mezitim prepnul) -> ulozi do body_html
+    5) Aktualizuje body_preview na realny 255-znakovy bodyPreview z Graphu
+
+  Bezpecne preusitelne a opakovatelne - skript znovu refetchne jen ty kde
+  stale chybi body_html i body_text.
+
+Spusteni:
+  python refetch_text_bodies_v1.0.py                      # vsechny schranky
+  python refetch_text_bodies_v1.0.py --mailbox vladimir.buzalka@buzalka.cz
+  python refetch_text_bodies_v1.0.py --limit 100          # test
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# --- konfigurace ------------------------------------------------------------
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB  = "emaily"
+SKIP_COLLECTIONS = {"attachments_index"}
+
+MAX_BODY_BYTES = 2 * 1024 * 1024   # 2 MB - stejny limit jako body_html v parseru
+BATCH_SIZE = 50
+LOG_FILE = Path(__file__).parent / "refetch_text_bodies_errors.log"
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+
+# --- Graph auth -------------------------------------------------------------
+_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in res:
+        raise RuntimeError(f"Graph auth failed: {res}")
+    _token = res["access_token"]
+    return _token
+
+
+def graph_get(url: str, params: dict = None) -> Optional[dict]:
+    global _token
+    if not _token:
+        get_token()
+    for attempt in range(3):
+        try:
+            r = requests.get(
+                url,
+                headers={"Authorization": f"Bearer {_token}"},
+                params=params,
+                timeout=30,
+            )
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code == 404:
+                return None  # zprava uz neexistuje na strane Outlook
+            if r.status_code == 429:
+                wait = int(r.headers.get("Retry-After", "5"))
+                print(f"  [429] throttled, cekam {wait}s", flush=True)
+                time.sleep(wait)
+                continue
+            r.raise_for_status()
+            return r.json()
+        except requests.RequestException as e:
+            if attempt == 2:
+                raise
+            time.sleep(2)
+    return None
+
+
+# --- hlavni smycka ----------------------------------------------------------
+
+# emaily kde chybi obe tela (body_html i body_text) - tj. jeste nezpracovane
+EMPTY_BODY_FILTER = {
+    "$and": [
+        {"$or": [
+            {"body_html": None},
+            {"body_html": {"$exists": False}},
+            {"body_html": ""},
+        ]},
+        {"$or": [
+            {"body_text": None},
+            {"body_text": {"$exists": False}},
+            {"body_text": ""},
+        ]},
+        {"graph_id": {"$exists": True, "$ne": None, "$ne": ""}},
+    ]
+}
+
+
+def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict:
+    total = col.count_documents(EMPTY_BODY_FILTER)
+    print(f"[{mailbox}] kandidatu k refetchi: {total}"
+          + (f" (limit {limit})" if limit else ""))
+    if total == 0:
+        return {"mailbox": mailbox, "candidates": 0, "refetched": 0,
+                "text": 0, "html": 0, "still_empty": 0, "errors": 0, "missing": 0}
+
+    cursor = col.find(EMPTY_BODY_FILTER, {"_id": 1, "graph_id": 1},
+                      no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    n = refetched = txt = html = still_empty = err = missing = 0
+    bulk: list[UpdateOne] = []
+
+    try:
+        for doc in cursor:
+            n += 1
+            mid = doc["_id"]
+            gid = doc["graph_id"]
+            url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}"
+            params = {"$select": "body,bodyPreview"}
+            try:
+                data = graph_get(url, params)
+            except Exception as e:
+                err += 1
+                logging.error("[%s] graph_get %s: %s", mailbox, gid, e)
+                continue
+
+            if data is None:
+                missing += 1
+                continue
+
+            body = data.get("body") or {}
+            ctype = body.get("contentType")
+            content = body.get("content") or ""
+            preview = data.get("bodyPreview") or ""
+
+            update: dict = {"refetched_at": datetime.now(timezone.utc).replace(tzinfo=None)}
+
+            if not content:
+                still_empty += 1
+                update["body_refetch_status"] = "graph_empty"
+            elif ctype == "html":
+                update["body_html"] = (content[:MAX_BODY_BYTES]
+                                       if len(content) > MAX_BODY_BYTES else content)
+                update["body_refetch_status"] = "html"
+                html += 1
+                refetched += 1
+            elif ctype == "text":
+                update["body_text"] = (content[:MAX_BODY_BYTES]
+                                       if len(content) > MAX_BODY_BYTES else content)
+                update["body_refetch_status"] = "text"
+                txt += 1
+                refetched += 1
+            else:
+                update["body_refetch_status"] = f"unknown_ctype:{ctype}"
+                still_empty += 1
+
+            if preview:
+                update["body_preview"] = preview[:300]
+
+            bulk.append(UpdateOne({"_id": mid}, {"$set": update}))
+
+            if len(bulk) >= BATCH_SIZE:
+                col.bulk_write(bulk, ordered=False)
+                bulk.clear()
+
+            if n % 100 == 0 or n == 1:
+                print(f"  [{n:>5}/{total}] refetched={refetched}  "
+                      f"text={txt} html={html} still_empty={still_empty} "
+                      f"missing={missing} err={err}",
+                      flush=True)
+    finally:
+        cursor.close()
+        if bulk:
+            col.bulk_write(bulk, ordered=False)
+
+    print(f"  [{n}/{total}] DONE  refetched={refetched}  text={txt} html={html} "
+          f"still_empty={still_empty} missing={missing} err={err}")
+    return {"mailbox": mailbox, "candidates": total, "refetched": refetched,
+            "text": txt, "html": html, "still_empty": still_empty,
+            "errors": err, "missing": missing}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    print("Token Graph API...")
+    get_token()
+    print("OK\n")
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}\n")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(db[mb], mb, limit=args.limit))
+        print()
+
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: candidates={r['candidates']}  "
+              f"refetched={r['refetched']}  text={r['text']}  html={r['html']}  "
+              f"still_empty={r['still_empty']}  missing={r['missing']}  errors={r['errors']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/Python-runner/3_download_attachments_v1.3.md b/Python-runner/3_download_attachments_v1.3.md
new file mode 100644
index 0000000..dc4701c
--- /dev/null
+++ b/Python-runner/3_download_attachments_v1.3.md
@@ -0,0 +1,47 @@
+# 3_download_attachments_v1.3.py
+
+**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails/<schránka>/Attachments/`. Deduplikace podle **SHA256** obsahu:
+- stejný hash → soubor už existuje → přeskočí
+- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf` …
+
+Po uložení doplní do Mongo `file_hash` + `local_path` a aktualizuje kolekci `emaily.attachments_index` (`_id`=hash, filename, path, size, mime, mailbox, ref_count). Emaily kde mají všechny přílohy `file_hash` → skip → **bezpečné opakovat**.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) |
+| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N emailů **per schránka** (test) |
+| `--force-recheck` | ne | flag | false | Znovu ověří i už stažené přílohy |
+| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci |
+
+## SKIP_MAILBOXES (hardcoded)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
+
+## Varianty volání
+
+```bash
+# VŠECHNY schránky (mimo SKIP_MAILBOXES):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py
+
+# Jedna schránka interaktivně:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
+
+# Test 50 emailů:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Force-recheck (znovu ověří všechny):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+# Na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/download_attachments.log
+```
diff --git a/Python-runner/3_download_attachments_v1.3.py b/Python-runner/3_download_attachments_v1.3.py
new file mode 100644
index 0000000..93e544e
--- /dev/null
+++ b/Python-runner/3_download_attachments_v1.3.py
@@ -0,0 +1,546 @@
+"""
+download_attachments_v1.3.py
+Nazev:  download_attachments_v1.3.py
+Verze:  1.3
+Datum:  2026-06-02
+Autor:  vladimir.buzalka
+
+Popis:
+    Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
+    pres Microsoft Graph API a uklada je do adresare
+    /mnt/Emails/<schránka>/Attachments/.
+
+    Schránka se predava jako povinny parametr --mailbox.
+
+    Deduplikace podle SHA256 hashe obsahu:
+        - stejny hash = soubor uz existuje -> preskoci
+        - prvni vyskytu souboru: ulozi pod puvodnimnazvem
+        - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
+
+    Po ulozeni aktualizuje MongoDB:
+        - v email dokumentu: kazda priloha dostane file_hash + local_path
+        - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
+          mime_type, mailbox, first_seen_at, ref_count
+
+    Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
+    se preskoci. --force-recheck znovu overi i uz stazene.
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    python download_attachments_v1.3.py                                       # VSECHNY schranky (mimo SKIP_MAILBOXES)
+    python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz       # jedna schranka
+    python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50
+    python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+SKIP_MAILBOXES (hardcoded):
+    vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup.
+
+Docker:
+    docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py
+
+Zavislosti:
+    msal, requests, pymongo
+    Python 3.10+
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze
+    1.1  2026-06-02  Schránka jako parametr --mailbox
+    1.2  2026-06-02  Oprava: Graph attachment mapa vcetne inline; normalizace nazvu;
+                     preskoceni S/MIME; inline z Graphu -> SKIP ne ERR
+    1.3  2026-06-02  Primarni stazeni pres graph_att_id (prime ID bez name-matchingu);
+                     oprava $select na attachment listu (odstranen contentId ktery
+                     zpusoboval BadRequest a vracel prazdny seznam); name-matching
+                     zustava jako fallback pro stare emaily bez graph_att_id
+"""
+
+import sys
+import re
+import hashlib
+import logging
+import argparse
+import unicodedata
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI           = "mongodb://192.168.1.76:27017"
+MONGO_DB            = "emaily"
+MONGO_COL_INDEX     = "attachments_index"
+
+EMAILS_BASE_DIR     = Path("/mnt/Emails")
+LOG_FILE            = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION      = "1.3"
+BATCH_SIZE          = 50
+
+# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
+SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky kde NEMAME Graph API pristup — pri behu bez --mailbox se preskocia
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",   # JNJ tenant — nemame Graph credentials
+}
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+_graph_token: Optional[str] = None
+
+
+# ─── Graph API ────────────────────────────────────────────────────────────────
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def graph_get_bytes(url: str) -> bytes:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.content
+    raise RuntimeError(f"Graph GET bytes failed: {url}")
+
+
+def graph_get_json(url: str, params: dict = None) -> dict:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET json failed: {url}")
+
+
+def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
+    """Nacte metadata vsech priloh zpravy (bez contentBytes)."""
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
+    try:
+        # Pozor: contentId NENI v base attachment type — nesmi byt v $select
+        data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
+        return data.get("value", [])
+    except Exception as e:
+        logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
+        return []
+
+
+def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
+    try:
+        return graph_get_bytes(url)
+    except Exception as e:
+        logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s",
+                      graph_message_id, attachment_id, e)
+        return None
+
+
+# ─── Pomocné funkce ───────────────────────────────────────────────────────────
+
+def normalize_name(name: str) -> str:
+    """Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-"""
+    nfkd = unicodedata.normalize("NFKD", name.lower().strip())
+    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
+    return re.sub(r"[^\w.\-]", "_", ascii_str)
+
+
+def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
+    """Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id)."""
+    # 1. Presna shoda
+    for ga in graph_atts:
+        if ga["name"] == att_name:
+            return ga
+
+    norm_want = normalize_name(att_name)
+
+    # 2. Normalizovana shoda
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            return ga
+
+    # 3. Normalizovana shoda + velikost (±10 %)
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            ga_size = ga.get("size", 0)
+            if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
+                return ga
+
+    # 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména)
+    for ga in graph_atts:
+        if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
+            return ga
+
+    return None
+
+
+def sha256(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def safe_filename(name: str) -> str:
+    safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
+    return safe or "attachment"
+
+
+def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
+    existing = col_index.find_one({"filename": desired_name})
+    if existing:
+        if existing["_id"] == hash_val:
+            return desired_name
+        stem   = Path(desired_name).stem
+        suffix = Path(desired_name).suffix
+        n = 2
+        while True:
+            candidate = f"{stem}_{n}{suffix}"
+            ex2 = col_index.find_one({"filename": candidate})
+            if not ex2 or ex2["_id"] == hash_val:
+                if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
+                    return candidate
+            n += 1
+    return desired_name
+
+
+def save_attachment(
+    content: bytes,
+    original_name: str,
+    mime_type: str,
+    mailbox: str,
+    att_dir: Path,
+    col_index,
+) -> tuple[str, str, bool]:
+    hash_val = sha256(content)
+
+    existing = col_index.find_one({"_id": hash_val})
+    if existing:
+        col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
+        return hash_val, existing["local_path"], False
+
+    filename  = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
+    file_path = att_dir / filename
+    file_path.write_bytes(content)
+
+    col_index.insert_one({
+        "_id":           hash_val,
+        "filename":      filename,
+        "local_path":    filename,
+        "size_bytes":    len(content),
+        "mime_type":     mime_type,
+        "mailbox":       mailbox,
+        "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        "ref_count":     1,
+    })
+
+    return hash_val, filename, True
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def process_mailbox(client, mailbox: str, args) -> dict:
+    """Zpracuje jednu schranku. Vraci statistiky."""
+    att_dir   = EMAILS_BASE_DIR / mailbox / "Attachments"
+    mongo_col = mailbox
+
+    start = datetime.now()
+    print(f"\n========== {mailbox} ==========")
+    print(f"Cilovy adresar: {att_dir}")
+
+    att_dir.mkdir(parents=True, exist_ok=True)
+
+    col_emails = client[MONGO_DB][mongo_col]
+    col_index  = client[MONGO_DB][MONGO_COL_INDEX]
+
+    if args.force_recheck:
+        query = {"has_attachments": True}
+    else:
+        query = {
+            "has_attachments": True,
+            "attachments": {
+                "$elemMatch": {
+                    "is_inline": False,
+                    "file_hash": {"$exists": False},
+                }
+            }
+        }
+
+    total = col_emails.count_documents(query)
+    print(f"Emailu ke zpracovani: {total}")
+    if total == 0:
+        print("  Neni co stahnout.")
+        return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0, "err": 0,
+                "elapsed": 0.0}
+
+    cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
+    if args.limit:
+        cursor = cursor.limit(args.limit)
+
+    ok_count   = 0
+    new_count  = 0
+    dup_count  = 0
+    skip_count = 0
+    err_count  = 0
+    email_i    = 0
+    batch      = []
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col_emails.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    for email_doc in cursor:
+        email_i  += 1
+        email_id  = email_doc["_id"]
+        graph_id  = email_doc.get("graph_id", "")
+        subject   = (email_doc.get("subject") or "")[:60]
+        att_list  = email_doc.get("attachments") or []
+
+        real_atts = [a for a in att_list if not a.get("is_inline", False)]
+        if not real_atts:
+            continue
+
+        print(f"\n  {email_i:>5}/{total}  {subject}")
+
+        # Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id
+        need_listing = any(
+            not a.get("is_inline", False)
+            and not (not args.force_recheck and a.get("file_hash"))
+            and not a.get("graph_att_id")
+            for a in att_list
+        )
+        graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
+
+        updated_atts = list(att_list)
+        email_ok     = True
+
+        for i, att in enumerate(updated_atts):
+            if att.get("is_inline", False):
+                continue
+            if not args.force_recheck and att.get("file_hash"):
+                continue
+
+            att_name     = att.get("filename", "")
+            att_size     = att.get("size_bytes", 0)
+            graph_att_id = att.get("graph_att_id")
+
+            # Preskoc S/MIME podpisy
+            if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
+                updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
+                skip_count += 1
+                print(f"         SKIP  {att_name} (S/MIME)")
+                continue
+
+            # Primy pristup pres graph_att_id (emaily parsovane v1.2+)
+            if graph_att_id:
+                content = fetch_attachment_content(mailbox, graph_id, graph_att_id)
+                if content is None:
+                    err_count += 1
+                    email_ok = False
+                    print(f"         ERR   {att_name} (stazeni selhalo)")
+                    continue
+                # Zkontroluj zda jde skutecne o inline (pro edge case)
+                mime_type = att.get("mime_type", "")
+            else:
+                # Fallback: name matching pro stare emaily (parsovane pred v1.2)
+                graph_att = find_graph_att(att_name, att_size, graph_atts)
+
+                if not graph_att:
+                    logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
+                    print(f"         ERR   {att_name} (nenalezeno)")
+                    err_count += 1
+                    email_ok = False
+                    continue
+
+                # Pokud Graph rika ze je inline — preskoc
+                if graph_att.get("isInline", False):
+                    updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
+                    skip_count += 1
+                    print(f"         SKIP  {att_name} (inline obrazek)")
+                    continue
+
+                content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
+                if content is None:
+                    err_count += 1
+                    email_ok = False
+                    print(f"         ERR   {att_name} (stazeni selhalo)")
+                    continue
+
+                mime_type = att.get("mime_type") or graph_att.get("contentType", "")
+
+            hash_val, local_path, was_new = save_attachment(
+                content, att_name, mime_type, mailbox, att_dir, col_index
+            )
+
+            updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
+
+            if was_new:
+                new_count += 1
+                print(f"         NEW   {local_path}  ({len(content):,} B)")
+            else:
+                dup_count += 1
+                print(f"         DUP   {att_name} -> {local_path}")
+
+        if email_ok:
+            ok_count += 1
+
+        batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
+
+        if len(batch) >= BATCH_SIZE:
+            flush()
+
+        if email_i % 100 == 0:
+            elapsed = (datetime.now() - start).total_seconds()
+            print(f"  {'─'*60}")
+            print(f"  Průběh: emaily={email_i}/{total}  nove={new_count}  dup={dup_count}  skip={skip_count}  err={err_count}")
+            print(f"  {'─'*60}")
+
+    flush()
+
+    elapsed = (datetime.now() - start).total_seconds()
+    print(f"  -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
+          f"skip={skip_count} err={err_count} ({elapsed:.1f} s)")
+    return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
+            "skip": skip_count, "err": err_count, "elapsed": elapsed}
+
+
+def discover_mailboxes(db) -> list[str]:
+    """Vrati seznam mailboxu = vsechny kolekce mimo NON_MAILBOX a SKIP_MAILBOXES."""
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        if name in SKIP_MAILBOXES:
+            print(f"  [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
+            continue
+        out.append(name)
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox",       default="",
+                    help="Emailova schranka. Bez argumentu projede vsechny schranky "
+                         "v `emaily` mimo SKIP_MAILBOXES.")
+    ap.add_argument("--limit",         type=int, default=0,
+                    help="Zpracovat max N emailu (0 = vse) — per schranka")
+    ap.add_argument("--force-recheck", action="store_true",
+                    help="Znovu overi i emaily kde prilohy uz maji file_hash")
+    ap.add_argument("--no-indexes",    action="store_true",
+                    help="Nevytvorit indexy na attachments_index kolekci")
+    args = ap.parse_args()
+
+    start_all = datetime.now()
+    print(f"=== download_attachments v{SCRIPT_VERSION} ===")
+    print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+
+    col_index = client[MONGO_DB][MONGO_COL_INDEX]
+    if not args.no_indexes:
+        col_index.create_index("filename")
+        col_index.create_index("mime_type")
+        col_index.create_index("mailbox")
+
+    db = client[MONGO_DB]
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"  CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            sys.exit(2)
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+        print(f"  Schranky ke zpracovani: {len(mailboxes)}")
+        for m in mailboxes:
+            print(f"    {m}")
+
+    results = []
+    for mb in mailboxes:
+        try:
+            results.append(process_mailbox(client, mb, args))
+        except Exception as e:
+            logging.error("process_mailbox %s: %s", mb, e)
+            print(f"  FATAL pri zpracovani {mb}: {e}")
+            results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
+                            "skip": 0, "err": 1, "elapsed": 0.0})
+
+    elapsed_total = (datetime.now() - start_all).total_seconds()
+    files_total   = col_index.count_documents({})
+    size_total    = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
+
+    grand = {k: sum(r[k] for r in results) for k in ("ok", "new", "dup", "skip", "err")}
+
+    print(f"\n{'='*60}")
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
+              f"dup={r['dup']:>4} skip={r['skip']:>3} err={r['err']:>3}")
+    print(f"  {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
+          f"dup={grand['dup']:>4} skip={grand['skip']:>3} err={grand['err']:>3}")
+    print(f"Souboru v indexu: {files_total}  ({size_total / 1024 / 1024:.1f} MB)")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if grand['err']:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Python-runner/3_download_attachments_v1.4.md b/Python-runner/3_download_attachments_v1.4.md
new file mode 100644
index 0000000..f721798
--- /dev/null
+++ b/Python-runner/3_download_attachments_v1.4.md
@@ -0,0 +1,74 @@
+# 3_download_attachments_v1.4.py
+
+**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails/<schránka>/Attachments/`. Deduplikace podle **SHA256** obsahu.
+
+## Nové ve verzi 1.4
+
+| Typ přílohy | `@odata.type` | Co skript dělá |
+|---|---|---|
+| **File** | `#microsoft.graph.fileAttachment` | Stáhne přes `/$value`, uloží binárku |
+| **Item** (vnořený email) | `#microsoft.graph.itemAttachment` | `$expand=...itemAttachment/item`, sestaví **`.eml`** z hlaviček a body vnitřní zprávy |
+| **Reference** (OneDrive/SharePoint link) | `#microsoft.graph.referenceAttachment` | Žádný file — uloží jen `reference_url` do Mongo |
+
+Plus:
+- **Retry** s exponenciálním backoffem na 429/500/502/503/504 (1s, 2s, 4s; respektuje `Retry-After`).
+- **Permanentní označení chyb v Mongo** per-attachment:
+  - `attachment_missing: True` + `attachment_missing_at: <UTC>` při 404 (email/příloha už neexistuje v mailboxu)
+  - `attachment_reference: True` + `reference_url: <URL>` u referenceAttachment
+- Tagované přílohy se při dalším běhu **automaticky přeskočí** (bez `--force-recheck`).
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Schránka = kolekce v Mongo. Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` a `SKIP_MAILBOXES` |
+| `--limit N` | ne | int | 0 | Per schránka, jen prvních N emailů (test) |
+| `--force-recheck` | ne | flag | false | Znovu ověří i emaily kde přílohy mají `file_hash` **nebo** `attachment_missing` **nebo** `attachment_reference` |
+| `--no-indexes` | ne | flag | false | Nevytváří indexy na `attachments_index` |
+
+## SKIP_MAILBOXES (hardcoded)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
+
+## Statistiky per schránka
+
+```
+ok=N nove=N dup=N skip=N miss=N ref=N err=N
+```
+
+| Kategorie | Význam |
+|---|---|
+| `ok` | emaily zpracované bez chyby (všechny přílohy hotové) |
+| `nove` | nové soubory uložené (NEW + NEW(eml)) |
+| `dup` | hash už existuje (jen ref_count++) |
+| `skip` | S/MIME (.p7m/.p7s/...) nebo inline obrázek |
+| `miss` | 404 — označeno `attachment_missing` (nepokračuje se) |
+| `ref` | referenceAttachment — uložen jen URL |
+| `err` | tranzientní chyba (5xx, timeout) — bude retry při dalším běhu |
+
+## Varianty volání
+
+```bash
+# Všechny schránky (mimo SKIP_MAILBOXES):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz
+
+# Test 50 emailů:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Force-recheck (i missing/reference přepíše):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+# Na pozadí:
+docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.4.py > /scripts/download_attachments.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/download_attachments.log
+```
diff --git a/Python-runner/3_download_attachments_v1.4.py b/Python-runner/3_download_attachments_v1.4.py
new file mode 100644
index 0000000..487852f
--- /dev/null
+++ b/Python-runner/3_download_attachments_v1.4.py
@@ -0,0 +1,713 @@
+"""
+download_attachments_v1.4.py
+Nazev:  download_attachments_v1.4.py
+Verze:  1.4
+Datum:  2026-06-04
+Autor:  vladimir.buzalka
+
+Popis:
+    Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
+    pres Microsoft Graph API a uklada je do adresare
+    /mnt/Emails/<schranka>/Attachments/.
+
+    Bez argumentu --mailbox projede vsechny kolekce v `emaily` mimo
+    NON_MAILBOX_COLLECTIONS a SKIP_MAILBOXES.
+
+    Deduplikace podle SHA256 hashe obsahu:
+        - stejny hash = soubor uz existuje -> preskoci
+        - prvni vyskyt: ulozi pod puvodnim nazvem
+        - kolize nazvu: faktura_2.pdf, faktura_3.pdf ...
+
+    Po ulozeni aktualizuje MongoDB:
+        - v email dokumentu: kazda priloha dostane file_hash + local_path
+        - kolekce emaily.attachments_index: _id=hash, filename, ...
+
+    NOVE v 1.4:
+        - Spravne zpracovani vsech typu priloh:
+          * fileAttachment   -> /$value (jako predtim)
+          * itemAttachment   -> /$expand=microsoft.graph.itemAttachment/item
+                                -> sestavi .eml z vnitrni zpravy
+          * referenceAttachment -> ulozi jen URL, neexistuje content
+        - Retry s exponencialnim backoffem (1s, 2s, 4s) na 429/5xx
+        - Permanentni tagging chyb v Mongo per-attachment:
+          * attachment_missing: True       (404, email/att uz neexistuje)
+          * attachment_reference: True     (referenceAttachment, jen URL)
+          * reference_url, attachment_type — diagnosticke metadata
+        - Tagovane prilohy se pri dalsim behu preskocia (bez --force-recheck)
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    python download_attachments_v1.4.py
+    python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz
+    python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50
+    python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+SKIP_MAILBOXES (hardcoded):
+    vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup.
+
+Docker:
+    docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py
+
+Zavislosti:
+    msal, requests, pymongo
+    Python 3.10+
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze
+    1.1  2026-06-02  Schranka jako parametr --mailbox
+    1.2  2026-06-02  Oprava: Graph attachment mapa vcetne inline; normalizace nazvu
+    1.3  2026-06-02  Primarni stazeni pres graph_att_id; --mailbox volitelny
+    1.4  2026-06-04  itemAttachment/referenceAttachment handling; retry s backoffem;
+                     permanentni tagging chyb (attachment_missing / attachment_reference)
+"""
+
+import sys
+import re
+import time
+import json
+import hashlib
+import logging
+import argparse
+import unicodedata
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI           = "mongodb://192.168.1.76:27017"
+MONGO_DB            = "emaily"
+MONGO_COL_INDEX     = "attachments_index"
+
+EMAILS_BASE_DIR     = Path("/mnt/Emails")
+LOG_FILE            = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION      = "1.4"
+BATCH_SIZE          = 50
+
+# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
+SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky kde NEMAME Graph API pristup
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",
+}
+
+# Retry konfigurace pro tranzientni chyby
+RETRY_STATUSES   = {429, 500, 502, 503, 504}
+RETRY_BACKOFF_S  = [1, 2, 4]  # max 3 pokusy
+
+# Sentinel hodnoty pro fetch_attachment_smart
+FETCH_MISSING    = "__MISSING__"     # 404
+FETCH_REFERENCE  = "__REFERENCE__"   # referenceAttachment
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+_graph_token: Optional[str] = None
+
+
+# ─── Graph API ────────────────────────────────────────────────────────────────
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def _graph_request(method: str, url: str, *, params: dict = None,
+                   stream: bool = False, timeout: int = 60):
+    """Nizko-urovnova HTTP volani s retry na 429/5xx a auto-reauth na 401.
+    Vraci requests.Response (pro stream=True pred .content); pro 404 vraci Response."""
+    global _graph_token
+    if not _graph_token:
+        get_token()
+
+    last_exc = None
+    for attempt in range(len(RETRY_BACKOFF_S) + 1):
+        try:
+            r = requests.request(
+                method, url,
+                headers={"Authorization": f"Bearer {_graph_token}"},
+                params=params, timeout=timeout, stream=stream,
+            )
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code in RETRY_STATUSES and attempt < len(RETRY_BACKOFF_S):
+                # Retry-After hlavicka ma prednost
+                ra = r.headers.get("Retry-After")
+                sleep_s = float(ra) if ra and ra.replace(".", "").isdigit() else RETRY_BACKOFF_S[attempt]
+                time.sleep(sleep_s)
+                continue
+            return r
+        except (requests.ConnectionError, requests.Timeout) as e:
+            last_exc = e
+            if attempt < len(RETRY_BACKOFF_S):
+                time.sleep(RETRY_BACKOFF_S[attempt])
+                continue
+            raise
+    raise RuntimeError(f"Graph request exhausted retries: {url} (last_exc={last_exc})")
+
+
+def graph_get_json(url: str, params: dict = None) -> dict:
+    r = _graph_request("GET", url, params=params, timeout=30)
+    r.raise_for_status()
+    return r.json()
+
+
+def graph_get_bytes(url: str) -> bytes:
+    r = _graph_request("GET", url, stream=True, timeout=120)
+    r.raise_for_status()
+    return r.content
+
+
+def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
+    """Nacte metadata vsech priloh zpravy. Vraci i @odata.type."""
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
+    try:
+        # @odata.type se vraci automaticky (neni v base $select)
+        data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
+        return data.get("value", [])
+    except Exception as e:
+        logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
+        return []
+
+
+def _build_eml_from_item(item: dict) -> bytes:
+    """Sestavi minimalni RFC822 .eml z itemAttachment.item (message)."""
+    def hdr(name, val):
+        return f"{name}: {val}\r\n" if val else ""
+
+    def addrs(field):
+        rec = item.get(field) or []
+        out = []
+        for r in rec:
+            ea = r.get("emailAddress") or {}
+            name = ea.get("name", "")
+            addr = ea.get("address", "")
+            if name and addr:
+                out.append(f'"{name}" <{addr}>')
+            elif addr:
+                out.append(addr)
+        return ", ".join(out)
+
+    subj = item.get("subject", "")
+    sender = item.get("from") or item.get("sender") or {}
+    sender_ea = sender.get("emailAddress") or {}
+    from_str = (f'"{sender_ea.get("name","")}" <{sender_ea.get("address","")}>'
+                if sender_ea.get("address") else "")
+    sent = item.get("sentDateTime") or item.get("receivedDateTime") or ""
+
+    body = item.get("body") or {}
+    content_type = body.get("contentType", "text")  # 'text' | 'html'
+    body_content = body.get("content", "") or ""
+
+    mime_type = "text/html" if content_type.lower() == "html" else "text/plain"
+
+    headers = (
+        hdr("From", from_str)
+        + hdr("To", addrs("toRecipients"))
+        + hdr("Cc", addrs("ccRecipients"))
+        + hdr("Subject", subj)
+        + hdr("Date", sent)
+        + f"Content-Type: {mime_type}; charset=utf-8\r\n"
+        + "MIME-Version: 1.0\r\n"
+        + "\r\n"
+    )
+    return (headers + body_content).encode("utf-8", errors="replace")
+
+
+def fetch_attachment_smart(mailbox: str, graph_message_id: str,
+                           attachment_id: str, odata_type: str = "") -> tuple:
+    """Smart fetch: rozezna typ prilohy a vrati (content_bytes, type_str, extra).
+    type_str: 'file' | 'item' | 'reference' | FETCH_MISSING | FETCH_REFERENCE
+    extra: pri 'reference' = sourceUrl; pri 'item' = puvodni subject (info)
+    Vraci (None, FETCH_MISSING, None) pri 404.
+    Vyhazuje exception pri jinych failures po vycerpani retry.
+    """
+    base = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}"
+
+    # Zname typ → optimalni cesta
+    if odata_type == "#microsoft.graph.fileAttachment":
+        r = _graph_request("GET", base + "/$value", stream=True, timeout=120)
+        if r.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r.raise_for_status()
+        return (r.content, "file", None)
+
+    if odata_type == "#microsoft.graph.itemAttachment":
+        r = _graph_request("GET", base,
+                           params={"$expand": "microsoft.graph.itemAttachment/item"},
+                           timeout=60)
+        if r.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r.raise_for_status()
+        obj = r.json()
+        item = obj.get("item") or {}
+        return (_build_eml_from_item(item), "item", item.get("subject"))
+
+    if odata_type == "#microsoft.graph.referenceAttachment":
+        r = _graph_request("GET", base, timeout=30)
+        if r.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r.raise_for_status()
+        obj = r.json()
+        return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name"))
+
+    # Neznamy typ — zkus $value, pri 405 detekuj typ a rekurzivne zpracuj
+    r = _graph_request("GET", base + "/$value", stream=True, timeout=120)
+    if r.status_code == 404:
+        return (None, FETCH_MISSING, None)
+    if r.status_code == 405:
+        # Method Not Allowed -> neni fileAttachment; zjisti typ
+        r2 = _graph_request("GET", base, timeout=30)
+        if r2.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r2.raise_for_status()
+        obj = r2.json()
+        ot = obj.get("@odata.type", "")
+        if ot == "#microsoft.graph.itemAttachment":
+            # objekt nema item bez expand → druhy request
+            return fetch_attachment_smart(mailbox, graph_message_id, attachment_id, ot)
+        if ot == "#microsoft.graph.referenceAttachment":
+            return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name"))
+        # fallback: fileAttachment ale jeho contentBytes je v JSON
+        if ot == "#microsoft.graph.fileAttachment":
+            import base64
+            cb = obj.get("contentBytes")
+            if cb:
+                return (base64.b64decode(cb), "file", None)
+        raise RuntimeError(f"unknown attachment odata.type={ot}")
+    r.raise_for_status()
+    return (r.content, "file", None)
+
+
+# ─── Pomocne funkce ───────────────────────────────────────────────────────────
+
+def normalize_name(name: str) -> str:
+    nfkd = unicodedata.normalize("NFKD", name.lower().strip())
+    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
+    return re.sub(r"[^\w.\-]", "_", ascii_str)
+
+
+def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
+    for ga in graph_atts:
+        if ga["name"] == att_name:
+            return ga
+    norm_want = normalize_name(att_name)
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            return ga
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            ga_size = ga.get("size", 0)
+            if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
+                return ga
+    for ga in graph_atts:
+        if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
+            return ga
+    return None
+
+
+def sha256(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def safe_filename(name: str) -> str:
+    safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
+    return safe or "attachment"
+
+
+def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
+    existing = col_index.find_one({"filename": desired_name})
+    if existing:
+        if existing["_id"] == hash_val:
+            return desired_name
+        stem   = Path(desired_name).stem
+        suffix = Path(desired_name).suffix
+        n = 2
+        while True:
+            candidate = f"{stem}_{n}{suffix}"
+            ex2 = col_index.find_one({"filename": candidate})
+            if not ex2 or ex2["_id"] == hash_val:
+                if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
+                    return candidate
+            n += 1
+    return desired_name
+
+
+def save_attachment(content: bytes, original_name: str, mime_type: str,
+                    mailbox: str, att_dir: Path, col_index) -> tuple[str, str, bool]:
+    hash_val = sha256(content)
+    existing = col_index.find_one({"_id": hash_val})
+    if existing:
+        col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
+        return hash_val, existing["local_path"], False
+
+    filename  = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
+    file_path = att_dir / filename
+    file_path.write_bytes(content)
+
+    col_index.insert_one({
+        "_id":           hash_val,
+        "filename":      filename,
+        "local_path":    filename,
+        "size_bytes":    len(content),
+        "mime_type":     mime_type,
+        "mailbox":       mailbox,
+        "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        "ref_count":     1,
+    })
+    return hash_val, filename, True
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def process_mailbox(client, mailbox: str, args) -> dict:
+    att_dir   = EMAILS_BASE_DIR / mailbox / "Attachments"
+    mongo_col = mailbox
+
+    start = datetime.now()
+    print(f"\n========== {mailbox} ==========")
+    print(f"Cilovy adresar: {att_dir}")
+
+    att_dir.mkdir(parents=True, exist_ok=True)
+
+    col_emails = client[MONGO_DB][mongo_col]
+    col_index  = client[MONGO_DB][MONGO_COL_INDEX]
+
+    if args.force_recheck:
+        query = {"has_attachments": True}
+    else:
+        # priloha "ke zpracovani" = neni inline, nema file_hash, neni oznacena
+        # jako missing/reference
+        query = {
+            "has_attachments": True,
+            "attachments": {
+                "$elemMatch": {
+                    "is_inline": False,
+                    "file_hash": {"$exists": False},
+                    "attachment_missing": {"$ne": True},
+                    "attachment_reference": {"$ne": True},
+                }
+            }
+        }
+
+    total = col_emails.count_documents(query)
+    print(f"Emailu ke zpracovani: {total}")
+    if total == 0:
+        print("  Neni co stahnout.")
+        return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0,
+                "miss": 0, "ref": 0, "err": 0, "elapsed": 0.0}
+
+    cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
+    if args.limit:
+        cursor = cursor.limit(args.limit)
+
+    ok_count   = 0
+    new_count  = 0
+    dup_count  = 0
+    skip_count = 0
+    miss_count = 0
+    ref_count  = 0
+    err_count  = 0
+    email_i    = 0
+    batch      = []
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col_emails.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    for email_doc in cursor:
+        email_i  += 1
+        email_id  = email_doc["_id"]
+        graph_id  = email_doc.get("graph_id", "")
+        subject   = (email_doc.get("subject") or "")[:60]
+        att_list  = email_doc.get("attachments") or []
+
+        real_atts = [a for a in att_list if not a.get("is_inline", False)
+                     and not a.get("attachment_missing")
+                     and not a.get("attachment_reference")]
+        if not real_atts:
+            continue
+
+        print(f"\n  {email_i:>5}/{total}  {subject}")
+
+        need_listing = any(
+            not a.get("is_inline", False)
+            and not (not args.force_recheck and a.get("file_hash"))
+            and not a.get("graph_att_id")
+            for a in att_list
+        )
+        graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
+
+        # mapa graph_att_id -> @odata.type (z listingu pokud byl)
+        type_map = {ga["id"]: ga.get("@odata.type", "") for ga in graph_atts}
+
+        updated_atts = list(att_list)
+        email_ok     = True
+
+        for i, att in enumerate(updated_atts):
+            if att.get("is_inline", False):
+                continue
+            if att.get("attachment_missing") or att.get("attachment_reference"):
+                continue
+            if not args.force_recheck and att.get("file_hash"):
+                continue
+
+            att_name     = att.get("filename", "")
+            att_size     = att.get("size_bytes", 0)
+            graph_att_id = att.get("graph_att_id")
+
+            if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
+                updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
+                skip_count += 1
+                print(f"         SKIP  {att_name} (S/MIME)")
+                continue
+
+            # Resolve graph_att_id + odata_type
+            resolved_id = graph_att_id
+            odata_type  = type_map.get(graph_att_id, "") if graph_att_id else ""
+
+            if not resolved_id:
+                # Fallback: name matching (legacy)
+                graph_att = find_graph_att(att_name, att_size, graph_atts)
+                if not graph_att:
+                    logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
+                    print(f"         ERR   {att_name} (nenalezeno)")
+                    err_count += 1
+                    email_ok = False
+                    continue
+                if graph_att.get("isInline", False):
+                    updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
+                    skip_count += 1
+                    print(f"         SKIP  {att_name} (inline obrazek)")
+                    continue
+                resolved_id = graph_att["id"]
+                odata_type  = graph_att.get("@odata.type", "")
+
+            # Smart fetch
+            try:
+                content, kind, extra = fetch_attachment_smart(
+                    mailbox, graph_id, resolved_id, odata_type
+                )
+            except Exception as e:
+                logging.error("fetch_attachment_smart failed [msg=%s att=%s type=%s]: %s",
+                              graph_id, resolved_id, odata_type, e)
+                err_count += 1
+                email_ok = False
+                print(f"         ERR   {att_name} (stazeni selhalo)")
+                continue
+
+            now_utc = datetime.now(timezone.utc).replace(tzinfo=None)
+
+            if kind == FETCH_MISSING:
+                updated_atts[i] = {
+                    **att,
+                    "attachment_missing": True,
+                    "attachment_missing_at": now_utc,
+                }
+                miss_count += 1
+                print(f"         MISS  {att_name} (404 — oznaceno jako missing)")
+                continue
+
+            if kind == FETCH_REFERENCE:
+                updated_atts[i] = {
+                    **att,
+                    "attachment_reference": True,
+                    "attachment_type": "reference",
+                    "reference_url": extra,
+                }
+                ref_count += 1
+                print(f"         REF   {att_name} -> {extra}")
+                continue
+
+            # kind in ('file', 'item') — mame bytes
+            mime_type = att.get("mime_type") or (
+                "message/rfc822" if kind == "item" else "application/octet-stream"
+            )
+
+            # Pro itemAttachment vyrobime .eml priponu pokud chybi
+            save_name = att_name
+            if kind == "item" and not save_name.lower().endswith(".eml"):
+                save_name = (save_name or "embedded_email") + ".eml"
+
+            hash_val, local_path, was_new = save_attachment(
+                content, save_name, mime_type, mailbox, att_dir, col_index
+            )
+
+            updated_atts[i] = {
+                **att,
+                "file_hash":       hash_val,
+                "local_path":      local_path,
+                "attachment_type": kind,
+            }
+
+            if was_new:
+                new_count += 1
+                tag = "NEW(eml)" if kind == "item" else "NEW"
+                print(f"         {tag}   {local_path}  ({len(content):,} B)")
+            else:
+                dup_count += 1
+                print(f"         DUP   {att_name} -> {local_path}")
+
+        if email_ok:
+            ok_count += 1
+
+        batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
+
+        if len(batch) >= BATCH_SIZE:
+            flush()
+
+        if email_i % 100 == 0:
+            elapsed = (datetime.now() - start).total_seconds()
+            print(f"  {'─'*60}")
+            print(f"  Průběh: emaily={email_i}/{total}  nove={new_count}  dup={dup_count}  "
+                  f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count}")
+            print(f"  {'─'*60}")
+
+    flush()
+
+    elapsed = (datetime.now() - start).total_seconds()
+    print(f"  -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
+          f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count} ({elapsed:.1f} s)")
+    return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
+            "skip": skip_count, "miss": miss_count, "ref": ref_count, "err": err_count,
+            "elapsed": elapsed}
+
+
+def discover_mailboxes(db) -> list[str]:
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        if name in SKIP_MAILBOXES:
+            print(f"  [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
+            continue
+        out.append(name)
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox",       default="",
+                    help="Emailova schranka. Bez argumentu projede vsechny schranky.")
+    ap.add_argument("--limit",         type=int, default=0,
+                    help="Zpracovat max N emailu (0 = vse) — per schranka")
+    ap.add_argument("--force-recheck", action="store_true",
+                    help="Znovu overi i emaily kde prilohy uz maji file_hash / missing / reference")
+    ap.add_argument("--no-indexes",    action="store_true",
+                    help="Nevytvorit indexy na attachments_index kolekci")
+    args = ap.parse_args()
+
+    start_all = datetime.now()
+    print(f"=== download_attachments v{SCRIPT_VERSION} ===")
+    print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+
+    col_index = client[MONGO_DB][MONGO_COL_INDEX]
+    if not args.no_indexes:
+        col_index.create_index("filename")
+        col_index.create_index("mime_type")
+        col_index.create_index("mailbox")
+
+    db = client[MONGO_DB]
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"  CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            sys.exit(2)
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+        print(f"  Schranky ke zpracovani: {len(mailboxes)}")
+        for m in mailboxes:
+            print(f"    {m}")
+
+    results = []
+    for mb in mailboxes:
+        try:
+            results.append(process_mailbox(client, mb, args))
+        except Exception as e:
+            logging.error("process_mailbox %s: %s", mb, e)
+            print(f"  FATAL pri zpracovani {mb}: {e}")
+            results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
+                            "skip": 0, "miss": 0, "ref": 0, "err": 1, "elapsed": 0.0})
+
+    elapsed_total = (datetime.now() - start_all).total_seconds()
+    files_total   = col_index.count_documents({})
+    size_total    = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
+
+    grand = {k: sum(r.get(k, 0) for r in results)
+             for k in ("ok", "new", "dup", "skip", "miss", "ref", "err")}
+
+    print(f"\n{'='*60}")
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
+              f"dup={r['dup']:>4} skip={r['skip']:>3} miss={r.get('miss',0):>3} "
+              f"ref={r.get('ref',0):>3} err={r['err']:>3}")
+    print(f"  {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
+          f"dup={grand['dup']:>4} skip={grand['skip']:>3} miss={grand['miss']:>3} "
+          f"ref={grand['ref']:>3} err={grand['err']:>3}")
+    print(f"Souboru v indexu: {files_total}  ({size_total / 1024 / 1024:.1f} MB)")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if grand['err']:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+    return 1 if grand['err'] > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/Python-runner/4_unwrap_smime_v1.0.md b/Python-runner/4_unwrap_smime_v1.0.md
new file mode 100644
index 0000000..db8e420
--- /dev/null
+++ b/Python-runner/4_unwrap_smime_v1.0.md
@@ -0,0 +1,63 @@
+# 4_unwrap_smime_v1.0.py
+
+**Krok 4 pipeline** — rozbalení S/MIME wrapper zpráv. Některé emaily (Datová schránka, mBank, ComGate, PayU, PostSignum …) mají viditelné tělo jen *"This is an S/MIME signed message"* — skutečný obsah je zabalený uvnitř přílohy `smime.p7m`.
+
+Skript najde tyto emaily, stáhne binárku `smime.p7m` z Graphu, rozbalí PKCS7 SignedData (`asn1crypto.cms`), extrahuje vnitřní MIME zprávu a doplní do Mongo:
+
+| Pole | Obsah |
+|---|---|
+| `smime_unwrapped: True` | flag — už rozbaleno |
+| `smime_subject` | Subject z vnitřní MIME hlavičky |
+| `smime_body_text` | plain text vnitřního těla |
+| `smime_body_html` | HTML vnitřního těla (pokud je) |
+| `smime_inner_attachments[]` | `{filename, content_type, size_bytes}` vnitřních příloh |
+
+## POZOR: `smime.p7m` vs `smime.p7s`
+
+| Příloha | Co to je | Skript dělá |
+|---|---|---|
+| `smime.p7m` | **Enveloped wrapper** kolem celé MIME zprávy | **Rozbalí** |
+| `smime.p7s` | **Detached signature** vedle čistého emailu (tělo je normálně dostupné) | **Ignoruje** — není co rozbalovat |
+
+Filtr ve skriptu (`SMIME_FILTER`) je proto explicitně `^smime\.p7m$`. Email s přílohou `smime.p7s` a `smime_unwrapped != True` je **správný stav**.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka. Bez argumentu projede všechny kolekce v `emaily` mimo `SKIP_COLLECTIONS` (`attachments_index`, `sync_state`) a `SKIP_MAILBOXES`. |
+| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
+
+## SKIP_MAILBOXES (hardcoded)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
+
+## Varianty volání
+
+```bash
+# Všechny schránky (mimo SKIP_MAILBOXES):
+docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Test 10 emailů:
+docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz --limit 10
+
+# Plný běh na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/4_unwrap_smime_v1.0.py > /scripts/unwrap_smime.log 2>&1"
+```
+
+## Závislosti
+
+```bash
+docker exec python-runner pip install asn1crypto
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/unwrap_smime.log
+```
diff --git a/Python-runner/4_unwrap_smime_v1.0.py b/Python-runner/4_unwrap_smime_v1.0.py
new file mode 100644
index 0000000..6d79340
--- /dev/null
+++ b/Python-runner/4_unwrap_smime_v1.0.py
@@ -0,0 +1,445 @@
+"""
+==============================================================================
+Skript:   unwrap_smime_v1.0.py
+Verze:    1.0
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  Najde v Mongo emaily s prilohou smime.p7m (S/MIME signed-data),
+  stahne binarni obsah prilohy z Microsoft Graph API, rozbali PKCS7
+  SignedData (CMS), extrahuje vnitrni MIME message, a ulozi do Mongo:
+    - smime_unwrapped: True
+    - smime_body_text   : plain text vnitrniho tela
+    - smime_body_html   : HTML vnitrniho tela (kdyz je)
+    - smime_subject     : Subject vnitrni MIME hlavicky
+    - smime_inner_attachments : [{filename, content_type, size_bytes}]
+
+  Tyto pole pak pouzije enrich_fulltext_emails_v1.2 a doplni jejich
+  obsah do PG fulltext indexu.
+
+  Typicke S/MIME odesilatele:
+    notifikace@mojedatovaschranka.cz  (844 emailu)
+    kontakt@mbank.cz                  (226)
+    payments@comgate.cz, service@payu.com  (~250)
+    info.postsignum@cpost.cz
+
+Architekturalni poznamka:
+  S/MIME priloha smime.p7m ma Content-Type application/pkcs7-mime
+  s parametrem smime-type=signed-data. Vnitrni obsah je v PKCS7
+  ContentInfo -> SignedData -> encapContentInfo.eContent. To uz je
+  primo MIME zprava (multipart nebo single body).
+
+Zavislosti (instalovat v kontejneru):
+  pip install asn1crypto
+
+Spusteni:
+  python unwrap_smime_v1.0.py                              # vsechny schranky (mimo SKIP_MAILBOXES)
+  python unwrap_smime_v1.0.py --mailbox vladimir.buzalka@buzalka.cz
+  python unwrap_smime_v1.0.py --limit 10                   # test
+
+SKIP_MAILBOXES (hardcoded):
+  vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup. Pri behu
+                            bez --mailbox se tise preskoci, s --mailbox skript
+                            skonci s exit kodem 2.
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import email
+import email.policy
+import logging
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import msal
+import requests
+from asn1crypto import cms
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# --- konfigurace ------------------------------------------------------------
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB  = "emaily"
+SKIP_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky kde NEMAME Graph API pristup — pri bezne behu se preskocia.
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",   # JNJ tenant — nemame Graph credentials
+}
+
+MAX_BODY_BYTES   = 2 * 1024 * 1024   # 2 MB strop pro extrahovany text
+BATCH_SIZE       = 25
+LOG_FILE         = Path(__file__).parent / "unwrap_smime_errors.log"
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+
+# --- Graph auth -------------------------------------------------------------
+_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in res:
+        raise RuntimeError(f"Graph auth failed: {res}")
+    _token = res["access_token"]
+    return _token
+
+
+def graph_get_raw(url: str) -> Optional[bytes]:
+    """GET na Graph endpoint, vraci raw bytes (pro $value attachment endpoint)."""
+    global _token
+    if not _token:
+        get_token()
+    for attempt in range(3):
+        try:
+            r = requests.get(url, headers={"Authorization": f"Bearer {_token}"}, timeout=60)
+            if r.status_code == 401:
+                get_token(); continue
+            if r.status_code == 404:
+                return None
+            if r.status_code == 429:
+                wait = int(r.headers.get("Retry-After", "5"))
+                time.sleep(wait); continue
+            r.raise_for_status()
+            return r.content
+        except requests.RequestException:
+            if attempt == 2:
+                raise
+            time.sleep(2)
+    return None
+
+
+# --- PKCS7 / MIME unwrap ----------------------------------------------------
+
+def extract_inner_mime(content_bytes: bytes) -> bytes:
+    """Z S/MIME prilohy vytahne vnitrni MIME (signed content) jako bytes.
+
+    Dva formaty se v Graph API vyskytuji:
+      A) multipart/signed (detached signature) - bytes zacinaji 'Content-Type: multipart/signed'.
+         Obsah je rovnou citelny v prvni MIME casti (druha cast je oddeleny PKCS7 podpis).
+      B) application/pkcs7-mime (opaque, smime-type=signed-data) - vnitrni MIME je
+         schovany uvnitr PKCS7 SignedData -> encapContentInfo.eContent.
+
+    Vraci raw MIME bytes pro pripravu pro email.message_from_bytes.
+    """
+    head = content_bytes[:300].lower()
+
+    # A) multipart/signed (detached) - nejcastejsi pro maily z Graphu
+    if b"content-type:" in head and b"multipart/signed" in head:
+        try:
+            outer = email.message_from_bytes(content_bytes, policy=email.policy.default)
+        except Exception as e:
+            raise RuntimeError(f"MIME parse failed: {e}")
+        # iteruj parts - prvni non-signature je signed payload
+        signed_payload = None
+        if outer.is_multipart():
+            for part in outer.iter_parts():
+                ct = (part.get_content_type() or "").lower()
+                if "pkcs7-signature" in ct or "x-pkcs7-signature" in ct:
+                    continue
+                signed_payload = part
+                break
+        if signed_payload is None:
+            raise RuntimeError("multipart/signed: no signed payload found")
+        return signed_payload.as_bytes()
+
+    # B) opaque PKCS7 SignedData - DER nebo base64
+    data = content_bytes
+    try:
+        ci = cms.ContentInfo.load(data)
+    except Exception:
+        try:
+            import base64
+            stripped = b"".join(line for line in data.splitlines()
+                                if not line.startswith(b"-----"))
+            data = base64.b64decode(stripped, validate=False)
+            ci = cms.ContentInfo.load(data)
+        except Exception as e:
+            raise RuntimeError(f"PKCS7/MIME parse failed: {e}")
+
+    if ci["content_type"].native != "signed_data":
+        raise RuntimeError(f"Not signed-data, got {ci['content_type'].native}")
+    sd = ci["content"]
+    inner = sd["encap_content_info"]["content"]
+    if inner is None:
+        raise RuntimeError("encapContentInfo.content is null (detached without MIME wrapper)")
+    return bytes(inner.native) if hasattr(inner, "native") else bytes(inner)
+
+
+def parse_inner_mime(mime_bytes: bytes) -> dict:
+    """Z MIME bytes vytahne text, html a prilohy."""
+    msg = email.message_from_bytes(mime_bytes, policy=email.policy.default)
+
+    text_parts: list[str] = []
+    html_parts: list[str] = []
+    inner_attachments: list[dict] = []
+
+    def walk(part):
+        ctype = part.get_content_type()
+        disp = (part.get_content_disposition() or "").lower()
+        filename = part.get_filename()
+
+        if part.is_multipart():
+            for sub in part.iter_parts():
+                walk(sub)
+            return
+
+        if disp == "attachment" or filename:
+            try:
+                payload = part.get_content()
+                if isinstance(payload, str):
+                    payload_bytes = payload.encode("utf-8", errors="replace")
+                elif isinstance(payload, bytes):
+                    payload_bytes = payload
+                else:
+                    payload_bytes = b""
+                size = len(payload_bytes)
+            except Exception:
+                size = 0
+            inner_attachments.append({
+                "filename": filename or "(unnamed)",
+                "content_type": ctype,
+                "size_bytes": size,
+            })
+            return
+
+        if ctype == "text/plain":
+            try:
+                text_parts.append(part.get_content())
+            except Exception:
+                try:
+                    text_parts.append(part.get_payload(decode=True).decode(
+                        part.get_content_charset() or "utf-8", errors="replace"))
+                except Exception:
+                    pass
+        elif ctype == "text/html":
+            try:
+                html_parts.append(part.get_content())
+            except Exception:
+                try:
+                    html_parts.append(part.get_payload(decode=True).decode(
+                        part.get_content_charset() or "utf-8", errors="replace"))
+                except Exception:
+                    pass
+
+    walk(msg)
+
+    body_text = "\n\n".join(t.strip() for t in text_parts if t and t.strip())
+    body_html = "\n".join(h for h in html_parts if h and h.strip())
+    if len(body_text) > MAX_BODY_BYTES:
+        body_text = body_text[:MAX_BODY_BYTES]
+    if len(body_html) > MAX_BODY_BYTES:
+        body_html = body_html[:MAX_BODY_BYTES]
+
+    return {
+        "subject": str(msg.get("Subject") or "").strip(),
+        "from":    str(msg.get("From") or "").strip(),
+        "to":      str(msg.get("To") or "").strip(),
+        "date":    str(msg.get("Date") or "").strip(),
+        "body_text": body_text or None,
+        "body_html": body_html or None,
+        "inner_attachments": inner_attachments,
+    }
+
+
+# --- hlavni smycka ----------------------------------------------------------
+
+SMIME_FILTER = {
+    "$and": [
+        {"attachments.filename": {"$regex": "^smime\\.p7m$", "$options": "i"}},
+        {"smime_unwrapped": {"$ne": True}},
+    ]
+}
+
+
+def find_p7m_graph_att_id(doc: dict) -> Optional[str]:
+    for att in doc.get("attachments") or []:
+        if (att.get("filename") or "").lower() == "smime.p7m":
+            return att.get("graph_att_id")
+    return None
+
+
+def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict:
+    total = col.count_documents(SMIME_FILTER)
+    print(f"[{mailbox}] S/MIME k rozbaleni: {total}"
+          + (f" (limit {limit})" if limit else ""))
+    if total == 0:
+        return {"mailbox": mailbox, "candidates": 0, "unwrapped": 0,
+                "errors": 0, "no_att_id": 0, "missing": 0,
+                "with_inner_att": 0, "inner_att_total": 0}
+
+    cursor = col.find(SMIME_FILTER, {"_id": 1, "graph_id": 1, "attachments": 1},
+                      no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    n = unwrapped = err = no_att_id = missing = with_inner = inner_total = 0
+    bulk: list[UpdateOne] = []
+
+    try:
+        for doc in cursor:
+            n += 1
+            mid = doc["_id"]
+            gid = doc.get("graph_id")
+            att_id = find_p7m_graph_att_id(doc)
+            if not gid or not att_id:
+                no_att_id += 1
+                continue
+
+            url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}/attachments/{att_id}/$value"
+            try:
+                p7m_bytes = graph_get_raw(url)
+            except Exception as e:
+                err += 1
+                logging.error("[%s] graph fetch %s: %s", mailbox, gid, e)
+                bulk.append(UpdateOne({"_id": mid}, {"$set": {
+                    "smime_unwrapped": False,
+                    "smime_error": f"fetch: {type(e).__name__}: {e}"[:300],
+                    "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }}))
+                continue
+            if p7m_bytes is None:
+                missing += 1
+                bulk.append(UpdateOne({"_id": mid}, {"$set": {
+                    "smime_unwrapped": False,
+                    "smime_error": "attachment_404",
+                    "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }}))
+                continue
+
+            try:
+                inner_bytes = extract_inner_mime(p7m_bytes)
+                parsed = parse_inner_mime(inner_bytes)
+            except Exception as e:
+                err += 1
+                logging.error("[%s] unwrap %s: %s", mailbox, mid, e)
+                bulk.append(UpdateOne({"_id": mid}, {"$set": {
+                    "smime_unwrapped": False,
+                    "smime_error": f"unwrap: {type(e).__name__}: {e}"[:300],
+                    "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }}))
+                continue
+
+            inner_atts = parsed["inner_attachments"]
+            inner_total += len(inner_atts)
+            if inner_atts:
+                with_inner += 1
+
+            update = {
+                "smime_unwrapped": True,
+                "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                "smime_body_text": parsed["body_text"],
+                "smime_body_html": parsed["body_html"],
+                "smime_subject": parsed["subject"],
+                "smime_from": parsed["from"],
+                "smime_to": parsed["to"],
+                "smime_date": parsed["date"],
+                "smime_inner_attachments": inner_atts,
+                "smime_error": None,
+            }
+            bulk.append(UpdateOne({"_id": mid}, {"$set": update}))
+            unwrapped += 1
+
+            if len(bulk) >= BATCH_SIZE:
+                col.bulk_write(bulk, ordered=False)
+                bulk.clear()
+
+            if n % 50 == 0 or n == 1:
+                print(f"  [{n:>5}/{total}] unwrapped={unwrapped} err={err} "
+                      f"no_att_id={no_att_id} missing={missing} "
+                      f"inner_atts_total={inner_total}", flush=True)
+    finally:
+        cursor.close()
+        if bulk:
+            col.bulk_write(bulk, ordered=False)
+
+    print(f"  [{n}/{total}] DONE  unwrapped={unwrapped} err={err} "
+          f"no_att_id={no_att_id} missing={missing} "
+          f"with_inner_atts={with_inner} inner_atts_total={inner_total}")
+    return {"mailbox": mailbox, "candidates": total, "unwrapped": unwrapped,
+            "errors": err, "no_att_id": no_att_id, "missing": missing,
+            "with_inner_att": with_inner, "inner_att_total": inner_total}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    print("Token Graph API...")
+    get_token()
+    print("OK\n")
+
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            return 2
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = []
+        for c in db.list_collection_names():
+            if c in SKIP_COLLECTIONS:
+                continue
+            if c in SKIP_MAILBOXES:
+                print(f"  [skip] {c} — v SKIP_MAILBOXES (neni Graph pristup)")
+                continue
+            mailboxes.append(c)
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}\n")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(db[mb], mb, limit=args.limit))
+        print()
+
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: candidates={r['candidates']}  unwrapped={r['unwrapped']}  "
+              f"errors={r['errors']}  no_att_id={r['no_att_id']}  missing={r['missing']}  "
+              f"with_inner_atts={r['with_inner_att']}  inner_atts_total={r['inner_att_total']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    total_errors = sum(r.get("errors", 0) for r in results)
+    return 1 if total_errors > 0 else 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/Python-runner/5_enrich_fulltext_emails_v1.2.md b/Python-runner/5_enrich_fulltext_emails_v1.2.md
new file mode 100644
index 0000000..d6392fb
--- /dev/null
+++ b/Python-runner/5_enrich_fulltext_emails_v1.2.md
@@ -0,0 +1,47 @@
+# 5_enrich_fulltext_emails_v1.2.py
+
+**Krok 5 pipeline** — vytáhne plný text z emailů v MongoDB (db: `emaily`) a uloží do PostgreSQL (db: `MongoEmaily`, tabulka: `emails`) s GIN `tsvector` indexem (config `soubory` — simple + unaccent).
+
+Emaily se **nestahují znovu** — tělo už je v Mongo z kroků 1/2/4. Tento skript jen vybere první dostupné tělo podle priority a pošle text do PG na fulltext.
+
+## Priorita zdroje těla (`body_source`)
+
+1. **`smime`** — `smime_body_text` / `smime_body_html` (pokud unwrap proběhl)
+2. **`html`** — `body_html`
+3. **`text`** — `body_text` (z parse v1.4 nebo refetch v1.0)
+4. **`preview`** — `body_preview` (fallback)
+
+Názvy vnitřních S/MIME příloh (`smime_inner_attachments`) jdou do `attachments_summary` — dohledatelné přes MCP `emaily.find_attachment`.
+
+## Inkrementalita
+
+Pokud `(mailbox, message_id)` v PG existuje, `extractor_version` je aktuální (1.2) a `modified_at` v Mongo není novější → **skip**. Při bumpu `EXTRACTOR_VERSION` se vše přeparsuje.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka |
+| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
+
+## Varianty volání
+
+```bash
+# Všechny schránky:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz
+
+# Test 500 emailů:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz --limit 500
+
+# Plný běh na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.2.py > /scripts/enrich.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/enrich.log
+```
diff --git a/Python-runner/5_enrich_fulltext_emails_v1.2.py b/Python-runner/5_enrich_fulltext_emails_v1.2.py
new file mode 100644
index 0000000..530d19c
--- /dev/null
+++ b/Python-runner/5_enrich_fulltext_emails_v1.2.py
@@ -0,0 +1,489 @@
+"""
+==============================================================================
+Skript:   enrich_fulltext_emails_v1.2.py
+Verze:    1.2
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
+  PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
+
+  Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
+  (a refetch_text_bodies_v1.0 pro stare plain-text emaily).
+  Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
+
+Zmeny proti v1.1:
+  - S/MIME emaily (signed-data od Datove schranky, mBank, ComGate, PayU, ...):
+    pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html, pouzije se
+    PREFEROVANE pred bezvyznamnym vnejsim wrapper telem ("This is an S/MIME
+    signed message"). Nazvy vnitrnich priloh (smime_inner_attachments) se
+    pridavaji do attachments_summary, tj. dohledatelne pres find_attachment.
+  - body_source: nova hodnota "smime" (rozbalene vnitrni telo).
+  - EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
+
+Zmeny v1.1 vs v1.0:
+  - Fallback poradi rozsireno o body_text (novy v parse_emails_graph_v1.4).
+  - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
+
+Zdroj:
+  MongoDB    192.168.1.76  db=emaily  kolekce=<mailbox>
+             (krome attachments_index)
+
+Cil:
+  PostgreSQL 192.168.1.76  db=MongoEmaily  tabulka=emails
+             tsvector config 'soubory' (sdileny - simple + unaccent)
+
+Inkrementalita:
+  Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
+  a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
+  se vse preparsuje.
+
+Spusteni:
+  python enrich_fulltext_emails_v1.0.py                       # vsechny schranky
+  python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com
+  python enrich_fulltext_emails_v1.0.py --limit 500           # test
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from typing import Optional
+
+import psycopg
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+EXTRACTOR_VERSION = "1.2"
+
+MAX_TEXT_BYTES = 5 * 1024 * 1024   # plain text max 5 MB
+SKIP_COLLECTIONS = {"attachments_index"}
+
+BATCH_SIZE = 100
+
+
+# --- SCHEMA -----------------------------------------------------------------
+
+SCHEMA_SQL = """
+CREATE EXTENSION IF NOT EXISTS unaccent;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
+        CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
+        ALTER TEXT SEARCH CONFIGURATION soubory
+            ALTER MAPPING FOR hword, hword_part, word
+            WITH unaccent, simple;
+    END IF;
+END$$;
+
+CREATE TABLE IF NOT EXISTS emails (
+    id              BIGSERIAL PRIMARY KEY,
+    mailbox         TEXT NOT NULL,
+    message_id      TEXT NOT NULL,
+    graph_id        TEXT,
+    conversation_id TEXT,
+    folder_path     TEXT,
+    subject         TEXT,
+    sender_email    TEXT,
+    sender_name     TEXT,
+    to_addrs        TEXT,
+    cc_addrs        TEXT,
+    bcc_addrs       TEXT,
+    sent_at         TIMESTAMPTZ,
+    received_at     TIMESTAMPTZ,
+    modified_at     TIMESTAMPTZ,
+    is_read         BOOLEAN,
+    is_draft        BOOLEAN,
+    has_attachments BOOLEAN,
+    attachment_count INT,
+    attachments_summary TEXT,
+    body            TEXT,
+    body_length     INT,
+    body_source     TEXT,         -- 'html' | 'preview' | 'empty'
+    tsv             tsvector GENERATED ALWAYS AS (
+        to_tsvector('soubory'::regconfig,
+            left(
+                coalesce(subject, '') || ' ' ||
+                coalesce(sender_email, '') || ' ' ||
+                coalesce(sender_name, '') || ' ' ||
+                coalesce(to_addrs, '') || ' ' ||
+                coalesce(cc_addrs, '') || ' ' ||
+                coalesce(attachments_summary, '') || ' ' ||
+                coalesce(body, ''),
+            800000)
+        )
+    ) STORED,
+    extracted_at      TIMESTAMPTZ DEFAULT now(),
+    extractor_version TEXT,
+    ok                BOOLEAN,
+    error             TEXT,
+    UNIQUE (mailbox, message_id)
+);
+
+CREATE INDEX IF NOT EXISTS emails_tsv_gin            ON emails USING gin(tsv);
+CREATE INDEX IF NOT EXISTS emails_subject_trgm       ON emails USING gin(subject gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS emails_sender_email_idx   ON emails(sender_email);
+CREATE INDEX IF NOT EXISTS emails_mailbox_idx        ON emails(mailbox);
+CREATE INDEX IF NOT EXISTS emails_received_idx       ON emails(received_at DESC);
+CREATE INDEX IF NOT EXISTS emails_conv_idx           ON emails(conversation_id);
+"""
+
+
+# --- HELPERY ----------------------------------------------------------------
+
+_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+_WS_RX = re.compile(r"[ \t]+")
+_NL_RX = re.compile(r"\n{3,}")
+
+
+def _clean_for_pg(s: str) -> str:
+    if not s:
+        return ""
+    return _CTRL_RX.sub("", s)
+
+
+def _truncate(s: str) -> str:
+    s = _clean_for_pg(s or "")
+    if not s:
+        return ""
+    b = s.encode("utf-8", errors="replace")
+    if len(b) <= MAX_TEXT_BYTES:
+        return s
+    return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
+
+
+def html_to_text(html: str) -> str:
+    """Extrahuje plain text z HTML emailu. Odstrani <script>, <style>, normalizuje whitespace."""
+    if not html:
+        return ""
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "head"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    # normalizace whitespace
+    lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
+    text = "\n".join(ln for ln in lines if ln)
+    text = _NL_RX.sub("\n\n", text)
+    return text
+
+
+def fmt_recipients(recipients: list, kind: str) -> str:
+    """Sloupec to_addrs/cc_addrs/bcc_addrs - 'Jmeno <email>; Jmeno2 <email2>'."""
+    if not recipients:
+        return ""
+    out = []
+    for r in recipients:
+        if not isinstance(r, dict):
+            continue
+        if r.get("type") != kind:
+            continue
+        name = (r.get("name") or "").strip()
+        email = (r.get("email") or "").strip()
+        if name and email:
+            out.append(f"{name} <{email}>")
+        elif email:
+            out.append(email)
+        elif name:
+            out.append(name)
+    return "; ".join(out)
+
+
+def fmt_attachments(attachments: list) -> str:
+    if not attachments:
+        return ""
+    out = []
+    for a in attachments[:20]:
+        if not isinstance(a, dict):
+            continue
+        name = a.get("name") or a.get("filename") or ""
+        if name:
+            out.append(name)
+    return " | ".join(out)
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+# --- HLAVNI SMYCKA ----------------------------------------------------------
+
+def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
+                    limit: Optional[int] = None) -> dict:
+    # existujici zaznamy v PG (rychly inkrementalni lookup)
+    with pg.cursor() as cur:
+        cur.execute(
+            "SELECT message_id, extractor_version, modified_at, ok "
+            "FROM emails WHERE mailbox = %s",
+            (mailbox,),
+        )
+        existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
+
+    proj = {
+        "_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
+        "subject": 1, "sender": 1, "recipients": 1,
+        "sent_at": 1, "received_at": 1, "modified_at": 1,
+        "is_read": 1, "is_draft": 1,
+        "has_attachments": 1, "attachment_count": 1, "attachments": 1,
+        "body_html": 1, "body_text": 1, "body_preview": 1,
+        # S/MIME unwrapped fields (z unwrap_smime_v1.0)
+        "smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
+        "smime_subject": 1, "smime_inner_attachments": 1,
+    }
+    cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    total_pending = limit or mongo_coll.estimated_document_count()
+    print(f"[{mailbox}] kandidatu: ~{total_pending}")
+
+    processed = ok = errors = skipped = empty_body = 0
+    queue: list[dict] = []
+    n = 0
+
+    try:
+        for doc in cursor:
+            n += 1
+            msg_id = doc.get("_id") or ""
+            prev = existing.get(msg_id)
+            mongo_mtime = doc.get("modified_at")
+            if (prev and prev[0] == EXTRACTOR_VERSION and prev[2]
+                    and (mongo_mtime is None
+                         or (prev[1] and prev[1] >= mongo_mtime))):
+                skipped += 1
+                continue
+
+            sender = doc.get("sender") or {}
+            recipients = doc.get("recipients") or []
+            attachments = doc.get("attachments") or []
+            # u S/MIME prilepime nazvy SKUTECNYCH vnitrnich priloh (PDF faktura, ...)
+            # za vnejsi smime.p7m, aby je find_attachment nasel
+            inner = doc.get("smime_inner_attachments") or []
+            if inner:
+                attachments = list(attachments) + [
+                    {"filename": (a.get("filename") or "") + " [smime]"}
+                    for a in inner if a.get("filename")
+                ]
+
+            row = {
+                "mailbox": mailbox,
+                "message_id": msg_id,
+                "graph_id": doc.get("graph_id"),
+                "conversation_id": doc.get("conversation_id"),
+                "folder_path": doc.get("folder_path"),
+                "subject": doc.get("subject") or "",
+                "sender_email": sender.get("email"),
+                "sender_name": sender.get("name"),
+                "to_addrs": fmt_recipients(recipients, "to"),
+                "cc_addrs": fmt_recipients(recipients, "cc"),
+                "bcc_addrs": fmt_recipients(recipients, "bcc"),
+                "sent_at": doc.get("sent_at"),
+                "received_at": doc.get("received_at"),
+                "modified_at": mongo_mtime,
+                "is_read": doc.get("is_read"),
+                "is_draft": doc.get("is_draft"),
+                "has_attachments": doc.get("has_attachments"),
+                "attachment_count": doc.get("attachment_count"),
+                "attachments_summary": fmt_attachments(attachments),
+                "body": None,
+                "body_length": 0,
+                "body_source": "empty",
+                "extracted_at": _now(),
+                "extractor_version": EXTRACTOR_VERSION,
+                "ok": False,
+                "error": None,
+            }
+
+            status = "OK "; detail = ""
+            try:
+                # fallback poradi (v1.2):
+                #   smime_body_text/html (rozbaleny S/MIME) -> body_html -> body_text -> body_preview
+                text = ""
+                if doc.get("smime_unwrapped"):
+                    s_text = doc.get("smime_body_text") or ""
+                    s_html = doc.get("smime_body_html") or ""
+                    s_html_text = html_to_text(s_html) if s_html else ""
+                    # preferuj plain text, fallback html
+                    combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
+                    s_subject = doc.get("smime_subject") or ""
+                    if s_subject:
+                        combined = f"Subject: {s_subject}\n\n{combined}"
+                    if combined:
+                        text = combined
+                        row["body_source"] = "smime"
+                if not text:
+                    html = doc.get("body_html") or ""
+                    h_text = html_to_text(html) if html else ""
+                    if h_text:
+                        text = h_text
+                        row["body_source"] = "html"
+                if not text:
+                    plain = doc.get("body_text") or ""
+                    if plain:
+                        text = plain
+                        row["body_source"] = "text"
+                if not text:
+                    preview = doc.get("body_preview") or ""
+                    if preview:
+                        text = preview
+                        row["body_source"] = "preview"
+                if not text:
+                    row["body_source"] = "empty"
+                    empty_body += 1
+                body = _truncate(text)
+                row["body"] = body if body else None
+                row["body_length"] = len(body)
+                row["ok"] = True
+                ok += 1
+                detail = f"{len(body)} znaku  {_short(body, 60)!r}"
+            except Exception as e:
+                row["error"] = f"{type(e).__name__}: {e}"[:500]
+                status = "ERR"; detail = row["error"][:80]; errors += 1
+
+            queue.append(row)
+            processed += 1
+
+            if n % 200 == 0 or n == 1:
+                subj = _short(row["subject"], 50)
+                print(f"  [{n:>5}] {status} {row['body_source']:<7} "
+                      f"{row['body_length']:>7}ch  | {subj}", flush=True)
+
+            if len(queue) >= BATCH_SIZE:
+                _flush(pg, queue); queue.clear()
+    finally:
+        cursor.close()
+
+    if queue:
+        _flush(pg, queue)
+
+    return {"mailbox": mailbox, "processed": processed, "ok": ok,
+            "errors": errors, "skipped": skipped, "empty_body": empty_body}
+
+
+UPSERT_SQL = """
+INSERT INTO emails
+    (mailbox, message_id, graph_id, conversation_id, folder_path,
+     subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
+     sent_at, received_at, modified_at, is_read, is_draft,
+     has_attachments, attachment_count, attachments_summary,
+     body, body_length, body_source,
+     extracted_at, extractor_version, ok, error)
+VALUES
+    (%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
+     %(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
+     %(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
+     %(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
+     %(body)s, %(body_length)s, %(body_source)s,
+     %(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
+ON CONFLICT (mailbox, message_id) DO UPDATE SET
+    graph_id            = EXCLUDED.graph_id,
+    conversation_id     = EXCLUDED.conversation_id,
+    folder_path         = EXCLUDED.folder_path,
+    subject             = EXCLUDED.subject,
+    sender_email        = EXCLUDED.sender_email,
+    sender_name         = EXCLUDED.sender_name,
+    to_addrs            = EXCLUDED.to_addrs,
+    cc_addrs            = EXCLUDED.cc_addrs,
+    bcc_addrs           = EXCLUDED.bcc_addrs,
+    sent_at             = EXCLUDED.sent_at,
+    received_at         = EXCLUDED.received_at,
+    modified_at         = EXCLUDED.modified_at,
+    is_read             = EXCLUDED.is_read,
+    is_draft            = EXCLUDED.is_draft,
+    has_attachments     = EXCLUDED.has_attachments,
+    attachment_count    = EXCLUDED.attachment_count,
+    attachments_summary = EXCLUDED.attachments_summary,
+    body                = EXCLUDED.body,
+    body_length         = EXCLUDED.body_length,
+    body_source         = EXCLUDED.body_source,
+    extracted_at        = EXCLUDED.extracted_at,
+    extractor_version   = EXCLUDED.extractor_version,
+    ok                  = EXCLUDED.ok,
+    error               = EXCLUDED.error
+"""
+
+
+def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
+    for r in rows:
+        for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
+                  "bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
+            if r.get(k):
+                r[k] = _clean_for_pg(r[k])
+    with pg.cursor() as cur:
+        cur.executemany(UPSERT_SQL, rows)
+    pg.commit()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k PostgreSQL...")
+    # MongoEmaily DB musi existovat (create externe pres psql nebo DBeaver),
+    # protoze CREATE DATABASE nesmi byt v transakci.
+    pg = psycopg.connect(PG_DSN, connect_timeout=10)
+    with pg.cursor() as cur:
+        cur.execute(SCHEMA_SQL)
+    pg.commit()
+    print("Schema OK.")
+
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(pg, db[mb], mb, limit=args.limit))
+
+    pg.close()
+
+    print("\n=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: processed={r['processed']}  ok={r['ok']}  "
+              f"errors={r['errors']}  skipped={r['skipped']}  empty={r['empty_body']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/Python-runner/5_enrich_fulltext_emails_v1.3.md b/Python-runner/5_enrich_fulltext_emails_v1.3.md
new file mode 100644
index 0000000..c7410d5
--- /dev/null
+++ b/Python-runner/5_enrich_fulltext_emails_v1.3.md
@@ -0,0 +1,79 @@
+# 5_enrich_fulltext_emails_v1.3.py
+
+**Krok 5 pipeline** — vytáhne plain-text z emailů v Mongu (`emaily.<schránka>`) a uloží do PostgreSQL (`MongoEmaily.emails`) s tsvector GIN indexem nad konfigurací `soubory` (simple + unaccent).
+
+## Co dělá
+
+1. Vybere první dostupné tělo v tomto pořadí:
+   - `smime_body_text/html` (rozbaleno krokem 4)
+   - `body_html` → strip HTML přes BeautifulSoup
+   - `body_text` (legacy plain)
+   - `body_preview` (jako fallback)
+2. Naplní řádek v PG `emails` (mailbox, subject, sender, recipients, body, attachments_summary, ...) + tsvector se vygeneruje sám.
+3. Upsert (`ON CONFLICT (mailbox, message_id) DO UPDATE`).
+
+## Inkrementální logika
+
+Pokud `(mailbox, message_id)` už je v PG a:
+- `extractor_version == EXTRACTOR_VERSION` (aktuálně `1.2`)
+- `ok = true`
+- `modified_at` v Mongo není novější než v PG
+
+→ **skip**. Nemusíš se bát opakovaného spuštění — vladimirovo přepsání 73k záznamů co teď probíhá je proto, že `EXTRACTOR_VERSION` byl povýšen z 1.1 → 1.2, takže všechny řádky v PG jsou „zastaralé". Po doběhnutí bude další běh skipovat všechno až na nově přibyvší.
+
+## Změny v1.3 vs v1.2
+
+- **Bugfix** `NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}` — předtím `sync_state` (přibyla s delta syncem) projížděla jako mailbox.
+- **`--index-reset`** — před zpracováním schránky `DELETE FROM emails WHERE mailbox=%s`. Force re-extract bez nutnosti povyšovat verzi.
+- **Vylepšený per-mailbox header** — ukáže `v Mongu N, v PG M (uptodate K), k zpracovani K`.
+- Když `to_process_estimate == 0` → schránku přeskočí úplně (bez iterace cursorem).
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` |
+| `--limit N` | ne | int | (bez limitu) | Per schránka, jen prvních N emailů (test) |
+| `--index-reset` | ne | flag | false | Před zpracováním **smaže** všechny emaily dané schránky v PG. **Bez `--mailbox` smaže CELÝ index!** |
+
+## Varianty volání
+
+```bash
+# Všechny schránky, inkrementální:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz
+
+# Test 500 emailů:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz --limit 500
+
+# Force reindex jedné schránky:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz --index-reset
+
+# DANGEROUS: smaže celý index a postaví znovu (POMALÉ — typicky 30+ minut):
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --index-reset
+
+# Na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.3.py > /scripts/enrich_fulltext.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/enrich_fulltext.log
+```
+
+V průběhu skript vypisuje každých 200 zpracovaných emailů:
+```
+[ 38800|p=  5800] OK  html       2831ch  | CLEAR/RA payment information for invoice #22FV049
+```
+- první číslo = pozice v cursoru (počet všech emailů co prošlo)
+- `p=N` = počet skutečně zprocesovaných (zbytek byl skipnut jako už-aktuální)
+- `OK / ERR`, `body_source`, délka, subject
+
+## Závislosti
+
+```bash
+docker exec python-runner pip install psycopg[binary] beautifulsoup4 lxml pymongo
+```
diff --git a/Python-runner/5_enrich_fulltext_emails_v1.3.py b/Python-runner/5_enrich_fulltext_emails_v1.3.py
new file mode 100644
index 0000000..9487e7a
--- /dev/null
+++ b/Python-runner/5_enrich_fulltext_emails_v1.3.py
@@ -0,0 +1,567 @@
+"""
+==============================================================================
+Skript:   enrich_fulltext_emails_v1.3.py
+Verze:    1.3
+Datum:    2026-06-04
+Autor:    vladimir.buzalka
+
+Popis:
+  Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
+  PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
+
+  Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
+  (a refetch_text_bodies_v1.0 pro stare plain-text emaily).
+  Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
+
+Zmeny v1.3 vs v1.2:
+  - Bugfix: NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+    (sync_state pribyla v delta syncu, predtim ji v1.2 brala jako mailbox).
+  - --index-reset: pred zpracovanim schranky vymaze vsechny jeji emaily z PG
+    (force re-extract; pouzij kdyz povysis EXTRACTOR_VERSION nebo chces ciste).
+  - Vylepseny header per-mailbox: ukaze pocet v Mongu, v PG a k zpracovani.
+
+Zmeny v1.2 vs v1.1:
+  - S/MIME emaily: pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html,
+    pouzije se PREFEROVANE pred bezvyznamnym wrapper telem.
+  - body_source: nova hodnota "smime".
+  - EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
+
+Zmeny v1.1 vs v1.0:
+  - Fallback poradi rozsireno o body_text.
+  - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
+
+Zdroj:
+  MongoDB    192.168.1.76  db=emaily  kolekce=<mailbox>
+             (krome NON_MAILBOX_COLLECTIONS)
+
+Cil:
+  PostgreSQL 192.168.1.76  db=MongoEmaily  tabulka=emails
+             tsvector config 'soubory' (sdileny - simple + unaccent)
+
+Inkrementalita:
+  Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
+  a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
+  se vse preparsuje. --index-reset to obejde a smaze PG pred behom.
+
+Spusteni:
+  python enrich_fulltext_emails_v1.3.py                           # vsechny schranky
+  python enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz
+  python enrich_fulltext_emails_v1.3.py --limit 500               # test
+  python enrich_fulltext_emails_v1.3.py --mailbox X --index-reset # smaze PG schranky a re-extrahuje vsechno
+  python enrich_fulltext_emails_v1.3.py --index-reset             # smaze CELY index a postavi znovu (POMALE!)
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from typing import Optional
+
+import psycopg
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+EXTRACTOR_VERSION = "1.2"   # NEMENIT pokud nemenis fallback logiku!
+
+MAX_TEXT_BYTES = 5 * 1024 * 1024   # plain text max 5 MB
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy (nezpracovavame)
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+BATCH_SIZE = 100
+
+
+# --- SCHEMA -----------------------------------------------------------------
+
+SCHEMA_SQL = """
+CREATE EXTENSION IF NOT EXISTS unaccent;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
+        CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
+        ALTER TEXT SEARCH CONFIGURATION soubory
+            ALTER MAPPING FOR hword, hword_part, word
+            WITH unaccent, simple;
+    END IF;
+END$$;
+
+CREATE TABLE IF NOT EXISTS emails (
+    id              BIGSERIAL PRIMARY KEY,
+    mailbox         TEXT NOT NULL,
+    message_id      TEXT NOT NULL,
+    graph_id        TEXT,
+    conversation_id TEXT,
+    folder_path     TEXT,
+    subject         TEXT,
+    sender_email    TEXT,
+    sender_name     TEXT,
+    to_addrs        TEXT,
+    cc_addrs        TEXT,
+    bcc_addrs       TEXT,
+    sent_at         TIMESTAMPTZ,
+    received_at     TIMESTAMPTZ,
+    modified_at     TIMESTAMPTZ,
+    is_read         BOOLEAN,
+    is_draft        BOOLEAN,
+    has_attachments BOOLEAN,
+    attachment_count INT,
+    attachments_summary TEXT,
+    body            TEXT,
+    body_length     INT,
+    body_source     TEXT,         -- 'html' | 'preview' | 'empty'
+    tsv             tsvector GENERATED ALWAYS AS (
+        to_tsvector('soubory'::regconfig,
+            left(
+                coalesce(subject, '') || ' ' ||
+                coalesce(sender_email, '') || ' ' ||
+                coalesce(sender_name, '') || ' ' ||
+                coalesce(to_addrs, '') || ' ' ||
+                coalesce(cc_addrs, '') || ' ' ||
+                coalesce(attachments_summary, '') || ' ' ||
+                coalesce(body, ''),
+            800000)
+        )
+    ) STORED,
+    extracted_at      TIMESTAMPTZ DEFAULT now(),
+    extractor_version TEXT,
+    ok                BOOLEAN,
+    error             TEXT,
+    UNIQUE (mailbox, message_id)
+);
+
+CREATE INDEX IF NOT EXISTS emails_tsv_gin            ON emails USING gin(tsv);
+CREATE INDEX IF NOT EXISTS emails_subject_trgm       ON emails USING gin(subject gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS emails_sender_email_idx   ON emails(sender_email);
+CREATE INDEX IF NOT EXISTS emails_mailbox_idx        ON emails(mailbox);
+CREATE INDEX IF NOT EXISTS emails_received_idx       ON emails(received_at DESC);
+CREATE INDEX IF NOT EXISTS emails_conv_idx           ON emails(conversation_id);
+"""
+
+
+# --- HELPERY ----------------------------------------------------------------
+
+_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+_WS_RX = re.compile(r"[ \t]+")
+_NL_RX = re.compile(r"\n{3,}")
+
+
+def _clean_for_pg(s: str) -> str:
+    if not s:
+        return ""
+    return _CTRL_RX.sub("", s)
+
+
+def _truncate(s: str) -> str:
+    s = _clean_for_pg(s or "")
+    if not s:
+        return ""
+    b = s.encode("utf-8", errors="replace")
+    if len(b) <= MAX_TEXT_BYTES:
+        return s
+    return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
+
+
+def html_to_text(html: str) -> str:
+    if not html:
+        return ""
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "head"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
+    text = "\n".join(ln for ln in lines if ln)
+    text = _NL_RX.sub("\n\n", text)
+    return text
+
+
+def fmt_recipients(recipients: list, kind: str) -> str:
+    if not recipients:
+        return ""
+    out = []
+    for r in recipients:
+        if not isinstance(r, dict):
+            continue
+        if r.get("type") != kind:
+            continue
+        name = (r.get("name") or "").strip()
+        email = (r.get("email") or "").strip()
+        if name and email:
+            out.append(f"{name} <{email}>")
+        elif email:
+            out.append(email)
+        elif name:
+            out.append(name)
+    return "; ".join(out)
+
+
+def fmt_attachments(attachments: list) -> str:
+    if not attachments:
+        return ""
+    out = []
+    for a in attachments[:20]:
+        if not isinstance(a, dict):
+            continue
+        name = a.get("name") or a.get("filename") or ""
+        if name:
+            out.append(name)
+    return " | ".join(out)
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+def _aware_utc(dt: Optional[datetime]) -> Optional[datetime]:
+    """Sjednoceni: PG TIMESTAMPTZ -> tz-aware UTC; Mongo datetime -> naive (UTC).
+    Vrati tz-aware UTC datetime nebo None."""
+    if dt is None:
+        return None
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt.astimezone(timezone.utc)
+
+
+# --- HLAVNI SMYCKA ----------------------------------------------------------
+
+def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
+                    limit: Optional[int] = None,
+                    index_reset: bool = False) -> dict:
+    # --index-reset: smaz vse pro tuto schranku v PG
+    if index_reset:
+        with pg.cursor() as cur:
+            cur.execute("DELETE FROM emails WHERE mailbox = %s", (mailbox,))
+            deleted = cur.rowcount
+        pg.commit()
+        print(f"[{mailbox}] --index-reset: smazano {deleted} radku v PG")
+
+    # existujici zaznamy v PG (rychly inkrementalni lookup)
+    # tuple = (extractor_version, ok, body_source)
+    with pg.cursor() as cur:
+        cur.execute(
+            "SELECT message_id, extractor_version, ok, body_source "
+            "FROM emails WHERE mailbox = %s",
+            (mailbox,),
+        )
+        existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
+
+    mongo_total = mongo_coll.estimated_document_count()
+    pg_total    = len(existing)
+    pg_uptodate = sum(1 for v in existing.values()
+                      if v[0] == EXTRACTOR_VERSION and v[1])
+    to_process_estimate = mongo_total - pg_uptodate
+    print(f"\n========== {mailbox} ==========")
+    print(f"  v Mongu:      {mongo_total}")
+    print(f"  v PG:         {pg_total} (z toho ext_v={EXTRACTOR_VERSION} & ok=true: {pg_uptodate})")
+    print(f"  k zpracovani: ~{to_process_estimate}{' (limit=' + str(limit) + ')' if limit else ''}")
+
+    if to_process_estimate <= 0 and not index_reset and not limit:
+        print("  Nic noveho ke zpracovani.")
+        return {"mailbox": mailbox, "processed": 0, "ok": 0, "errors": 0,
+                "skipped": pg_uptodate, "empty_body": 0}
+
+    proj = {
+        "_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
+        "subject": 1, "sender": 1, "recipients": 1,
+        "sent_at": 1, "received_at": 1, "modified_at": 1,
+        "is_read": 1, "is_draft": 1,
+        "has_attachments": 1, "attachment_count": 1, "attachments": 1,
+        "body_html": 1, "body_text": 1, "body_preview": 1,
+        "smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
+        "smime_subject": 1, "smime_inner_attachments": 1,
+    }
+    cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    processed = ok = errors = skipped = empty_body = 0
+    queue: list[dict] = []
+    n = 0
+
+    try:
+        for doc in cursor:
+            n += 1
+            msg_id = doc.get("_id") or ""
+            prev = existing.get(msg_id)  # (extractor_version, ok, body_source)
+            mongo_mtime = doc.get("modified_at")
+
+            # Skip kdyz PG ma stejnou EV a ok=true.
+            # Vyjimka: smime_unwrapped v Mongu, ale PG body_source != 'smime'
+            #          -> unwrap_smime pridal rozbaleny text az po enrichu -> re-enrich.
+            if prev and prev[0] == EXTRACTOR_VERSION and prev[1]:
+                needs_smime_reindex = (
+                    bool(doc.get("smime_unwrapped"))
+                    and prev[2] != "smime"
+                )
+                if not needs_smime_reindex:
+                    skipped += 1
+                    continue
+
+            sender = doc.get("sender") or {}
+            recipients = doc.get("recipients") or []
+            attachments = doc.get("attachments") or []
+            inner = doc.get("smime_inner_attachments") or []
+            if inner:
+                attachments = list(attachments) + [
+                    {"filename": (a.get("filename") or "") + " [smime]"}
+                    for a in inner if a.get("filename")
+                ]
+
+            row = {
+                "mailbox": mailbox,
+                "message_id": msg_id,
+                "graph_id": doc.get("graph_id"),
+                "conversation_id": doc.get("conversation_id"),
+                "folder_path": doc.get("folder_path"),
+                "subject": doc.get("subject") or "",
+                "sender_email": sender.get("email"),
+                "sender_name": sender.get("name"),
+                "to_addrs": fmt_recipients(recipients, "to"),
+                "cc_addrs": fmt_recipients(recipients, "cc"),
+                "bcc_addrs": fmt_recipients(recipients, "bcc"),
+                # Vsechny timestampy z Monga jsou naive ale interpretovany jako UTC.
+                # Tagneme je tz-aware aby PG TIMESTAMPTZ ulozil spravnou UTC hodnotu
+                # a nepocital posun podle session timezone.
+                "sent_at":     _aware_utc(doc.get("sent_at")),
+                "received_at": _aware_utc(doc.get("received_at")),
+                "modified_at": _aware_utc(mongo_mtime),
+                "is_read": doc.get("is_read"),
+                "is_draft": doc.get("is_draft"),
+                "has_attachments": doc.get("has_attachments"),
+                "attachment_count": doc.get("attachment_count"),
+                "attachments_summary": fmt_attachments(attachments),
+                "body": None,
+                "body_length": 0,
+                "body_source": "empty",
+                "extracted_at": _now(),
+                "extractor_version": EXTRACTOR_VERSION,
+                "ok": False,
+                "error": None,
+            }
+
+            status = "OK "; detail = ""
+            try:
+                text = ""
+                if doc.get("smime_unwrapped"):
+                    s_text = doc.get("smime_body_text") or ""
+                    s_html = doc.get("smime_body_html") or ""
+                    s_html_text = html_to_text(s_html) if s_html else ""
+                    combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
+                    s_subject = doc.get("smime_subject") or ""
+                    if s_subject:
+                        combined = f"Subject: {s_subject}\n\n{combined}"
+                    if combined:
+                        text = combined
+                        row["body_source"] = "smime"
+                if not text:
+                    html = doc.get("body_html") or ""
+                    h_text = html_to_text(html) if html else ""
+                    if h_text:
+                        text = h_text
+                        row["body_source"] = "html"
+                if not text:
+                    plain = doc.get("body_text") or ""
+                    if plain:
+                        text = plain
+                        row["body_source"] = "text"
+                if not text:
+                    preview = doc.get("body_preview") or ""
+                    if preview:
+                        text = preview
+                        row["body_source"] = "preview"
+                if not text:
+                    row["body_source"] = "empty"
+                    empty_body += 1
+                body = _truncate(text)
+                row["body"] = body if body else None
+                row["body_length"] = len(body)
+                row["ok"] = True
+                ok += 1
+                detail = f"{len(body)} znaku  {_short(body, 60)!r}"
+            except Exception as e:
+                row["error"] = f"{type(e).__name__}: {e}"[:500]
+                status = "ERR"; detail = row["error"][:80]; errors += 1
+
+            queue.append(row)
+            processed += 1
+
+            if processed % 200 == 0 or processed == 1:
+                subj = _short(row["subject"], 50)
+                print(f"  [{n:>6}|p={processed:>5}] {status} {row['body_source']:<7} "
+                      f"{row['body_length']:>7}ch  | {subj}", flush=True)
+
+            if len(queue) >= BATCH_SIZE:
+                _flush(pg, queue); queue.clear()
+    finally:
+        cursor.close()
+
+    if queue:
+        _flush(pg, queue)
+
+    return {"mailbox": mailbox, "processed": processed, "ok": ok,
+            "errors": errors, "skipped": skipped, "empty_body": empty_body}
+
+
+UPSERT_SQL = """
+INSERT INTO emails
+    (mailbox, message_id, graph_id, conversation_id, folder_path,
+     subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
+     sent_at, received_at, modified_at, is_read, is_draft,
+     has_attachments, attachment_count, attachments_summary,
+     body, body_length, body_source,
+     extracted_at, extractor_version, ok, error)
+VALUES
+    (%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
+     %(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
+     %(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
+     %(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
+     %(body)s, %(body_length)s, %(body_source)s,
+     %(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
+ON CONFLICT (mailbox, message_id) DO UPDATE SET
+    graph_id            = EXCLUDED.graph_id,
+    conversation_id     = EXCLUDED.conversation_id,
+    folder_path         = EXCLUDED.folder_path,
+    subject             = EXCLUDED.subject,
+    sender_email        = EXCLUDED.sender_email,
+    sender_name         = EXCLUDED.sender_name,
+    to_addrs            = EXCLUDED.to_addrs,
+    cc_addrs            = EXCLUDED.cc_addrs,
+    bcc_addrs           = EXCLUDED.bcc_addrs,
+    sent_at             = EXCLUDED.sent_at,
+    received_at         = EXCLUDED.received_at,
+    modified_at         = EXCLUDED.modified_at,
+    is_read             = EXCLUDED.is_read,
+    is_draft            = EXCLUDED.is_draft,
+    has_attachments     = EXCLUDED.has_attachments,
+    attachment_count    = EXCLUDED.attachment_count,
+    attachments_summary = EXCLUDED.attachments_summary,
+    body                = EXCLUDED.body,
+    body_length         = EXCLUDED.body_length,
+    body_source         = EXCLUDED.body_source,
+    extracted_at        = EXCLUDED.extracted_at,
+    extractor_version   = EXCLUDED.extractor_version,
+    ok                  = EXCLUDED.ok,
+    error               = EXCLUDED.error
+"""
+
+
+def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
+    for r in rows:
+        for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
+                  "bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
+            if r.get(k):
+                r[k] = _clean_for_pg(r[k])
+    with pg.cursor() as cur:
+        cur.executemany(UPSERT_SQL, rows)
+    pg.commit()
+
+
+def discover_mailboxes(db) -> list[str]:
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        out.append(name)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="enrich_fulltext_emails v1.3")
+    ap.add_argument("--mailbox", default="",
+                    help="Jedna konkretni schranka. Bez argumentu projede vsechny.")
+    ap.add_argument("--limit", type=int,
+                    help="Limit emailu na schranku (test)")
+    ap.add_argument("--index-reset", action="store_true",
+                    help="Pred zpracovanim schranky vymaze vsechny jeji emaily z PG "
+                         "(force re-extract). Bez --mailbox SMAZE CELY index.")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print(f"=== enrich_fulltext_emails v1.3 ===")
+    print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\nPripojuji se k PostgreSQL...")
+    pg = psycopg.connect(PG_DSN, connect_timeout=10)
+    with pg.cursor() as cur:
+        cur.execute(SCHEMA_SQL)
+    pg.commit()
+    print("  Schema OK.")
+
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+    print("  MongoDB OK.")
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+    print(f"\nSchranky ke zpracovani ({len(mailboxes)}):")
+    for mb in mailboxes:
+        print(f"  - {mb}")
+
+    if args.index_reset and not args.mailbox:
+        print(f"\n!!! --index-reset bez --mailbox => SMAZE CELY INDEX ({len(mailboxes)} schranek) !!!")
+
+    results = []
+    for mb in mailboxes:
+        try:
+            results.append(process_mailbox(pg, db[mb], mb,
+                                           limit=args.limit,
+                                           index_reset=args.index_reset))
+        except Exception as e:
+            traceback.print_exc()
+            print(f"  FATAL pri zpracovani {mb}: {e}")
+            results.append({"mailbox": mb, "processed": 0, "ok": 0,
+                            "errors": 1, "skipped": 0, "empty_body": 0})
+
+    pg.close()
+
+    print("\n" + "="*60)
+    print("=== SHRNUTI ===")
+    grand = {"processed": 0, "ok": 0, "errors": 0, "skipped": 0, "empty_body": 0}
+    for r in results:
+        print(f"  {r['mailbox']:40} processed={r['processed']:>5} ok={r['ok']:>5} "
+              f"errors={r['errors']:>3} skipped={r['skipped']:>6} empty={r['empty_body']:>4}")
+        for k in grand:
+            grand[k] += r.get(k, 0)
+    print(f"  {'TOTAL':40} processed={grand['processed']:>5} ok={grand['ok']:>5} "
+          f"errors={grand['errors']:>3} skipped={grand['skipped']:>6} empty={grand['empty_body']:>4}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    print(f"Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    # exit code: 0 jen kdyz vsechny schranky probehly bez chyby
+    return 1 if grand["errors"] > 0 else 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/Python-runner/Trash/enrich_fulltext_emails_v1.1.py b/Python-runner/Trash/enrich_fulltext_emails_v1.1.py
new file mode 100644
index 0000000..6d75ce8
--- /dev/null
+++ b/Python-runner/Trash/enrich_fulltext_emails_v1.1.py
@@ -0,0 +1,455 @@
+"""
+==============================================================================
+Skript:   enrich_fulltext_emails_v1.1.py
+Verze:    1.1
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
+  PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
+
+  Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
+  (a refetch_text_bodies_v1.0 pro stare plain-text emaily).
+  Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
+
+Zmeny proti v1.0:
+  - Fallback poradi rozsireno: body_html -> body_text (novy v parse_emails_graph_v1.4)
+    -> body_preview -> empty. Drive bylo body_html -> body_preview.
+  - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
+  - EXTRACTOR_VERSION=1.1 -> vsechny existujici emaily v PG se preparsuji.
+
+Zdroj:
+  MongoDB    192.168.1.76  db=emaily  kolekce=<mailbox>
+             (krome attachments_index)
+
+Cil:
+  PostgreSQL 192.168.1.76  db=MongoEmaily  tabulka=emails
+             tsvector config 'soubory' (sdileny - simple + unaccent)
+
+Inkrementalita:
+  Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
+  a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
+  se vse preparsuje.
+
+Spusteni:
+  python enrich_fulltext_emails_v1.0.py                       # vsechny schranky
+  python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com
+  python enrich_fulltext_emails_v1.0.py --limit 500           # test
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from typing import Optional
+
+import psycopg
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+EXTRACTOR_VERSION = "1.1"
+
+MAX_TEXT_BYTES = 5 * 1024 * 1024   # plain text max 5 MB
+SKIP_COLLECTIONS = {"attachments_index"}
+
+BATCH_SIZE = 100
+
+
+# --- SCHEMA -----------------------------------------------------------------
+
+SCHEMA_SQL = """
+CREATE EXTENSION IF NOT EXISTS unaccent;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
+        CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
+        ALTER TEXT SEARCH CONFIGURATION soubory
+            ALTER MAPPING FOR hword, hword_part, word
+            WITH unaccent, simple;
+    END IF;
+END$$;
+
+CREATE TABLE IF NOT EXISTS emails (
+    id              BIGSERIAL PRIMARY KEY,
+    mailbox         TEXT NOT NULL,
+    message_id      TEXT NOT NULL,
+    graph_id        TEXT,
+    conversation_id TEXT,
+    folder_path     TEXT,
+    subject         TEXT,
+    sender_email    TEXT,
+    sender_name     TEXT,
+    to_addrs        TEXT,
+    cc_addrs        TEXT,
+    bcc_addrs       TEXT,
+    sent_at         TIMESTAMPTZ,
+    received_at     TIMESTAMPTZ,
+    modified_at     TIMESTAMPTZ,
+    is_read         BOOLEAN,
+    is_draft        BOOLEAN,
+    has_attachments BOOLEAN,
+    attachment_count INT,
+    attachments_summary TEXT,
+    body            TEXT,
+    body_length     INT,
+    body_source     TEXT,         -- 'html' | 'preview' | 'empty'
+    tsv             tsvector GENERATED ALWAYS AS (
+        to_tsvector('soubory'::regconfig,
+            left(
+                coalesce(subject, '') || ' ' ||
+                coalesce(sender_email, '') || ' ' ||
+                coalesce(sender_name, '') || ' ' ||
+                coalesce(to_addrs, '') || ' ' ||
+                coalesce(cc_addrs, '') || ' ' ||
+                coalesce(attachments_summary, '') || ' ' ||
+                coalesce(body, ''),
+            800000)
+        )
+    ) STORED,
+    extracted_at      TIMESTAMPTZ DEFAULT now(),
+    extractor_version TEXT,
+    ok                BOOLEAN,
+    error             TEXT,
+    UNIQUE (mailbox, message_id)
+);
+
+CREATE INDEX IF NOT EXISTS emails_tsv_gin            ON emails USING gin(tsv);
+CREATE INDEX IF NOT EXISTS emails_subject_trgm       ON emails USING gin(subject gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS emails_sender_email_idx   ON emails(sender_email);
+CREATE INDEX IF NOT EXISTS emails_mailbox_idx        ON emails(mailbox);
+CREATE INDEX IF NOT EXISTS emails_received_idx       ON emails(received_at DESC);
+CREATE INDEX IF NOT EXISTS emails_conv_idx           ON emails(conversation_id);
+"""
+
+
+# --- HELPERY ----------------------------------------------------------------
+
+_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+_WS_RX = re.compile(r"[ \t]+")
+_NL_RX = re.compile(r"\n{3,}")
+
+
+def _clean_for_pg(s: str) -> str:
+    if not s:
+        return ""
+    return _CTRL_RX.sub("", s)
+
+
+def _truncate(s: str) -> str:
+    s = _clean_for_pg(s or "")
+    if not s:
+        return ""
+    b = s.encode("utf-8", errors="replace")
+    if len(b) <= MAX_TEXT_BYTES:
+        return s
+    return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
+
+
+def html_to_text(html: str) -> str:
+    """Extrahuje plain text z HTML emailu. Odstrani <script>, <style>, normalizuje whitespace."""
+    if not html:
+        return ""
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "head"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    # normalizace whitespace
+    lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
+    text = "\n".join(ln for ln in lines if ln)
+    text = _NL_RX.sub("\n\n", text)
+    return text
+
+
+def fmt_recipients(recipients: list, kind: str) -> str:
+    """Sloupec to_addrs/cc_addrs/bcc_addrs - 'Jmeno <email>; Jmeno2 <email2>'."""
+    if not recipients:
+        return ""
+    out = []
+    for r in recipients:
+        if not isinstance(r, dict):
+            continue
+        if r.get("type") != kind:
+            continue
+        name = (r.get("name") or "").strip()
+        email = (r.get("email") or "").strip()
+        if name and email:
+            out.append(f"{name} <{email}>")
+        elif email:
+            out.append(email)
+        elif name:
+            out.append(name)
+    return "; ".join(out)
+
+
+def fmt_attachments(attachments: list) -> str:
+    if not attachments:
+        return ""
+    out = []
+    for a in attachments[:20]:
+        if not isinstance(a, dict):
+            continue
+        name = a.get("name") or a.get("filename") or ""
+        if name:
+            out.append(name)
+    return " | ".join(out)
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+# --- HLAVNI SMYCKA ----------------------------------------------------------
+
+def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
+                    limit: Optional[int] = None) -> dict:
+    # existujici zaznamy v PG (rychly inkrementalni lookup)
+    with pg.cursor() as cur:
+        cur.execute(
+            "SELECT message_id, extractor_version, modified_at, ok "
+            "FROM emails WHERE mailbox = %s",
+            (mailbox,),
+        )
+        existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
+
+    proj = {
+        "_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
+        "subject": 1, "sender": 1, "recipients": 1,
+        "sent_at": 1, "received_at": 1, "modified_at": 1,
+        "is_read": 1, "is_draft": 1,
+        "has_attachments": 1, "attachment_count": 1, "attachments": 1,
+        "body_html": 1, "body_text": 1, "body_preview": 1,
+    }
+    cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    total_pending = limit or mongo_coll.estimated_document_count()
+    print(f"[{mailbox}] kandidatu: ~{total_pending}")
+
+    processed = ok = errors = skipped = empty_body = 0
+    queue: list[dict] = []
+    n = 0
+
+    try:
+        for doc in cursor:
+            n += 1
+            msg_id = doc.get("_id") or ""
+            prev = existing.get(msg_id)
+            mongo_mtime = doc.get("modified_at")
+            if (prev and prev[0] == EXTRACTOR_VERSION and prev[2]
+                    and (mongo_mtime is None
+                         or (prev[1] and prev[1] >= mongo_mtime))):
+                skipped += 1
+                continue
+
+            sender = doc.get("sender") or {}
+            recipients = doc.get("recipients") or []
+            attachments = doc.get("attachments") or []
+
+            row = {
+                "mailbox": mailbox,
+                "message_id": msg_id,
+                "graph_id": doc.get("graph_id"),
+                "conversation_id": doc.get("conversation_id"),
+                "folder_path": doc.get("folder_path"),
+                "subject": doc.get("subject") or "",
+                "sender_email": sender.get("email"),
+                "sender_name": sender.get("name"),
+                "to_addrs": fmt_recipients(recipients, "to"),
+                "cc_addrs": fmt_recipients(recipients, "cc"),
+                "bcc_addrs": fmt_recipients(recipients, "bcc"),
+                "sent_at": doc.get("sent_at"),
+                "received_at": doc.get("received_at"),
+                "modified_at": mongo_mtime,
+                "is_read": doc.get("is_read"),
+                "is_draft": doc.get("is_draft"),
+                "has_attachments": doc.get("has_attachments"),
+                "attachment_count": doc.get("attachment_count"),
+                "attachments_summary": fmt_attachments(attachments),
+                "body": None,
+                "body_length": 0,
+                "body_source": "empty",
+                "extracted_at": _now(),
+                "extractor_version": EXTRACTOR_VERSION,
+                "ok": False,
+                "error": None,
+            }
+
+            status = "OK "; detail = ""
+            try:
+                # fallback poradi (v1.1): body_html -> body_text -> body_preview
+                html = doc.get("body_html") or ""
+                text = html_to_text(html) if html else ""
+                if text:
+                    row["body_source"] = "html"
+                else:
+                    plain = doc.get("body_text") or ""
+                    if plain:
+                        text = plain
+                        row["body_source"] = "text"
+                    else:
+                        preview = doc.get("body_preview") or ""
+                        if preview:
+                            text = preview
+                            row["body_source"] = "preview"
+                        else:
+                            row["body_source"] = "empty"
+                            empty_body += 1
+                body = _truncate(text)
+                row["body"] = body if body else None
+                row["body_length"] = len(body)
+                row["ok"] = True
+                ok += 1
+                detail = f"{len(body)} znaku  {_short(body, 60)!r}"
+            except Exception as e:
+                row["error"] = f"{type(e).__name__}: {e}"[:500]
+                status = "ERR"; detail = row["error"][:80]; errors += 1
+
+            queue.append(row)
+            processed += 1
+
+            if n % 200 == 0 or n == 1:
+                subj = _short(row["subject"], 50)
+                print(f"  [{n:>5}] {status} {row['body_source']:<7} "
+                      f"{row['body_length']:>7}ch  | {subj}", flush=True)
+
+            if len(queue) >= BATCH_SIZE:
+                _flush(pg, queue); queue.clear()
+    finally:
+        cursor.close()
+
+    if queue:
+        _flush(pg, queue)
+
+    return {"mailbox": mailbox, "processed": processed, "ok": ok,
+            "errors": errors, "skipped": skipped, "empty_body": empty_body}
+
+
+UPSERT_SQL = """
+INSERT INTO emails
+    (mailbox, message_id, graph_id, conversation_id, folder_path,
+     subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
+     sent_at, received_at, modified_at, is_read, is_draft,
+     has_attachments, attachment_count, attachments_summary,
+     body, body_length, body_source,
+     extracted_at, extractor_version, ok, error)
+VALUES
+    (%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
+     %(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
+     %(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
+     %(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
+     %(body)s, %(body_length)s, %(body_source)s,
+     %(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
+ON CONFLICT (mailbox, message_id) DO UPDATE SET
+    graph_id            = EXCLUDED.graph_id,
+    conversation_id     = EXCLUDED.conversation_id,
+    folder_path         = EXCLUDED.folder_path,
+    subject             = EXCLUDED.subject,
+    sender_email        = EXCLUDED.sender_email,
+    sender_name         = EXCLUDED.sender_name,
+    to_addrs            = EXCLUDED.to_addrs,
+    cc_addrs            = EXCLUDED.cc_addrs,
+    bcc_addrs           = EXCLUDED.bcc_addrs,
+    sent_at             = EXCLUDED.sent_at,
+    received_at         = EXCLUDED.received_at,
+    modified_at         = EXCLUDED.modified_at,
+    is_read             = EXCLUDED.is_read,
+    is_draft            = EXCLUDED.is_draft,
+    has_attachments     = EXCLUDED.has_attachments,
+    attachment_count    = EXCLUDED.attachment_count,
+    attachments_summary = EXCLUDED.attachments_summary,
+    body                = EXCLUDED.body,
+    body_length         = EXCLUDED.body_length,
+    body_source         = EXCLUDED.body_source,
+    extracted_at        = EXCLUDED.extracted_at,
+    extractor_version   = EXCLUDED.extractor_version,
+    ok                  = EXCLUDED.ok,
+    error               = EXCLUDED.error
+"""
+
+
+def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
+    for r in rows:
+        for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
+                  "bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
+            if r.get(k):
+                r[k] = _clean_for_pg(r[k])
+    with pg.cursor() as cur:
+        cur.executemany(UPSERT_SQL, rows)
+    pg.commit()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k PostgreSQL...")
+    # MongoEmaily DB musi existovat (create externe pres psql nebo DBeaver),
+    # protoze CREATE DATABASE nesmi byt v transakci.
+    pg = psycopg.connect(PG_DSN, connect_timeout=10)
+    with pg.cursor() as cur:
+        cur.execute(SCHEMA_SQL)
+    pg.commit()
+    print("Schema OK.")
+
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(pg, db[mb], mb, limit=args.limit))
+
+    pg.close()
+
+    print("\n=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: processed={r['processed']}  ok={r['ok']}  "
+              f"errors={r['errors']}  skipped={r['skipped']}  empty={r['empty_body']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/Python-runner/run_pipeline.sh b/Python-runner/run_pipeline.sh
new file mode 100644
index 0000000..935f761
--- /dev/null
+++ b/Python-runner/run_pipeline.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# ============================================================================
+# Wrapper for the email pipeline. Calls Python wrapper inside python-runner
+# container. Logs to dated file. Cleans up logs older than 30 days.
+#
+# Install via User Scripts plugin or /etc/cron.d/email_pipeline:
+#   0 6,18 * * * /mnt/user/Scripts/run_pipeline.sh
+# ============================================================================
+
+set -u
+
+LOG_DIR="/mnt/user/Scripts/logs"
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+LOG_FILE="${LOG_DIR}/pipeline_${TIMESTAMP}.log"
+LATEST_LINK="${LOG_DIR}/pipeline_latest.log"
+RETENTION_DAYS=30
+
+mkdir -p "$LOG_DIR"
+
+echo "=== Email pipeline run @ $(date '+%Y-%m-%d %H:%M:%S') ===" >> "$LOG_FILE"
+
+# Make sure the container is running
+if ! docker inspect -f '{{.State.Running}}' python-runner 2>/dev/null | grep -q true; then
+    echo "ERROR: python-runner container is not running" >> "$LOG_FILE"
+    docker start python-runner >> "$LOG_FILE" 2>&1 || exit 1
+    sleep 5
+fi
+
+docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet >> "$LOG_FILE" 2>&1
+RET=$?
+
+echo "" >> "$LOG_FILE"
+echo "=== Wrapper finished @ $(date '+%Y-%m-%d %H:%M:%S') exit=$RET ===" >> "$LOG_FILE"
+
+# Update "latest" symlink for easy tailing
+ln -sf "$LOG_FILE" "$LATEST_LINK"
+
+# Cleanup logs older than RETENTION_DAYS
+find "$LOG_DIR" -name 'pipeline_*.log' -type f -mtime +${RETENTION_DAYS} -delete
+
+exit $RET
diff --git a/Soubory/mcp_soubory.py b/Soubory/mcp_soubory.py
new file mode 100644
index 0000000..08080e1
--- /dev/null
+++ b/Soubory/mcp_soubory.py
@@ -0,0 +1,672 @@
+#!/usr/bin/env python3
+"""
+==============================================================================
+MCP server: SOUBORY  (Dropbox studie 42847922MDD3003 + 77242113UCO3001)
+
+Hybridni dotaz nad:
+  - PostgreSQL  192.168.1.76  db=MongoSoubory  tabulka=documents
+                (fulltext tsvector index, ts_headline, ts_rank)
+  - MongoDB     192.168.1.76  db=soubory
+                kolekce=42847922MDD3003, 77242113UCO3001
+                (metadata, content.* z enrich_files_v1.0)
+
+Spusteni:
+    python mcp_soubory.py        (stdio MCP)
+
+Pridano do U:\\janssen\\.mcp.json jako "soubory".
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import sys
+import traceback
+from datetime import datetime, timezone, timedelta
+from typing import Optional, Union
+
+import psycopg
+from bson import ObjectId
+from mcp.server.fastmcp import FastMCP
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "soubory"
+
+# Kratky alias -> Mongo kolekce = PG.study
+STUDY_MAP = {
+    "MDD3003": "42847922MDD3003",
+    "UCO3001": "77242113UCO3001",
+}
+STUDY_ALL = list(STUDY_MAP.values())
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+# Limit kolik telo doc vracime defaultne (aby tool response nebyla obri)
+DEFAULT_BODY_CHARS = 8000
+MAX_BODY_CHARS = 200_000
+
+
+def log(msg: str) -> None:
+    print(msg, file=sys.stderr, flush=True)
+
+
+# --- inicializace klientu ---------------------------------------------------
+try:
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    log(f"Mongo OK ({MONGO_URI})")
+except Exception as e:
+    log(f"Mongo connection failed: {e}")
+    sys.exit(1)
+
+try:
+    _test = psycopg.connect(PG_DSN, connect_timeout=10)
+    _test.close()
+    log("Postgres OK")
+except Exception as e:
+    log(f"Postgres connection failed: {e}")
+    sys.exit(1)
+
+
+def pg_conn():
+    return psycopg.connect(PG_DSN, connect_timeout=10)
+
+
+def serialize(obj):
+    if isinstance(obj, ObjectId):
+        return str(obj)
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    if isinstance(obj, bytes):
+        return obj.decode("utf-8", errors="replace")
+    if isinstance(obj, dict):
+        return {k: serialize(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [serialize(v) for v in obj]
+    return obj
+
+
+def resolve_studies(study: Optional[Union[str, list]]) -> Optional[list[str]]:
+    """Alias 'MDD3003' / 'UCO3001' -> plne nazvy kolekce. None -> obe (vraci None pro PG = bez filtru)."""
+    if study is None or study == "" or study == []:
+        return None
+    if isinstance(study, str):
+        study = [study]
+    out = []
+    for s in study:
+        if s in STUDY_MAP:
+            out.append(STUDY_MAP[s])
+        elif s in STUDY_MAP.values():
+            out.append(s)
+        else:
+            raise ValueError(f"Unknown study {s!r}. Use MDD3003 / UCO3001 or full code.")
+    return out
+
+
+def normalize_exts(ext: Optional[Union[str, list]]) -> Optional[list[str]]:
+    if ext is None or ext == "" or ext == []:
+        return None
+    if isinstance(ext, str):
+        ext = [ext]
+    return [e.lower().lstrip(".") for e in ext]
+
+
+def parse_since(since: Optional[str]) -> Optional[datetime]:
+    if not since:
+        return None
+    # akceptuj YYYY-MM-DD i ISO
+    try:
+        if "T" in since:
+            return datetime.fromisoformat(since.replace("Z", "+00:00"))
+        return datetime.strptime(since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
+    except Exception as e:
+        raise ValueError(f"Bad date {since!r}: {e}")
+
+
+def short_meta(content: dict) -> dict:
+    """Zhustene metadata z content.* pro tool response."""
+    if not content or not content.get("ok", True):
+        return {"ok": False, "error": (content or {}).get("error")}
+    out = {}
+    for k in ("title", "subject", "author", "last_modified_by",
+              "from", "to", "cc", "date", "pages", "slides",
+              "total_sheets", "paragraphs", "words",
+              "created", "modified", "encrypted"):
+        if k in content and content[k] not in (None, "", []):
+            v = content[k]
+            if isinstance(v, str) and len(v) > 200:
+                v = v[:200] + "..."
+            out[k] = v
+    if "sheets" in content:
+        out["sheet_names"] = [s.get("name") for s in content.get("sheets", []) if s]
+    if "attachments" in content:
+        out["attachment_count"] = len(content.get("attachments") or [])
+        if out["attachment_count"]:
+            out["attachments"] = content["attachments"][:10]
+    if "text_head" in content:
+        head = content["text_head"]
+        out["text_head"] = head[:400] + ("..." if head and len(head) > 400 else "")
+    return out
+
+
+# --- MCP --------------------------------------------------------------------
+mcp = FastMCP("soubory")
+
+
+@mcp.tool()
+def ping() -> dict:
+    """Quick health check. Reports Mongo + Postgres connectivity, totals per study, and PG documents.ok count.
+    Call this first when starting an investigation to confirm everything is up.
+    """
+    try:
+        info = mongo.admin.command("buildInfo")
+        study_counts = {}
+        for code in STUDY_ALL:
+            study_counts[code] = mongo[MONGO_DB][code].estimated_document_count()
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute("SELECT study, ok, count(*) FROM documents GROUP BY study, ok ORDER BY study, ok")
+            rows = cur.fetchall()
+        pg_summary = {}
+        for s, ok, c in rows:
+            pg_summary.setdefault(s, {})[("ok" if ok else "error")] = c
+        return {
+            "status": "ok",
+            "mongo_version": info.get("version"),
+            "mongo_files_per_study": study_counts,
+            "pg_documents_per_study": pg_summary,
+            "studies": STUDY_MAP,
+        }
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"status": "error", "error": str(e)}
+
+
+@mcp.tool()
+def list_studies() -> dict:
+    """Overview of both studies — total files, breakdown by extension, fulltext coverage,
+    earliest/latest mtime. Use this to understand the corpus before searching.
+    """
+    out = {}
+    try:
+        for alias, code in STUDY_MAP.items():
+            col = mongo[MONGO_DB][code]
+            total = col.count_documents({})
+            deleted = col.count_documents({"deleted_at": {"$exists": True}})
+            ext_breakdown = list(col.aggregate([
+                {"$match": {"deleted_at": {"$exists": False}}},
+                {"$group": {"_id": "$ext", "count": {"$sum": 1}}},
+                {"$sort": {"count": -1}},
+            ]))
+            mtime_minmax = list(col.aggregate([
+                {"$match": {"deleted_at": {"$exists": False}}},
+                {"$group": {"_id": None,
+                            "min_mtime": {"$min": "$mtime"},
+                            "max_mtime": {"$max": "$mtime"}}},
+            ]))
+            with pg_conn() as pg, pg.cursor() as cur:
+                cur.execute(
+                    "SELECT count(*) FILTER (WHERE ok), count(*) FROM documents WHERE study=%s",
+                    (code,),
+                )
+                pg_ok, pg_total = cur.fetchone()
+            out[alias] = {
+                "code": code,
+                "mongo_total": total,
+                "mongo_active": total - deleted,
+                "mongo_deleted": deleted,
+                "by_ext": {r["_id"]: r["count"] for r in ext_breakdown},
+                "fulltext_indexed": pg_ok,
+                "fulltext_failed": pg_total - pg_ok,
+                "oldest_mtime": serialize(mtime_minmax[0]["min_mtime"]) if mtime_minmax else None,
+                "newest_mtime": serialize(mtime_minmax[0]["max_mtime"]) if mtime_minmax else None,
+            }
+        return {"studies": out}
+    except Exception as e:
+        log(traceback.format_exc())
+        raise
+
+
+@mcp.tool()
+def search(
+    query: str,
+    study: Optional[Union[str, list]] = None,
+    ext: Optional[Union[str, list]] = None,
+    since: Optional[str] = None,
+    folder: Optional[str] = None,
+    limit: int = 15,
+    with_metadata: bool = True,
+) -> dict:
+    """PRIMARY TOOL — fulltext search across all parsed documents in both studies.
+
+    query: search expression in PostgreSQL websearch_to_tsquery syntax:
+        adverse event          -> AND (both must appear)
+        "adverse event"        -> exact phrase
+        adverse OR serious     -> OR
+        adverse -mild          -> exclude
+    study: "MDD3003", "UCO3001", or list. None = both.
+    ext: filter file types: ["pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv"]
+    since: ISO date "YYYY-MM-DD" — only files modified on/after this date
+    folder: substring match against any parent folder name (e.g. "CRF", "Training")
+    limit: max results (default 15, max 100)
+    with_metadata: if True, also fetch content.* metadata from Mongo (author, pages, sheets, EML headers)
+
+    Returns ranked results with `snippet` showing matches highlighted with <<...>>.
+    Use `read_document` to fetch full body of a specific hit.
+    """
+    try:
+        studies = resolve_studies(study)
+        exts = normalize_exts(ext)
+        since_dt = parse_since(since)
+        limit = min(max(1, limit), 100)
+
+        sql = """
+        WITH q AS (
+            SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
+        )
+        SELECT
+            d.id, d.mongo_id, d.study, d.path, d.rel_path, d.name, d.ext,
+            d.size_bytes, d.mtime, d.body_length,
+            ts_rank(d.tsv, q.tsq) AS rank,
+            ts_headline('soubory'::regconfig,
+                left(d.body, 200000),
+                q.tsq,
+                'MaxFragments=3, MinWords=4, MaxWords=18, '
+                'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
+        FROM documents d, q
+        WHERE d.tsv @@ q.tsq
+          AND d.ok = TRUE
+          AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
+          AND (%(exts)s::text[]    IS NULL OR d.ext   = ANY(%(exts)s::text[]))
+          AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
+        ORDER BY rank DESC, d.mtime DESC NULLS LAST
+        LIMIT %(limit)s
+        """
+        params = {"query": query, "studies": studies, "exts": exts,
+                  "since": since_dt, "limit": limit}
+
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, params)
+            cols = [c.name for c in cur.description]
+            rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+
+        # filter by folder via Mongo (PG nema parent_folders)
+        meta_by_path: dict[str, dict] = {}
+        if rows and (with_metadata or folder):
+            by_study: dict[str, list[str]] = {}
+            for r in rows:
+                by_study.setdefault(r["study"], []).append(r["path"])
+            for code, paths in by_study.items():
+                proj = {"path": 1, "parent_folders": 1, "dates_in_name": 1}
+                if with_metadata:
+                    proj["content"] = 1
+                for d in mongo[MONGO_DB][code].find({"path": {"$in": paths}}, proj):
+                    meta_by_path[d["path"]] = d
+
+        if folder:
+            needle = folder.lower()
+            kept = []
+            for r in rows:
+                folders = (meta_by_path.get(r["path"]) or {}).get("parent_folders") or []
+                if any(needle in (f or "").lower() for f in folders):
+                    kept.append(r)
+            rows = kept
+
+        results = []
+        for r in rows:
+            mongo_doc = meta_by_path.get(r["path"]) or {}
+            results.append({
+                "study": r["study"],
+                "path": r["path"],
+                "rel_path": r["rel_path"],
+                "name": r["name"],
+                "ext": r["ext"],
+                "size_mb": round((r["size_bytes"] or 0) / 1024 / 1024, 2),
+                "mtime": serialize(r["mtime"]),
+                "body_length": r["body_length"],
+                "rank": round(float(r["rank"]), 5),
+                "snippet": (r["snippet"] or "").strip(),
+                "mongo_id": r["mongo_id"],
+                "dates_in_name": mongo_doc.get("dates_in_name"),
+                "metadata": short_meta(mongo_doc.get("content") or {}) if with_metadata else None,
+            })
+
+        return {
+            "query": query,
+            "filters": {"study": studies, "ext": exts, "since": since,
+                        "folder": folder, "limit": limit},
+            "count": len(results),
+            "results": results,
+            "tip": "Use read_document(path=...) to fetch full body of any hit.",
+        }
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e), "query": query}
+
+
+@mcp.tool()
+def read_document(
+    path: Optional[str] = None,
+    mongo_id: Optional[str] = None,
+    offset: int = 0,
+    length: int = DEFAULT_BODY_CHARS,
+    around_match: Optional[str] = None,
+) -> dict:
+    """Read the full parsed text of one document (PG body column) + its Mongo metadata.
+
+    Identify the document by EITHER `path` (absolute) OR `mongo_id`.
+    offset, length: slice the body (default first 8000 chars). length capped at 200000.
+    around_match: if given, return up to 3 windows of ~1000 chars centered on the first matches
+                  of this substring (case-insensitive). Useful to jump to a keyword in a long doc.
+
+    Body is truncated to fit; check `body_length` vs returned length to know if more exists.
+    Use offset to page further (offset=8000, then 16000, ...).
+    """
+    try:
+        if not path and not mongo_id:
+            return {"error": "Provide either path or mongo_id."}
+
+        length = min(max(1, length), MAX_BODY_CHARS)
+
+        sql = """
+        SELECT id, mongo_id, study, path, rel_path, name, ext, sha256,
+               size_bytes, mtime, body, body_length, extractor_version,
+               extracted_at, ok, error
+        FROM documents
+        WHERE """ + ("path = %s" if path else "mongo_id = %s") + " LIMIT 1"
+
+        with pg_conn() as pg, pg.cursor() as cur:
+            cur.execute(sql, (path or mongo_id,))
+            row = cur.fetchone()
+            cols = [c.name for c in cur.description]
+        if not row:
+            return {"error": "Document not found.", "path": path, "mongo_id": mongo_id}
+        rec = dict(zip(cols, row))
+
+        body = rec.get("body") or ""
+
+        if around_match and body:
+            needle = around_match.lower()
+            hay = body.lower()
+            windows = []
+            start = 0
+            while len(windows) < 3:
+                pos = hay.find(needle, start)
+                if pos < 0:
+                    break
+                lo = max(0, pos - 400)
+                hi = min(len(body), pos + 600)
+                windows.append({"offset": lo, "text": body[lo:hi]})
+                start = pos + len(needle)
+            body_out = None
+            slice_info = {"mode": "around_match", "match": around_match,
+                          "windows": windows, "windows_found": len(windows)}
+        else:
+            body_out = body[offset:offset + length]
+            slice_info = {
+                "mode": "slice", "offset": offset,
+                "length_returned": len(body_out),
+                "has_more": offset + length < len(body),
+                "next_offset": offset + length if offset + length < len(body) else None,
+            }
+
+        # Mongo metadata
+        col_code = rec["study"]
+        mdoc = mongo[MONGO_DB][col_code].find_one(
+            {"path": rec["path"]},
+            {"content": 1, "dates_in_name": 1, "parent_folders": 1, "tokens": 1},
+        ) or {}
+
+        out = {
+            "study": rec["study"],
+            "path": rec["path"],
+            "rel_path": rec["rel_path"],
+            "name": rec["name"],
+            "ext": rec["ext"],
+            "size_mb": round((rec["size_bytes"] or 0) / 1024 / 1024, 2),
+            "mtime": serialize(rec["mtime"]),
+            "sha256": rec["sha256"],
+            "body_length": rec["body_length"],
+            "extractor_version": rec["extractor_version"],
+            "extracted_at": serialize(rec["extracted_at"]),
+            "ok": rec["ok"],
+            "error": rec["error"],
+            "parent_folders": mdoc.get("parent_folders"),
+            "dates_in_name": mdoc.get("dates_in_name"),
+            "metadata": short_meta(mdoc.get("content") or {}),
+        }
+        if body_out is not None:
+            out["body"] = body_out
+        out["slice"] = slice_info
+        return out
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def get_metadata(path: str) -> dict:
+    """Return raw Mongo document for one path (full content.*, parent_folders, dates_in_name,
+    sha256, sizes, timestamps, tokens). Use when you need the full structured metadata —
+    e.g. all sheet names of an XLSX, all attachments of an email, full author info.
+    Does NOT return body text — use `read_document` for that.
+    """
+    try:
+        for code in STUDY_ALL:
+            d = mongo[MONGO_DB][code].find_one({"path": path})
+            if d:
+                return serialize(d)
+        return {"error": "Not found in any study collection.", "path": path}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def recent_files(
+    study: Optional[Union[str, list]] = None,
+    days: int = 7,
+    ext: Optional[Union[str, list]] = None,
+    limit: int = 30,
+) -> dict:
+    """List most recently modified files (no fulltext involved). Use for "what changed lately"
+    or "what did I get this week" questions.
+
+    days: window from now (default 7). Set to 0 for no time filter (just top-N newest).
+    """
+    try:
+        studies = resolve_studies(study) or STUDY_ALL
+        exts = normalize_exts(ext)
+        limit = min(max(1, limit), 200)
+
+        q: dict = {"deleted_at": {"$exists": False}}
+        if exts:
+            q["ext"] = {"$in": exts}
+        if days and days > 0:
+            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
+            q["mtime"] = {"$gte": since_dt}
+
+        results = []
+        for code in studies:
+            for d in (mongo[MONGO_DB][code]
+                      .find(q, {"path": 1, "rel_path": 1, "name": 1, "ext": 1,
+                                "size_bytes": 1, "mtime": 1, "study": 1,
+                                "content.author": 1, "content.title": 1,
+                                "content.last_modified_by": 1})
+                      .sort("mtime", -1).limit(limit)):
+                results.append({
+                    "study": d.get("study"),
+                    "path": d["path"],
+                    "rel_path": d.get("rel_path"),
+                    "name": d.get("name"),
+                    "ext": d.get("ext"),
+                    "size_mb": round((d.get("size_bytes") or 0) / 1024 / 1024, 2),
+                    "mtime": serialize(d.get("mtime")),
+                    "author": (d.get("content") or {}).get("author"),
+                    "title": (d.get("content") or {}).get("title"),
+                    "last_modified_by": (d.get("content") or {}).get("last_modified_by"),
+                })
+        results.sort(key=lambda r: r["mtime"] or "", reverse=True)
+        return {"days": days, "count": len(results[:limit]), "results": results[:limit]}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def find_duplicates(
+    study: Optional[Union[str, list]] = None,
+    min_size_kb: int = 10,
+    limit: int = 30,
+) -> dict:
+    """Find groups of files with identical content (same sha256) but at different paths.
+    Reveals copies of the same document scattered across folders / studies.
+
+    min_size_kb: ignore tiny duplicate groups (default 10 KB)
+    limit: max duplicate groups returned
+    """
+    try:
+        studies = resolve_studies(study) or STUDY_ALL
+        pipeline = [
+            {"$match": {"deleted_at": {"$exists": False},
+                        "size_bytes": {"$gte": min_size_kb * 1024}}},
+            {"$group": {"_id": "$sha256",
+                        "count": {"$sum": 1},
+                        "size_bytes": {"$first": "$size_bytes"},
+                        "ext": {"$first": "$ext"},
+                        "paths": {"$push": {"study": "$study",
+                                            "path": "$path",
+                                            "rel_path": "$rel_path",
+                                            "mtime": "$mtime"}}}},
+            {"$match": {"count": {"$gte": 2}}},
+            {"$sort": {"size_bytes": -1, "count": -1}},
+            {"$limit": limit},
+        ]
+
+        all_groups: dict = {}
+        for code in studies:
+            for g in mongo[MONGO_DB][code].aggregate(pipeline):
+                sha = g["_id"]
+                if sha in all_groups:
+                    all_groups[sha]["count"] += g["count"]
+                    all_groups[sha]["paths"].extend(g["paths"])
+                else:
+                    all_groups[sha] = {
+                        "sha256": sha, "count": g["count"], "ext": g["ext"],
+                        "size_mb": round(g["size_bytes"] / 1024 / 1024, 2),
+                        "paths": g["paths"],
+                    }
+
+        groups = sorted(all_groups.values(),
+                        key=lambda x: (x["size_mb"], x["count"]), reverse=True)[:limit]
+        for g in groups:
+            for p in g["paths"]:
+                p["mtime"] = serialize(p.get("mtime"))
+        return {
+            "filters": {"study": studies, "min_size_kb": min_size_kb},
+            "group_count": len(groups),
+            "wasted_mb_estimate": round(sum(g["size_mb"] * (g["count"] - 1) for g in groups), 2),
+            "groups": groups,
+        }
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def by_author(
+    name: str,
+    study: Optional[Union[str, list]] = None,
+    ext: Optional[Union[str, list]] = None,
+    limit: int = 30,
+) -> dict:
+    """Find documents where content.author OR content.last_modified_by matches `name` (case-insensitive substring).
+    Works for DOCX/XLSX/PPTX/PDF embedded metadata. Use for "what did X write" or "who edited this".
+    """
+    try:
+        studies = resolve_studies(study) or STUDY_ALL
+        exts = normalize_exts(ext)
+        limit = min(max(1, limit), 200)
+
+        rx = {"$regex": name, "$options": "i"}
+        q: dict = {"deleted_at": {"$exists": False},
+                   "$or": [{"content.author": rx},
+                           {"content.last_modified_by": rx}]}
+        if exts:
+            q["ext"] = {"$in": exts}
+
+        results = []
+        for code in studies:
+            for d in (mongo[MONGO_DB][code]
+                      .find(q, {"path": 1, "rel_path": 1, "name": 1, "ext": 1,
+                                "size_bytes": 1, "mtime": 1, "study": 1, "content": 1})
+                      .sort("mtime", -1).limit(limit)):
+                c = d.get("content") or {}
+                results.append({
+                    "study": d.get("study"),
+                    "path": d["path"],
+                    "rel_path": d.get("rel_path"),
+                    "name": d.get("name"),
+                    "ext": d.get("ext"),
+                    "mtime": serialize(d.get("mtime")),
+                    "size_mb": round((d.get("size_bytes") or 0) / 1024 / 1024, 2),
+                    "author": c.get("author"),
+                    "last_modified_by": c.get("last_modified_by"),
+                    "title": c.get("title"),
+                })
+        results.sort(key=lambda r: r["mtime"] or "", reverse=True)
+        return {"author_match": name, "count": len(results[:limit]), "results": results[:limit]}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+@mcp.tool()
+def browse_folder(
+    folder: str,
+    study: Optional[Union[str, list]] = None,
+    ext: Optional[Union[str, list]] = None,
+    limit: int = 100,
+) -> dict:
+    """List files where any parent folder name contains `folder` (case-insensitive substring match).
+    Use for "show me what's in the CRF folder" or "what's in Training". Returns just metadata,
+    no body text. Files sorted by relative path.
+    """
+    try:
+        studies = resolve_studies(study) or STUDY_ALL
+        exts = normalize_exts(ext)
+        limit = min(max(1, limit), 500)
+
+        rx = {"$regex": folder, "$options": "i"}
+        q: dict = {"deleted_at": {"$exists": False}, "parent_folders": rx}
+        if exts:
+            q["ext"] = {"$in": exts}
+
+        results = []
+        for code in studies:
+            for d in (mongo[MONGO_DB][code]
+                      .find(q, {"path": 1, "rel_path": 1, "name": 1, "ext": 1,
+                                "size_bytes": 1, "mtime": 1, "study": 1,
+                                "parent_folders": 1, "dates_in_name": 1})
+                      .sort("rel_path", 1).limit(limit)):
+                results.append({
+                    "study": d.get("study"),
+                    "path": d["path"],
+                    "rel_path": d.get("rel_path"),
+                    "name": d.get("name"),
+                    "ext": d.get("ext"),
+                    "mtime": serialize(d.get("mtime")),
+                    "size_mb": round((d.get("size_bytes") or 0) / 1024 / 1024, 2),
+                    "parent_folders": d.get("parent_folders"),
+                    "dates_in_name": d.get("dates_in_name"),
+                })
+        return {"folder_match": folder, "count": len(results), "results": results}
+    except Exception as e:
+        log(traceback.format_exc())
+        return {"error": str(e)}
+
+
+if __name__ == "__main__":
+    log("MCP soubory server started (FastMCP)")
+    mcp.run()
diff --git a/Soubory/priklady_dotazu.md b/Soubory/priklady_dotazu.md
new file mode 100644
index 0000000..a2fef75
--- /dev/null
+++ b/Soubory/priklady_dotazu.md
@@ -0,0 +1,210 @@
+# Příklady dotazů — MCP `soubory`
+
+10 příkladů od nejjednoduššího po nejsložitější. Každý je nejdřív stručně, pak rozepsaný.
+
+Volání je přes `search(...)`. V Claude chatu se ptáš normálně česky ("najdi mi…") a Claude pod kapotou volá `mcp__soubory__search(...)`. Tady ukazuju **přímé volání** tak, abys viděl co se kombinuje.
+
+---
+
+## Přehled (od nejlehčího po nejtěžší)
+
+| # | Příklad |
+|---|---|
+| 1 | `search("randomization")` |
+| 2 | `search("adverse event")` |
+| 3 | `search('"protocol deviation"')` |
+| 4 | `search("randomization", ext=["xlsx","xlsm"])` |
+| 5 | `search("SAE", study="UCO3001", ext=["eml","msg"])` |
+| 6 | `search('"kit number"', folder="CRF", since="2025-06-01")` |
+| 7 | `search("adverse OR serious -mild")` |
+| 8 | `search('"serious adverse event" -draft -obsolete', ext=["docx","pdf"])` |
+| 9 | `search('icotrokinra placebo', study="UCO3001", folder="Training", limit=30)` |
+| 10 | `search('"lot expiration" OR "expirační" OR "expiry"', ext=["eml","msg","pdf"], since="2025-01-01")` |
+
+---
+
+## 1. Nejjednodušší — jedno slovo
+
+```python
+search("randomization")
+```
+
+**Co to dělá:** Najde všechny dokumenty z obou studií, které kdekoli v textu obsahují slovo "randomization". Bez filtru typu souboru, studie, ani data. Vrátí 15 nejlépe rankovaných.
+
+**Kdy použít:** Když máš jen obecné slovo a nevíš kde to může být. Dobré pro první nástřel — uvidíš, ve kterých typech souborů a v kterých složkách se to vyskytuje.
+
+**Trik:** Slovník indexu používá `unaccent`, takže `príloha` najde i `priloha` (diakritika neřeší).
+
+---
+
+## 2. Dvě slova — implicitní AND
+
+```python
+search("adverse event")
+```
+
+**Co to dělá:** Najde dokumenty, kde se vyskytují **obě** slova "adverse" a "event" — ale **kdekoli v dokumentu**, nemusí být vedle sebe. Mohou být klidně na různých stranách.
+
+**Kdy použít:** Když chceš zúžit širší slovo (`adverse` samotné by našlo i `adversely`). Dvě slova = silnější rank.
+
+**Rozdíl proti #3:** "adverse" může být na straně 5 a "event" na straně 150 — pořád match.
+
+---
+
+## 3. Přesná fráze
+
+```python
+search('"protocol deviation"')
+```
+
+**Co to dělá:** Najde dokumenty, kde jsou tato dvě slova **přímo vedle sebe** v tomto pořadí. "protocol of deviation" už nematchne, "deviation from protocol" taky ne.
+
+**Kdy použít:** Pro odborné termíny, názvy formulářů, ustálené fráze. Mnohem ostřejší než AND.
+
+**Pozor:** Uvozovky musí být přesně `"..."` (PowerShell může vyžadovat escape: `'"protocol deviation"'`).
+
+---
+
+## 4. Filtr typu souboru
+
+```python
+search("randomization", ext=["xlsx", "xlsm"])
+```
+
+**Co to dělá:** Stejné jako #1, ale jen v Excelech (`.xlsx` + `.xlsm`). Užitečné když víš, že to bude v tabulce — typicky randomizační listy, IWRS exporty.
+
+**Kdy použít:** Když chceš najít data, ne dokumentaci. Excel = data tabulky, PDF/DOCX = popis.
+
+**Tip:** Metadata v odpovědi obsahují `sheet_names` — uvidíš ve kterých listech to může být. Pak otevřeš ten Excel rovnou na správném listu.
+
+---
+
+## 5. Studie + typ — kombinovaný filtr
+
+```python
+search("SAE", study="UCO3001", ext=["eml", "msg"])
+```
+
+**Co to dělá:** Najde emaily (EML i MSG) z **UCO3001 studie**, které obsahují slovo "SAE" (Serious Adverse Event). Metadata vrátí `from`, `to`, `subject`, `date`, počet příloh.
+
+**Kdy použít:** "Kdo mi psal o SAE případu" — typický audit dotaz.
+
+**Trik:** Kombinace `study + ext` je výkonná — Postgres má index `(study, ext)` přímo na to.
+
+---
+
+## 6. Tři filtry — fráze + složka + datum
+
+```python
+search('"kit number"', folder="CRF", since="2025-06-01")
+```
+
+**Co to dělá:** Najde dokumenty obsahující frázi "kit number", ale jen ty **uložené v jakékoli složce s "CRF" v názvu**, a **modifikované od 1. června 2025** dál.
+
+**Kdy použít:** Když si pamatuješ kontext ("bylo to v CRF dokumentaci po SIVu") ale ne celý text.
+
+**Jak to funguje pod kapotou:**
+1. Postgres najde fulltextové matche
+2. Mongo dotáhne `parent_folders` a `mtime`
+3. Filtruje se v Pythonu — proto se to dělá jako AND nad všemi třemi
+
+---
+
+## 7. OR + NOT — logické operátory
+
+```python
+search("adverse OR serious -mild")
+```
+
+**Co to dělá:** Najde dokumenty, kde je **buď** "adverse" **nebo** "serious", ale **nesmí** obsahovat slovo "mild".
+
+**Kdy použít:** Když máš více synonym a chceš jeden dotaz místo tří. `-mild` vyloučí typicky tréninkové materiály ("mild AE example") nebo nezávažné případy.
+
+**Důležité — priorita operátorů:**
+- `A OR B C` se vyhodnotí jako `A OR (B AND C)`
+- `websearch_to_tsquery` **nemá závorky** — nemůžeš to přeskupit
+- Když potřebuješ jiné pořadí, rozděl na dva dotazy
+
+---
+
+## 8. Fráze + dvě vyloučení + ext filtr
+
+```python
+search('"serious adverse event" -draft -obsolete', ext=["docx", "pdf"])
+```
+
+**Co to dělá:** Přesná fráze "serious adverse event", ale **bez** dokumentů obsahujících slova "draft" nebo "obsolete", a jen v Wordech a PDFkách.
+
+**Kdy použít:** Když chceš jen **finální** verze dokumentů. V Dropboxu typicky najdeš 5 verzí toho samého (draft, v0.9, v1.0, v1.0_FINAL, OBSOLETE) — tohle odřízne šum.
+
+**Kombinace technik:** fráze + vícenásobné NOT + typ. Reálné dotazy v práci vypadají takhle.
+
+---
+
+## 9. Dvě AND slova + 2 filtry + víc výsledků
+
+```python
+search("icotrokinra placebo", study="UCO3001", folder="Training", limit=30)
+```
+
+**Co to dělá:** Najde tréninkové materiály z UCO3001, kde se mluví **jak o léku icotrokinra, tak o placebu** (typicky srovnání ramen studie). `limit=30` místo defaultních 15.
+
+**Kdy použít:**
+- Onboarding nového člena týmu — "dej mi všechny prezentace co srovnávají větve studie"
+- Příprava na monitorovací návštěvu
+- Hledání edukačního obsahu pro pacienty
+
+**Proč 30:** Tréninkových materiálů bývá hodně verzí (každý SIV nové), default 15 by jich pravděpodobně neukázal všechny.
+
+---
+
+## 10. Nejtěžší — vícejazyčné OR + tři typy + datum
+
+```python
+search(
+    '"lot expiration" OR "expirační" OR "expiry"',
+    ext=["eml", "msg", "pdf"],
+    since="2025-01-01"
+)
+```
+
+**Co to dělá:** Najde **dokumentaci a komunikaci o expiraci léků** z roku 2025. Hledá ve třech jazykových variantách (EN fráze "lot expiration", CZ "expirační", krátké "expiry") napříč emaily, MSG soubory a PDFkami.
+
+**Kdy použít:** Typický compliance dotaz — "ukaž mi všechno co tento rok řešilo expiraci kitů". Kombinuje:
+- **vícejazyčnost** (sponsor píše anglicky, ty notifikuješ česky)
+- **více kanálů** (emaily i oficiální PDF dokumenty)
+- **časové okno** (relevantní jen letošek)
+
+**Pod kapotou:**
+1. PG fulltext spojí 3 OR větve do jednoho tsquery
+2. Filtr `ext` IN ('eml','msg','pdf') na PG úrovni
+3. Filtr `since` na sloupec `mtime` (indexovaný)
+4. Mongo metadata: u emailů `from/to/subject`, u PDF `pages/author`
+5. Výsledky setřízené podle `ts_rank` (nejvíc relevantní nahoře)
+
+**Tohle je ten případ kdy nakombinuješ úplně všechno** a Claude ti pak v chatu napíše: *"Našel jsem 8 dokumentů. 3 emaily od monitorky z března, 1 PDF notifikace IWRS z června, ..."*. Přesně proto jsme to stavěli.
+
+---
+
+## Shrnutí — pravidla palce
+
+- **Začni jednoduše** (1 slovo) → uvidíš co je v korpusu → zužuj
+- **Fráze (`"..."`)** je vždy ostřejší než AND
+- **`-slovo`** je tvůj nejlepší kamarád proti šumu (draft, obsolete, training)
+- **`ext=[...]`** dramaticky zrychlí dotaz a vyfiltruje formátový šum
+- **`folder=...`** funguje skvěle pokud máš konzistentní strukturu složek (#190 eCRF, #200 Training, …)
+- **`since=...`** používej kdykoli tě zajímá "co je nového"
+- **Diakritika neřeší** — `expirace` najde i `expirační` (oboje má root `expira`)
+- **Není wildcard** — `randomiz*` nefunguje, ale `randomization` a `randomized` jsou různá slova → dej je do OR
+
+---
+
+## Co když fulltext nestačí?
+
+Jiné nástroje MCP:
+
+- **`by_author("Hazzard")`** — kdo psal/upravoval (DOCX/PPTX metadata)
+- **`recent_files(days=7)`** — co se změnilo bez ohledu na obsah
+- **`find_duplicates()`** — kolikrát mám stejný soubor
+- **`browse_folder("CRF")`** — výpis složky bez fulltextu
+- **`read_document(path=..., around_match="randomization")`** — skok přímo na slovo v dlouhém dokumentu
diff --git a/Soubory/query_v0.1.py b/Soubory/query_v0.1.py
new file mode 100644
index 0000000..9605bbf
--- /dev/null
+++ b/Soubory/query_v0.1.py
@@ -0,0 +1,203 @@
+"""
+==============================================================================
+Skript:   query_v0.1.py
+Verze:    0.1
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+Popis:    Hybridni dotaz: PostgreSQL fulltext (tsv + ts_rank + ts_headline)
+          + obohaceni z MongoDB (content.* - autor, listy, EML hlavicky,
+          datumy v nazvu).
+
+Pouziti:
+    python query_v0.1.py "adverse event"
+    python query_v0.1.py "protocol deviation" --study MDD3003 --ext docx pptx
+    python query_v0.1.py "randomization" --ext xlsx xlsm --limit 20
+    python query_v0.1.py "lot expiration" --since 2026-01-01
+
+Syntaxe dotazu = websearch_to_tsquery:
+    adverse event           -> AND
+    "adverse event"         -> fraze
+    adverse OR serious      -> OR
+    adverse -mild           -> NOT
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from datetime import datetime, timezone
+
+import psycopg
+from pymongo import MongoClient
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "soubory"
+STUDY_COLLECTIONS = {
+    "MDD3003":  "42847922MDD3003",
+    "UCO3001":  "77242113UCO3001",
+}
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+
+SEARCH_SQL = """
+WITH q AS (
+    SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
+)
+SELECT
+    d.study,
+    d.path,
+    d.rel_path,
+    d.name,
+    d.ext,
+    d.size_bytes,
+    d.mtime,
+    d.body_length,
+    ts_rank(d.tsv, q.tsq)                                       AS rank,
+    ts_headline('soubory'::regconfig,
+                left(d.body, 200000),
+                q.tsq,
+                'MaxFragments=2, MinWords=4, MaxWords=18, '
+                'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
+FROM documents d, q
+WHERE d.tsv @@ q.tsq
+  AND d.ok = TRUE
+  AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
+  AND (%(exts)s::text[]    IS NULL OR d.ext   = ANY(%(exts)s::text[]))
+  AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
+ORDER BY rank DESC, d.mtime DESC NULLS LAST
+LIMIT %(limit)s
+"""
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Hybridni dotaz PG fulltext + Mongo metadata",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument("query", help="Vyhledavaci vyraz (websearch syntaxe)")
+    p.add_argument("--study", nargs="*",
+                   choices=sorted(STUDY_COLLECTIONS.keys()),
+                   help="Filtr studie (default: obe)")
+    p.add_argument("--ext", nargs="*",
+                   help="Filtr pripon (napr. pdf docx xlsx)")
+    p.add_argument("--since",
+                   help="mtime >= datum (YYYY-MM-DD)")
+    p.add_argument("--limit", type=int, default=15,
+                   help="Pocet vysledku (default 15)")
+    p.add_argument("--no-meta", action="store_true",
+                   help="Vynechat doplneni z Mongo")
+    return p.parse_args()
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _fmt_meta(study_code: str, content: dict) -> str:
+    """Vrati jednoradkove shrnuti zajimavych poli z content.*"""
+    if not content:
+        return "(bez content)"
+    bits = []
+    if not content.get("ok", True):
+        return f"content.error: {content.get('error', '?')}"
+    for key in ("title", "subject", "author", "last_modified_by",
+                "from", "to", "subject", "date"):
+        v = content.get(key)
+        if v:
+            bits.append(f"{key}={_short(v, 40)}")
+    if "pages" in content:
+        bits.append(f"pages={content['pages']}")
+    if "slides" in content:
+        bits.append(f"slides={content['slides']}")
+    if "total_sheets" in content:
+        sheet_names = [s.get("name") for s in content.get("sheets", [])][:4]
+        bits.append(f"sheets={content['total_sheets']} {sheet_names}")
+    if "paragraphs" in content:
+        bits.append(f"paragraphs={content['paragraphs']}")
+    if "has_attachments" in content:
+        bits.append(f"attachments={len(content.get('attachments', []))}")
+    return " | ".join(bits) if bits else "(content bez vyznamnych poli)"
+
+
+def main() -> int:
+    args = parse_args()
+
+    studies = None
+    if args.study:
+        studies = [STUDY_COLLECTIONS[s] for s in args.study]
+
+    exts = None
+    if args.ext:
+        exts = [e.lower().lstrip(".") for e in args.ext]
+
+    since = None
+    if args.since:
+        since = datetime.strptime(args.since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
+
+    params = {
+        "query":   args.query,
+        "studies": studies,
+        "exts":    exts,
+        "since":   since,
+        "limit":   args.limit,
+    }
+
+    with psycopg.connect(PG_DSN, connect_timeout=10) as pg, pg.cursor() as cur:
+        cur.execute(SEARCH_SQL, params)
+        cols = [c.name for c in cur.description]
+        rows = [dict(zip(cols, r)) for r in cur.fetchall()]
+
+    if not rows:
+        print(f"Zadne vysledky pro: {args.query!r}")
+        return 0
+
+    # obohaceni z Mongo - jeden round-trip na studii
+    meta_by_path: dict[str, dict] = {}
+    if not args.no_meta:
+        mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+        db = mongo[MONGO_DB]
+        by_study: dict[str, list[str]] = {}
+        for r in rows:
+            by_study.setdefault(r["study"], []).append(r["path"])
+        for study_code, paths in by_study.items():
+            for d in db[study_code].find(
+                {"path": {"$in": paths}},
+                {"path": 1, "content": 1, "dates_in_name": 1, "parent_folders": 1},
+            ):
+                meta_by_path[d["path"]] = d
+        mongo.close()
+
+    print(f"\n=== Dotaz: {args.query!r}   vysledku: {len(rows)} ===\n")
+    for i, r in enumerate(rows, 1):
+        size_mb = (r["size_bytes"] or 0) / 1024 / 1024
+        mtime = r["mtime"].strftime("%Y-%m-%d") if r["mtime"] else "?"
+        print(f"[{i:>2}] rank={r['rank']:.4f}  {r['study']}  "
+              f"{r['ext']:<4} {size_mb:5.1f}MB  {mtime}  "
+              f"({r['body_length']} znaku)")
+        print(f"     {r['rel_path'] or r['name']}")
+        snippet = (r["snippet"] or "").replace("\n", " ").strip()
+        if snippet:
+            print(f"     >> {snippet}")
+        if not args.no_meta:
+            m = meta_by_path.get(r["path"]) or {}
+            content_line = _fmt_meta(r["study"], m.get("content") or {})
+            print(f"     meta: {content_line}")
+            if m.get("dates_in_name"):
+                print(f"     dates_in_name: {m['dates_in_name']}")
+        print()
+
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+        sys.exit(130)
diff --git a/claude-memory/feedback_proactive_suggestions.md b/claude-memory/feedback_proactive_suggestions.md
new file mode 100644
index 0000000..1d549c1
--- /dev/null
+++ b/claude-memory/feedback_proactive_suggestions.md
@@ -0,0 +1,22 @@
+---
+name: feedback-proactive-suggestions
+description: "Při auditu/refaktoru pipeline aktivně nabízej lepší API/postupy, ne jen popisuj stav"
+metadata: 
+  node_type: memory
+  type: feedback
+  originSessionId: e6bd6ea2-647e-46c4-976e-dfcb5aa31269
+---
+
+Když uživatel ukazuje existující pipeline a já mám kontext o (a) co dělá, (b) jak často to běží, (c) na jakém objemu dat, a (d) vím o standardním lepším řešení v daném ekosystému (Graph delta, CDC, webhooks, server-side filtry…), **mám to proaktivně navrhnout**, ne čekat až se uživatel zeptá.
+
+**Konkrétní příklad (2026-06-04):** Při auditu emailové pipeline jsem viděl `--mode sync` který projde celou schránku jen aby přes `mid in existing` zahodil 99 % zpráv, a 9 schránek s ~268k mailů spouštěných pravidelně. Microsoft Graph `messages/delta` API je přesně pro tenhle scénář (vrací jen změny + smazání od posledního deltaLink). Neudržel jsem to a uživatel se musel zeptat sám — "proč jsem se musel ptát já?".
+
+**Why:** Reaktivní režim ("user se ptá → odpovídám") nestačí, když mám víc kontextu než uživatel. Pokud uvidím evidentně optimalizovatelný kus pipeline a znám standardní lepší nástroj, **mlčení je škoda** — uživatel pak žije s neoptimálním řešením, dokud na to nepřijde sám.
+
+**How to apply:**
+- Po dokončení popisu stavu (audit, rename, doc) udělat krok zpátky: *"co tady chybí / co by se dalo udělat líp?"*
+- Když popisuju jak něco funguje a vidím, že existuje server-side API které by udělalo to samé efektivněji (delta queries, CDC, webhooks, server-side filter, batch endpoint), **nabídnout to ve stejné odpovědi**, ne čekat.
+- Skryté problémy (jako "smazané zprávy v Mongo nikdy nemažeme") nezmiňovat jako poznámku pod čarou — vytáhnout je jako věc k rozhodnutí.
+- Stačí 1-2 věty: *"mimochodem, tohle by šlo udělat přes X — chceš to?"* Není to plánování, je to upozornění.
+
+Související: [[project_graph_email_import]]
diff --git a/claude-memory/feedback_use_mcp_emaily.md b/claude-memory/feedback_use_mcp_emaily.md
new file mode 100644
index 0000000..c29602a
--- /dev/null
+++ b/claude-memory/feedback_use_mcp_emaily.md
@@ -0,0 +1,31 @@
+---
+name: feedback-use-mcp-emaily
+description: "Pro statistiky/dotazy nad emaily/přílohami v Mongo používej MCP `emaily`, ne SSH+paramiko"
+metadata: 
+  node_type: memory
+  type: feedback
+  originSessionId: e6bd6ea2-647e-46c4-976e-dfcb5aa31269
+---
+
+Když potřebuju zjistit cokoliv o emailech, schránkách nebo přílohách v Mongo `emaily` db, **první volba je MCP `emaily`** — ne SSH na tower + paramiko + temp script.
+
+**Konkrétní příklad (2026-06-04):** Uživatel se ptal kolik příloh ve své schránce ještě nestaženo. Já jsem rovnou skočil na paramiko SSH → temp Python script → docker exec, zatímco `mcp__emaily__list_mailboxes` mi to dalo jedním voláním. Uživatel mě na to upozornil podruhé v té samé konverzaci.
+
+**Why:** SSH+paramiko je oklika přes 3 vrstvy (Windows → SSH → docker exec), pomalá a navíc se mi pravidelně láme Unicode (Windows cp1252 vs UTF-8 výstup s diakritikou). MCP je přímý.
+
+**How to apply:**
+
+- **Vždy nejdřív** mrkni jestli to neumí jeden z MCP `emaily` tools:
+  - `ping` → zdraví Mongo+PG
+  - `list_mailboxes` → přehled schránek (counts, top senders, date range, with_attachments count)
+  - `search` → fulltext (subject/body/recipients/attachments)
+  - `find_attachment` → emaily s přílohou daného jména
+  - `recent_emails` → poslední X dní
+  - `read_email`, `by_sender`, `conversation_thread`, `top_senders`
+- **SSH/paramiko jen když:**
+  - potřebuju raw `count_documents` s custom filtrem (např. „kolik emailů má `attachments.file_hash` chybějící" — interní stav download skriptu)
+  - admin operace co MCP nevystavuje (DROP, create_index, update_many)
+  - debugging samotné MCP serveru
+- Před paramiko skoč si vlastnoručně otázku: *„jde tohle přes MCP emaily?"* Pokud aspoň zhruba ano, jdi tudy.
+
+Související: [[project_mcp_emaily]]
diff --git a/claude-memory/project_mcp_emaily.md b/claude-memory/project_mcp_emaily.md
new file mode 100644
index 0000000..f819351
--- /dev/null
+++ b/claude-memory/project_mcp_emaily.md
@@ -0,0 +1,39 @@
+---
+name: project-mcp-emaily
+description: "MCP server \"emaily\" - fulltext nad 9 schrankami z Microsoft Graph importu (~268k emailu)"
+metadata: 
+  node_type: memory
+  type: project
+  originSessionId: 49aa480f-6667-4832-b091-08f333c27872
+---
+
+MCP server `emaily` v [mcp_emaily.py](EmailsImport/mcp_emaily.py), registrovan v `U:\janssen\.mcp.json` jako `emaily`.
+
+**Architektura paralelni s [[project_mcp_soubory]]:**
+- Mongo `emaily.<mailbox>` (z [parse_emails_graph_v1.4.py](Python-runner/parse_emails_graph_v1.4.py)) = body_html / body_text + headers + recipients + attachments[]
+- PG `MongoEmaily.emails` (z [enrich_fulltext_emails_v1.1.py](Python-runner/enrich_fulltext_emails_v1.1.py)) = plain text + tsvector index
+
+**Pipeline opraveny 2026-06-03:** v1.3 parseru ukladal plain-text emaily JEN jako prvnich 2000 znaku do body_preview a zbytek zahazoval (17 672 emailu, 6.2% korpusu). v1.4 uklada plne plain-text telo do noveho pole body_text. Pro stare zaznamy: [refetch_text_bodies_v1.0.py](Python-runner/refetch_text_bodies_v1.0.py) - prochazi Mongo, refetchne z Graph API jen tam kde body_html i body_text chybi (cca 80 min pro 17.7k emailu). Enrich v1.1 ma fallback poradi html -> body_text -> body_preview.
+
+**9 schranek, ~268k emailu celkem:**
+vladimir.buzalka@buzalka.cz, vbuzalka@buzalka.cz, ordinace@buzalkova.cz, alica.buzalkova@buzalka.cz, mbuzalkova@buzalka.cz, jan.luxemburk@luxemburk.cz, vbuzalka@its.jnj.com, jarmila.kusinova@buzalka.cz, michaela.buzalkova@buzalka.cz
+
+**Index tsv pokryva:** subject + sender_email + sender_name + to_addrs + cc_addrs + attachments_summary + body. Takze search najde i emaily kde slovo je jen v predmetu nebo jmene odesilatele.
+
+**MCP tools:**
+- `ping`, `list_mailboxes` - prehled korpusu
+- `search(query, mailbox?, since?, until?, folder_contains?, sender_contains?, has_attachments?, limit)` - HLAVNI fulltext (websearch_to_tsquery), <<...>> snippet
+- `read_email(message_id, mailbox?, offset/length/around_match, include_html?)` - cely email, slice nebo okno
+- `by_sender(sender, mailbox?, since?, has_attachments?)` - regex na sender_email/name
+- `recent_emails(mailbox?, days, folder_contains?, has_attachments?)` - by received_at
+- `conversation_thread(conversation_id)` - cele Outlook vlakno chronologicky
+- `find_attachment(name_contains, mailbox?, since?)` - hledani podle nazvu prilohy
+- `top_senders(mailbox?, since?)` - kdo me nejvic email
+
+**Why:** Mongo uz mela emaily (body_html), uzivatel se ptal jestli se musi znovu stahovat - nemusi. Stacilo z HTML udelat plain text pres BeautifulSoup a zaindexovat v PG.
+
+**How to apply:**
+- Pred prvnim plnym importem 268k emailu spustit: `python U:\janssen\EmailsImport\enrich_fulltext_emails_v1.0.py` (~80 min). Pro test `--limit 500 --mailbox X`.
+- Sdileny TS config `soubory` (simple + unaccent) takze diakritika a case insensitive.
+- Pri reset/zmene parseru: bump `EXTRACTOR_VERSION` -> preparsuje vse.
+- Pri dotazu na "co poslal/posilam X" pouzivat `by_sender` namisto search - rychlejsi a vyhne se false matchum v tele.
diff --git a/claude-memory/project_mcp_soubory.md b/claude-memory/project_mcp_soubory.md
new file mode 100644
index 0000000..4b81f99
--- /dev/null
+++ b/claude-memory/project_mcp_soubory.md
@@ -0,0 +1,31 @@
+---
+name: project-mcp-soubory
+description: "MCP server \"soubory\" - hybridni dotaz nad PG fulltextem a Mongo metadaty pro Dropbox souboru obou studii"
+metadata: 
+  node_type: memory
+  type: project
+  originSessionId: 49aa480f-6667-4832-b091-08f333c27872
+---
+
+MCP server `soubory` v [mcp_soubory.py](soubory/mcp_soubory.py), registrovan v `U:\janssen\.mcp.json` jako `soubory`.
+
+**Architektura:** Postgres `MongoSoubory.documents` (fulltext tsvector + body) + MongoDB `soubory.{42847922MDD3003,77242113UCO3001}` (metadata + content.*). Source skripty: [scan_files_v1.0.py](soubory/scan_files_v1.0.py), [enrich_files_v1.0.py](soubory/enrich_files_v1.0.py), [enrich_fulltext_v1.2.py](soubory/enrich_fulltext_v1.2.py).
+
+**Tools:**
+- `ping` - health check + counts per studie
+- `list_studies` - prehled korpusu, ext breakdown, fulltext coverage
+- `search(query, study?, ext?, since?, folder?, limit, with_metadata)` - HLAVNI fulltext (websearch_to_tsquery), ranked + ts_headline snippet `<<...>>` + Mongo content.* enrichment
+- `read_document(path|mongo_id, offset, length, around_match)` - cele body, slice nebo okno kolem matche
+- `get_metadata(path)` - raw Mongo doc bez body
+- `recent_files(study?, days, ext?, limit)` - co se zmenilo (mtime)
+- `find_duplicates(study?, min_size_kb, limit)` - sha256 groups, wasted_mb_estimate
+- `by_author(name, study?, ext?)` - regex na content.author / last_modified_by
+- `browse_folder(folder, study?, ext?)` - regex na parent_folders
+
+Pro EML/MSG pouzij `search(..., ext=["eml","msg"])` + `by_author(jmeno, ext=["eml","msg"])` - samostatny email tool je redundantni a navic by se pletl s [[project_graph_email_import]].
+
+**Aliasy studii:** `MDD3003` -> `42847922MDD3003`, `UCO3001` -> `77242113UCO3001`. None = obe.
+
+**Why:** [[project_claude_learning]] - chce se ptat v chatu ("najdi mi dokument o X", "co psal sponsor Y"), nikoli volat CLI.
+
+**How to apply:** Po restartu Claude Code budou nastroje dostupne jako `mcp__soubory__*`. Pred volanim search radeji nejdriv `list_studies` aby clovek vedel co je v korpusu. Pri velkem body pouzij `around_match` misto stahovani celeho dokumentu.