notebook

2026-06-05 21:21:30 +02:00
parent 1ec9e40196
commit a347051145
28 changed files with 7402 additions and 0 deletions
@@ -0,0 +1,77 @@
+# 0_run_pipeline_v1.0.py
+
+**Wrapper kolem celé emailové pipeline.** Spustí postupně všechny 4 kroky daily syncu, vždy přes všechny dostupné schránky:
+
+| # | Krok | Skript |
+|---|---|---|
+| 1b | Graph delta sync (inkrementální Mongo update) | `1b_parse_emails_graph_delta_v1.0.py` |
+| 3  | Download attachments | `3_download_attachments_v1.4.py` |
+| 4  | Unwrap S/MIME | `4_unwrap_smime_v1.0.py` |
+| 5  | Enrich fulltext (PG) | `5_enrich_fulltext_emails_v1.3.py` |
+
+## Politika chyb
+
+Default je **continue-on-error** — když některý krok selže, pipeline pokračuje dalším (downstream se nezasekne kvůli minor problému). Po vyběhnutí dostaneš souhrnnou tabulku s `OK / FAIL(N)` per krok.
+
+Použij `--stop-on-error` pokud chceš tvrdou abort při první chybě.
+
+## Logování
+
+Každý krok jde do vlastního logu v `/scripts/pipeline_<id>.log`:
+- `pipeline_1b.log`
+- `pipeline_3.log`
+- `pipeline_4.log`
+- `pipeline_5.log`
+
+Live výstup se zároveň tee-uje na konzoli (vypneš přes `--quiet`).
+
+## Argumenty
+
+| Argument | Hodnoty | Popis |
+|---|---|---|
+| `--only` | `1b 3 4 5` | Spustit jen tyto kroky |
+| `--skip` | `1b 3 4 5` | Přeskočit tyto kroky |
+| `--stop-on-error` | flag | Zastavit při první chybě (default: pokračovat) |
+| `--quiet` | flag | Necpat stdout na konzoli (zůstane v logu) |
+
+## Varianty volání
+
+```bash
+# Daily run — vše, všechny schránky:
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py
+
+# Jen enrich (např. po manuálním zásahu do Mongo):
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 5
+
+# Vše mimo S/MIME (krok 4 občas vyžaduje pip install asn1crypto):
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --skip 4
+
+# Test daily sync bez fulltextu:
+docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 1b 3 4
+
+# Na pozadí, master log:
+docker exec -d python-runner bash -c "python /scripts/0_run_pipeline_v1.0.py --quiet > /scripts/pipeline_master.log 2>&1"
+docker exec -it python-runner tail -f /scripts/pipeline_master.log
+```
+
+## Cron / nightly automation
+
+Pro nightly se hodí jednoduchý cron na Unraidu (`/etc/cron.daily/` nebo User Scripts plugin):
+
+```bash
+#!/bin/bash
+docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet \
+    > /mnt/user/Scripts/pipeline_$(date +%Y%m%d).log 2>&1
+```
+
+Stačí denně, delta sync z minulého stavu trvá ~30s s prázdným backlogem.
+
+## Exit kódy wrapperu
+
+| Kód | Význam |
+|---|---|
+| 0 | Všechny kroky OK |
+| 1 | Alespoň jeden krok selhal |
+| 2 | Žádný krok k běhu (--only + --skip vyloučily vše) |
+| 127 | Některý skript neexistuje v `/scripts/` |
+| 130 | Přerušeno Ctrl+C |
@@ -0,0 +1,176 @@
+"""
+==============================================================================
+Skript:   0_run_pipeline_v1.0.py
+Verze:    1.0
+Datum:    2026-06-04
+Autor:    vladimir.buzalka
+
+Popis:
+  Wrapper kolem cele emailove pipeline. Spousti postupne:
+    1b. parse_emails_graph_delta  -> delta sync z Graph API do Mongo
+     3. download_attachments      -> stahne pripojeny soubory
+     4. unwrap_smime              -> rozbali S/MIME wrapper zpravy
+     5. enrich_fulltext_emails    -> doindexuje do PG fulltext
+
+  Vzdy projizdi VSECHNY schranky (mimo SKIP_MAILBOXES v jednotlivych skriptech).
+  Per-krok merici cas + exit code. Pokud krok selze, default pokracuje dal
+  (aby se downstream nezasekl) — viz --stop-on-error.
+
+  Vsechny vystupy a chyby kazdeho kroku jsou ulozeny do /scripts/pipeline_<step>.log
+
+Spousteni:
+  python 0_run_pipeline_v1.0.py                          # vse, vsechny schranky
+  python 0_run_pipeline_v1.0.py --only 5                 # jen krok 5 (enrich)
+  python 0_run_pipeline_v1.0.py --skip 4                 # vse mimo smime unwrap
+  python 0_run_pipeline_v1.0.py --stop-on-error          # zastavit pri prvni chybe
+  python 0_run_pipeline_v1.0.py --quiet                  # bez tee na konzoli, jen logy
+
+Docker:
+  docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+SCRIPTS_DIR = Path("/scripts")
+LOGS_DIR    = SCRIPTS_DIR  # vse do /scripts/
+
+# Definice pipeline (step_id, label, executable filename)
+STEPS = [
+    ("1b", "Graph delta sync",      "1b_parse_emails_graph_delta_v1.0.py"),
+    ("3",  "Download attachments",  "3_download_attachments_v1.4.py"),
+    ("4",  "Unwrap S/MIME",         "4_unwrap_smime_v1.0.py"),
+    ("5",  "Enrich fulltext (PG)",  "5_enrich_fulltext_emails_v1.3.py"),
+]
+
+
+def fmt_dur(s: float) -> str:
+    if s < 60:
+        return f"{s:.1f}s"
+    m, s = divmod(int(s), 60)
+    if m < 60:
+        return f"{m}m{s:02d}s"
+    h, m = divmod(m, 60)
+    return f"{h}h{m:02d}m{s:02d}s"
+
+
+def run_step(step_id: str, label: str, script: str, *,
+             quiet: bool = False) -> tuple[int, float]:
+    script_path = SCRIPTS_DIR / script
+    log_path    = LOGS_DIR / f"pipeline_{step_id}.log"
+
+    if not script_path.exists():
+        print(f"  CHYBA: {script_path} neexistuje!")
+        return 127, 0.0
+
+    print(f"\n{'='*70}")
+    print(f"  KROK {step_id}: {label}")
+    print(f"  script: {script_path}")
+    print(f"  log:    {log_path}")
+    print(f"  start:  {datetime.now().strftime('%H:%M:%S')}")
+    print(f"{'='*70}")
+
+    t0 = time.time()
+
+    # Tee: zaroven do konzole i do logu (pokud ne --quiet)
+    with open(log_path, "w", encoding="utf-8") as logf:
+        proc = subprocess.Popen(
+            [sys.executable, str(script_path)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+            encoding="utf-8",
+            errors="replace",
+        )
+        for line in proc.stdout:
+            logf.write(line)
+            if not quiet:
+                print(line, end="", flush=True)
+        ret = proc.wait()
+
+    dur = time.time() - t0
+    print(f"\n  KROK {step_id} {'OK' if ret == 0 else f'FAILED ({ret})'} za {fmt_dur(dur)}")
+    return ret, dur
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Email pipeline wrapper v1.0")
+    ap.add_argument("--only", nargs="+", default=None,
+                    help="Spustit jen tyto kroky (napr. --only 3 4 5)")
+    ap.add_argument("--skip", nargs="+", default=None,
+                    help="Preskocit tyto kroky")
+    ap.add_argument("--stop-on-error", action="store_true",
+                    help="Zastavit pipeline pri prvni nenulovem exit kodu")
+    ap.add_argument("--quiet", action="store_true",
+                    help="Necpat stdout kroku na konzoli, jen do logu")
+    args = ap.parse_args()
+
+    # Filter step set
+    only_set = set(args.only) if args.only else None
+    skip_set = set(args.skip) if args.skip else set()
+
+    to_run = []
+    for sid, label, script in STEPS:
+        if only_set and sid not in only_set:
+            continue
+        if sid in skip_set:
+            continue
+        to_run.append((sid, label, script))
+
+    if not to_run:
+        print("Zadny krok k spusteni.")
+        return 2
+
+    print(f"=== Email Pipeline Wrapper v1.0 ===")
+    print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Kroku k spusteni: {len(to_run)}")
+    for sid, label, _ in to_run:
+        print(f"  {sid}: {label}")
+    if args.stop_on_error:
+        print("Politika: stop-on-error")
+    else:
+        print("Politika: continue-on-error (default)")
+
+    t_all = time.time()
+    results = []
+
+    for sid, label, script in to_run:
+        ret, dur = run_step(sid, label, script, quiet=args.quiet)
+        results.append((sid, label, ret, dur))
+        if ret != 0 and args.stop_on_error:
+            print(f"\n!!! Pipeline zastavena na kroku {sid} (--stop-on-error)")
+            break
+
+    total_dur = time.time() - t_all
+
+    print(f"\n{'='*70}")
+    print("=== SHRNUTI PIPELINE ===")
+    print(f"{'='*70}")
+    failed = 0
+    for sid, label, ret, dur in results:
+        status = "OK" if ret == 0 else f"FAIL({ret})"
+        if ret != 0:
+            failed += 1
+        print(f"  [{sid:>2}] {label:30} {status:>8}  {fmt_dur(dur):>10}")
+    print(f"{'='*70}")
+    print(f"  Celkem: {len(results)} kroku, {failed} chyb, {fmt_dur(total_dur)}")
+    print(f"  Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"  Per-krok logy: {LOGS_DIR}/pipeline_<id>.log")
+
+    return 1 if failed else 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+        sys.exit(130)
@@ -0,0 +1,41 @@
+# 1_parse_emails_graph_v1.4.py
+
+**Krok 1 pipeline** — import emailů z libovolné schránky přes Microsoft Graph API do MongoDB (`emaily.<mailbox>`). Čte všechny složky rekurzivně. Upsert podle Message-ID → bezpečné přerušit a opakovat.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ano | e-mail | — | Schránka = název kolekce v Mongo |
+| `--mode` | ne | `full` / `new-only` / `sync` | `full` | `full` = plný upsert; `new-only` = jen nové; `sync` = aktualizuje `is_read`/`flag_status`/`categories`/`folder_path` u existujících + importuje nové |
+| `--folder` | ne | název složky | (všechny) | Jen jedna složka (např. `Inbox`) |
+| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N zpráv (test) |
+| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci |
+
+## Varianty volání
+
+```bash
+# První plný import schránky (vše):
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz
+
+# Test na 50 zprávách bez vytváření indexů:
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Jen nové emaily (po prvním importu):
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode new-only
+
+# Pravidelný sync (nové + aktualizace flagů u existujících) na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
+
+# Import jen složky Inbox:
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder Inbox
+
+# Test 10 emailů z konkrétní složky:
+docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder "Sent Items" --limit 10
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/parse_emails.log
+```
@@ -0,0 +1,624 @@
+"""
+parse_emails_graph_v1.4.py
+Nazev:  parse_emails_graph_v1.4.py
+Verze:  1.4
+Datum:  2026-06-03
+Autor:  vladimir.buzalka
+
+Popis:
+    Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API
+    a importuje je jako dokumenty do MongoDB.
+    Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
+
+        - predmet, odesilatel, prijemci (To/CC/BCC s typy)
+        - cas doruceni, odeslani, vytvoreni, modifikace (UTC)
+        - telo HTML (max 2 MB) + textovy preview
+        - prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id)
+        - internet headers (SPF, DKIM, Received, X-*, ...)
+        - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
+          kategorie, In-Reply-To, References, ...
+        - navic: isRead, isDraft, folder_path, inferenceClassification
+
+    Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
+    archivni slozky, ...).
+
+    DB:       emaily
+    Kolekce:  <mailbox> (napr. ordinace@buzalkova.cz)
+    _id:      Internet Message-ID (nebo "graphid:<id>" jako fallback)
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    # Prvni import (vsechno):
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
+
+    # Test na prvnich 50:
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+    # Jen jedna slozka:
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox
+
+    # Pokracovani po preruseni (pouze nove):
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only
+
+    # Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
+    python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync
+
+    # Jina schranka:
+    python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz
+
+Rezimy (--mode):
+    full      Plny upsert vsech poli pro kazdou zpravu (vychozi)
+    new-only  Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
+    sync      Existujici: aktualizuje jen is_read/flag_status/categories/
+              modified_at/folder_path. Nove zpravy importuje cely.
+              Idealni pro pravidelne spousteni.
+
+Zavislosti:
+    msal, requests, pymongo, python-dateutil
+    Python 3.10+
+
+Struktura dokumentu v MongoDB:
+    _id                     Internet Message-ID (nebo graphid: fallback)
+    graph_id                Graph API message ID
+    subject                 predmet zpravy
+    normalized_subject      predmet bez RE:/FW:/AW: prefixu
+    importance              0=nizka 1=normalni 2=vysoka
+    flag_status             0=bez priznaku 1=oznaceno 2=dokonceno
+    is_read                 bool — aktualni stav precteni ve schrance
+    is_draft                bool
+    has_attachments         bool
+    attachment_count        int
+    inference_classification focused / other
+    categories              [str]
+    conversation_id         Graph conversationId
+    conversation_index      base64 conversationIndex
+    conversation_topic      tema vlakna (z internet headers Thread-Topic)
+    in_reply_to             Message-ID predchozi zpravy
+    internet_references     [Message-ID]
+    received_at             datetime UTC
+    sent_at                 datetime UTC
+    created_at              datetime UTC
+    modified_at             datetime UTC
+    folder_id               Graph parentFolderId
+    folder_path             cela cesta slozky (napr. Inbox/Subfolder)
+    sender.email            emailova adresa odesilatele
+    sender.name             zobrazovane jmeno
+    to                      retezec To (joined)
+    cc                      retezec CC
+    bcc                     retezec BCC
+    recipients              [{type, email, name}]
+    body_html               HTML telo (pokud contentType=='html', max 2 MB)
+    body_text               plain-text telo (pokud contentType=='text', max 2 MB)
+    body_preview            textovy nahled z Graph bodyPreview (max 255 znaku)
+    attachments             [{filename, size_bytes, mime_type, is_inline, graph_att_id}]
+    headers                 dict internet headers
+    parsed_at               datetime UTC
+
+Indexy:
+    received_at, sent_at, sender.email, graph_id (unique),
+    conversation_id, folder_path, has_attachments, categories,
+    importance, flag_status, is_read,
+    text_search (subject + body_preview + to + cc)
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze
+    1.1  2026-06-02  Pridany rezimy --mode full/new-only/sync;
+                     odstranen --skip-existing (nahrazen --mode new-only)
+    1.2  2026-06-02  $expand attachments s $select (bez contentBytes — rychlejsi);
+                     prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu
+    1.3  2026-06-02  --mailbox jako povinny parametr — univerzalni pouziti pro
+                     libovolnou schranku; kolekce v MongoDB = nazev schranky
+    1.4  2026-06-03  Plain-text emaily (contentType=='text') se ukladaji do
+                     noveho pole body_text (max 2 MB), drive se truncovalo na
+                     2000 znaku do body_preview a zbytek se zahazoval.
+                     body_preview ted obsahuje vzdy puvodni Graph bodyPreview.
+                     Pro existujici emaily z v1.3 lze pouzit
+                     refetch_text_bodies_v1.0.py.
+"""
+
+import sys
+import re
+import logging
+import argparse
+import base64
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from dateutil import parser as dtparser
+from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI      = "mongodb://192.168.1.76:27017"
+MONGO_DB       = "emaily"
+BATCH_SIZE     = 100
+PAGE_SIZE      = 50
+LOG_FILE       = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION = "1.4"
+
+# Schránka se nastavuje za behu z --mailbox parametru
+GRAPH_MAILBOX: str = ""
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+IMPORTANCE_MAP  = {"low": 0, "normal": 1, "high": 2}
+FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
+RE_SUBJECT      = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
+
+# $expand prilohy bez contentBytes — jen metadata co potrebujeme
+ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
+
+MSG_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification,internetMessageHeaders"
+)
+
+MSG_SELECT_SYNC = (
+    "id,internetMessageId,isRead,isDraft,flag,categories,"
+    "lastModifiedDateTime,parentFolderId,importance"
+)
+
+
+# ─── Graph API helpers ────────────────────────────────────────────────────────
+
+_graph_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def graph_get(url: str, params: dict = None) -> dict:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET failed after retry: {url}")
+
+
+def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
+    """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
+    if parent_id is None:
+        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
+    else:
+        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
+
+    folders = []
+    params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
+    while url:
+        data = graph_get(url, params)
+        for f in data.get("value", []):
+            path = f"{parent_path}/{f['displayName']}".lstrip("/")
+            folders.append({"id": f["id"], "path": path})
+            if f.get("childFolderCount", 0) > 0:
+                folders.extend(get_all_folders(f["id"], path))
+        url = data.get("@odata.nextLink")
+        params = None
+    return folders
+
+
+def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
+    """Generator: vraci zpravy ze slozky po strankach."""
+    url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
+    params = {"$top": PAGE_SIZE, "$select": select}
+    if expand_attachments:
+        params["$expand"] = ATT_EXPAND
+    while url:
+        data = graph_get(url, params)
+        for msg in data.get("value", []):
+            yield msg
+        url = data.get("@odata.nextLink")
+        params = None
+
+
+# ─── Pomocné funkce ───────────────────────────────────────────────────────────
+
+def parse_date(raw) -> Optional[datetime]:
+    if raw is None:
+        return None
+    if isinstance(raw, datetime):
+        if raw.tzinfo:
+            return raw.astimezone(timezone.utc).replace(tzinfo=None)
+        return raw
+    try:
+        dt = dtparser.parse(str(raw))
+        if dt.tzinfo:
+            return dt.astimezone(timezone.utc).replace(tzinfo=None)
+        return dt
+    except Exception:
+        return None
+
+
+def normalize_subject(subject: str) -> str:
+    s = subject.strip()
+    while True:
+        m = RE_SUBJECT.match(s)
+        if not m:
+            break
+        s = s[m.end():].strip()
+    return s
+
+
+def parse_headers(raw_headers: list) -> dict:
+    result = {}
+    for h in raw_headers:
+        k = h["name"].lower().replace("-", "_")
+        v = h["value"]
+        if k in result:
+            existing = result[k]
+            result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
+        else:
+            result[k] = v
+    return result
+
+
+def format_recipients(lst: list) -> str:
+    return "; ".join(
+        f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
+        for r in lst
+    )
+
+
+# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
+
+def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
+    """Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
+    try:
+        mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
+        subject = msg.get("subject") or ""
+
+        body_html = None
+        body_text = None
+        body_preview = msg.get("bodyPreview") or ""
+        body = msg.get("body", {})
+        _MAX_BODY = 2 * 1024 * 1024  # 2 MB
+        if body.get("contentType") == "html":
+            content = body.get("content") or ""
+            body_html = content if len(content) <= _MAX_BODY else content[:_MAX_BODY]
+        elif body.get("contentType") == "text":
+            content = body.get("content") or ""
+            # v1.4: ulozime PLNY plain text do body_text (drive se truncovalo na 2000 znaku
+            # do body_preview a zbytek se zahodil)
+            body_text = content if len(content) <= _MAX_BODY else content[:_MAX_BODY]
+
+        sender_ea    = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
+        to_list      = msg.get("toRecipients", [])
+        cc_list      = msg.get("ccRecipients", [])
+        bcc_list     = msg.get("bccRecipients", [])
+
+        recipients = (
+            [{"type": "to",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
+            [{"type": "cc",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
+            [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
+        )
+
+        importance  = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
+        flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
+
+        raw_headers   = msg.get("internetMessageHeaders") or []
+        headers       = parse_headers(raw_headers)
+
+        in_reply_to = headers.get("in_reply_to", "")
+        if isinstance(in_reply_to, list):
+            in_reply_to = in_reply_to[0]
+
+        refs_raw = headers.get("references", "")
+        if isinstance(refs_raw, list):
+            refs_raw = " ".join(refs_raw)
+        internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
+
+        conv_topic = headers.get("thread_topic", "")
+        if isinstance(conv_topic, list):
+            conv_topic = conv_topic[0]
+
+        conv_index = ""
+        ci_raw = msg.get("conversationIndex")
+        if ci_raw:
+            try:
+                conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
+            except Exception:
+                conv_index = ci_raw
+
+        attachments = []
+        for att in msg.get("attachments") or []:
+            fname = att.get("name") or ""
+            if not fname:
+                continue
+            attachments.append({
+                "filename":     fname,
+                "size_bytes":   att.get("size", 0),
+                "mime_type":    att.get("contentType", "application/octet-stream"),
+                "is_inline":    att.get("isInline", False),
+                "graph_att_id": att.get("id"),
+            })
+
+        return {
+            "_id":      mid,
+            "graph_id": msg["id"],
+
+            "subject":            subject,
+            "normalized_subject": normalize_subject(subject),
+            "importance":         importance,
+            "flag_status":        flag_status,
+            "is_read":            msg.get("isRead", False),
+            "is_draft":           msg.get("isDraft", False),
+            "has_attachments":    msg.get("hasAttachments", False),
+            "attachment_count":   len(attachments),
+            "inference_classification": msg.get("inferenceClassification", ""),
+            "categories":         msg.get("categories") or [],
+
+            "conversation_id":     msg.get("conversationId", ""),
+            "conversation_index":  conv_index,
+            "conversation_topic":  conv_topic,
+            "in_reply_to":         in_reply_to,
+            "internet_references": internet_refs,
+
+            "received_at": parse_date(msg.get("receivedDateTime")),
+            "sent_at":     parse_date(msg.get("sentDateTime")),
+            "created_at":  parse_date(msg.get("createdDateTime")),
+            "modified_at": parse_date(msg.get("lastModifiedDateTime")),
+
+            "folder_id":   msg.get("parentFolderId", ""),
+            "folder_path": folder_path,
+
+            "sender": {
+                "email": sender_ea.get("address", ""),
+                "name":  sender_ea.get("name", ""),
+            },
+            "to":         format_recipients(to_list),
+            "cc":         format_recipients(cc_list),
+            "bcc":        format_recipients(bcc_list),
+            "recipients": recipients,
+
+            "body_html":    body_html,
+            "body_text":    body_text,
+            "body_preview": body_preview,
+
+            "attachments": attachments,
+            "headers":     headers,
+
+            "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        }
+
+    except Exception as e:
+        logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
+        return None
+
+
+def extract_sync_fields(msg: dict, folder_path: str) -> dict:
+    """Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
+    return {
+        "is_read":    msg.get("isRead", False),
+        "is_draft":   msg.get("isDraft", False),
+        "flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
+        "importance":  IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
+        "categories":  msg.get("categories") or [],
+        "modified_at": parse_date(msg.get("lastModifiedDateTime")),
+        "folder_id":   msg.get("parentFolderId", ""),
+        "folder_path": folder_path,
+        "parsed_at":   datetime.now(timezone.utc).replace(tzinfo=None),
+    }
+
+
+# ─── MongoDB indexy ───────────────────────────────────────────────────────────
+
+def create_indexes(col):
+    print("  Vytvarim indexy...")
+    col.create_index([("received_at",     ASCENDING)])
+    col.create_index([("sent_at",         ASCENDING)])
+    col.create_index([("sender.email",    ASCENDING)])
+    col.create_index([("graph_id",        ASCENDING)], unique=True, sparse=True)
+    col.create_index([("conversation_id", ASCENDING)])
+    col.create_index([("folder_path",     ASCENDING)])
+    col.create_index([("has_attachments", ASCENDING)])
+    col.create_index([("categories",      ASCENDING)])
+    col.create_index([("importance",      ASCENDING)])
+    col.create_index([("flag_status",     ASCENDING)])
+    col.create_index([("is_read",         ASCENDING)])
+    col.create_index([
+        ("subject",      TEXT),
+        ("body_preview", TEXT),
+        ("to",           TEXT),
+        ("cc",           TEXT),
+    ], name="text_search", default_language="none")
+    print("  Indexy hotovy.")
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def main():
+    global GRAPH_MAILBOX
+
+    ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox",    required=True,
+                    help="Emailova schranka (napr. ordinace@buzalkova.cz)")
+    ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
+                    help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
+                         "sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
+    ap.add_argument("--limit",      type=int, default=0,
+                    help="Zpracovat max N zprav (0 = vse)")
+    ap.add_argument("--folder",     default="",
+                    help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
+    ap.add_argument("--no-indexes", action="store_true",
+                    help="Nevytvorit indexy na konci")
+    args = ap.parse_args()
+
+    GRAPH_MAILBOX = args.mailbox
+    mongo_col     = args.mailbox
+
+    start = datetime.now()
+    print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
+    print(f"Start:    {start.strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Schránka: {GRAPH_MAILBOX}")
+    print(f"MongoDB:  {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
+    print(f"Režim:    {args.mode}")
+
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+    col = client[MONGO_DB][mongo_col]
+
+    existing: set = set()
+    if args.mode in ("new-only", "sync"):
+        print("  Nacitam existujici zaznamy z MongoDB...")
+        existing = set(col.distinct("_id"))
+        print(f"  {len(existing)} jiz importovano")
+
+    print("\nNacitam seznam slozek...")
+    all_folders = get_all_folders()
+    if args.folder:
+        all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
+    print(f"  Slozek ke zpracovani: {len(all_folders)}")
+    for f in all_folders:
+        print(f"    {f['path']}")
+
+    is_sync    = args.mode == "sync"
+    msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
+    expand_att = not is_sync
+
+    batch      = []
+    ok_count   = 0
+    sync_count = 0
+    err_count  = 0
+    skip_count = 0
+    total_i    = 0
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    print()
+    for folder in all_folders:
+        print(f"--- Složka: {folder['path']} ---")
+        folder_count = 0
+
+        for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
+            if args.limit and total_i >= args.limit:
+                break
+
+            mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
+            total_i += 1
+            folder_count += 1
+
+            if args.mode == "new-only" and mid in existing:
+                skip_count += 1
+                continue
+
+            if is_sync and mid in existing:
+                fields = extract_sync_fields(msg, folder["path"])
+                batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
+                sync_count += 1
+                print(f"  {total_i:>6}  SYN   {mid[:80]}")
+            else:
+                if is_sync:
+                    full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
+                    full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND}
+                    try:
+                        msg = graph_get(full_url, full_params)
+                    except Exception as e:
+                        logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
+                        err_count += 1
+                        continue
+
+                doc = extract_message(msg, folder["path"])
+                if doc is None:
+                    err_count += 1
+                    print(f"  {total_i:>6}  ERR   {mid[:80]}")
+                else:
+                    batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
+                    ok_count += 1
+                    subject_str = (doc.get("subject") or "")[:60]
+                    sender_str  = (doc.get("sender", {}).get("email") or "")[:40]
+                    print(f"  {total_i:>6}  OK    {subject_str:<60}  {sender_str}")
+
+            if len(batch) >= BATCH_SIZE:
+                flush()
+
+            if total_i % 500 == 0:
+                elapsed = (datetime.now() - start).total_seconds()
+                rate    = total_i / elapsed if elapsed > 0 else 0
+                print(f"  {'─'*80}")
+                print(f"  Průběh: ok={ok_count}  sync={sync_count}  skip={skip_count}  err={err_count}  {rate:.1f} msg/s")
+                print(f"  {'─'*80}")
+
+        flush()
+        print(f"  → {folder_count} zprav ze slozky {folder['path']}")
+
+        if args.limit and total_i >= args.limit:
+            break
+
+    elapsed_total = (datetime.now() - start).total_seconds()
+    print(f"\n{'='*52}")
+    print(f"Vysledek:  ok={ok_count}  |  sync={sync_count}  |  skip={skip_count}  |  err={err_count}")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"Dokumentu v kolekci: {col.count_documents({})}")
+
+    if not args.no_indexes:
+        print()
+        create_indexes(col)
+
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if err_count:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,139 @@
+# 1b_parse_emails_graph_delta_v1.0.py
+
+**Inkrementalní sync přes Microsoft Graph delta query.** Sourozenec [`1_parse_emails_graph_v1.4.py`](1_parse_emails_graph_v1.4.md) — každý řeší jiný use case:
+
+| Skript | Použití |
+|---|---|
+| `1_parse_emails_graph_v1.4.py` | **První plný import** schránky (vše od začátku) |
+| `1b_parse_emails_graph_delta_v1.0.py` | **Pravidelný sync** — jen co se od minula změnilo |
+
+## Jak funguje
+
+Graph API vystavuje `messages/delta` endpoint, který si pamatuje **záložku** (`deltaLink` s tokenem). Při dalším volání s touto záložkou vrátí jen:
+
+- **nové zprávy**
+- **změny** existujících (`isRead`, vlajka, přesun do jiné složky, kategorie)
+- **smazané** zprávy (`@removed`)
+
+Delta běží **per složka**. Skript drží stav v Mongo kolekci `emaily.sync_state`:
+
+```json
+{
+  "_id": "ordinace@buzalkova.cz|<folder_id>",
+  "mailbox": "ordinace@buzalkova.cz",
+  "folder_id": "AAA...",
+  "folder_path": "Inbox",
+  "delta_link": "https://graph.microsoft.com/.../delta?$deltatoken=...",
+  "last_run_at": "2026-06-04T10:00:00Z",
+  "cumulative_new": 1234, "cumulative_sync": 5678, "cumulative_removed": 12, "run_count": 42
+}
+```
+
+První běh = fresh delta (Graph vrátí všechno + dá `deltaLink`). Každý další = jen změny od poslední záložky.
+
+## Co se stane se smazanými zprávami
+
+Když delta vrátí `@removed` pro zprávu, skript ji **nemaže** z Mongo. Pouze nastaví:
+
+```json
+{ "permanently_deleted": true, "permanently_deleted_at": "2026-06-04T10:00:00Z" }
+```
+
+Dohledatelné: `col.find({"permanently_deleted": true})`.
+
+**`@removed` přijde jen pro definitivně smazané** zprávy (uživatel vysypal koš / Shift+Del). Mail v `Deleted Items` je pořád normální zpráva, jen má `folder_path = "Deleted Items"`.
+
+## Extrakce zprávy
+
+Funkce `extract_message` a `extract_sync_fields` se načítají přímo z modulu `1_parse_emails_graph_v1.4.py` (přes `importlib`) — extrakční logika je jediná na celý projekt, nemůže se rozejít.
+
+## Nové vs změněné — jak skript pozná
+
+Pro každou položku z delta odpovědi:
+
+1. **Má `@removed`?** → označit `permanently_deleted` v Mongo, hotovo.
+2. **`graph_id` už je v Mongo?** → existující změna — pošle se jen `extract_sync_fields` (is_read, flag, folder, …) přes `$set`.
+3. **`graph_id` v Mongo není?** → nová zpráva — udělá se druhý GET `/messages/{id}?$expand=attachments` (delta nepodporuje `$expand`), aby přišla těla, hlavičky i přílohy, a uloží se přes `extract_message` jako klasický nový dokument.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) |
+| `--folder` | ne | substring | (všechny) | Filtr složek (např. `Inbox` zahrne i `Inbox/Archive`) |
+| `--limit N` | ne | int | 0 (bez limitu) | Max položek na složku (test) |
+| `--reset` | ne | flag | false | Smaže všechny `deltaLink`y pro vybrané schránky → další běh začne od fresh delta |
+| `--dry-run` | ne | flag | false | Nic neuloží do Mongo, jen vypíše co by se stalo |
+
+## SKIP_MAILBOXES (hardcoded ve skriptu)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Pro tuto schránku je nutný samostatný skript (lokální `.msg` parser nebo jiný zdroj). |
+
+Při `--mailbox vbuzalka@its.jnj.com` skript skončí s exit kódem 2. Při běhu bez `--mailbox` se schránka tiše přeskočí s hlášením `[skip]`.
+
+## Varianty volání
+
+```bash
+# VŠECHNY schránky najednou (mimo SKIP_MAILBOXES) — pro cron / pravidelný sync:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py
+
+# Jedna schránka — první běh (fresh delta — projde všechno, uloží deltaLinky):
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Pravidelný sync jedné schránky (jen změny od minulého běhu):
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Dry-run — uvidíš co by se stalo, nic se neuloží:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --dry-run
+
+# Test jen na složce Inbox, max 20 položek:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox --limit 20
+
+# Reset — zahodí deltaLinky a najede znova od plné delta:
+docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --reset
+
+# Cron / na pozadí (každých 5 min):
+docker exec -d python-runner bash -c "python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz > /scripts/delta_sync.log 2>&1"
+```
+
+## Co dělat na začátek
+
+1. **První import** schránky pořád přes `1_parse_emails_graph_v1.4.py` (existující data zůstanou).
+2. **První běh** `1b_…delta_v1.0.py` — fresh delta projde znovu všechny zprávy a hlavně uloží `deltaLink`y do `sync_state`. To může chvíli trvat (podobně jako `--mode new-only` na v1.4).
+3. **Další běhy** = už jen rychlé, vrací 0-X změn za interval.
+
+## Otevřené body k otestování
+
+- Jak rychle běží první (fresh) delta na velké schránce (`vladimir.buzalka@buzalka.cz` ~80k mailů)
+- Co Graph vrátí pro nově vytvořené složky (mělo by fungovat — appendnou se do `folders` při dalším `get_all_folders`)
+- Chování při `--limit` (drží se starý deltaLink → pristi beh dokonci zbytek)
+
+## HTTP 410 — expirovaný deltaLink
+
+DeltaLinky drží Graph cca 30 dní. Pokud nebudeš schránku syncovat měsíc, skript dostane 410, **smaže starý state** a sám zopakuje běh jako fresh delta. Žádný manuální zásah není potřeba.
+
+## Závislosti
+
+Stejné jako `1_parse_emails_graph_v1.4.py` (msal, requests, pymongo, dateutil) — žádné nové.
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/delta_sync.log
+docker exec -it python-runner tail -f /scripts/delta_errors.log
+```
+
+## Stav sync_state v Mongo
+
+```python
+# Přehled posledních synců:
+db.sync_state.find().sort("last_run_at", -1)
+
+# Zahodit deltaLinky pro jednu schránku (= efekt --reset):
+db.sync_state.delete_many({"mailbox": "ordinace@buzalkova.cz"})
+
+# Najít všechny permanentně smazané v jedné schránce:
+db["ordinace@buzalkova.cz"].find({"permanently_deleted": true}, {"subject": 1, "permanently_deleted_at": 1})
+```
@@ -0,0 +1,514 @@
+"""
+==============================================================================
+Skript:   1b_parse_emails_graph_delta_v1.0.py
+Verze:    1.0
+Datum:    2026-06-04
+Autor:    vladimir.buzalka
+
+Popis:
+  Inkrementalni sync emailu pres Microsoft Graph DELTA QUERY.
+  Sourozenec `1_parse_emails_graph_v1.4.py` — kazdy resi jiny use case:
+
+    1_parse_emails_graph_v1.4.py   = prvni plny import schranky
+    1b_parse_emails_graph_delta_v1.0.py = pravidelny sync (zmeny od minula)
+
+  Delta query je server-side change tracking — Graph si pamatuje "zalozku"
+  (deltaLink) a vraci jen to, co se od ni zmenilo:
+    - nove zpravy
+    - zmeny existujicich (isRead, flag, presun do jine slozky, kategorie)
+    - SMAZANE zpravy (@removed) — definitivne smazane, nikoli v kosi
+
+  Pro mail v "Deleted Items" delta nic specialniho nedela — je to porad
+  normalni zprava, jen s folder_path="Deleted Items". @removed prijde az
+  kdyz uzivatel vysype kos / Shift+Del.
+
+State:
+  Kolekce `emaily.sync_state`, _id = "<mailbox>|<folder_id>".
+  {
+    mailbox, folder_id, folder_path,
+    delta_link,           # plny URL s $deltatoken na pristi beh
+    last_run_at,
+    cumulative_new, cumulative_sync, cumulative_removed
+  }
+
+Permanentne smazane zpravy:
+  Skript je NEMAZE z Mongo. Pouze nastavi:
+    permanently_deleted: True
+    permanently_deleted_at: <UTC datetime detekce>
+  Dohledani: col.find({"permanently_deleted": True})
+
+Reuse:
+  Funkce extract_message / extract_sync_fields se nactou primo z modulu
+  1_parse_emails_graph_v1.4.py (importlib, file-based), aby se logika
+  extrahce nikdy nerozesla.
+
+Spousteni:
+  python 1b_parse_emails_graph_delta_v1.0.py                                   # VSECHNY schranky (mimo SKIP_MAILBOXES)
+  python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz   # jedna schranka
+  python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox
+  python 1b_parse_emails_graph_delta_v1.0.py --reset                           # zahodit deltaLinky a najet znova
+  python 1b_parse_emails_graph_delta_v1.0.py --dry-run                         # nic neulozit
+
+SKIP_MAILBOXES (hardcoded):
+  vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup. Pro tuto
+                            schranku je nutny samostatny skript (lokalni .msg).
+
+Zavislosti:
+  msal, requests, pymongo, python-dateutil
+  Python 3.10+
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import logging
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, ASCENDING
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI       = "mongodb://192.168.1.76:27017"
+MONGO_DB        = "emaily"
+SYNC_STATE_COL  = "sync_state"
+PAGE_SIZE       = 100  # delta endpoint typicky vraci max 100/stranka
+LOG_FILE        = Path(__file__).parent / "delta_errors.log"
+SCRIPT_VERSION  = "1.0"
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy:
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky, kde NEMAME Graph API pristup — pri bezneho behu se preskoci.
+# Pro tyto je nutny separatni skript (napr. lokalni .msg parser).
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",   # JNJ tenant — nemame Graph credentials
+}
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+# Co tahnout z delta endpointu (stejne jako MSG_SELECT v v1.4, mimo internetMessageHeaders
+# ktere delta neumi vratit pro vsechny polozky — pro nove zpravy si je dotahneme
+# samostatnym fetchem).
+DELTA_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification"
+)
+
+# Pro plne nacteni nove zpravy (vcetne hlavicek + priloh) pouzijeme stejny
+# select+expand jako v1.4
+FULL_FETCH_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification,internetMessageHeaders"
+)
+FULL_FETCH_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
+
+# ─── Reuse extract logiky z v1.4 ──────────────────────────────────────────────
+
+_HERE = Path(__file__).parent
+_V14_PATH = _HERE / "1_parse_emails_graph_v1.4.py"
+if not _V14_PATH.exists():
+    print(f"CHYBA: chybi sourozenec {_V14_PATH.name} — extract logiku nelze nacist", file=sys.stderr)
+    sys.exit(1)
+
+_spec = importlib.util.spec_from_file_location("v14_parse", _V14_PATH)
+_v14 = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_v14)
+extract_message     = _v14.extract_message
+extract_sync_fields = _v14.extract_sync_fields
+
+# GRAPH_MAILBOX modul-level v v1.4 — pro extract neni potreba, ale pro
+# konzistenci nastavujeme ho v main()
+
+# ─── Graph API ────────────────────────────────────────────────────────────────
+
+_graph_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+class DeltaExpired(Exception):
+    """deltaLink expiroval (HTTP 410) — je nutne zacit od plne delta znovu."""
+
+
+def graph_get(url: str, params: dict = None, allow_410: bool = False) -> dict:
+    """GET na Graph s retry pri 401. Pri 410 a allow_410=True vyhodi DeltaExpired."""
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(3):
+        r = requests.get(
+            url,
+            headers={"Authorization": f"Bearer {_graph_token}"},
+            params=params,
+            timeout=60,
+        )
+        if r.status_code == 401:
+            get_token()
+            continue
+        if r.status_code == 410 and allow_410:
+            raise DeltaExpired(url)
+        if r.status_code == 429:
+            # rate limit — respect Retry-After
+            wait = int(r.headers.get("Retry-After", "5"))
+            print(f"  [429] cekam {wait}s ...")
+            time.sleep(wait)
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET failed after retries: {url}")
+
+
+def get_all_folders(mailbox: str, parent_id: str = None, parent_path: str = "") -> list[dict]:
+    if parent_id is None:
+        url = f"{GRAPH_URL}/users/{mailbox}/mailFolders"
+    else:
+        url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{parent_id}/childFolders"
+
+    folders = []
+    params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
+    while url:
+        data = graph_get(url, params)
+        for f in data.get("value", []):
+            path = f"{parent_path}/{f['displayName']}".lstrip("/")
+            folders.append({"id": f["id"], "path": path})
+            if f.get("childFolderCount", 0) > 0:
+                folders.extend(get_all_folders(mailbox, f["id"], path))
+        url = data.get("@odata.nextLink")
+        params = None
+    return folders
+
+
+def fetch_full_message(mailbox: str, msg_id: str) -> Optional[dict]:
+    """Stahne celou zpravu vcetne hlavicek a priloh — pro nove zpravy zachycene v delte."""
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{msg_id}"
+    params = {"$select": FULL_FETCH_SELECT, "$expand": FULL_FETCH_EXPAND}
+    try:
+        return graph_get(url, params)
+    except requests.HTTPError as e:
+        logging.error("fetch_full_message %s: %s", msg_id, e)
+        return None
+
+
+# ─── Delta iterace ────────────────────────────────────────────────────────────
+
+def iter_folder_delta(mailbox: str, folder_id: str, delta_link: Optional[str], limit: int = 0):
+    """
+    Generator: vraci (item, final_delta_link).
+    item je dict s polozkou (bud zmena nebo {'@removed': ...}).
+    Posledni vyhozeny tuple ma final_delta_link != None (zbytek None).
+
+    Pri HTTP 410 (expirovany deltaLink) vyhodi DeltaExpired — caller ma
+    pustit znova s delta_link=None (= fresh full delta).
+    """
+    if delta_link:
+        url = delta_link
+        params = None
+    else:
+        url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{folder_id}/messages/delta"
+        params = {"$select": DELTA_SELECT, "$top": PAGE_SIZE}
+
+    n = 0
+    while url:
+        data = graph_get(url, params, allow_410=True)
+        params = None
+        for item in data.get("value", []):
+            yield item, None
+            n += 1
+            if limit and n >= limit:
+                # ulozime aspon stavajici nextLink jako "delta" — neni to ciste,
+                # ale pri --limit jde o test, takze pristi beh proste pocnize znovu
+                return
+        next_link  = data.get("@odata.nextLink")
+        final_link = data.get("@odata.deltaLink")
+        if final_link:
+            # konec — predame final delta
+            yield None, final_link
+            return
+        url = next_link
+
+
+# ─── Per-folder sync ──────────────────────────────────────────────────────────
+
+def sync_folder(col, sync_col, mailbox: str, folder: dict, dry_run: bool, limit: int) -> dict:
+    """Vrati statistiky."""
+    fid   = folder["id"]
+    fpath = folder["path"]
+    state_id = f"{mailbox}|{fid}"
+    state = sync_col.find_one({"_id": state_id})
+    delta_link = state.get("delta_link") if state else None
+
+    is_first_run = delta_link is None
+    label = "FRESH" if is_first_run else "DELTA"
+    print(f"\n[{label}] {fpath}")
+
+    stats = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
+    final_delta = None
+
+    try:
+        gen = iter_folder_delta(mailbox, fid, delta_link, limit=limit)
+        for item, fin in gen:
+            if fin:
+                final_delta = fin
+                break
+            try:
+                process_item(col, mailbox, fpath, item, stats, dry_run)
+            except Exception as e:
+                stats["errors"] += 1
+                logging.error("process_item %s: %s", item.get("id", "?"), e)
+    except DeltaExpired:
+        print(f"  [410] deltaLink expiroval — restart od fresh delta")
+        # rekurzivni restart s vymazanym statem
+        sync_col.delete_one({"_id": state_id})
+        return sync_folder(col, sync_col, mailbox, folder, dry_run, limit)
+
+    print(f"  new={stats['new']}  sync={stats['sync']}  removed={stats['removed']}  err={stats['errors']}")
+
+    # Ulozit sync_state pokud mame final_delta a neni dry run
+    if final_delta and not dry_run:
+        sync_col.update_one(
+            {"_id": state_id},
+            {
+                "$set": {
+                    "mailbox":     mailbox,
+                    "folder_id":   fid,
+                    "folder_path": fpath,
+                    "delta_link":  final_delta,
+                    "last_run_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                },
+                "$inc": {
+                    "cumulative_new":     stats["new"],
+                    "cumulative_sync":    stats["sync"],
+                    "cumulative_removed": stats["removed"],
+                    "run_count":          1,
+                },
+            },
+            upsert=True,
+        )
+    elif not final_delta:
+        # neprisel deltaLink (napr. limit nebo chyba) — nemenime state, pristi beh
+        # bude pokracovat normalne podle stareho deltaLinku nebo zacne od fresh
+        if not is_first_run:
+            print(f"  [pozn] delta neukoncena — pristi beh pojede od ulozeneho deltaLinku")
+
+    return stats
+
+
+def process_item(col, mailbox: str, folder_path: str, item: dict, stats: dict, dry_run: bool):
+    """Zpracuje jednu polozku z delta odpovedi."""
+    # 1) Smazana zprava (@removed)
+    if "@removed" in item or item.get("@removed.reason"):
+        graph_id = item.get("id")
+        if not graph_id:
+            return
+        if dry_run:
+            print(f"  REMOVED  graph_id={graph_id[:30]}...")
+        else:
+            col.update_one(
+                {"graph_id": graph_id},
+                {"$set": {
+                    "permanently_deleted":    True,
+                    "permanently_deleted_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }},
+            )
+        stats["removed"] += 1
+        return
+
+    # 2) Nova nebo zmenena zprava — rozhodneme podle existence graph_id v Mongo
+    graph_id = item.get("id")
+    if not graph_id:
+        return
+
+    existing = col.find_one({"graph_id": graph_id}, {"_id": 1})
+
+    if existing:
+        # Existujici zprava — update jen sync poli (delta payload je obsahuje)
+        fields = extract_sync_fields(item, folder_path)
+        if dry_run:
+            print(f"  SYNC     {item.get('subject','')[:60]}")
+        else:
+            col.update_one({"_id": existing["_id"]}, {"$set": fields})
+        stats["sync"] += 1
+    else:
+        # Nova zprava — pro telo+attachments+headers fetchneme plnou verzi
+        full = fetch_full_message(mailbox, graph_id)
+        if full is None:
+            stats["errors"] += 1
+            return
+        doc = extract_message(full, folder_path)
+        if doc is None:
+            stats["errors"] += 1
+            return
+        if dry_run:
+            print(f"  NEW      {doc.get('subject','')[:60]}")
+        else:
+            col.update_one({"_id": doc["_id"]}, {"$set": doc}, upsert=True)
+        stats["new"] += 1
+
+
+# ─── Indexy pro sync_state ────────────────────────────────────────────────────
+
+def ensure_sync_state_indexes(sync_col):
+    sync_col.create_index([("mailbox", ASCENDING), ("folder_id", ASCENDING)])
+    sync_col.create_index([("last_run_at", ASCENDING)])
+
+
+def ensure_perm_deleted_index(col):
+    col.create_index([("permanently_deleted", ASCENDING)], sparse=True)
+
+
+# ─── Main ─────────────────────────────────────────────────────────────────────
+
+def discover_mailboxes(db) -> list[str]:
+    """Vrati seznam mailboxu = vsechny kolekce v `emaily` mimo NON_MAILBOX_COLLECTIONS
+    a SKIP_MAILBOXES."""
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        if name in SKIP_MAILBOXES:
+            print(f"  [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
+            continue
+        out.append(name)
+    return out
+
+
+def sync_mailbox(client, mailbox: str, args) -> dict:
+    """Sync jedne schranky. Vraci totals dict."""
+    _v14.GRAPH_MAILBOX = mailbox
+
+    print(f"\n========== {mailbox} ==========")
+
+    col      = client[MONGO_DB][mailbox]
+    sync_col = client[MONGO_DB][SYNC_STATE_COL]
+
+    if not args.dry_run:
+        ensure_sync_state_indexes(sync_col)
+        ensure_perm_deleted_index(col)
+
+    if args.reset:
+        n = sync_col.delete_many({"mailbox": mailbox}).deleted_count
+        print(f"  --reset: smazano {n} deltaLinku pro {mailbox}")
+
+    print("Nacitam seznam slozek...")
+    try:
+        folders = get_all_folders(mailbox)
+    except requests.HTTPError as e:
+        print(f"  CHYBA: nelze nacist slozky pro {mailbox}: {e}")
+        logging.error("get_all_folders %s: %s", mailbox, e)
+        return {"new": 0, "sync": 0, "removed": 0, "errors": 1}
+
+    if args.folder:
+        folders = [f for f in folders if args.folder.lower() in f["path"].lower()]
+    print(f"  Slozek ke zpracovani: {len(folders)}")
+
+    totals = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
+    for folder in folders:
+        s = sync_folder(col, sync_col, mailbox, folder, args.dry_run, args.limit)
+        for k in totals:
+            totals[k] += s[k]
+    print(f"  -> mailbox total: new={totals['new']}  sync={totals['sync']}  removed={totals['removed']}  err={totals['errors']}")
+    return totals
+
+
+def main():
+    ap = argparse.ArgumentParser(description=f"parse_emails_graph delta sync v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox", default="",
+                    help="E-mail schranky (= kolekce v Mongo). "
+                         "Bez argumentu projede vsechny schranky z `emaily` (mimo SKIP_MAILBOXES).")
+    ap.add_argument("--folder",  default="",   help="Filtruje slozky obsahujici tento retezec (default: vsechny)")
+    ap.add_argument("--limit",   type=int, default=0, help="Max polozek na slozku (test)")
+    ap.add_argument("--reset",   action="store_true",
+                    help="Smaze deltaLinky pro vybrane schranky — pristi beh zacne od fresh delta")
+    ap.add_argument("--dry-run", action="store_true", help="Nic neulozi do Mongo, jen vypise co by se stalo")
+    args = ap.parse_args()
+
+    print(f"=== Delta sync v{SCRIPT_VERSION} ===")
+    if args.dry_run:
+        print("  DRY-RUN — zadne zmeny v Mongo")
+
+    print("Pripojuji se k MongoDB...")
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    client.admin.command("ping")
+    db = client[MONGO_DB]
+
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"  CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            sys.exit(2)
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+        print(f"  Schranky ke zpracovani: {len(mailboxes)}")
+        for m in mailboxes:
+            print(f"    {m}")
+
+    print("Token Graph API...")
+    get_token()
+    print("  OK")
+
+    t0 = time.time()
+    grand = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
+    per_mailbox = []
+    for mb in mailboxes:
+        try:
+            s = sync_mailbox(client, mb, args)
+        except Exception as e:
+            print(f"  FATAL pri sync {mb}: {e}")
+            logging.error("sync_mailbox %s: %s", mb, e)
+            s = {"new": 0, "sync": 0, "removed": 0, "errors": 1}
+        per_mailbox.append((mb, s))
+        for k in grand:
+            grand[k] += s[k]
+
+    dt = time.time() - t0
+    print(f"\n=== SHRNUTI ===")
+    for mb, s in per_mailbox:
+        print(f"  {mb:40} new={s['new']:>5} sync={s['sync']:>5} removed={s['removed']:>4} err={s['errors']:>3}")
+    print(f"  {'TOTAL':40} new={grand['new']:>5} sync={grand['sync']:>5} removed={grand['removed']:>4} err={grand['errors']:>3}")
+    print(f"  trvalo: {dt:.1f} s")
+    return 1 if grand["errors"] > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
@@ -0,0 +1,34 @@
+# 2_refetch_text_bodies_v1.0.py
+
+**Krok 2 pipeline** — ONETIME oprava starých plain-text emailů. Starý `parse_emails_graph_v1.3` ukládal plain-text emaily jen jako prvních 2000 znaků do `body_preview`; plné tělo se zahazovalo. Tento skript najde takové emaily a re-fetchne plný obsah do nového pole `body_text` (max 2 MB).
+
+> Pro schránky importované rovnou v1.4 nemá co dělat (kandidátů 0). Drží se kvůli archivním schránkám importovaným ve v1.3.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka |
+| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
+
+## Varianty volání
+
+```bash
+# Všechny schránky:
+docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Test 20 emailů:
+docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz --limit 20
+
+# Plný běh na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/2_refetch_text_bodies_v1.0.py > /scripts/refetch.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/refetch.log
+```
@@ -0,0 +1,270 @@
+"""
+==============================================================================
+Skript:   refetch_text_bodies_v1.0.py
+Verze:    1.0
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  ONETIME oprava — parse_emails_graph_v1.3 ukladal plain-text emaily jen jako
+  prvnich 2000 znaku do `body_preview`. Plne telo se zahazovalo.
+
+  Tento skript:
+    1) Najde v Mongo emaily kde body_html IS NULL/missing/empty
+       a soucasne maji graph_id (lze refetch)
+    2) Pro kazdy GET /users/{mailbox}/messages/{graph_id}?$select=body,bodyPreview
+    3) Pokud body.contentType == 'text' -> ulozi PLNY obsah do noveho pole
+       body_text (max 2 MB - stejny limit jako body_html)
+    4) Pokud body.contentType == 'html' (Graph mezitim prepnul) -> ulozi do body_html
+    5) Aktualizuje body_preview na realny 255-znakovy bodyPreview z Graphu
+
+  Bezpecne preusitelne a opakovatelne - skript znovu refetchne jen ty kde
+  stale chybi body_html i body_text.
+
+Spusteni:
+  python refetch_text_bodies_v1.0.py                      # vsechny schranky
+  python refetch_text_bodies_v1.0.py --mailbox vladimir.buzalka@buzalka.cz
+  python refetch_text_bodies_v1.0.py --limit 100          # test
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# --- konfigurace ------------------------------------------------------------
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB  = "emaily"
+SKIP_COLLECTIONS = {"attachments_index"}
+
+MAX_BODY_BYTES = 2 * 1024 * 1024   # 2 MB - stejny limit jako body_html v parseru
+BATCH_SIZE = 50
+LOG_FILE = Path(__file__).parent / "refetch_text_bodies_errors.log"
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+
+# --- Graph auth -------------------------------------------------------------
+_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in res:
+        raise RuntimeError(f"Graph auth failed: {res}")
+    _token = res["access_token"]
+    return _token
+
+
+def graph_get(url: str, params: dict = None) -> Optional[dict]:
+    global _token
+    if not _token:
+        get_token()
+    for attempt in range(3):
+        try:
+            r = requests.get(
+                url,
+                headers={"Authorization": f"Bearer {_token}"},
+                params=params,
+                timeout=30,
+            )
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code == 404:
+                return None  # zprava uz neexistuje na strane Outlook
+            if r.status_code == 429:
+                wait = int(r.headers.get("Retry-After", "5"))
+                print(f"  [429] throttled, cekam {wait}s", flush=True)
+                time.sleep(wait)
+                continue
+            r.raise_for_status()
+            return r.json()
+        except requests.RequestException as e:
+            if attempt == 2:
+                raise
+            time.sleep(2)
+    return None
+
+
+# --- hlavni smycka ----------------------------------------------------------
+
+# emaily kde chybi obe tela (body_html i body_text) - tj. jeste nezpracovane
+EMPTY_BODY_FILTER = {
+    "$and": [
+        {"$or": [
+            {"body_html": None},
+            {"body_html": {"$exists": False}},
+            {"body_html": ""},
+        ]},
+        {"$or": [
+            {"body_text": None},
+            {"body_text": {"$exists": False}},
+            {"body_text": ""},
+        ]},
+        {"graph_id": {"$exists": True, "$ne": None, "$ne": ""}},
+    ]
+}
+
+
+def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict:
+    total = col.count_documents(EMPTY_BODY_FILTER)
+    print(f"[{mailbox}] kandidatu k refetchi: {total}"
+          + (f" (limit {limit})" if limit else ""))
+    if total == 0:
+        return {"mailbox": mailbox, "candidates": 0, "refetched": 0,
+                "text": 0, "html": 0, "still_empty": 0, "errors": 0, "missing": 0}
+
+    cursor = col.find(EMPTY_BODY_FILTER, {"_id": 1, "graph_id": 1},
+                      no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    n = refetched = txt = html = still_empty = err = missing = 0
+    bulk: list[UpdateOne] = []
+
+    try:
+        for doc in cursor:
+            n += 1
+            mid = doc["_id"]
+            gid = doc["graph_id"]
+            url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}"
+            params = {"$select": "body,bodyPreview"}
+            try:
+                data = graph_get(url, params)
+            except Exception as e:
+                err += 1
+                logging.error("[%s] graph_get %s: %s", mailbox, gid, e)
+                continue
+
+            if data is None:
+                missing += 1
+                continue
+
+            body = data.get("body") or {}
+            ctype = body.get("contentType")
+            content = body.get("content") or ""
+            preview = data.get("bodyPreview") or ""
+
+            update: dict = {"refetched_at": datetime.now(timezone.utc).replace(tzinfo=None)}
+
+            if not content:
+                still_empty += 1
+                update["body_refetch_status"] = "graph_empty"
+            elif ctype == "html":
+                update["body_html"] = (content[:MAX_BODY_BYTES]
+                                       if len(content) > MAX_BODY_BYTES else content)
+                update["body_refetch_status"] = "html"
+                html += 1
+                refetched += 1
+            elif ctype == "text":
+                update["body_text"] = (content[:MAX_BODY_BYTES]
+                                       if len(content) > MAX_BODY_BYTES else content)
+                update["body_refetch_status"] = "text"
+                txt += 1
+                refetched += 1
+            else:
+                update["body_refetch_status"] = f"unknown_ctype:{ctype}"
+                still_empty += 1
+
+            if preview:
+                update["body_preview"] = preview[:300]
+
+            bulk.append(UpdateOne({"_id": mid}, {"$set": update}))
+
+            if len(bulk) >= BATCH_SIZE:
+                col.bulk_write(bulk, ordered=False)
+                bulk.clear()
+
+            if n % 100 == 0 or n == 1:
+                print(f"  [{n:>5}/{total}] refetched={refetched}  "
+                      f"text={txt} html={html} still_empty={still_empty} "
+                      f"missing={missing} err={err}",
+                      flush=True)
+    finally:
+        cursor.close()
+        if bulk:
+            col.bulk_write(bulk, ordered=False)
+
+    print(f"  [{n}/{total}] DONE  refetched={refetched}  text={txt} html={html} "
+          f"still_empty={still_empty} missing={missing} err={err}")
+    return {"mailbox": mailbox, "candidates": total, "refetched": refetched,
+            "text": txt, "html": html, "still_empty": still_empty,
+            "errors": err, "missing": missing}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    print("Token Graph API...")
+    get_token()
+    print("OK\n")
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}\n")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(db[mb], mb, limit=args.limit))
+        print()
+
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: candidates={r['candidates']}  "
+              f"refetched={r['refetched']}  text={r['text']}  html={r['html']}  "
+              f"still_empty={r['still_empty']}  missing={r['missing']}  errors={r['errors']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
@@ -0,0 +1,47 @@
+# 3_download_attachments_v1.3.py
+
+**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails/<schránka>/Attachments/`. Deduplikace podle **SHA256** obsahu:
+- stejný hash → soubor už existuje → přeskočí
+- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf` …
+
+Po uložení doplní do Mongo `file_hash` + `local_path` a aktualizuje kolekci `emaily.attachments_index` (`_id`=hash, filename, path, size, mime, mailbox, ref_count). Emaily kde mají všechny přílohy `file_hash` → skip → **bezpečné opakovat**.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) |
+| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N emailů **per schránka** (test) |
+| `--force-recheck` | ne | flag | false | Znovu ověří i už stažené přílohy |
+| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci |
+
+## SKIP_MAILBOXES (hardcoded)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
+
+## Varianty volání
+
+```bash
+# VŠECHNY schránky (mimo SKIP_MAILBOXES):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py
+
+# Jedna schránka interaktivně:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
+
+# Test 50 emailů:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Force-recheck (znovu ověří všechny):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+# Na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/download_attachments.log
+```
@@ -0,0 +1,546 @@
+"""
+download_attachments_v1.3.py
+Nazev:  download_attachments_v1.3.py
+Verze:  1.3
+Datum:  2026-06-02
+Autor:  vladimir.buzalka
+
+Popis:
+    Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
+    pres Microsoft Graph API a uklada je do adresare
+    /mnt/Emails/<schránka>/Attachments/.
+
+    Schránka se predava jako povinny parametr --mailbox.
+
+    Deduplikace podle SHA256 hashe obsahu:
+        - stejny hash = soubor uz existuje -> preskoci
+        - prvni vyskytu souboru: ulozi pod puvodnimnazvem
+        - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
+
+    Po ulozeni aktualizuje MongoDB:
+        - v email dokumentu: kazda priloha dostane file_hash + local_path
+        - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
+          mime_type, mailbox, first_seen_at, ref_count
+
+    Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
+    se preskoci. --force-recheck znovu overi i uz stazene.
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    python download_attachments_v1.3.py                                       # VSECHNY schranky (mimo SKIP_MAILBOXES)
+    python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz       # jedna schranka
+    python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50
+    python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+SKIP_MAILBOXES (hardcoded):
+    vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup.
+
+Docker:
+    docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py
+
+Zavislosti:
+    msal, requests, pymongo
+    Python 3.10+
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze
+    1.1  2026-06-02  Schránka jako parametr --mailbox
+    1.2  2026-06-02  Oprava: Graph attachment mapa vcetne inline; normalizace nazvu;
+                     preskoceni S/MIME; inline z Graphu -> SKIP ne ERR
+    1.3  2026-06-02  Primarni stazeni pres graph_att_id (prime ID bez name-matchingu);
+                     oprava $select na attachment listu (odstranen contentId ktery
+                     zpusoboval BadRequest a vracel prazdny seznam); name-matching
+                     zustava jako fallback pro stare emaily bez graph_att_id
+"""
+
+import sys
+import re
+import hashlib
+import logging
+import argparse
+import unicodedata
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI           = "mongodb://192.168.1.76:27017"
+MONGO_DB            = "emaily"
+MONGO_COL_INDEX     = "attachments_index"
+
+EMAILS_BASE_DIR     = Path("/mnt/Emails")
+LOG_FILE            = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION      = "1.3"
+BATCH_SIZE          = 50
+
+# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
+SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky kde NEMAME Graph API pristup — pri behu bez --mailbox se preskocia
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",   # JNJ tenant — nemame Graph credentials
+}
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+_graph_token: Optional[str] = None
+
+
+# ─── Graph API ────────────────────────────────────────────────────────────────
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def graph_get_bytes(url: str) -> bytes:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.content
+    raise RuntimeError(f"Graph GET bytes failed: {url}")
+
+
+def graph_get_json(url: str, params: dict = None) -> dict:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET json failed: {url}")
+
+
+def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
+    """Nacte metadata vsech priloh zpravy (bez contentBytes)."""
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
+    try:
+        # Pozor: contentId NENI v base attachment type — nesmi byt v $select
+        data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
+        return data.get("value", [])
+    except Exception as e:
+        logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
+        return []
+
+
+def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
+    try:
+        return graph_get_bytes(url)
+    except Exception as e:
+        logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s",
+                      graph_message_id, attachment_id, e)
+        return None
+
+
+# ─── Pomocné funkce ───────────────────────────────────────────────────────────
+
+def normalize_name(name: str) -> str:
+    """Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-"""
+    nfkd = unicodedata.normalize("NFKD", name.lower().strip())
+    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
+    return re.sub(r"[^\w.\-]", "_", ascii_str)
+
+
+def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
+    """Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id)."""
+    # 1. Presna shoda
+    for ga in graph_atts:
+        if ga["name"] == att_name:
+            return ga
+
+    norm_want = normalize_name(att_name)
+
+    # 2. Normalizovana shoda
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            return ga
+
+    # 3. Normalizovana shoda + velikost (±10 %)
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            ga_size = ga.get("size", 0)
+            if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
+                return ga
+
+    # 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména)
+    for ga in graph_atts:
+        if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
+            return ga
+
+    return None
+
+
+def sha256(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def safe_filename(name: str) -> str:
+    safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
+    return safe or "attachment"
+
+
+def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
+    existing = col_index.find_one({"filename": desired_name})
+    if existing:
+        if existing["_id"] == hash_val:
+            return desired_name
+        stem   = Path(desired_name).stem
+        suffix = Path(desired_name).suffix
+        n = 2
+        while True:
+            candidate = f"{stem}_{n}{suffix}"
+            ex2 = col_index.find_one({"filename": candidate})
+            if not ex2 or ex2["_id"] == hash_val:
+                if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
+                    return candidate
+            n += 1
+    return desired_name
+
+
+def save_attachment(
+    content: bytes,
+    original_name: str,
+    mime_type: str,
+    mailbox: str,
+    att_dir: Path,
+    col_index,
+) -> tuple[str, str, bool]:
+    hash_val = sha256(content)
+
+    existing = col_index.find_one({"_id": hash_val})
+    if existing:
+        col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
+        return hash_val, existing["local_path"], False
+
+    filename  = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
+    file_path = att_dir / filename
+    file_path.write_bytes(content)
+
+    col_index.insert_one({
+        "_id":           hash_val,
+        "filename":      filename,
+        "local_path":    filename,
+        "size_bytes":    len(content),
+        "mime_type":     mime_type,
+        "mailbox":       mailbox,
+        "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        "ref_count":     1,
+    })
+
+    return hash_val, filename, True
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def process_mailbox(client, mailbox: str, args) -> dict:
+    """Zpracuje jednu schranku. Vraci statistiky."""
+    att_dir   = EMAILS_BASE_DIR / mailbox / "Attachments"
+    mongo_col = mailbox
+
+    start = datetime.now()
+    print(f"\n========== {mailbox} ==========")
+    print(f"Cilovy adresar: {att_dir}")
+
+    att_dir.mkdir(parents=True, exist_ok=True)
+
+    col_emails = client[MONGO_DB][mongo_col]
+    col_index  = client[MONGO_DB][MONGO_COL_INDEX]
+
+    if args.force_recheck:
+        query = {"has_attachments": True}
+    else:
+        query = {
+            "has_attachments": True,
+            "attachments": {
+                "$elemMatch": {
+                    "is_inline": False,
+                    "file_hash": {"$exists": False},
+                }
+            }
+        }
+
+    total = col_emails.count_documents(query)
+    print(f"Emailu ke zpracovani: {total}")
+    if total == 0:
+        print("  Neni co stahnout.")
+        return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0, "err": 0,
+                "elapsed": 0.0}
+
+    cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
+    if args.limit:
+        cursor = cursor.limit(args.limit)
+
+    ok_count   = 0
+    new_count  = 0
+    dup_count  = 0
+    skip_count = 0
+    err_count  = 0
+    email_i    = 0
+    batch      = []
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col_emails.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    for email_doc in cursor:
+        email_i  += 1
+        email_id  = email_doc["_id"]
+        graph_id  = email_doc.get("graph_id", "")
+        subject   = (email_doc.get("subject") or "")[:60]
+        att_list  = email_doc.get("attachments") or []
+
+        real_atts = [a for a in att_list if not a.get("is_inline", False)]
+        if not real_atts:
+            continue
+
+        print(f"\n  {email_i:>5}/{total}  {subject}")
+
+        # Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id
+        need_listing = any(
+            not a.get("is_inline", False)
+            and not (not args.force_recheck and a.get("file_hash"))
+            and not a.get("graph_att_id")
+            for a in att_list
+        )
+        graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
+
+        updated_atts = list(att_list)
+        email_ok     = True
+
+        for i, att in enumerate(updated_atts):
+            if att.get("is_inline", False):
+                continue
+            if not args.force_recheck and att.get("file_hash"):
+                continue
+
+            att_name     = att.get("filename", "")
+            att_size     = att.get("size_bytes", 0)
+            graph_att_id = att.get("graph_att_id")
+
+            # Preskoc S/MIME podpisy
+            if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
+                updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
+                skip_count += 1
+                print(f"         SKIP  {att_name} (S/MIME)")
+                continue
+
+            # Primy pristup pres graph_att_id (emaily parsovane v1.2+)
+            if graph_att_id:
+                content = fetch_attachment_content(mailbox, graph_id, graph_att_id)
+                if content is None:
+                    err_count += 1
+                    email_ok = False
+                    print(f"         ERR   {att_name} (stazeni selhalo)")
+                    continue
+                # Zkontroluj zda jde skutecne o inline (pro edge case)
+                mime_type = att.get("mime_type", "")
+            else:
+                # Fallback: name matching pro stare emaily (parsovane pred v1.2)
+                graph_att = find_graph_att(att_name, att_size, graph_atts)
+
+                if not graph_att:
+                    logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
+                    print(f"         ERR   {att_name} (nenalezeno)")
+                    err_count += 1
+                    email_ok = False
+                    continue
+
+                # Pokud Graph rika ze je inline — preskoc
+                if graph_att.get("isInline", False):
+                    updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
+                    skip_count += 1
+                    print(f"         SKIP  {att_name} (inline obrazek)")
+                    continue
+
+                content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
+                if content is None:
+                    err_count += 1
+                    email_ok = False
+                    print(f"         ERR   {att_name} (stazeni selhalo)")
+                    continue
+
+                mime_type = att.get("mime_type") or graph_att.get("contentType", "")
+
+            hash_val, local_path, was_new = save_attachment(
+                content, att_name, mime_type, mailbox, att_dir, col_index
+            )
+
+            updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
+
+            if was_new:
+                new_count += 1
+                print(f"         NEW   {local_path}  ({len(content):,} B)")
+            else:
+                dup_count += 1
+                print(f"         DUP   {att_name} -> {local_path}")
+
+        if email_ok:
+            ok_count += 1
+
+        batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
+
+        if len(batch) >= BATCH_SIZE:
+            flush()
+
+        if email_i % 100 == 0:
+            elapsed = (datetime.now() - start).total_seconds()
+            print(f"  {'─'*60}")
+            print(f"  Průběh: emaily={email_i}/{total}  nove={new_count}  dup={dup_count}  skip={skip_count}  err={err_count}")
+            print(f"  {'─'*60}")
+
+    flush()
+
+    elapsed = (datetime.now() - start).total_seconds()
+    print(f"  -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
+          f"skip={skip_count} err={err_count} ({elapsed:.1f} s)")
+    return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
+            "skip": skip_count, "err": err_count, "elapsed": elapsed}
+
+
+def discover_mailboxes(db) -> list[str]:
+    """Vrati seznam mailboxu = vsechny kolekce mimo NON_MAILBOX a SKIP_MAILBOXES."""
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        if name in SKIP_MAILBOXES:
+            print(f"  [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
+            continue
+        out.append(name)
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox",       default="",
+                    help="Emailova schranka. Bez argumentu projede vsechny schranky "
+                         "v `emaily` mimo SKIP_MAILBOXES.")
+    ap.add_argument("--limit",         type=int, default=0,
+                    help="Zpracovat max N emailu (0 = vse) — per schranka")
+    ap.add_argument("--force-recheck", action="store_true",
+                    help="Znovu overi i emaily kde prilohy uz maji file_hash")
+    ap.add_argument("--no-indexes",    action="store_true",
+                    help="Nevytvorit indexy na attachments_index kolekci")
+    args = ap.parse_args()
+
+    start_all = datetime.now()
+    print(f"=== download_attachments v{SCRIPT_VERSION} ===")
+    print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+
+    col_index = client[MONGO_DB][MONGO_COL_INDEX]
+    if not args.no_indexes:
+        col_index.create_index("filename")
+        col_index.create_index("mime_type")
+        col_index.create_index("mailbox")
+
+    db = client[MONGO_DB]
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"  CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            sys.exit(2)
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+        print(f"  Schranky ke zpracovani: {len(mailboxes)}")
+        for m in mailboxes:
+            print(f"    {m}")
+
+    results = []
+    for mb in mailboxes:
+        try:
+            results.append(process_mailbox(client, mb, args))
+        except Exception as e:
+            logging.error("process_mailbox %s: %s", mb, e)
+            print(f"  FATAL pri zpracovani {mb}: {e}")
+            results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
+                            "skip": 0, "err": 1, "elapsed": 0.0})
+
+    elapsed_total = (datetime.now() - start_all).total_seconds()
+    files_total   = col_index.count_documents({})
+    size_total    = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
+
+    grand = {k: sum(r[k] for r in results) for k in ("ok", "new", "dup", "skip", "err")}
+
+    print(f"\n{'='*60}")
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
+              f"dup={r['dup']:>4} skip={r['skip']:>3} err={r['err']:>3}")
+    print(f"  {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
+          f"dup={grand['dup']:>4} skip={grand['skip']:>3} err={grand['err']:>3}")
+    print(f"Souboru v indexu: {files_total}  ({size_total / 1024 / 1024:.1f} MB)")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if grand['err']:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,74 @@
+# 3_download_attachments_v1.4.py
+
+**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails/<schránka>/Attachments/`. Deduplikace podle **SHA256** obsahu.
+
+## Nové ve verzi 1.4
+
+| Typ přílohy | `@odata.type` | Co skript dělá |
+|---|---|---|
+| **File** | `#microsoft.graph.fileAttachment` | Stáhne přes `/$value`, uloží binárku |
+| **Item** (vnořený email) | `#microsoft.graph.itemAttachment` | `$expand=...itemAttachment/item`, sestaví **`.eml`** z hlaviček a body vnitřní zprávy |
+| **Reference** (OneDrive/SharePoint link) | `#microsoft.graph.referenceAttachment` | Žádný file — uloží jen `reference_url` do Mongo |
+
+Plus:
+- **Retry** s exponenciálním backoffem na 429/500/502/503/504 (1s, 2s, 4s; respektuje `Retry-After`).
+- **Permanentní označení chyb v Mongo** per-attachment:
+  - `attachment_missing: True` + `attachment_missing_at: <UTC>` při 404 (email/příloha už neexistuje v mailboxu)
+  - `attachment_reference: True` + `reference_url: <URL>` u referenceAttachment
+- Tagované přílohy se při dalším běhu **automaticky přeskočí** (bez `--force-recheck`).
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Schránka = kolekce v Mongo. Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` a `SKIP_MAILBOXES` |
+| `--limit N` | ne | int | 0 | Per schránka, jen prvních N emailů (test) |
+| `--force-recheck` | ne | flag | false | Znovu ověří i emaily kde přílohy mají `file_hash` **nebo** `attachment_missing` **nebo** `attachment_reference` |
+| `--no-indexes` | ne | flag | false | Nevytváří indexy na `attachments_index` |
+
+## SKIP_MAILBOXES (hardcoded)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
+
+## Statistiky per schránka
+
+```
+ok=N nove=N dup=N skip=N miss=N ref=N err=N
+```
+
+| Kategorie | Význam |
+|---|---|
+| `ok` | emaily zpracované bez chyby (všechny přílohy hotové) |
+| `nove` | nové soubory uložené (NEW + NEW(eml)) |
+| `dup` | hash už existuje (jen ref_count++) |
+| `skip` | S/MIME (.p7m/.p7s/...) nebo inline obrázek |
+| `miss` | 404 — označeno `attachment_missing` (nepokračuje se) |
+| `ref` | referenceAttachment — uložen jen URL |
+| `err` | tranzientní chyba (5xx, timeout) — bude retry při dalším běhu |
+
+## Varianty volání
+
+```bash
+# Všechny schránky (mimo SKIP_MAILBOXES):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz
+
+# Test 50 emailů:
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Force-recheck (i missing/reference přepíše):
+docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+# Na pozadí:
+docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.4.py > /scripts/download_attachments.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/download_attachments.log
+```
@@ -0,0 +1,713 @@
+"""
+download_attachments_v1.4.py
+Nazev:  download_attachments_v1.4.py
+Verze:  1.4
+Datum:  2026-06-04
+Autor:  vladimir.buzalka
+
+Popis:
+    Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
+    pres Microsoft Graph API a uklada je do adresare
+    /mnt/Emails/<schranka>/Attachments/.
+
+    Bez argumentu --mailbox projede vsechny kolekce v `emaily` mimo
+    NON_MAILBOX_COLLECTIONS a SKIP_MAILBOXES.
+
+    Deduplikace podle SHA256 hashe obsahu:
+        - stejny hash = soubor uz existuje -> preskoci
+        - prvni vyskyt: ulozi pod puvodnim nazvem
+        - kolize nazvu: faktura_2.pdf, faktura_3.pdf ...
+
+    Po ulozeni aktualizuje MongoDB:
+        - v email dokumentu: kazda priloha dostane file_hash + local_path
+        - kolekce emaily.attachments_index: _id=hash, filename, ...
+
+    NOVE v 1.4:
+        - Spravne zpracovani vsech typu priloh:
+          * fileAttachment   -> /$value (jako predtim)
+          * itemAttachment   -> /$expand=microsoft.graph.itemAttachment/item
+                                -> sestavi .eml z vnitrni zpravy
+          * referenceAttachment -> ulozi jen URL, neexistuje content
+        - Retry s exponencialnim backoffem (1s, 2s, 4s) na 429/5xx
+        - Permanentni tagging chyb v Mongo per-attachment:
+          * attachment_missing: True       (404, email/att uz neexistuje)
+          * attachment_reference: True     (referenceAttachment, jen URL)
+          * reference_url, attachment_type — diagnosticke metadata
+        - Tagovane prilohy se pri dalsim behu preskocia (bez --force-recheck)
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    python download_attachments_v1.4.py
+    python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz
+    python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50
+    python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck
+
+SKIP_MAILBOXES (hardcoded):
+    vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup.
+
+Docker:
+    docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py
+
+Zavislosti:
+    msal, requests, pymongo
+    Python 3.10+
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze
+    1.1  2026-06-02  Schranka jako parametr --mailbox
+    1.2  2026-06-02  Oprava: Graph attachment mapa vcetne inline; normalizace nazvu
+    1.3  2026-06-02  Primarni stazeni pres graph_att_id; --mailbox volitelny
+    1.4  2026-06-04  itemAttachment/referenceAttachment handling; retry s backoffem;
+                     permanentni tagging chyb (attachment_missing / attachment_reference)
+"""
+
+import sys
+import re
+import time
+import json
+import hashlib
+import logging
+import argparse
+import unicodedata
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI           = "mongodb://192.168.1.76:27017"
+MONGO_DB            = "emaily"
+MONGO_COL_INDEX     = "attachments_index"
+
+EMAILS_BASE_DIR     = Path("/mnt/Emails")
+LOG_FILE            = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION      = "1.4"
+BATCH_SIZE          = 50
+
+# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
+SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky kde NEMAME Graph API pristup
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",
+}
+
+# Retry konfigurace pro tranzientni chyby
+RETRY_STATUSES   = {429, 500, 502, 503, 504}
+RETRY_BACKOFF_S  = [1, 2, 4]  # max 3 pokusy
+
+# Sentinel hodnoty pro fetch_attachment_smart
+FETCH_MISSING    = "__MISSING__"     # 404
+FETCH_REFERENCE  = "__REFERENCE__"   # referenceAttachment
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+_graph_token: Optional[str] = None
+
+
+# ─── Graph API ────────────────────────────────────────────────────────────────
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def _graph_request(method: str, url: str, *, params: dict = None,
+                   stream: bool = False, timeout: int = 60):
+    """Nizko-urovnova HTTP volani s retry na 429/5xx a auto-reauth na 401.
+    Vraci requests.Response (pro stream=True pred .content); pro 404 vraci Response."""
+    global _graph_token
+    if not _graph_token:
+        get_token()
+
+    last_exc = None
+    for attempt in range(len(RETRY_BACKOFF_S) + 1):
+        try:
+            r = requests.request(
+                method, url,
+                headers={"Authorization": f"Bearer {_graph_token}"},
+                params=params, timeout=timeout, stream=stream,
+            )
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code in RETRY_STATUSES and attempt < len(RETRY_BACKOFF_S):
+                # Retry-After hlavicka ma prednost
+                ra = r.headers.get("Retry-After")
+                sleep_s = float(ra) if ra and ra.replace(".", "").isdigit() else RETRY_BACKOFF_S[attempt]
+                time.sleep(sleep_s)
+                continue
+            return r
+        except (requests.ConnectionError, requests.Timeout) as e:
+            last_exc = e
+            if attempt < len(RETRY_BACKOFF_S):
+                time.sleep(RETRY_BACKOFF_S[attempt])
+                continue
+            raise
+    raise RuntimeError(f"Graph request exhausted retries: {url} (last_exc={last_exc})")
+
+
+def graph_get_json(url: str, params: dict = None) -> dict:
+    r = _graph_request("GET", url, params=params, timeout=30)
+    r.raise_for_status()
+    return r.json()
+
+
+def graph_get_bytes(url: str) -> bytes:
+    r = _graph_request("GET", url, stream=True, timeout=120)
+    r.raise_for_status()
+    return r.content
+
+
+def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
+    """Nacte metadata vsech priloh zpravy. Vraci i @odata.type."""
+    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
+    try:
+        # @odata.type se vraci automaticky (neni v base $select)
+        data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
+        return data.get("value", [])
+    except Exception as e:
+        logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
+        return []
+
+
+def _build_eml_from_item(item: dict) -> bytes:
+    """Sestavi minimalni RFC822 .eml z itemAttachment.item (message)."""
+    def hdr(name, val):
+        return f"{name}: {val}\r\n" if val else ""
+
+    def addrs(field):
+        rec = item.get(field) or []
+        out = []
+        for r in rec:
+            ea = r.get("emailAddress") or {}
+            name = ea.get("name", "")
+            addr = ea.get("address", "")
+            if name and addr:
+                out.append(f'"{name}" <{addr}>')
+            elif addr:
+                out.append(addr)
+        return ", ".join(out)
+
+    subj = item.get("subject", "")
+    sender = item.get("from") or item.get("sender") or {}
+    sender_ea = sender.get("emailAddress") or {}
+    from_str = (f'"{sender_ea.get("name","")}" <{sender_ea.get("address","")}>'
+                if sender_ea.get("address") else "")
+    sent = item.get("sentDateTime") or item.get("receivedDateTime") or ""
+
+    body = item.get("body") or {}
+    content_type = body.get("contentType", "text")  # 'text' | 'html'
+    body_content = body.get("content", "") or ""
+
+    mime_type = "text/html" if content_type.lower() == "html" else "text/plain"
+
+    headers = (
+        hdr("From", from_str)
+        + hdr("To", addrs("toRecipients"))
+        + hdr("Cc", addrs("ccRecipients"))
+        + hdr("Subject", subj)
+        + hdr("Date", sent)
+        + f"Content-Type: {mime_type}; charset=utf-8\r\n"
+        + "MIME-Version: 1.0\r\n"
+        + "\r\n"
+    )
+    return (headers + body_content).encode("utf-8", errors="replace")
+
+
+def fetch_attachment_smart(mailbox: str, graph_message_id: str,
+                           attachment_id: str, odata_type: str = "") -> tuple:
+    """Smart fetch: rozezna typ prilohy a vrati (content_bytes, type_str, extra).
+    type_str: 'file' | 'item' | 'reference' | FETCH_MISSING | FETCH_REFERENCE
+    extra: pri 'reference' = sourceUrl; pri 'item' = puvodni subject (info)
+    Vraci (None, FETCH_MISSING, None) pri 404.
+    Vyhazuje exception pri jinych failures po vycerpani retry.
+    """
+    base = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}"
+
+    # Zname typ → optimalni cesta
+    if odata_type == "#microsoft.graph.fileAttachment":
+        r = _graph_request("GET", base + "/$value", stream=True, timeout=120)
+        if r.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r.raise_for_status()
+        return (r.content, "file", None)
+
+    if odata_type == "#microsoft.graph.itemAttachment":
+        r = _graph_request("GET", base,
+                           params={"$expand": "microsoft.graph.itemAttachment/item"},
+                           timeout=60)
+        if r.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r.raise_for_status()
+        obj = r.json()
+        item = obj.get("item") or {}
+        return (_build_eml_from_item(item), "item", item.get("subject"))
+
+    if odata_type == "#microsoft.graph.referenceAttachment":
+        r = _graph_request("GET", base, timeout=30)
+        if r.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r.raise_for_status()
+        obj = r.json()
+        return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name"))
+
+    # Neznamy typ — zkus $value, pri 405 detekuj typ a rekurzivne zpracuj
+    r = _graph_request("GET", base + "/$value", stream=True, timeout=120)
+    if r.status_code == 404:
+        return (None, FETCH_MISSING, None)
+    if r.status_code == 405:
+        # Method Not Allowed -> neni fileAttachment; zjisti typ
+        r2 = _graph_request("GET", base, timeout=30)
+        if r2.status_code == 404:
+            return (None, FETCH_MISSING, None)
+        r2.raise_for_status()
+        obj = r2.json()
+        ot = obj.get("@odata.type", "")
+        if ot == "#microsoft.graph.itemAttachment":
+            # objekt nema item bez expand → druhy request
+            return fetch_attachment_smart(mailbox, graph_message_id, attachment_id, ot)
+        if ot == "#microsoft.graph.referenceAttachment":
+            return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name"))
+        # fallback: fileAttachment ale jeho contentBytes je v JSON
+        if ot == "#microsoft.graph.fileAttachment":
+            import base64
+            cb = obj.get("contentBytes")
+            if cb:
+                return (base64.b64decode(cb), "file", None)
+        raise RuntimeError(f"unknown attachment odata.type={ot}")
+    r.raise_for_status()
+    return (r.content, "file", None)
+
+
+# ─── Pomocne funkce ───────────────────────────────────────────────────────────
+
+def normalize_name(name: str) -> str:
+    nfkd = unicodedata.normalize("NFKD", name.lower().strip())
+    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
+    return re.sub(r"[^\w.\-]", "_", ascii_str)
+
+
+def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
+    for ga in graph_atts:
+        if ga["name"] == att_name:
+            return ga
+    norm_want = normalize_name(att_name)
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            return ga
+    for ga in graph_atts:
+        if normalize_name(ga["name"]) == norm_want:
+            ga_size = ga.get("size", 0)
+            if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
+                return ga
+    for ga in graph_atts:
+        if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
+            return ga
+    return None
+
+
+def sha256(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def safe_filename(name: str) -> str:
+    safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
+    return safe or "attachment"
+
+
+def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
+    existing = col_index.find_one({"filename": desired_name})
+    if existing:
+        if existing["_id"] == hash_val:
+            return desired_name
+        stem   = Path(desired_name).stem
+        suffix = Path(desired_name).suffix
+        n = 2
+        while True:
+            candidate = f"{stem}_{n}{suffix}"
+            ex2 = col_index.find_one({"filename": candidate})
+            if not ex2 or ex2["_id"] == hash_val:
+                if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
+                    return candidate
+            n += 1
+    return desired_name
+
+
+def save_attachment(content: bytes, original_name: str, mime_type: str,
+                    mailbox: str, att_dir: Path, col_index) -> tuple[str, str, bool]:
+    hash_val = sha256(content)
+    existing = col_index.find_one({"_id": hash_val})
+    if existing:
+        col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
+        return hash_val, existing["local_path"], False
+
+    filename  = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
+    file_path = att_dir / filename
+    file_path.write_bytes(content)
+
+    col_index.insert_one({
+        "_id":           hash_val,
+        "filename":      filename,
+        "local_path":    filename,
+        "size_bytes":    len(content),
+        "mime_type":     mime_type,
+        "mailbox":       mailbox,
+        "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        "ref_count":     1,
+    })
+    return hash_val, filename, True
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def process_mailbox(client, mailbox: str, args) -> dict:
+    att_dir   = EMAILS_BASE_DIR / mailbox / "Attachments"
+    mongo_col = mailbox
+
+    start = datetime.now()
+    print(f"\n========== {mailbox} ==========")
+    print(f"Cilovy adresar: {att_dir}")
+
+    att_dir.mkdir(parents=True, exist_ok=True)
+
+    col_emails = client[MONGO_DB][mongo_col]
+    col_index  = client[MONGO_DB][MONGO_COL_INDEX]
+
+    if args.force_recheck:
+        query = {"has_attachments": True}
+    else:
+        # priloha "ke zpracovani" = neni inline, nema file_hash, neni oznacena
+        # jako missing/reference
+        query = {
+            "has_attachments": True,
+            "attachments": {
+                "$elemMatch": {
+                    "is_inline": False,
+                    "file_hash": {"$exists": False},
+                    "attachment_missing": {"$ne": True},
+                    "attachment_reference": {"$ne": True},
+                }
+            }
+        }
+
+    total = col_emails.count_documents(query)
+    print(f"Emailu ke zpracovani: {total}")
+    if total == 0:
+        print("  Neni co stahnout.")
+        return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0,
+                "miss": 0, "ref": 0, "err": 0, "elapsed": 0.0}
+
+    cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
+    if args.limit:
+        cursor = cursor.limit(args.limit)
+
+    ok_count   = 0
+    new_count  = 0
+    dup_count  = 0
+    skip_count = 0
+    miss_count = 0
+    ref_count  = 0
+    err_count  = 0
+    email_i    = 0
+    batch      = []
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col_emails.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    for email_doc in cursor:
+        email_i  += 1
+        email_id  = email_doc["_id"]
+        graph_id  = email_doc.get("graph_id", "")
+        subject   = (email_doc.get("subject") or "")[:60]
+        att_list  = email_doc.get("attachments") or []
+
+        real_atts = [a for a in att_list if not a.get("is_inline", False)
+                     and not a.get("attachment_missing")
+                     and not a.get("attachment_reference")]
+        if not real_atts:
+            continue
+
+        print(f"\n  {email_i:>5}/{total}  {subject}")
+
+        need_listing = any(
+            not a.get("is_inline", False)
+            and not (not args.force_recheck and a.get("file_hash"))
+            and not a.get("graph_att_id")
+            for a in att_list
+        )
+        graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
+
+        # mapa graph_att_id -> @odata.type (z listingu pokud byl)
+        type_map = {ga["id"]: ga.get("@odata.type", "") for ga in graph_atts}
+
+        updated_atts = list(att_list)
+        email_ok     = True
+
+        for i, att in enumerate(updated_atts):
+            if att.get("is_inline", False):
+                continue
+            if att.get("attachment_missing") or att.get("attachment_reference"):
+                continue
+            if not args.force_recheck and att.get("file_hash"):
+                continue
+
+            att_name     = att.get("filename", "")
+            att_size     = att.get("size_bytes", 0)
+            graph_att_id = att.get("graph_att_id")
+
+            if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
+                updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
+                skip_count += 1
+                print(f"         SKIP  {att_name} (S/MIME)")
+                continue
+
+            # Resolve graph_att_id + odata_type
+            resolved_id = graph_att_id
+            odata_type  = type_map.get(graph_att_id, "") if graph_att_id else ""
+
+            if not resolved_id:
+                # Fallback: name matching (legacy)
+                graph_att = find_graph_att(att_name, att_size, graph_atts)
+                if not graph_att:
+                    logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
+                    print(f"         ERR   {att_name} (nenalezeno)")
+                    err_count += 1
+                    email_ok = False
+                    continue
+                if graph_att.get("isInline", False):
+                    updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
+                    skip_count += 1
+                    print(f"         SKIP  {att_name} (inline obrazek)")
+                    continue
+                resolved_id = graph_att["id"]
+                odata_type  = graph_att.get("@odata.type", "")
+
+            # Smart fetch
+            try:
+                content, kind, extra = fetch_attachment_smart(
+                    mailbox, graph_id, resolved_id, odata_type
+                )
+            except Exception as e:
+                logging.error("fetch_attachment_smart failed [msg=%s att=%s type=%s]: %s",
+                              graph_id, resolved_id, odata_type, e)
+                err_count += 1
+                email_ok = False
+                print(f"         ERR   {att_name} (stazeni selhalo)")
+                continue
+
+            now_utc = datetime.now(timezone.utc).replace(tzinfo=None)
+
+            if kind == FETCH_MISSING:
+                updated_atts[i] = {
+                    **att,
+                    "attachment_missing": True,
+                    "attachment_missing_at": now_utc,
+                }
+                miss_count += 1
+                print(f"         MISS  {att_name} (404 — oznaceno jako missing)")
+                continue
+
+            if kind == FETCH_REFERENCE:
+                updated_atts[i] = {
+                    **att,
+                    "attachment_reference": True,
+                    "attachment_type": "reference",
+                    "reference_url": extra,
+                }
+                ref_count += 1
+                print(f"         REF   {att_name} -> {extra}")
+                continue
+
+            # kind in ('file', 'item') — mame bytes
+            mime_type = att.get("mime_type") or (
+                "message/rfc822" if kind == "item" else "application/octet-stream"
+            )
+
+            # Pro itemAttachment vyrobime .eml priponu pokud chybi
+            save_name = att_name
+            if kind == "item" and not save_name.lower().endswith(".eml"):
+                save_name = (save_name or "embedded_email") + ".eml"
+
+            hash_val, local_path, was_new = save_attachment(
+                content, save_name, mime_type, mailbox, att_dir, col_index
+            )
+
+            updated_atts[i] = {
+                **att,
+                "file_hash":       hash_val,
+                "local_path":      local_path,
+                "attachment_type": kind,
+            }
+
+            if was_new:
+                new_count += 1
+                tag = "NEW(eml)" if kind == "item" else "NEW"
+                print(f"         {tag}   {local_path}  ({len(content):,} B)")
+            else:
+                dup_count += 1
+                print(f"         DUP   {att_name} -> {local_path}")
+
+        if email_ok:
+            ok_count += 1
+
+        batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
+
+        if len(batch) >= BATCH_SIZE:
+            flush()
+
+        if email_i % 100 == 0:
+            elapsed = (datetime.now() - start).total_seconds()
+            print(f"  {'─'*60}")
+            print(f"  Průběh: emaily={email_i}/{total}  nove={new_count}  dup={dup_count}  "
+                  f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count}")
+            print(f"  {'─'*60}")
+
+    flush()
+
+    elapsed = (datetime.now() - start).total_seconds()
+    print(f"  -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
+          f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count} ({elapsed:.1f} s)")
+    return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
+            "skip": skip_count, "miss": miss_count, "ref": ref_count, "err": err_count,
+            "elapsed": elapsed}
+
+
+def discover_mailboxes(db) -> list[str]:
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        if name in SKIP_MAILBOXES:
+            print(f"  [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
+            continue
+        out.append(name)
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
+    ap.add_argument("--mailbox",       default="",
+                    help="Emailova schranka. Bez argumentu projede vsechny schranky.")
+    ap.add_argument("--limit",         type=int, default=0,
+                    help="Zpracovat max N emailu (0 = vse) — per schranka")
+    ap.add_argument("--force-recheck", action="store_true",
+                    help="Znovu overi i emaily kde prilohy uz maji file_hash / missing / reference")
+    ap.add_argument("--no-indexes",    action="store_true",
+                    help="Nevytvorit indexy na attachments_index kolekci")
+    args = ap.parse_args()
+
+    start_all = datetime.now()
+    print(f"=== download_attachments v{SCRIPT_VERSION} ===")
+    print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+
+    col_index = client[MONGO_DB][MONGO_COL_INDEX]
+    if not args.no_indexes:
+        col_index.create_index("filename")
+        col_index.create_index("mime_type")
+        col_index.create_index("mailbox")
+
+    db = client[MONGO_DB]
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"  CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            sys.exit(2)
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+        print(f"  Schranky ke zpracovani: {len(mailboxes)}")
+        for m in mailboxes:
+            print(f"    {m}")
+
+    results = []
+    for mb in mailboxes:
+        try:
+            results.append(process_mailbox(client, mb, args))
+        except Exception as e:
+            logging.error("process_mailbox %s: %s", mb, e)
+            print(f"  FATAL pri zpracovani {mb}: {e}")
+            results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
+                            "skip": 0, "miss": 0, "ref": 0, "err": 1, "elapsed": 0.0})
+
+    elapsed_total = (datetime.now() - start_all).total_seconds()
+    files_total   = col_index.count_documents({})
+    size_total    = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
+
+    grand = {k: sum(r.get(k, 0) for r in results)
+             for k in ("ok", "new", "dup", "skip", "miss", "ref", "err")}
+
+    print(f"\n{'='*60}")
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
+              f"dup={r['dup']:>4} skip={r['skip']:>3} miss={r.get('miss',0):>3} "
+              f"ref={r.get('ref',0):>3} err={r['err']:>3}")
+    print(f"  {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
+          f"dup={grand['dup']:>4} skip={grand['skip']:>3} miss={grand['miss']:>3} "
+          f"ref={grand['ref']:>3} err={grand['err']:>3}")
+    print(f"Souboru v indexu: {files_total}  ({size_total / 1024 / 1024:.1f} MB)")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if grand['err']:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+    return 1 if grand['err'] > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
@@ -0,0 +1,63 @@
+# 4_unwrap_smime_v1.0.py
+
+**Krok 4 pipeline** — rozbalení S/MIME wrapper zpráv. Některé emaily (Datová schránka, mBank, ComGate, PayU, PostSignum …) mají viditelné tělo jen *"This is an S/MIME signed message"* — skutečný obsah je zabalený uvnitř přílohy `smime.p7m`.
+
+Skript najde tyto emaily, stáhne binárku `smime.p7m` z Graphu, rozbalí PKCS7 SignedData (`asn1crypto.cms`), extrahuje vnitřní MIME zprávu a doplní do Mongo:
+
+| Pole | Obsah |
+|---|---|
+| `smime_unwrapped: True` | flag — už rozbaleno |
+| `smime_subject` | Subject z vnitřní MIME hlavičky |
+| `smime_body_text` | plain text vnitřního těla |
+| `smime_body_html` | HTML vnitřního těla (pokud je) |
+| `smime_inner_attachments[]` | `{filename, content_type, size_bytes}` vnitřních příloh |
+
+## POZOR: `smime.p7m` vs `smime.p7s`
+
+| Příloha | Co to je | Skript dělá |
+|---|---|---|
+| `smime.p7m` | **Enveloped wrapper** kolem celé MIME zprávy | **Rozbalí** |
+| `smime.p7s` | **Detached signature** vedle čistého emailu (tělo je normálně dostupné) | **Ignoruje** — není co rozbalovat |
+
+Filtr ve skriptu (`SMIME_FILTER`) je proto explicitně `^smime\.p7m$`. Email s přílohou `smime.p7s` a `smime_unwrapped != True` je **správný stav**.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka. Bez argumentu projede všechny kolekce v `emaily` mimo `SKIP_COLLECTIONS` (`attachments_index`, `sync_state`) a `SKIP_MAILBOXES`. |
+| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
+
+## SKIP_MAILBOXES (hardcoded)
+
+| Schránka | Důvod |
+|---|---|
+| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
+
+## Varianty volání
+
+```bash
+# Všechny schránky (mimo SKIP_MAILBOXES):
+docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz
+
+# Test 10 emailů:
+docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz --limit 10
+
+# Plný běh na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/4_unwrap_smime_v1.0.py > /scripts/unwrap_smime.log 2>&1"
+```
+
+## Závislosti
+
+```bash
+docker exec python-runner pip install asn1crypto
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/unwrap_smime.log
+```
@@ -0,0 +1,445 @@
+"""
+==============================================================================
+Skript:   unwrap_smime_v1.0.py
+Verze:    1.0
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  Najde v Mongo emaily s prilohou smime.p7m (S/MIME signed-data),
+  stahne binarni obsah prilohy z Microsoft Graph API, rozbali PKCS7
+  SignedData (CMS), extrahuje vnitrni MIME message, a ulozi do Mongo:
+    - smime_unwrapped: True
+    - smime_body_text   : plain text vnitrniho tela
+    - smime_body_html   : HTML vnitrniho tela (kdyz je)
+    - smime_subject     : Subject vnitrni MIME hlavicky
+    - smime_inner_attachments : [{filename, content_type, size_bytes}]
+
+  Tyto pole pak pouzije enrich_fulltext_emails_v1.2 a doplni jejich
+  obsah do PG fulltext indexu.
+
+  Typicke S/MIME odesilatele:
+    notifikace@mojedatovaschranka.cz  (844 emailu)
+    kontakt@mbank.cz                  (226)
+    payments@comgate.cz, service@payu.com  (~250)
+    info.postsignum@cpost.cz
+
+Architekturalni poznamka:
+  S/MIME priloha smime.p7m ma Content-Type application/pkcs7-mime
+  s parametrem smime-type=signed-data. Vnitrni obsah je v PKCS7
+  ContentInfo -> SignedData -> encapContentInfo.eContent. To uz je
+  primo MIME zprava (multipart nebo single body).
+
+Zavislosti (instalovat v kontejneru):
+  pip install asn1crypto
+
+Spusteni:
+  python unwrap_smime_v1.0.py                              # vsechny schranky (mimo SKIP_MAILBOXES)
+  python unwrap_smime_v1.0.py --mailbox vladimir.buzalka@buzalka.cz
+  python unwrap_smime_v1.0.py --limit 10                   # test
+
+SKIP_MAILBOXES (hardcoded):
+  vbuzalka@its.jnj.com   — JNJ tenant, nemame Graph API pristup. Pri behu
+                            bez --mailbox se tise preskoci, s --mailbox skript
+                            skonci s exit kodem 2.
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import email
+import email.policy
+import logging
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+import msal
+import requests
+from asn1crypto import cms
+from pymongo import MongoClient, UpdateOne
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# --- konfigurace ------------------------------------------------------------
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB  = "emaily"
+SKIP_COLLECTIONS = {"attachments_index", "sync_state"}
+
+# Schranky kde NEMAME Graph API pristup — pri bezne behu se preskocia.
+SKIP_MAILBOXES = {
+    "vbuzalka@its.jnj.com",   # JNJ tenant — nemame Graph credentials
+}
+
+MAX_BODY_BYTES   = 2 * 1024 * 1024   # 2 MB strop pro extrahovany text
+BATCH_SIZE       = 25
+LOG_FILE         = Path(__file__).parent / "unwrap_smime_errors.log"
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+
+# --- Graph auth -------------------------------------------------------------
+_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in res:
+        raise RuntimeError(f"Graph auth failed: {res}")
+    _token = res["access_token"]
+    return _token
+
+
+def graph_get_raw(url: str) -> Optional[bytes]:
+    """GET na Graph endpoint, vraci raw bytes (pro $value attachment endpoint)."""
+    global _token
+    if not _token:
+        get_token()
+    for attempt in range(3):
+        try:
+            r = requests.get(url, headers={"Authorization": f"Bearer {_token}"}, timeout=60)
+            if r.status_code == 401:
+                get_token(); continue
+            if r.status_code == 404:
+                return None
+            if r.status_code == 429:
+                wait = int(r.headers.get("Retry-After", "5"))
+                time.sleep(wait); continue
+            r.raise_for_status()
+            return r.content
+        except requests.RequestException:
+            if attempt == 2:
+                raise
+            time.sleep(2)
+    return None
+
+
+# --- PKCS7 / MIME unwrap ----------------------------------------------------
+
+def extract_inner_mime(content_bytes: bytes) -> bytes:
+    """Z S/MIME prilohy vytahne vnitrni MIME (signed content) jako bytes.
+
+    Dva formaty se v Graph API vyskytuji:
+      A) multipart/signed (detached signature) - bytes zacinaji 'Content-Type: multipart/signed'.
+         Obsah je rovnou citelny v prvni MIME casti (druha cast je oddeleny PKCS7 podpis).
+      B) application/pkcs7-mime (opaque, smime-type=signed-data) - vnitrni MIME je
+         schovany uvnitr PKCS7 SignedData -> encapContentInfo.eContent.
+
+    Vraci raw MIME bytes pro pripravu pro email.message_from_bytes.
+    """
+    head = content_bytes[:300].lower()
+
+    # A) multipart/signed (detached) - nejcastejsi pro maily z Graphu
+    if b"content-type:" in head and b"multipart/signed" in head:
+        try:
+            outer = email.message_from_bytes(content_bytes, policy=email.policy.default)
+        except Exception as e:
+            raise RuntimeError(f"MIME parse failed: {e}")
+        # iteruj parts - prvni non-signature je signed payload
+        signed_payload = None
+        if outer.is_multipart():
+            for part in outer.iter_parts():
+                ct = (part.get_content_type() or "").lower()
+                if "pkcs7-signature" in ct or "x-pkcs7-signature" in ct:
+                    continue
+                signed_payload = part
+                break
+        if signed_payload is None:
+            raise RuntimeError("multipart/signed: no signed payload found")
+        return signed_payload.as_bytes()
+
+    # B) opaque PKCS7 SignedData - DER nebo base64
+    data = content_bytes
+    try:
+        ci = cms.ContentInfo.load(data)
+    except Exception:
+        try:
+            import base64
+            stripped = b"".join(line for line in data.splitlines()
+                                if not line.startswith(b"-----"))
+            data = base64.b64decode(stripped, validate=False)
+            ci = cms.ContentInfo.load(data)
+        except Exception as e:
+            raise RuntimeError(f"PKCS7/MIME parse failed: {e}")
+
+    if ci["content_type"].native != "signed_data":
+        raise RuntimeError(f"Not signed-data, got {ci['content_type'].native}")
+    sd = ci["content"]
+    inner = sd["encap_content_info"]["content"]
+    if inner is None:
+        raise RuntimeError("encapContentInfo.content is null (detached without MIME wrapper)")
+    return bytes(inner.native) if hasattr(inner, "native") else bytes(inner)
+
+
+def parse_inner_mime(mime_bytes: bytes) -> dict:
+    """Z MIME bytes vytahne text, html a prilohy."""
+    msg = email.message_from_bytes(mime_bytes, policy=email.policy.default)
+
+    text_parts: list[str] = []
+    html_parts: list[str] = []
+    inner_attachments: list[dict] = []
+
+    def walk(part):
+        ctype = part.get_content_type()
+        disp = (part.get_content_disposition() or "").lower()
+        filename = part.get_filename()
+
+        if part.is_multipart():
+            for sub in part.iter_parts():
+                walk(sub)
+            return
+
+        if disp == "attachment" or filename:
+            try:
+                payload = part.get_content()
+                if isinstance(payload, str):
+                    payload_bytes = payload.encode("utf-8", errors="replace")
+                elif isinstance(payload, bytes):
+                    payload_bytes = payload
+                else:
+                    payload_bytes = b""
+                size = len(payload_bytes)
+            except Exception:
+                size = 0
+            inner_attachments.append({
+                "filename": filename or "(unnamed)",
+                "content_type": ctype,
+                "size_bytes": size,
+            })
+            return
+
+        if ctype == "text/plain":
+            try:
+                text_parts.append(part.get_content())
+            except Exception:
+                try:
+                    text_parts.append(part.get_payload(decode=True).decode(
+                        part.get_content_charset() or "utf-8", errors="replace"))
+                except Exception:
+                    pass
+        elif ctype == "text/html":
+            try:
+                html_parts.append(part.get_content())
+            except Exception:
+                try:
+                    html_parts.append(part.get_payload(decode=True).decode(
+                        part.get_content_charset() or "utf-8", errors="replace"))
+                except Exception:
+                    pass
+
+    walk(msg)
+
+    body_text = "\n\n".join(t.strip() for t in text_parts if t and t.strip())
+    body_html = "\n".join(h for h in html_parts if h and h.strip())
+    if len(body_text) > MAX_BODY_BYTES:
+        body_text = body_text[:MAX_BODY_BYTES]
+    if len(body_html) > MAX_BODY_BYTES:
+        body_html = body_html[:MAX_BODY_BYTES]
+
+    return {
+        "subject": str(msg.get("Subject") or "").strip(),
+        "from":    str(msg.get("From") or "").strip(),
+        "to":      str(msg.get("To") or "").strip(),
+        "date":    str(msg.get("Date") or "").strip(),
+        "body_text": body_text or None,
+        "body_html": body_html or None,
+        "inner_attachments": inner_attachments,
+    }
+
+
+# --- hlavni smycka ----------------------------------------------------------
+
+SMIME_FILTER = {
+    "$and": [
+        {"attachments.filename": {"$regex": "^smime\\.p7m$", "$options": "i"}},
+        {"smime_unwrapped": {"$ne": True}},
+    ]
+}
+
+
+def find_p7m_graph_att_id(doc: dict) -> Optional[str]:
+    for att in doc.get("attachments") or []:
+        if (att.get("filename") or "").lower() == "smime.p7m":
+            return att.get("graph_att_id")
+    return None
+
+
+def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict:
+    total = col.count_documents(SMIME_FILTER)
+    print(f"[{mailbox}] S/MIME k rozbaleni: {total}"
+          + (f" (limit {limit})" if limit else ""))
+    if total == 0:
+        return {"mailbox": mailbox, "candidates": 0, "unwrapped": 0,
+                "errors": 0, "no_att_id": 0, "missing": 0,
+                "with_inner_att": 0, "inner_att_total": 0}
+
+    cursor = col.find(SMIME_FILTER, {"_id": 1, "graph_id": 1, "attachments": 1},
+                      no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    n = unwrapped = err = no_att_id = missing = with_inner = inner_total = 0
+    bulk: list[UpdateOne] = []
+
+    try:
+        for doc in cursor:
+            n += 1
+            mid = doc["_id"]
+            gid = doc.get("graph_id")
+            att_id = find_p7m_graph_att_id(doc)
+            if not gid or not att_id:
+                no_att_id += 1
+                continue
+
+            url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}/attachments/{att_id}/$value"
+            try:
+                p7m_bytes = graph_get_raw(url)
+            except Exception as e:
+                err += 1
+                logging.error("[%s] graph fetch %s: %s", mailbox, gid, e)
+                bulk.append(UpdateOne({"_id": mid}, {"$set": {
+                    "smime_unwrapped": False,
+                    "smime_error": f"fetch: {type(e).__name__}: {e}"[:300],
+                    "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }}))
+                continue
+            if p7m_bytes is None:
+                missing += 1
+                bulk.append(UpdateOne({"_id": mid}, {"$set": {
+                    "smime_unwrapped": False,
+                    "smime_error": "attachment_404",
+                    "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }}))
+                continue
+
+            try:
+                inner_bytes = extract_inner_mime(p7m_bytes)
+                parsed = parse_inner_mime(inner_bytes)
+            except Exception as e:
+                err += 1
+                logging.error("[%s] unwrap %s: %s", mailbox, mid, e)
+                bulk.append(UpdateOne({"_id": mid}, {"$set": {
+                    "smime_unwrapped": False,
+                    "smime_error": f"unwrap: {type(e).__name__}: {e}"[:300],
+                    "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                }}))
+                continue
+
+            inner_atts = parsed["inner_attachments"]
+            inner_total += len(inner_atts)
+            if inner_atts:
+                with_inner += 1
+
+            update = {
+                "smime_unwrapped": True,
+                "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+                "smime_body_text": parsed["body_text"],
+                "smime_body_html": parsed["body_html"],
+                "smime_subject": parsed["subject"],
+                "smime_from": parsed["from"],
+                "smime_to": parsed["to"],
+                "smime_date": parsed["date"],
+                "smime_inner_attachments": inner_atts,
+                "smime_error": None,
+            }
+            bulk.append(UpdateOne({"_id": mid}, {"$set": update}))
+            unwrapped += 1
+
+            if len(bulk) >= BATCH_SIZE:
+                col.bulk_write(bulk, ordered=False)
+                bulk.clear()
+
+            if n % 50 == 0 or n == 1:
+                print(f"  [{n:>5}/{total}] unwrapped={unwrapped} err={err} "
+                      f"no_att_id={no_att_id} missing={missing} "
+                      f"inner_atts_total={inner_total}", flush=True)
+    finally:
+        cursor.close()
+        if bulk:
+            col.bulk_write(bulk, ordered=False)
+
+    print(f"  [{n}/{total}] DONE  unwrapped={unwrapped} err={err} "
+          f"no_att_id={no_att_id} missing={missing} "
+          f"with_inner_atts={with_inner} inner_atts_total={inner_total}")
+    return {"mailbox": mailbox, "candidates": total, "unwrapped": unwrapped,
+            "errors": err, "no_att_id": no_att_id, "missing": missing,
+            "with_inner_att": with_inner, "inner_att_total": inner_total}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    print("Token Graph API...")
+    get_token()
+    print("OK\n")
+
+    if args.mailbox:
+        if args.mailbox in SKIP_MAILBOXES:
+            print(f"CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
+            return 2
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = []
+        for c in db.list_collection_names():
+            if c in SKIP_COLLECTIONS:
+                continue
+            if c in SKIP_MAILBOXES:
+                print(f"  [skip] {c} — v SKIP_MAILBOXES (neni Graph pristup)")
+                continue
+            mailboxes.append(c)
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}\n")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(db[mb], mb, limit=args.limit))
+        print()
+
+    print("=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: candidates={r['candidates']}  unwrapped={r['unwrapped']}  "
+              f"errors={r['errors']}  no_att_id={r['no_att_id']}  missing={r['missing']}  "
+              f"with_inner_atts={r['with_inner_att']}  inner_atts_total={r['inner_att_total']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    total_errors = sum(r.get("errors", 0) for r in results)
+    return 1 if total_errors > 0 else 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
@@ -0,0 +1,47 @@
+# 5_enrich_fulltext_emails_v1.2.py
+
+**Krok 5 pipeline** — vytáhne plný text z emailů v MongoDB (db: `emaily`) a uloží do PostgreSQL (db: `MongoEmaily`, tabulka: `emails`) s GIN `tsvector` indexem (config `soubory` — simple + unaccent).
+
+Emaily se **nestahují znovu** — tělo už je v Mongo z kroků 1/2/4. Tento skript jen vybere první dostupné tělo podle priority a pošle text do PG na fulltext.
+
+## Priorita zdroje těla (`body_source`)
+
+1. **`smime`** — `smime_body_text` / `smime_body_html` (pokud unwrap proběhl)
+2. **`html`** — `body_html`
+3. **`text`** — `body_text` (z parse v1.4 nebo refetch v1.0)
+4. **`preview`** — `body_preview` (fallback)
+
+Názvy vnitřních S/MIME příloh (`smime_inner_attachments`) jdou do `attachments_summary` — dohledatelné přes MCP `emaily.find_attachment`.
+
+## Inkrementalita
+
+Pokud `(mailbox, message_id)` v PG existuje, `extractor_version` je aktuální (1.2) a `modified_at` v Mongo není novější → **skip**. Při bumpu `EXTRACTOR_VERSION` se vše přeparsuje.
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka |
+| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
+
+## Varianty volání
+
+```bash
+# Všechny schránky:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz
+
+# Test 500 emailů:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz --limit 500
+
+# Plný běh na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.2.py > /scripts/enrich.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/enrich.log
+```
@@ -0,0 +1,489 @@
+"""
+==============================================================================
+Skript:   enrich_fulltext_emails_v1.2.py
+Verze:    1.2
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
+  PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
+
+  Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
+  (a refetch_text_bodies_v1.0 pro stare plain-text emaily).
+  Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
+
+Zmeny proti v1.1:
+  - S/MIME emaily (signed-data od Datove schranky, mBank, ComGate, PayU, ...):
+    pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html, pouzije se
+    PREFEROVANE pred bezvyznamnym vnejsim wrapper telem ("This is an S/MIME
+    signed message"). Nazvy vnitrnich priloh (smime_inner_attachments) se
+    pridavaji do attachments_summary, tj. dohledatelne pres find_attachment.
+  - body_source: nova hodnota "smime" (rozbalene vnitrni telo).
+  - EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
+
+Zmeny v1.1 vs v1.0:
+  - Fallback poradi rozsireno o body_text (novy v parse_emails_graph_v1.4).
+  - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
+
+Zdroj:
+  MongoDB    192.168.1.76  db=emaily  kolekce=<mailbox>
+             (krome attachments_index)
+
+Cil:
+  PostgreSQL 192.168.1.76  db=MongoEmaily  tabulka=emails
+             tsvector config 'soubory' (sdileny - simple + unaccent)
+
+Inkrementalita:
+  Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
+  a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
+  se vse preparsuje.
+
+Spusteni:
+  python enrich_fulltext_emails_v1.0.py                       # vsechny schranky
+  python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com
+  python enrich_fulltext_emails_v1.0.py --limit 500           # test
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from typing import Optional
+
+import psycopg
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+EXTRACTOR_VERSION = "1.2"
+
+MAX_TEXT_BYTES = 5 * 1024 * 1024   # plain text max 5 MB
+SKIP_COLLECTIONS = {"attachments_index"}
+
+BATCH_SIZE = 100
+
+
+# --- SCHEMA -----------------------------------------------------------------
+
+SCHEMA_SQL = """
+CREATE EXTENSION IF NOT EXISTS unaccent;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
+        CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
+        ALTER TEXT SEARCH CONFIGURATION soubory
+            ALTER MAPPING FOR hword, hword_part, word
+            WITH unaccent, simple;
+    END IF;
+END$$;
+
+CREATE TABLE IF NOT EXISTS emails (
+    id              BIGSERIAL PRIMARY KEY,
+    mailbox         TEXT NOT NULL,
+    message_id      TEXT NOT NULL,
+    graph_id        TEXT,
+    conversation_id TEXT,
+    folder_path     TEXT,
+    subject         TEXT,
+    sender_email    TEXT,
+    sender_name     TEXT,
+    to_addrs        TEXT,
+    cc_addrs        TEXT,
+    bcc_addrs       TEXT,
+    sent_at         TIMESTAMPTZ,
+    received_at     TIMESTAMPTZ,
+    modified_at     TIMESTAMPTZ,
+    is_read         BOOLEAN,
+    is_draft        BOOLEAN,
+    has_attachments BOOLEAN,
+    attachment_count INT,
+    attachments_summary TEXT,
+    body            TEXT,
+    body_length     INT,
+    body_source     TEXT,         -- 'html' | 'preview' | 'empty'
+    tsv             tsvector GENERATED ALWAYS AS (
+        to_tsvector('soubory'::regconfig,
+            left(
+                coalesce(subject, '') || ' ' ||
+                coalesce(sender_email, '') || ' ' ||
+                coalesce(sender_name, '') || ' ' ||
+                coalesce(to_addrs, '') || ' ' ||
+                coalesce(cc_addrs, '') || ' ' ||
+                coalesce(attachments_summary, '') || ' ' ||
+                coalesce(body, ''),
+            800000)
+        )
+    ) STORED,
+    extracted_at      TIMESTAMPTZ DEFAULT now(),
+    extractor_version TEXT,
+    ok                BOOLEAN,
+    error             TEXT,
+    UNIQUE (mailbox, message_id)
+);
+
+CREATE INDEX IF NOT EXISTS emails_tsv_gin            ON emails USING gin(tsv);
+CREATE INDEX IF NOT EXISTS emails_subject_trgm       ON emails USING gin(subject gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS emails_sender_email_idx   ON emails(sender_email);
+CREATE INDEX IF NOT EXISTS emails_mailbox_idx        ON emails(mailbox);
+CREATE INDEX IF NOT EXISTS emails_received_idx       ON emails(received_at DESC);
+CREATE INDEX IF NOT EXISTS emails_conv_idx           ON emails(conversation_id);
+"""
+
+
+# --- HELPERY ----------------------------------------------------------------
+
+_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+_WS_RX = re.compile(r"[ \t]+")
+_NL_RX = re.compile(r"\n{3,}")
+
+
+def _clean_for_pg(s: str) -> str:
+    if not s:
+        return ""
+    return _CTRL_RX.sub("", s)
+
+
+def _truncate(s: str) -> str:
+    s = _clean_for_pg(s or "")
+    if not s:
+        return ""
+    b = s.encode("utf-8", errors="replace")
+    if len(b) <= MAX_TEXT_BYTES:
+        return s
+    return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
+
+
+def html_to_text(html: str) -> str:
+    """Extrahuje plain text z HTML emailu. Odstrani <script>, <style>, normalizuje whitespace."""
+    if not html:
+        return ""
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "head"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    # normalizace whitespace
+    lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
+    text = "\n".join(ln for ln in lines if ln)
+    text = _NL_RX.sub("\n\n", text)
+    return text
+
+
+def fmt_recipients(recipients: list, kind: str) -> str:
+    """Sloupec to_addrs/cc_addrs/bcc_addrs - 'Jmeno <email>; Jmeno2 <email2>'."""
+    if not recipients:
+        return ""
+    out = []
+    for r in recipients:
+        if not isinstance(r, dict):
+            continue
+        if r.get("type") != kind:
+            continue
+        name = (r.get("name") or "").strip()
+        email = (r.get("email") or "").strip()
+        if name and email:
+            out.append(f"{name} <{email}>")
+        elif email:
+            out.append(email)
+        elif name:
+            out.append(name)
+    return "; ".join(out)
+
+
+def fmt_attachments(attachments: list) -> str:
+    if not attachments:
+        return ""
+    out = []
+    for a in attachments[:20]:
+        if not isinstance(a, dict):
+            continue
+        name = a.get("name") or a.get("filename") or ""
+        if name:
+            out.append(name)
+    return " | ".join(out)
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+# --- HLAVNI SMYCKA ----------------------------------------------------------
+
+def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
+                    limit: Optional[int] = None) -> dict:
+    # existujici zaznamy v PG (rychly inkrementalni lookup)
+    with pg.cursor() as cur:
+        cur.execute(
+            "SELECT message_id, extractor_version, modified_at, ok "
+            "FROM emails WHERE mailbox = %s",
+            (mailbox,),
+        )
+        existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
+
+    proj = {
+        "_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
+        "subject": 1, "sender": 1, "recipients": 1,
+        "sent_at": 1, "received_at": 1, "modified_at": 1,
+        "is_read": 1, "is_draft": 1,
+        "has_attachments": 1, "attachment_count": 1, "attachments": 1,
+        "body_html": 1, "body_text": 1, "body_preview": 1,
+        # S/MIME unwrapped fields (z unwrap_smime_v1.0)
+        "smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
+        "smime_subject": 1, "smime_inner_attachments": 1,
+    }
+    cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    total_pending = limit or mongo_coll.estimated_document_count()
+    print(f"[{mailbox}] kandidatu: ~{total_pending}")
+
+    processed = ok = errors = skipped = empty_body = 0
+    queue: list[dict] = []
+    n = 0
+
+    try:
+        for doc in cursor:
+            n += 1
+            msg_id = doc.get("_id") or ""
+            prev = existing.get(msg_id)
+            mongo_mtime = doc.get("modified_at")
+            if (prev and prev[0] == EXTRACTOR_VERSION and prev[2]
+                    and (mongo_mtime is None
+                         or (prev[1] and prev[1] >= mongo_mtime))):
+                skipped += 1
+                continue
+
+            sender = doc.get("sender") or {}
+            recipients = doc.get("recipients") or []
+            attachments = doc.get("attachments") or []
+            # u S/MIME prilepime nazvy SKUTECNYCH vnitrnich priloh (PDF faktura, ...)
+            # za vnejsi smime.p7m, aby je find_attachment nasel
+            inner = doc.get("smime_inner_attachments") or []
+            if inner:
+                attachments = list(attachments) + [
+                    {"filename": (a.get("filename") or "") + " [smime]"}
+                    for a in inner if a.get("filename")
+                ]
+
+            row = {
+                "mailbox": mailbox,
+                "message_id": msg_id,
+                "graph_id": doc.get("graph_id"),
+                "conversation_id": doc.get("conversation_id"),
+                "folder_path": doc.get("folder_path"),
+                "subject": doc.get("subject") or "",
+                "sender_email": sender.get("email"),
+                "sender_name": sender.get("name"),
+                "to_addrs": fmt_recipients(recipients, "to"),
+                "cc_addrs": fmt_recipients(recipients, "cc"),
+                "bcc_addrs": fmt_recipients(recipients, "bcc"),
+                "sent_at": doc.get("sent_at"),
+                "received_at": doc.get("received_at"),
+                "modified_at": mongo_mtime,
+                "is_read": doc.get("is_read"),
+                "is_draft": doc.get("is_draft"),
+                "has_attachments": doc.get("has_attachments"),
+                "attachment_count": doc.get("attachment_count"),
+                "attachments_summary": fmt_attachments(attachments),
+                "body": None,
+                "body_length": 0,
+                "body_source": "empty",
+                "extracted_at": _now(),
+                "extractor_version": EXTRACTOR_VERSION,
+                "ok": False,
+                "error": None,
+            }
+
+            status = "OK "; detail = ""
+            try:
+                # fallback poradi (v1.2):
+                #   smime_body_text/html (rozbaleny S/MIME) -> body_html -> body_text -> body_preview
+                text = ""
+                if doc.get("smime_unwrapped"):
+                    s_text = doc.get("smime_body_text") or ""
+                    s_html = doc.get("smime_body_html") or ""
+                    s_html_text = html_to_text(s_html) if s_html else ""
+                    # preferuj plain text, fallback html
+                    combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
+                    s_subject = doc.get("smime_subject") or ""
+                    if s_subject:
+                        combined = f"Subject: {s_subject}\n\n{combined}"
+                    if combined:
+                        text = combined
+                        row["body_source"] = "smime"
+                if not text:
+                    html = doc.get("body_html") or ""
+                    h_text = html_to_text(html) if html else ""
+                    if h_text:
+                        text = h_text
+                        row["body_source"] = "html"
+                if not text:
+                    plain = doc.get("body_text") or ""
+                    if plain:
+                        text = plain
+                        row["body_source"] = "text"
+                if not text:
+                    preview = doc.get("body_preview") or ""
+                    if preview:
+                        text = preview
+                        row["body_source"] = "preview"
+                if not text:
+                    row["body_source"] = "empty"
+                    empty_body += 1
+                body = _truncate(text)
+                row["body"] = body if body else None
+                row["body_length"] = len(body)
+                row["ok"] = True
+                ok += 1
+                detail = f"{len(body)} znaku  {_short(body, 60)!r}"
+            except Exception as e:
+                row["error"] = f"{type(e).__name__}: {e}"[:500]
+                status = "ERR"; detail = row["error"][:80]; errors += 1
+
+            queue.append(row)
+            processed += 1
+
+            if n % 200 == 0 or n == 1:
+                subj = _short(row["subject"], 50)
+                print(f"  [{n:>5}] {status} {row['body_source']:<7} "
+                      f"{row['body_length']:>7}ch  | {subj}", flush=True)
+
+            if len(queue) >= BATCH_SIZE:
+                _flush(pg, queue); queue.clear()
+    finally:
+        cursor.close()
+
+    if queue:
+        _flush(pg, queue)
+
+    return {"mailbox": mailbox, "processed": processed, "ok": ok,
+            "errors": errors, "skipped": skipped, "empty_body": empty_body}
+
+
+UPSERT_SQL = """
+INSERT INTO emails
+    (mailbox, message_id, graph_id, conversation_id, folder_path,
+     subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
+     sent_at, received_at, modified_at, is_read, is_draft,
+     has_attachments, attachment_count, attachments_summary,
+     body, body_length, body_source,
+     extracted_at, extractor_version, ok, error)
+VALUES
+    (%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
+     %(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
+     %(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
+     %(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
+     %(body)s, %(body_length)s, %(body_source)s,
+     %(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
+ON CONFLICT (mailbox, message_id) DO UPDATE SET
+    graph_id            = EXCLUDED.graph_id,
+    conversation_id     = EXCLUDED.conversation_id,
+    folder_path         = EXCLUDED.folder_path,
+    subject             = EXCLUDED.subject,
+    sender_email        = EXCLUDED.sender_email,
+    sender_name         = EXCLUDED.sender_name,
+    to_addrs            = EXCLUDED.to_addrs,
+    cc_addrs            = EXCLUDED.cc_addrs,
+    bcc_addrs           = EXCLUDED.bcc_addrs,
+    sent_at             = EXCLUDED.sent_at,
+    received_at         = EXCLUDED.received_at,
+    modified_at         = EXCLUDED.modified_at,
+    is_read             = EXCLUDED.is_read,
+    is_draft            = EXCLUDED.is_draft,
+    has_attachments     = EXCLUDED.has_attachments,
+    attachment_count    = EXCLUDED.attachment_count,
+    attachments_summary = EXCLUDED.attachments_summary,
+    body                = EXCLUDED.body,
+    body_length         = EXCLUDED.body_length,
+    body_source         = EXCLUDED.body_source,
+    extracted_at        = EXCLUDED.extracted_at,
+    extractor_version   = EXCLUDED.extractor_version,
+    ok                  = EXCLUDED.ok,
+    error               = EXCLUDED.error
+"""
+
+
+def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
+    for r in rows:
+        for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
+                  "bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
+            if r.get(k):
+                r[k] = _clean_for_pg(r[k])
+    with pg.cursor() as cur:
+        cur.executemany(UPSERT_SQL, rows)
+    pg.commit()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k PostgreSQL...")
+    # MongoEmaily DB musi existovat (create externe pres psql nebo DBeaver),
+    # protoze CREATE DATABASE nesmi byt v transakci.
+    pg = psycopg.connect(PG_DSN, connect_timeout=10)
+    with pg.cursor() as cur:
+        cur.execute(SCHEMA_SQL)
+    pg.commit()
+    print("Schema OK.")
+
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(pg, db[mb], mb, limit=args.limit))
+
+    pg.close()
+
+    print("\n=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: processed={r['processed']}  ok={r['ok']}  "
+              f"errors={r['errors']}  skipped={r['skipped']}  empty={r['empty_body']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
@@ -0,0 +1,79 @@
+# 5_enrich_fulltext_emails_v1.3.py
+
+**Krok 5 pipeline** — vytáhne plain-text z emailů v Mongu (`emaily.<schránka>`) a uloží do PostgreSQL (`MongoEmaily.emails`) s tsvector GIN indexem nad konfigurací `soubory` (simple + unaccent).
+
+## Co dělá
+
+1. Vybere první dostupné tělo v tomto pořadí:
+   - `smime_body_text/html` (rozbaleno krokem 4)
+   - `body_html` → strip HTML přes BeautifulSoup
+   - `body_text` (legacy plain)
+   - `body_preview` (jako fallback)
+2. Naplní řádek v PG `emails` (mailbox, subject, sender, recipients, body, attachments_summary, ...) + tsvector se vygeneruje sám.
+3. Upsert (`ON CONFLICT (mailbox, message_id) DO UPDATE`).
+
+## Inkrementální logika
+
+Pokud `(mailbox, message_id)` už je v PG a:
+- `extractor_version == EXTRACTOR_VERSION` (aktuálně `1.2`)
+- `ok = true`
+- `modified_at` v Mongo není novější než v PG
+
+→ **skip**. Nemusíš se bát opakovaného spuštění — vladimirovo přepsání 73k záznamů co teď probíhá je proto, že `EXTRACTOR_VERSION` byl povýšen z 1.1 → 1.2, takže všechny řádky v PG jsou „zastaralé". Po doběhnutí bude další běh skipovat všechno až na nově přibyvší.
+
+## Změny v1.3 vs v1.2
+
+- **Bugfix** `NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}` — předtím `sync_state` (přibyla s delta syncem) projížděla jako mailbox.
+- **`--index-reset`** — před zpracováním schránky `DELETE FROM emails WHERE mailbox=%s`. Force re-extract bez nutnosti povyšovat verzi.
+- **Vylepšený per-mailbox header** — ukáže `v Mongu N, v PG M (uptodate K), k zpracovani K`.
+- Když `to_process_estimate == 0` → schránku přeskočí úplně (bez iterace cursorem).
+
+## Argumenty
+
+| Argument | Povinný | Hodnoty | Default | Popis |
+|---|---|---|---|---|
+| `--mailbox` | ne | e-mail | (všechny) | Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` |
+| `--limit N` | ne | int | (bez limitu) | Per schránka, jen prvních N emailů (test) |
+| `--index-reset` | ne | flag | false | Před zpracováním **smaže** všechny emaily dané schránky v PG. **Bez `--mailbox` smaže CELÝ index!** |
+
+## Varianty volání
+
+```bash
+# Všechny schránky, inkrementální:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py
+
+# Jedna schránka:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz
+
+# Test 500 emailů:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz --limit 500
+
+# Force reindex jedné schránky:
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz --index-reset
+
+# DANGEROUS: smaže celý index a postaví znovu (POMALÉ — typicky 30+ minut):
+docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --index-reset
+
+# Na pozadí, log do souboru:
+docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.3.py > /scripts/enrich_fulltext.log 2>&1"
+```
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/enrich_fulltext.log
+```
+
+V průběhu skript vypisuje každých 200 zpracovaných emailů:
+```
+[ 38800|p=  5800] OK  html       2831ch  | CLEAR/RA payment information for invoice #22FV049
+```
+- první číslo = pozice v cursoru (počet všech emailů co prošlo)
+- `p=N` = počet skutečně zprocesovaných (zbytek byl skipnut jako už-aktuální)
+- `OK / ERR`, `body_source`, délka, subject
+
+## Závislosti
+
+```bash
+docker exec python-runner pip install psycopg[binary] beautifulsoup4 lxml pymongo
+```
@@ -0,0 +1,567 @@
+"""
+==============================================================================
+Skript:   enrich_fulltext_emails_v1.3.py
+Verze:    1.3
+Datum:    2026-06-04
+Autor:    vladimir.buzalka
+
+Popis:
+  Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
+  PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
+
+  Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
+  (a refetch_text_bodies_v1.0 pro stare plain-text emaily).
+  Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
+
+Zmeny v1.3 vs v1.2:
+  - Bugfix: NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+    (sync_state pribyla v delta syncu, predtim ji v1.2 brala jako mailbox).
+  - --index-reset: pred zpracovanim schranky vymaze vsechny jeji emaily z PG
+    (force re-extract; pouzij kdyz povysis EXTRACTOR_VERSION nebo chces ciste).
+  - Vylepseny header per-mailbox: ukaze pocet v Mongu, v PG a k zpracovani.
+
+Zmeny v1.2 vs v1.1:
+  - S/MIME emaily: pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html,
+    pouzije se PREFEROVANE pred bezvyznamnym wrapper telem.
+  - body_source: nova hodnota "smime".
+  - EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
+
+Zmeny v1.1 vs v1.0:
+  - Fallback poradi rozsireno o body_text.
+  - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
+
+Zdroj:
+  MongoDB    192.168.1.76  db=emaily  kolekce=<mailbox>
+             (krome NON_MAILBOX_COLLECTIONS)
+
+Cil:
+  PostgreSQL 192.168.1.76  db=MongoEmaily  tabulka=emails
+             tsvector config 'soubory' (sdileny - simple + unaccent)
+
+Inkrementalita:
+  Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
+  a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
+  se vse preparsuje. --index-reset to obejde a smaze PG pred behom.
+
+Spusteni:
+  python enrich_fulltext_emails_v1.3.py                           # vsechny schranky
+  python enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz
+  python enrich_fulltext_emails_v1.3.py --limit 500               # test
+  python enrich_fulltext_emails_v1.3.py --mailbox X --index-reset # smaze PG schranky a re-extrahuje vsechno
+  python enrich_fulltext_emails_v1.3.py --index-reset             # smaze CELY index a postavi znovu (POMALE!)
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from typing import Optional
+
+import psycopg
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+EXTRACTOR_VERSION = "1.2"   # NEMENIT pokud nemenis fallback logiku!
+
+MAX_TEXT_BYTES = 5 * 1024 * 1024   # plain text max 5 MB
+
+# Kolekce v `emaily` ktere NEJSOU mailboxy (nezpracovavame)
+NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
+
+BATCH_SIZE = 100
+
+
+# --- SCHEMA -----------------------------------------------------------------
+
+SCHEMA_SQL = """
+CREATE EXTENSION IF NOT EXISTS unaccent;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
+        CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
+        ALTER TEXT SEARCH CONFIGURATION soubory
+            ALTER MAPPING FOR hword, hword_part, word
+            WITH unaccent, simple;
+    END IF;
+END$$;
+
+CREATE TABLE IF NOT EXISTS emails (
+    id              BIGSERIAL PRIMARY KEY,
+    mailbox         TEXT NOT NULL,
+    message_id      TEXT NOT NULL,
+    graph_id        TEXT,
+    conversation_id TEXT,
+    folder_path     TEXT,
+    subject         TEXT,
+    sender_email    TEXT,
+    sender_name     TEXT,
+    to_addrs        TEXT,
+    cc_addrs        TEXT,
+    bcc_addrs       TEXT,
+    sent_at         TIMESTAMPTZ,
+    received_at     TIMESTAMPTZ,
+    modified_at     TIMESTAMPTZ,
+    is_read         BOOLEAN,
+    is_draft        BOOLEAN,
+    has_attachments BOOLEAN,
+    attachment_count INT,
+    attachments_summary TEXT,
+    body            TEXT,
+    body_length     INT,
+    body_source     TEXT,         -- 'html' | 'preview' | 'empty'
+    tsv             tsvector GENERATED ALWAYS AS (
+        to_tsvector('soubory'::regconfig,
+            left(
+                coalesce(subject, '') || ' ' ||
+                coalesce(sender_email, '') || ' ' ||
+                coalesce(sender_name, '') || ' ' ||
+                coalesce(to_addrs, '') || ' ' ||
+                coalesce(cc_addrs, '') || ' ' ||
+                coalesce(attachments_summary, '') || ' ' ||
+                coalesce(body, ''),
+            800000)
+        )
+    ) STORED,
+    extracted_at      TIMESTAMPTZ DEFAULT now(),
+    extractor_version TEXT,
+    ok                BOOLEAN,
+    error             TEXT,
+    UNIQUE (mailbox, message_id)
+);
+
+CREATE INDEX IF NOT EXISTS emails_tsv_gin            ON emails USING gin(tsv);
+CREATE INDEX IF NOT EXISTS emails_subject_trgm       ON emails USING gin(subject gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS emails_sender_email_idx   ON emails(sender_email);
+CREATE INDEX IF NOT EXISTS emails_mailbox_idx        ON emails(mailbox);
+CREATE INDEX IF NOT EXISTS emails_received_idx       ON emails(received_at DESC);
+CREATE INDEX IF NOT EXISTS emails_conv_idx           ON emails(conversation_id);
+"""
+
+
+# --- HELPERY ----------------------------------------------------------------
+
+_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+_WS_RX = re.compile(r"[ \t]+")
+_NL_RX = re.compile(r"\n{3,}")
+
+
+def _clean_for_pg(s: str) -> str:
+    if not s:
+        return ""
+    return _CTRL_RX.sub("", s)
+
+
+def _truncate(s: str) -> str:
+    s = _clean_for_pg(s or "")
+    if not s:
+        return ""
+    b = s.encode("utf-8", errors="replace")
+    if len(b) <= MAX_TEXT_BYTES:
+        return s
+    return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
+
+
+def html_to_text(html: str) -> str:
+    if not html:
+        return ""
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "head"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
+    text = "\n".join(ln for ln in lines if ln)
+    text = _NL_RX.sub("\n\n", text)
+    return text
+
+
+def fmt_recipients(recipients: list, kind: str) -> str:
+    if not recipients:
+        return ""
+    out = []
+    for r in recipients:
+        if not isinstance(r, dict):
+            continue
+        if r.get("type") != kind:
+            continue
+        name = (r.get("name") or "").strip()
+        email = (r.get("email") or "").strip()
+        if name and email:
+            out.append(f"{name} <{email}>")
+        elif email:
+            out.append(email)
+        elif name:
+            out.append(name)
+    return "; ".join(out)
+
+
+def fmt_attachments(attachments: list) -> str:
+    if not attachments:
+        return ""
+    out = []
+    for a in attachments[:20]:
+        if not isinstance(a, dict):
+            continue
+        name = a.get("name") or a.get("filename") or ""
+        if name:
+            out.append(name)
+    return " | ".join(out)
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+def _aware_utc(dt: Optional[datetime]) -> Optional[datetime]:
+    """Sjednoceni: PG TIMESTAMPTZ -> tz-aware UTC; Mongo datetime -> naive (UTC).
+    Vrati tz-aware UTC datetime nebo None."""
+    if dt is None:
+        return None
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt.astimezone(timezone.utc)
+
+
+# --- HLAVNI SMYCKA ----------------------------------------------------------
+
+def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
+                    limit: Optional[int] = None,
+                    index_reset: bool = False) -> dict:
+    # --index-reset: smaz vse pro tuto schranku v PG
+    if index_reset:
+        with pg.cursor() as cur:
+            cur.execute("DELETE FROM emails WHERE mailbox = %s", (mailbox,))
+            deleted = cur.rowcount
+        pg.commit()
+        print(f"[{mailbox}] --index-reset: smazano {deleted} radku v PG")
+
+    # existujici zaznamy v PG (rychly inkrementalni lookup)
+    # tuple = (extractor_version, ok, body_source)
+    with pg.cursor() as cur:
+        cur.execute(
+            "SELECT message_id, extractor_version, ok, body_source "
+            "FROM emails WHERE mailbox = %s",
+            (mailbox,),
+        )
+        existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
+
+    mongo_total = mongo_coll.estimated_document_count()
+    pg_total    = len(existing)
+    pg_uptodate = sum(1 for v in existing.values()
+                      if v[0] == EXTRACTOR_VERSION and v[1])
+    to_process_estimate = mongo_total - pg_uptodate
+    print(f"\n========== {mailbox} ==========")
+    print(f"  v Mongu:      {mongo_total}")
+    print(f"  v PG:         {pg_total} (z toho ext_v={EXTRACTOR_VERSION} & ok=true: {pg_uptodate})")
+    print(f"  k zpracovani: ~{to_process_estimate}{' (limit=' + str(limit) + ')' if limit else ''}")
+
+    if to_process_estimate <= 0 and not index_reset and not limit:
+        print("  Nic noveho ke zpracovani.")
+        return {"mailbox": mailbox, "processed": 0, "ok": 0, "errors": 0,
+                "skipped": pg_uptodate, "empty_body": 0}
+
+    proj = {
+        "_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
+        "subject": 1, "sender": 1, "recipients": 1,
+        "sent_at": 1, "received_at": 1, "modified_at": 1,
+        "is_read": 1, "is_draft": 1,
+        "has_attachments": 1, "attachment_count": 1, "attachments": 1,
+        "body_html": 1, "body_text": 1, "body_preview": 1,
+        "smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
+        "smime_subject": 1, "smime_inner_attachments": 1,
+    }
+    cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    processed = ok = errors = skipped = empty_body = 0
+    queue: list[dict] = []
+    n = 0
+
+    try:
+        for doc in cursor:
+            n += 1
+            msg_id = doc.get("_id") or ""
+            prev = existing.get(msg_id)  # (extractor_version, ok, body_source)
+            mongo_mtime = doc.get("modified_at")
+
+            # Skip kdyz PG ma stejnou EV a ok=true.
+            # Vyjimka: smime_unwrapped v Mongu, ale PG body_source != 'smime'
+            #          -> unwrap_smime pridal rozbaleny text az po enrichu -> re-enrich.
+            if prev and prev[0] == EXTRACTOR_VERSION and prev[1]:
+                needs_smime_reindex = (
+                    bool(doc.get("smime_unwrapped"))
+                    and prev[2] != "smime"
+                )
+                if not needs_smime_reindex:
+                    skipped += 1
+                    continue
+
+            sender = doc.get("sender") or {}
+            recipients = doc.get("recipients") or []
+            attachments = doc.get("attachments") or []
+            inner = doc.get("smime_inner_attachments") or []
+            if inner:
+                attachments = list(attachments) + [
+                    {"filename": (a.get("filename") or "") + " [smime]"}
+                    for a in inner if a.get("filename")
+                ]
+
+            row = {
+                "mailbox": mailbox,
+                "message_id": msg_id,
+                "graph_id": doc.get("graph_id"),
+                "conversation_id": doc.get("conversation_id"),
+                "folder_path": doc.get("folder_path"),
+                "subject": doc.get("subject") or "",
+                "sender_email": sender.get("email"),
+                "sender_name": sender.get("name"),
+                "to_addrs": fmt_recipients(recipients, "to"),
+                "cc_addrs": fmt_recipients(recipients, "cc"),
+                "bcc_addrs": fmt_recipients(recipients, "bcc"),
+                # Vsechny timestampy z Monga jsou naive ale interpretovany jako UTC.
+                # Tagneme je tz-aware aby PG TIMESTAMPTZ ulozil spravnou UTC hodnotu
+                # a nepocital posun podle session timezone.
+                "sent_at":     _aware_utc(doc.get("sent_at")),
+                "received_at": _aware_utc(doc.get("received_at")),
+                "modified_at": _aware_utc(mongo_mtime),
+                "is_read": doc.get("is_read"),
+                "is_draft": doc.get("is_draft"),
+                "has_attachments": doc.get("has_attachments"),
+                "attachment_count": doc.get("attachment_count"),
+                "attachments_summary": fmt_attachments(attachments),
+                "body": None,
+                "body_length": 0,
+                "body_source": "empty",
+                "extracted_at": _now(),
+                "extractor_version": EXTRACTOR_VERSION,
+                "ok": False,
+                "error": None,
+            }
+
+            status = "OK "; detail = ""
+            try:
+                text = ""
+                if doc.get("smime_unwrapped"):
+                    s_text = doc.get("smime_body_text") or ""
+                    s_html = doc.get("smime_body_html") or ""
+                    s_html_text = html_to_text(s_html) if s_html else ""
+                    combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
+                    s_subject = doc.get("smime_subject") or ""
+                    if s_subject:
+                        combined = f"Subject: {s_subject}\n\n{combined}"
+                    if combined:
+                        text = combined
+                        row["body_source"] = "smime"
+                if not text:
+                    html = doc.get("body_html") or ""
+                    h_text = html_to_text(html) if html else ""
+                    if h_text:
+                        text = h_text
+                        row["body_source"] = "html"
+                if not text:
+                    plain = doc.get("body_text") or ""
+                    if plain:
+                        text = plain
+                        row["body_source"] = "text"
+                if not text:
+                    preview = doc.get("body_preview") or ""
+                    if preview:
+                        text = preview
+                        row["body_source"] = "preview"
+                if not text:
+                    row["body_source"] = "empty"
+                    empty_body += 1
+                body = _truncate(text)
+                row["body"] = body if body else None
+                row["body_length"] = len(body)
+                row["ok"] = True
+                ok += 1
+                detail = f"{len(body)} znaku  {_short(body, 60)!r}"
+            except Exception as e:
+                row["error"] = f"{type(e).__name__}: {e}"[:500]
+                status = "ERR"; detail = row["error"][:80]; errors += 1
+
+            queue.append(row)
+            processed += 1
+
+            if processed % 200 == 0 or processed == 1:
+                subj = _short(row["subject"], 50)
+                print(f"  [{n:>6}|p={processed:>5}] {status} {row['body_source']:<7} "
+                      f"{row['body_length']:>7}ch  | {subj}", flush=True)
+
+            if len(queue) >= BATCH_SIZE:
+                _flush(pg, queue); queue.clear()
+    finally:
+        cursor.close()
+
+    if queue:
+        _flush(pg, queue)
+
+    return {"mailbox": mailbox, "processed": processed, "ok": ok,
+            "errors": errors, "skipped": skipped, "empty_body": empty_body}
+
+
+UPSERT_SQL = """
+INSERT INTO emails
+    (mailbox, message_id, graph_id, conversation_id, folder_path,
+     subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
+     sent_at, received_at, modified_at, is_read, is_draft,
+     has_attachments, attachment_count, attachments_summary,
+     body, body_length, body_source,
+     extracted_at, extractor_version, ok, error)
+VALUES
+    (%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
+     %(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
+     %(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
+     %(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
+     %(body)s, %(body_length)s, %(body_source)s,
+     %(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
+ON CONFLICT (mailbox, message_id) DO UPDATE SET
+    graph_id            = EXCLUDED.graph_id,
+    conversation_id     = EXCLUDED.conversation_id,
+    folder_path         = EXCLUDED.folder_path,
+    subject             = EXCLUDED.subject,
+    sender_email        = EXCLUDED.sender_email,
+    sender_name         = EXCLUDED.sender_name,
+    to_addrs            = EXCLUDED.to_addrs,
+    cc_addrs            = EXCLUDED.cc_addrs,
+    bcc_addrs           = EXCLUDED.bcc_addrs,
+    sent_at             = EXCLUDED.sent_at,
+    received_at         = EXCLUDED.received_at,
+    modified_at         = EXCLUDED.modified_at,
+    is_read             = EXCLUDED.is_read,
+    is_draft            = EXCLUDED.is_draft,
+    has_attachments     = EXCLUDED.has_attachments,
+    attachment_count    = EXCLUDED.attachment_count,
+    attachments_summary = EXCLUDED.attachments_summary,
+    body                = EXCLUDED.body,
+    body_length         = EXCLUDED.body_length,
+    body_source         = EXCLUDED.body_source,
+    extracted_at        = EXCLUDED.extracted_at,
+    extractor_version   = EXCLUDED.extractor_version,
+    ok                  = EXCLUDED.ok,
+    error               = EXCLUDED.error
+"""
+
+
+def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
+    for r in rows:
+        for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
+                  "bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
+            if r.get(k):
+                r[k] = _clean_for_pg(r[k])
+    with pg.cursor() as cur:
+        cur.executemany(UPSERT_SQL, rows)
+    pg.commit()
+
+
+def discover_mailboxes(db) -> list[str]:
+    out = []
+    for name in sorted(db.list_collection_names()):
+        if name in NON_MAILBOX_COLLECTIONS:
+            continue
+        out.append(name)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="enrich_fulltext_emails v1.3")
+    ap.add_argument("--mailbox", default="",
+                    help="Jedna konkretni schranka. Bez argumentu projede vsechny.")
+    ap.add_argument("--limit", type=int,
+                    help="Limit emailu na schranku (test)")
+    ap.add_argument("--index-reset", action="store_true",
+                    help="Pred zpracovanim schranky vymaze vsechny jeji emaily z PG "
+                         "(force re-extract). Bez --mailbox SMAZE CELY index.")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print(f"=== enrich_fulltext_emails v1.3 ===")
+    print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\nPripojuji se k PostgreSQL...")
+    pg = psycopg.connect(PG_DSN, connect_timeout=10)
+    with pg.cursor() as cur:
+        cur.execute(SCHEMA_SQL)
+    pg.commit()
+    print("  Schema OK.")
+
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+    print("  MongoDB OK.")
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = discover_mailboxes(db)
+    print(f"\nSchranky ke zpracovani ({len(mailboxes)}):")
+    for mb in mailboxes:
+        print(f"  - {mb}")
+
+    if args.index_reset and not args.mailbox:
+        print(f"\n!!! --index-reset bez --mailbox => SMAZE CELY INDEX ({len(mailboxes)} schranek) !!!")
+
+    results = []
+    for mb in mailboxes:
+        try:
+            results.append(process_mailbox(pg, db[mb], mb,
+                                           limit=args.limit,
+                                           index_reset=args.index_reset))
+        except Exception as e:
+            traceback.print_exc()
+            print(f"  FATAL pri zpracovani {mb}: {e}")
+            results.append({"mailbox": mb, "processed": 0, "ok": 0,
+                            "errors": 1, "skipped": 0, "empty_body": 0})
+
+    pg.close()
+
+    print("\n" + "="*60)
+    print("=== SHRNUTI ===")
+    grand = {"processed": 0, "ok": 0, "errors": 0, "skipped": 0, "empty_body": 0}
+    for r in results:
+        print(f"  {r['mailbox']:40} processed={r['processed']:>5} ok={r['ok']:>5} "
+              f"errors={r['errors']:>3} skipped={r['skipped']:>6} empty={r['empty_body']:>4}")
+        for k in grand:
+            grand[k] += r.get(k, 0)
+    print(f"  {'TOTAL':40} processed={grand['processed']:>5} ok={grand['ok']:>5} "
+          f"errors={grand['errors']:>3} skipped={grand['skipped']:>6} empty={grand['empty_body']:>4}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    print(f"Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    # exit code: 0 jen kdyz vsechny schranky probehly bez chyby
+    return 1 if grand["errors"] > 0 else 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
@@ -0,0 +1,455 @@
+"""
+==============================================================================
+Skript:   enrich_fulltext_emails_v1.1.py
+Verze:    1.1
+Datum:    2026-06-03
+Autor:    vladimir.buzalka
+
+Popis:
+  Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
+  PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
+
+  Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
+  (a refetch_text_bodies_v1.0 pro stare plain-text emaily).
+  Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
+
+Zmeny proti v1.0:
+  - Fallback poradi rozsireno: body_html -> body_text (novy v parse_emails_graph_v1.4)
+    -> body_preview -> empty. Drive bylo body_html -> body_preview.
+  - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
+  - EXTRACTOR_VERSION=1.1 -> vsechny existujici emaily v PG se preparsuji.
+
+Zdroj:
+  MongoDB    192.168.1.76  db=emaily  kolekce=<mailbox>
+             (krome attachments_index)
+
+Cil:
+  PostgreSQL 192.168.1.76  db=MongoEmaily  tabulka=emails
+             tsvector config 'soubory' (sdileny - simple + unaccent)
+
+Inkrementalita:
+  Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
+  a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
+  se vse preparsuje.
+
+Spusteni:
+  python enrich_fulltext_emails_v1.0.py                       # vsechny schranky
+  python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com
+  python enrich_fulltext_emails_v1.0.py --limit 500           # test
+==============================================================================
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import time
+import traceback
+from datetime import datetime, timezone
+from typing import Optional
+
+import psycopg
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+# --- konfigurace ------------------------------------------------------------
+MONGO_URI = "mongodb://192.168.1.76:27017"
+MONGO_DB = "emaily"
+
+PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
+          "user=vladimir.buzalka password=Vlado7309208104++")
+
+EXTRACTOR_VERSION = "1.1"
+
+MAX_TEXT_BYTES = 5 * 1024 * 1024   # plain text max 5 MB
+SKIP_COLLECTIONS = {"attachments_index"}
+
+BATCH_SIZE = 100
+
+
+# --- SCHEMA -----------------------------------------------------------------
+
+SCHEMA_SQL = """
+CREATE EXTENSION IF NOT EXISTS unaccent;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
+        CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
+        ALTER TEXT SEARCH CONFIGURATION soubory
+            ALTER MAPPING FOR hword, hword_part, word
+            WITH unaccent, simple;
+    END IF;
+END$$;
+
+CREATE TABLE IF NOT EXISTS emails (
+    id              BIGSERIAL PRIMARY KEY,
+    mailbox         TEXT NOT NULL,
+    message_id      TEXT NOT NULL,
+    graph_id        TEXT,
+    conversation_id TEXT,
+    folder_path     TEXT,
+    subject         TEXT,
+    sender_email    TEXT,
+    sender_name     TEXT,
+    to_addrs        TEXT,
+    cc_addrs        TEXT,
+    bcc_addrs       TEXT,
+    sent_at         TIMESTAMPTZ,
+    received_at     TIMESTAMPTZ,
+    modified_at     TIMESTAMPTZ,
+    is_read         BOOLEAN,
+    is_draft        BOOLEAN,
+    has_attachments BOOLEAN,
+    attachment_count INT,
+    attachments_summary TEXT,
+    body            TEXT,
+    body_length     INT,
+    body_source     TEXT,         -- 'html' | 'preview' | 'empty'
+    tsv             tsvector GENERATED ALWAYS AS (
+        to_tsvector('soubory'::regconfig,
+            left(
+                coalesce(subject, '') || ' ' ||
+                coalesce(sender_email, '') || ' ' ||
+                coalesce(sender_name, '') || ' ' ||
+                coalesce(to_addrs, '') || ' ' ||
+                coalesce(cc_addrs, '') || ' ' ||
+                coalesce(attachments_summary, '') || ' ' ||
+                coalesce(body, ''),
+            800000)
+        )
+    ) STORED,
+    extracted_at      TIMESTAMPTZ DEFAULT now(),
+    extractor_version TEXT,
+    ok                BOOLEAN,
+    error             TEXT,
+    UNIQUE (mailbox, message_id)
+);
+
+CREATE INDEX IF NOT EXISTS emails_tsv_gin            ON emails USING gin(tsv);
+CREATE INDEX IF NOT EXISTS emails_subject_trgm       ON emails USING gin(subject gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS emails_sender_email_idx   ON emails(sender_email);
+CREATE INDEX IF NOT EXISTS emails_mailbox_idx        ON emails(mailbox);
+CREATE INDEX IF NOT EXISTS emails_received_idx       ON emails(received_at DESC);
+CREATE INDEX IF NOT EXISTS emails_conv_idx           ON emails(conversation_id);
+"""
+
+
+# --- HELPERY ----------------------------------------------------------------
+
+_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+_WS_RX = re.compile(r"[ \t]+")
+_NL_RX = re.compile(r"\n{3,}")
+
+
+def _clean_for_pg(s: str) -> str:
+    if not s:
+        return ""
+    return _CTRL_RX.sub("", s)
+
+
+def _truncate(s: str) -> str:
+    s = _clean_for_pg(s or "")
+    if not s:
+        return ""
+    b = s.encode("utf-8", errors="replace")
+    if len(b) <= MAX_TEXT_BYTES:
+        return s
+    return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
+
+
+def html_to_text(html: str) -> str:
+    """Extrahuje plain text z HTML emailu. Odstrani <script>, <style>, normalizuje whitespace."""
+    if not html:
+        return ""
+    try:
+        soup = BeautifulSoup(html, "lxml")
+    except Exception:
+        soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "head"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    # normalizace whitespace
+    lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
+    text = "\n".join(ln for ln in lines if ln)
+    text = _NL_RX.sub("\n\n", text)
+    return text
+
+
+def fmt_recipients(recipients: list, kind: str) -> str:
+    """Sloupec to_addrs/cc_addrs/bcc_addrs - 'Jmeno <email>; Jmeno2 <email2>'."""
+    if not recipients:
+        return ""
+    out = []
+    for r in recipients:
+        if not isinstance(r, dict):
+            continue
+        if r.get("type") != kind:
+            continue
+        name = (r.get("name") or "").strip()
+        email = (r.get("email") or "").strip()
+        if name and email:
+            out.append(f"{name} <{email}>")
+        elif email:
+            out.append(email)
+        elif name:
+            out.append(name)
+    return "; ".join(out)
+
+
+def fmt_attachments(attachments: list) -> str:
+    if not attachments:
+        return ""
+    out = []
+    for a in attachments[:20]:
+        if not isinstance(a, dict):
+            continue
+        name = a.get("name") or a.get("filename") or ""
+        if name:
+            out.append(name)
+    return " | ".join(out)
+
+
+def _short(s, n=60):
+    if not s:
+        return ""
+    s = str(s).replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc)
+
+
+# --- HLAVNI SMYCKA ----------------------------------------------------------
+
+def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
+                    limit: Optional[int] = None) -> dict:
+    # existujici zaznamy v PG (rychly inkrementalni lookup)
+    with pg.cursor() as cur:
+        cur.execute(
+            "SELECT message_id, extractor_version, modified_at, ok "
+            "FROM emails WHERE mailbox = %s",
+            (mailbox,),
+        )
+        existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
+
+    proj = {
+        "_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
+        "subject": 1, "sender": 1, "recipients": 1,
+        "sent_at": 1, "received_at": 1, "modified_at": 1,
+        "is_read": 1, "is_draft": 1,
+        "has_attachments": 1, "attachment_count": 1, "attachments": 1,
+        "body_html": 1, "body_text": 1, "body_preview": 1,
+    }
+    cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
+    if limit:
+        cursor = cursor.limit(limit)
+
+    total_pending = limit or mongo_coll.estimated_document_count()
+    print(f"[{mailbox}] kandidatu: ~{total_pending}")
+
+    processed = ok = errors = skipped = empty_body = 0
+    queue: list[dict] = []
+    n = 0
+
+    try:
+        for doc in cursor:
+            n += 1
+            msg_id = doc.get("_id") or ""
+            prev = existing.get(msg_id)
+            mongo_mtime = doc.get("modified_at")
+            if (prev and prev[0] == EXTRACTOR_VERSION and prev[2]
+                    and (mongo_mtime is None
+                         or (prev[1] and prev[1] >= mongo_mtime))):
+                skipped += 1
+                continue
+
+            sender = doc.get("sender") or {}
+            recipients = doc.get("recipients") or []
+            attachments = doc.get("attachments") or []
+
+            row = {
+                "mailbox": mailbox,
+                "message_id": msg_id,
+                "graph_id": doc.get("graph_id"),
+                "conversation_id": doc.get("conversation_id"),
+                "folder_path": doc.get("folder_path"),
+                "subject": doc.get("subject") or "",
+                "sender_email": sender.get("email"),
+                "sender_name": sender.get("name"),
+                "to_addrs": fmt_recipients(recipients, "to"),
+                "cc_addrs": fmt_recipients(recipients, "cc"),
+                "bcc_addrs": fmt_recipients(recipients, "bcc"),
+                "sent_at": doc.get("sent_at"),
+                "received_at": doc.get("received_at"),
+                "modified_at": mongo_mtime,
+                "is_read": doc.get("is_read"),
+                "is_draft": doc.get("is_draft"),
+                "has_attachments": doc.get("has_attachments"),
+                "attachment_count": doc.get("attachment_count"),
+                "attachments_summary": fmt_attachments(attachments),
+                "body": None,
+                "body_length": 0,
+                "body_source": "empty",
+                "extracted_at": _now(),
+                "extractor_version": EXTRACTOR_VERSION,
+                "ok": False,
+                "error": None,
+            }
+
+            status = "OK "; detail = ""
+            try:
+                # fallback poradi (v1.1): body_html -> body_text -> body_preview
+                html = doc.get("body_html") or ""
+                text = html_to_text(html) if html else ""
+                if text:
+                    row["body_source"] = "html"
+                else:
+                    plain = doc.get("body_text") or ""
+                    if plain:
+                        text = plain
+                        row["body_source"] = "text"
+                    else:
+                        preview = doc.get("body_preview") or ""
+                        if preview:
+                            text = preview
+                            row["body_source"] = "preview"
+                        else:
+                            row["body_source"] = "empty"
+                            empty_body += 1
+                body = _truncate(text)
+                row["body"] = body if body else None
+                row["body_length"] = len(body)
+                row["ok"] = True
+                ok += 1
+                detail = f"{len(body)} znaku  {_short(body, 60)!r}"
+            except Exception as e:
+                row["error"] = f"{type(e).__name__}: {e}"[:500]
+                status = "ERR"; detail = row["error"][:80]; errors += 1
+
+            queue.append(row)
+            processed += 1
+
+            if n % 200 == 0 or n == 1:
+                subj = _short(row["subject"], 50)
+                print(f"  [{n:>5}] {status} {row['body_source']:<7} "
+                      f"{row['body_length']:>7}ch  | {subj}", flush=True)
+
+            if len(queue) >= BATCH_SIZE:
+                _flush(pg, queue); queue.clear()
+    finally:
+        cursor.close()
+
+    if queue:
+        _flush(pg, queue)
+
+    return {"mailbox": mailbox, "processed": processed, "ok": ok,
+            "errors": errors, "skipped": skipped, "empty_body": empty_body}
+
+
+UPSERT_SQL = """
+INSERT INTO emails
+    (mailbox, message_id, graph_id, conversation_id, folder_path,
+     subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
+     sent_at, received_at, modified_at, is_read, is_draft,
+     has_attachments, attachment_count, attachments_summary,
+     body, body_length, body_source,
+     extracted_at, extractor_version, ok, error)
+VALUES
+    (%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
+     %(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
+     %(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
+     %(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
+     %(body)s, %(body_length)s, %(body_source)s,
+     %(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
+ON CONFLICT (mailbox, message_id) DO UPDATE SET
+    graph_id            = EXCLUDED.graph_id,
+    conversation_id     = EXCLUDED.conversation_id,
+    folder_path         = EXCLUDED.folder_path,
+    subject             = EXCLUDED.subject,
+    sender_email        = EXCLUDED.sender_email,
+    sender_name         = EXCLUDED.sender_name,
+    to_addrs            = EXCLUDED.to_addrs,
+    cc_addrs            = EXCLUDED.cc_addrs,
+    bcc_addrs           = EXCLUDED.bcc_addrs,
+    sent_at             = EXCLUDED.sent_at,
+    received_at         = EXCLUDED.received_at,
+    modified_at         = EXCLUDED.modified_at,
+    is_read             = EXCLUDED.is_read,
+    is_draft            = EXCLUDED.is_draft,
+    has_attachments     = EXCLUDED.has_attachments,
+    attachment_count    = EXCLUDED.attachment_count,
+    attachments_summary = EXCLUDED.attachments_summary,
+    body                = EXCLUDED.body,
+    body_length         = EXCLUDED.body_length,
+    body_source         = EXCLUDED.body_source,
+    extracted_at        = EXCLUDED.extracted_at,
+    extractor_version   = EXCLUDED.extractor_version,
+    ok                  = EXCLUDED.ok,
+    error               = EXCLUDED.error
+"""
+
+
+def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
+    for r in rows:
+        for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
+                  "bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
+            if r.get(k):
+                r[k] = _clean_for_pg(r[k])
+    with pg.cursor() as cur:
+        cur.executemany(UPSERT_SQL, rows)
+    pg.commit()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
+    ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
+    args = ap.parse_args()
+
+    t0 = time.time()
+    print("Pripojuji se k PostgreSQL...")
+    # MongoEmaily DB musi existovat (create externe pres psql nebo DBeaver),
+    # protoze CREATE DATABASE nesmi byt v transakci.
+    pg = psycopg.connect(PG_DSN, connect_timeout=10)
+    with pg.cursor() as cur:
+        cur.execute(SCHEMA_SQL)
+    pg.commit()
+    print("Schema OK.")
+
+    print("Pripojuji se k MongoDB...")
+    mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    mongo.admin.command("ping")
+    db = mongo[MONGO_DB]
+
+    if args.mailbox:
+        mailboxes = [args.mailbox]
+    else:
+        mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
+    print(f"Schranky ({len(mailboxes)}): {mailboxes}")
+
+    results = []
+    for mb in mailboxes:
+        results.append(process_mailbox(pg, db[mb], mb, limit=args.limit))
+
+    pg.close()
+
+    print("\n=== SHRNUTI ===")
+    for r in results:
+        print(f"  {r['mailbox']}: processed={r['processed']}  ok={r['ok']}  "
+              f"errors={r['errors']}  skipped={r['skipped']}  empty={r['empty_body']}")
+    print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        print("\nPreruseno uzivatelem")
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
@@ -0,0 +1,41 @@
+#!/bin/bash
+# ============================================================================
+# Wrapper for the email pipeline. Calls Python wrapper inside python-runner
+# container. Logs to dated file. Cleans up logs older than 30 days.
+#
+# Install via User Scripts plugin or /etc/cron.d/email_pipeline:
+#   0 6,18 * * * /mnt/user/Scripts/run_pipeline.sh
+# ============================================================================
+
+set -u
+
+LOG_DIR="/mnt/user/Scripts/logs"
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+LOG_FILE="${LOG_DIR}/pipeline_${TIMESTAMP}.log"
+LATEST_LINK="${LOG_DIR}/pipeline_latest.log"
+RETENTION_DAYS=30
+
+mkdir -p "$LOG_DIR"
+
+echo "=== Email pipeline run @ $(date '+%Y-%m-%d %H:%M:%S') ===" >> "$LOG_FILE"
+
+# Make sure the container is running
+if ! docker inspect -f '{{.State.Running}}' python-runner 2>/dev/null | grep -q true; then
+    echo "ERROR: python-runner container is not running" >> "$LOG_FILE"
+    docker start python-runner >> "$LOG_FILE" 2>&1 || exit 1
+    sleep 5
+fi
+
+docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet >> "$LOG_FILE" 2>&1
+RET=$?
+
+echo "" >> "$LOG_FILE"
+echo "=== Wrapper finished @ $(date '+%Y-%m-%d %H:%M:%S') exit=$RET ===" >> "$LOG_FILE"
+
+# Update "latest" symlink for easy tailing
+ln -sf "$LOG_FILE" "$LATEST_LINK"
+
+# Cleanup logs older than RETENTION_DAYS
+find "$LOG_DIR" -name 'pipeline_*.log' -type f -mtime +${RETENTION_DAYS} -delete
+
+exit $RET