This commit is contained in:
2026-06-05 21:21:30 +02:00
parent 1ec9e40196
commit a347051145
28 changed files with 7402 additions and 0 deletions
+77
View File
@@ -0,0 +1,77 @@
# 0_run_pipeline_v1.0.py
**Wrapper kolem celé emailové pipeline.** Spustí postupně všechny 4 kroky daily syncu, vždy přes všechny dostupné schránky:
| # | Krok | Skript |
|---|---|---|
| 1b | Graph delta sync (inkrementální Mongo update) | `1b_parse_emails_graph_delta_v1.0.py` |
| 3 | Download attachments | `3_download_attachments_v1.4.py` |
| 4 | Unwrap S/MIME | `4_unwrap_smime_v1.0.py` |
| 5 | Enrich fulltext (PG) | `5_enrich_fulltext_emails_v1.3.py` |
## Politika chyb
Default je **continue-on-error** — když některý krok selže, pipeline pokračuje dalším (downstream se nezasekne kvůli minor problému). Po vyběhnutí dostaneš souhrnnou tabulku s `OK / FAIL(N)` per krok.
Použij `--stop-on-error` pokud chceš tvrdou abort při první chybě.
## Logování
Každý krok jde do vlastního logu v `/scripts/pipeline_<id>.log`:
- `pipeline_1b.log`
- `pipeline_3.log`
- `pipeline_4.log`
- `pipeline_5.log`
Live výstup se zároveň tee-uje na konzoli (vypneš přes `--quiet`).
## Argumenty
| Argument | Hodnoty | Popis |
|---|---|---|
| `--only` | `1b 3 4 5` | Spustit jen tyto kroky |
| `--skip` | `1b 3 4 5` | Přeskočit tyto kroky |
| `--stop-on-error` | flag | Zastavit při první chybě (default: pokračovat) |
| `--quiet` | flag | Necpat stdout na konzoli (zůstane v logu) |
## Varianty volání
```bash
# Daily run — vše, všechny schránky:
docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py
# Jen enrich (např. po manuálním zásahu do Mongo):
docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 5
# Vše mimo S/MIME (krok 4 občas vyžaduje pip install asn1crypto):
docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --skip 4
# Test daily sync bez fulltextu:
docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 1b 3 4
# Na pozadí, master log:
docker exec -d python-runner bash -c "python /scripts/0_run_pipeline_v1.0.py --quiet > /scripts/pipeline_master.log 2>&1"
docker exec -it python-runner tail -f /scripts/pipeline_master.log
```
## Cron / nightly automation
Pro nightly se hodí jednoduchý cron na Unraidu (`/etc/cron.daily/` nebo User Scripts plugin):
```bash
#!/bin/bash
docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet \
> /mnt/user/Scripts/pipeline_$(date +%Y%m%d).log 2>&1
```
Stačí denně, delta sync z minulého stavu trvá ~30s s prázdným backlogem.
## Exit kódy wrapperu
| Kód | Význam |
|---|---|
| 0 | Všechny kroky OK |
| 1 | Alespoň jeden krok selhal |
| 2 | Žádný krok k běhu (--only + --skip vyloučily vše) |
| 127 | Některý skript neexistuje v `/scripts/` |
| 130 | Přerušeno Ctrl+C |
+176
View File
@@ -0,0 +1,176 @@
"""
==============================================================================
Skript: 0_run_pipeline_v1.0.py
Verze: 1.0
Datum: 2026-06-04
Autor: vladimir.buzalka
Popis:
Wrapper kolem cele emailove pipeline. Spousti postupne:
1b. parse_emails_graph_delta -> delta sync z Graph API do Mongo
3. download_attachments -> stahne pripojeny soubory
4. unwrap_smime -> rozbali S/MIME wrapper zpravy
5. enrich_fulltext_emails -> doindexuje do PG fulltext
Vzdy projizdi VSECHNY schranky (mimo SKIP_MAILBOXES v jednotlivych skriptech).
Per-krok merici cas + exit code. Pokud krok selze, default pokracuje dal
(aby se downstream nezasekl) — viz --stop-on-error.
Vsechny vystupy a chyby kazdeho kroku jsou ulozeny do /scripts/pipeline_<step>.log
Spousteni:
python 0_run_pipeline_v1.0.py # vse, vsechny schranky
python 0_run_pipeline_v1.0.py --only 5 # jen krok 5 (enrich)
python 0_run_pipeline_v1.0.py --skip 4 # vse mimo smime unwrap
python 0_run_pipeline_v1.0.py --stop-on-error # zastavit pri prvni chybe
python 0_run_pipeline_v1.0.py --quiet # bez tee na konzoli, jen logy
Docker:
docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py
==============================================================================
"""
from __future__ import annotations
import argparse
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
SCRIPTS_DIR = Path("/scripts")
LOGS_DIR = SCRIPTS_DIR # vse do /scripts/
# Definice pipeline (step_id, label, executable filename)
STEPS = [
("1b", "Graph delta sync", "1b_parse_emails_graph_delta_v1.0.py"),
("3", "Download attachments", "3_download_attachments_v1.4.py"),
("4", "Unwrap S/MIME", "4_unwrap_smime_v1.0.py"),
("5", "Enrich fulltext (PG)", "5_enrich_fulltext_emails_v1.3.py"),
]
def fmt_dur(s: float) -> str:
if s < 60:
return f"{s:.1f}s"
m, s = divmod(int(s), 60)
if m < 60:
return f"{m}m{s:02d}s"
h, m = divmod(m, 60)
return f"{h}h{m:02d}m{s:02d}s"
def run_step(step_id: str, label: str, script: str, *,
quiet: bool = False) -> tuple[int, float]:
script_path = SCRIPTS_DIR / script
log_path = LOGS_DIR / f"pipeline_{step_id}.log"
if not script_path.exists():
print(f" CHYBA: {script_path} neexistuje!")
return 127, 0.0
print(f"\n{'='*70}")
print(f" KROK {step_id}: {label}")
print(f" script: {script_path}")
print(f" log: {log_path}")
print(f" start: {datetime.now().strftime('%H:%M:%S')}")
print(f"{'='*70}")
t0 = time.time()
# Tee: zaroven do konzole i do logu (pokud ne --quiet)
with open(log_path, "w", encoding="utf-8") as logf:
proc = subprocess.Popen(
[sys.executable, str(script_path)],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
encoding="utf-8",
errors="replace",
)
for line in proc.stdout:
logf.write(line)
if not quiet:
print(line, end="", flush=True)
ret = proc.wait()
dur = time.time() - t0
print(f"\n KROK {step_id} {'OK' if ret == 0 else f'FAILED ({ret})'} za {fmt_dur(dur)}")
return ret, dur
def main() -> int:
ap = argparse.ArgumentParser(description="Email pipeline wrapper v1.0")
ap.add_argument("--only", nargs="+", default=None,
help="Spustit jen tyto kroky (napr. --only 3 4 5)")
ap.add_argument("--skip", nargs="+", default=None,
help="Preskocit tyto kroky")
ap.add_argument("--stop-on-error", action="store_true",
help="Zastavit pipeline pri prvni nenulovem exit kodu")
ap.add_argument("--quiet", action="store_true",
help="Necpat stdout kroku na konzoli, jen do logu")
args = ap.parse_args()
# Filter step set
only_set = set(args.only) if args.only else None
skip_set = set(args.skip) if args.skip else set()
to_run = []
for sid, label, script in STEPS:
if only_set and sid not in only_set:
continue
if sid in skip_set:
continue
to_run.append((sid, label, script))
if not to_run:
print("Zadny krok k spusteni.")
return 2
print(f"=== Email Pipeline Wrapper v1.0 ===")
print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Kroku k spusteni: {len(to_run)}")
for sid, label, _ in to_run:
print(f" {sid}: {label}")
if args.stop_on_error:
print("Politika: stop-on-error")
else:
print("Politika: continue-on-error (default)")
t_all = time.time()
results = []
for sid, label, script in to_run:
ret, dur = run_step(sid, label, script, quiet=args.quiet)
results.append((sid, label, ret, dur))
if ret != 0 and args.stop_on_error:
print(f"\n!!! Pipeline zastavena na kroku {sid} (--stop-on-error)")
break
total_dur = time.time() - t_all
print(f"\n{'='*70}")
print("=== SHRNUTI PIPELINE ===")
print(f"{'='*70}")
failed = 0
for sid, label, ret, dur in results:
status = "OK" if ret == 0 else f"FAIL({ret})"
if ret != 0:
failed += 1
print(f" [{sid:>2}] {label:30} {status:>8} {fmt_dur(dur):>10}")
print(f"{'='*70}")
print(f" Celkem: {len(results)} kroku, {failed} chyb, {fmt_dur(total_dur)}")
print(f" Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Per-krok logy: {LOGS_DIR}/pipeline_<id>.log")
return 1 if failed else 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
sys.exit(130)
@@ -0,0 +1,41 @@
# 1_parse_emails_graph_v1.4.py
**Krok 1 pipeline** — import emailů z libovolné schránky přes Microsoft Graph API do MongoDB (`emaily.<mailbox>`). Čte všechny složky rekurzivně. Upsert podle Message-ID → bezpečné přerušit a opakovat.
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | ano | e-mail | — | Schránka = název kolekce v Mongo |
| `--mode` | ne | `full` / `new-only` / `sync` | `full` | `full` = plný upsert; `new-only` = jen nové; `sync` = aktualizuje `is_read`/`flag_status`/`categories`/`folder_path` u existujících + importuje nové |
| `--folder` | ne | název složky | (všechny) | Jen jedna složka (např. `Inbox`) |
| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N zpráv (test) |
| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci |
## Varianty volání
```bash
# První plný import schránky (vše):
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz
# Test na 50 zprávách bez vytváření indexů:
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
# Jen nové emaily (po prvním importu):
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode new-only
# Pravidelný sync (nové + aktualizace flagů u existujících) na pozadí, log do souboru:
docker exec -d python-runner bash -c "python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
# Import jen složky Inbox:
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder Inbox
# Test 10 emailů z konkrétní složky:
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder "Sent Items" --limit 10
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/parse_emails.log
```
+624
View File
@@ -0,0 +1,624 @@
"""
parse_emails_graph_v1.4.py
Nazev: parse_emails_graph_v1.4.py
Verze: 1.4
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis:
Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API
a importuje je jako dokumenty do MongoDB.
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
- telo HTML (max 2 MB) + textovy preview
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id)
- internet headers (SPF, DKIM, Received, X-*, ...)
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
kategorie, In-Reply-To, References, ...
- navic: isRead, isDraft, folder_path, inferenceClassification
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
archivni slozky, ...).
DB: emaily
Kolekce: <mailbox> (napr. ordinace@buzalkova.cz)
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
Spousteni:
# Prvni import (vsechno):
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
# Test na prvnich 50:
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
# Jen jedna slozka:
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox
# Pokracovani po preruseni (pouze nove):
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only
# Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync
# Jina schranka:
python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz
Rezimy (--mode):
full Plny upsert vsech poli pro kazdou zpravu (vychozi)
new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
sync Existujici: aktualizuje jen is_read/flag_status/categories/
modified_at/folder_path. Nove zpravy importuje cely.
Idealni pro pravidelne spousteni.
Zavislosti:
msal, requests, pymongo, python-dateutil
Python 3.10+
Struktura dokumentu v MongoDB:
_id Internet Message-ID (nebo graphid: fallback)
graph_id Graph API message ID
subject predmet zpravy
normalized_subject predmet bez RE:/FW:/AW: prefixu
importance 0=nizka 1=normalni 2=vysoka
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
is_read bool — aktualni stav precteni ve schrance
is_draft bool
has_attachments bool
attachment_count int
inference_classification focused / other
categories [str]
conversation_id Graph conversationId
conversation_index base64 conversationIndex
conversation_topic tema vlakna (z internet headers Thread-Topic)
in_reply_to Message-ID predchozi zpravy
internet_references [Message-ID]
received_at datetime UTC
sent_at datetime UTC
created_at datetime UTC
modified_at datetime UTC
folder_id Graph parentFolderId
folder_path cela cesta slozky (napr. Inbox/Subfolder)
sender.email emailova adresa odesilatele
sender.name zobrazovane jmeno
to retezec To (joined)
cc retezec CC
bcc retezec BCC
recipients [{type, email, name}]
body_html HTML telo (pokud contentType=='html', max 2 MB)
body_text plain-text telo (pokud contentType=='text', max 2 MB)
body_preview textovy nahled z Graph bodyPreview (max 255 znaku)
attachments [{filename, size_bytes, mime_type, is_inline, graph_att_id}]
headers dict internet headers
parsed_at datetime UTC
Indexy:
received_at, sent_at, sender.email, graph_id (unique),
conversation_id, folder_path, has_attachments, categories,
importance, flag_status, is_read,
text_search (subject + body_preview + to + cc)
Historie verzi:
1.0 2026-06-02 Inicialni verze
1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync;
odstranen --skip-existing (nahrazen --mode new-only)
1.2 2026-06-02 $expand attachments s $select (bez contentBytes — rychlejsi);
prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu
1.3 2026-06-02 --mailbox jako povinny parametr — univerzalni pouziti pro
libovolnou schranku; kolekce v MongoDB = nazev schranky
1.4 2026-06-03 Plain-text emaily (contentType=='text') se ukladaji do
noveho pole body_text (max 2 MB), drive se truncovalo na
2000 znaku do body_preview a zbytek se zahazoval.
body_preview ted obsahuje vzdy puvodni Graph bodyPreview.
Pro existujici emaily z v1.3 lze pouzit
refetch_text_bodies_v1.0.py.
"""
import sys
import re
import logging
import argparse
import base64
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import msal
import requests
from dateutil import parser as dtparser
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
BATCH_SIZE = 100
PAGE_SIZE = 50
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.4"
# Schránka se nastavuje za behu z --mailbox parametru
GRAPH_MAILBOX: str = ""
# ──────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
# $expand prilohy bez contentBytes — jen metadata co potrebujeme
ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
MSG_SELECT = (
"id,internetMessageId,subject,bodyPreview,body,"
"importance,isRead,isDraft,hasAttachments,"
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
"conversationId,conversationIndex,parentFolderId,"
"categories,flag,inferenceClassification,internetMessageHeaders"
)
MSG_SELECT_SYNC = (
"id,internetMessageId,isRead,isDraft,flag,categories,"
"lastModifiedDateTime,parentFolderId,importance"
)
# ─── Graph API helpers ────────────────────────────────────────────────────────
_graph_token: Optional[str] = None
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
def graph_get(url: str, params: dict = None) -> dict:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Graph GET failed after retry: {url}")
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
if parent_id is None:
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
else:
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
folders = []
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
while url:
data = graph_get(url, params)
for f in data.get("value", []):
path = f"{parent_path}/{f['displayName']}".lstrip("/")
folders.append({"id": f["id"], "path": path})
if f.get("childFolderCount", 0) > 0:
folders.extend(get_all_folders(f["id"], path))
url = data.get("@odata.nextLink")
params = None
return folders
def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
"""Generator: vraci zpravy ze slozky po strankach."""
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
params = {"$top": PAGE_SIZE, "$select": select}
if expand_attachments:
params["$expand"] = ATT_EXPAND
while url:
data = graph_get(url, params)
for msg in data.get("value", []):
yield msg
url = data.get("@odata.nextLink")
params = None
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
def parse_date(raw) -> Optional[datetime]:
if raw is None:
return None
if isinstance(raw, datetime):
if raw.tzinfo:
return raw.astimezone(timezone.utc).replace(tzinfo=None)
return raw
try:
dt = dtparser.parse(str(raw))
if dt.tzinfo:
return dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
except Exception:
return None
def normalize_subject(subject: str) -> str:
s = subject.strip()
while True:
m = RE_SUBJECT.match(s)
if not m:
break
s = s[m.end():].strip()
return s
def parse_headers(raw_headers: list) -> dict:
result = {}
for h in raw_headers:
k = h["name"].lower().replace("-", "_")
v = h["value"]
if k in result:
existing = result[k]
result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
else:
result[k] = v
return result
def format_recipients(lst: list) -> str:
return "; ".join(
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
for r in lst
)
# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
"""Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
try:
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
subject = msg.get("subject") or ""
body_html = None
body_text = None
body_preview = msg.get("bodyPreview") or ""
body = msg.get("body", {})
_MAX_BODY = 2 * 1024 * 1024 # 2 MB
if body.get("contentType") == "html":
content = body.get("content") or ""
body_html = content if len(content) <= _MAX_BODY else content[:_MAX_BODY]
elif body.get("contentType") == "text":
content = body.get("content") or ""
# v1.4: ulozime PLNY plain text do body_text (drive se truncovalo na 2000 znaku
# do body_preview a zbytek se zahodil)
body_text = content if len(content) <= _MAX_BODY else content[:_MAX_BODY]
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
to_list = msg.get("toRecipients", [])
cc_list = msg.get("ccRecipients", [])
bcc_list = msg.get("bccRecipients", [])
recipients = (
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
)
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
raw_headers = msg.get("internetMessageHeaders") or []
headers = parse_headers(raw_headers)
in_reply_to = headers.get("in_reply_to", "")
if isinstance(in_reply_to, list):
in_reply_to = in_reply_to[0]
refs_raw = headers.get("references", "")
if isinstance(refs_raw, list):
refs_raw = " ".join(refs_raw)
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
conv_topic = headers.get("thread_topic", "")
if isinstance(conv_topic, list):
conv_topic = conv_topic[0]
conv_index = ""
ci_raw = msg.get("conversationIndex")
if ci_raw:
try:
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
except Exception:
conv_index = ci_raw
attachments = []
for att in msg.get("attachments") or []:
fname = att.get("name") or ""
if not fname:
continue
attachments.append({
"filename": fname,
"size_bytes": att.get("size", 0),
"mime_type": att.get("contentType", "application/octet-stream"),
"is_inline": att.get("isInline", False),
"graph_att_id": att.get("id"),
})
return {
"_id": mid,
"graph_id": msg["id"],
"subject": subject,
"normalized_subject": normalize_subject(subject),
"importance": importance,
"flag_status": flag_status,
"is_read": msg.get("isRead", False),
"is_draft": msg.get("isDraft", False),
"has_attachments": msg.get("hasAttachments", False),
"attachment_count": len(attachments),
"inference_classification": msg.get("inferenceClassification", ""),
"categories": msg.get("categories") or [],
"conversation_id": msg.get("conversationId", ""),
"conversation_index": conv_index,
"conversation_topic": conv_topic,
"in_reply_to": in_reply_to,
"internet_references": internet_refs,
"received_at": parse_date(msg.get("receivedDateTime")),
"sent_at": parse_date(msg.get("sentDateTime")),
"created_at": parse_date(msg.get("createdDateTime")),
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
"folder_id": msg.get("parentFolderId", ""),
"folder_path": folder_path,
"sender": {
"email": sender_ea.get("address", ""),
"name": sender_ea.get("name", ""),
},
"to": format_recipients(to_list),
"cc": format_recipients(cc_list),
"bcc": format_recipients(bcc_list),
"recipients": recipients,
"body_html": body_html,
"body_text": body_text,
"body_preview": body_preview,
"attachments": attachments,
"headers": headers,
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}
except Exception as e:
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
return None
def extract_sync_fields(msg: dict, folder_path: str) -> dict:
"""Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
return {
"is_read": msg.get("isRead", False),
"is_draft": msg.get("isDraft", False),
"flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
"importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
"categories": msg.get("categories") or [],
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
"folder_id": msg.get("parentFolderId", ""),
"folder_path": folder_path,
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
def create_indexes(col):
print(" Vytvarim indexy...")
col.create_index([("received_at", ASCENDING)])
col.create_index([("sent_at", ASCENDING)])
col.create_index([("sender.email", ASCENDING)])
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
col.create_index([("conversation_id", ASCENDING)])
col.create_index([("folder_path", ASCENDING)])
col.create_index([("has_attachments", ASCENDING)])
col.create_index([("categories", ASCENDING)])
col.create_index([("importance", ASCENDING)])
col.create_index([("flag_status", ASCENDING)])
col.create_index([("is_read", ASCENDING)])
col.create_index([
("subject", TEXT),
("body_preview", TEXT),
("to", TEXT),
("cc", TEXT),
], name="text_search", default_language="none")
print(" Indexy hotovy.")
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def main():
global GRAPH_MAILBOX
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", required=True,
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
"sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
ap.add_argument("--limit", type=int, default=0,
help="Zpracovat max N zprav (0 = vse)")
ap.add_argument("--folder", default="",
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
ap.add_argument("--no-indexes", action="store_true",
help="Nevytvorit indexy na konci")
args = ap.parse_args()
GRAPH_MAILBOX = args.mailbox
mongo_col = args.mailbox
start = datetime.now()
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Schránka: {GRAPH_MAILBOX}")
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
print(f"Režim: {args.mode}")
print("\nPřipojuji se k Graph API...")
try:
get_token()
print(" Graph API OK")
except Exception as e:
print(f" CHYBA: {e}")
sys.exit(1)
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
try:
client.admin.command("ping")
print(" MongoDB OK")
except Exception as e:
print(f" CHYBA: MongoDB neni dostupna -- {e}")
sys.exit(1)
col = client[MONGO_DB][mongo_col]
existing: set = set()
if args.mode in ("new-only", "sync"):
print(" Nacitam existujici zaznamy z MongoDB...")
existing = set(col.distinct("_id"))
print(f" {len(existing)} jiz importovano")
print("\nNacitam seznam slozek...")
all_folders = get_all_folders()
if args.folder:
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
print(f" Slozek ke zpracovani: {len(all_folders)}")
for f in all_folders:
print(f" {f['path']}")
is_sync = args.mode == "sync"
msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
expand_att = not is_sync
batch = []
ok_count = 0
sync_count = 0
err_count = 0
skip_count = 0
total_i = 0
def flush():
if not batch:
return
try:
col.bulk_write(batch, ordered=False)
except Exception as e:
logging.error("bulk_write: %s", e)
print(f" CHYBA bulk_write: {e}")
batch.clear()
print()
for folder in all_folders:
print(f"--- Složka: {folder['path']} ---")
folder_count = 0
for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
if args.limit and total_i >= args.limit:
break
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
total_i += 1
folder_count += 1
if args.mode == "new-only" and mid in existing:
skip_count += 1
continue
if is_sync and mid in existing:
fields = extract_sync_fields(msg, folder["path"])
batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
sync_count += 1
print(f" {total_i:>6} SYN {mid[:80]}")
else:
if is_sync:
full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND}
try:
msg = graph_get(full_url, full_params)
except Exception as e:
logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
err_count += 1
continue
doc = extract_message(msg, folder["path"])
if doc is None:
err_count += 1
print(f" {total_i:>6} ERR {mid[:80]}")
else:
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
ok_count += 1
subject_str = (doc.get("subject") or "")[:60]
sender_str = (doc.get("sender", {}).get("email") or "")[:40]
print(f" {total_i:>6} OK {subject_str:<60} {sender_str}")
if len(batch) >= BATCH_SIZE:
flush()
if total_i % 500 == 0:
elapsed = (datetime.now() - start).total_seconds()
rate = total_i / elapsed if elapsed > 0 else 0
print(f" {''*80}")
print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
print(f" {''*80}")
flush()
print(f"{folder_count} zprav ze slozky {folder['path']}")
if args.limit and total_i >= args.limit:
break
elapsed_total = (datetime.now() - start).total_seconds()
print(f"\n{'='*52}")
print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"Dokumentu v kolekci: {col.count_documents({})}")
if not args.no_indexes:
print()
create_indexes(col)
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if err_count:
print(f"Chyby logovany do: {LOG_FILE}")
client.close()
if __name__ == "__main__":
main()
@@ -0,0 +1,139 @@
# 1b_parse_emails_graph_delta_v1.0.py
**Inkrementalní sync přes Microsoft Graph delta query.** Sourozenec [`1_parse_emails_graph_v1.4.py`](1_parse_emails_graph_v1.4.md) — každý řeší jiný use case:
| Skript | Použití |
|---|---|
| `1_parse_emails_graph_v1.4.py` | **První plný import** schránky (vše od začátku) |
| `1b_parse_emails_graph_delta_v1.0.py` | **Pravidelný sync** — jen co se od minula změnilo |
## Jak funguje
Graph API vystavuje `messages/delta` endpoint, který si pamatuje **záložku** (`deltaLink` s tokenem). Při dalším volání s touto záložkou vrátí jen:
- **nové zprávy**
- **změny** existujících (`isRead`, vlajka, přesun do jiné složky, kategorie)
- **smazané** zprávy (`@removed`)
Delta běží **per složka**. Skript drží stav v Mongo kolekci `emaily.sync_state`:
```json
{
"_id": "ordinace@buzalkova.cz|<folder_id>",
"mailbox": "ordinace@buzalkova.cz",
"folder_id": "AAA...",
"folder_path": "Inbox",
"delta_link": "https://graph.microsoft.com/.../delta?$deltatoken=...",
"last_run_at": "2026-06-04T10:00:00Z",
"cumulative_new": 1234, "cumulative_sync": 5678, "cumulative_removed": 12, "run_count": 42
}
```
První běh = fresh delta (Graph vrátí všechno + dá `deltaLink`). Každý další = jen změny od poslední záložky.
## Co se stane se smazanými zprávami
Když delta vrátí `@removed` pro zprávu, skript ji **nemaže** z Mongo. Pouze nastaví:
```json
{ "permanently_deleted": true, "permanently_deleted_at": "2026-06-04T10:00:00Z" }
```
Dohledatelné: `col.find({"permanently_deleted": true})`.
**`@removed` přijde jen pro definitivně smazané** zprávy (uživatel vysypal koš / Shift+Del). Mail v `Deleted Items` je pořád normální zpráva, jen má `folder_path = "Deleted Items"`.
## Extrakce zprávy
Funkce `extract_message` a `extract_sync_fields` se načítají přímo z modulu `1_parse_emails_graph_v1.4.py` (přes `importlib`) — extrakční logika je jediná na celý projekt, nemůže se rozejít.
## Nové vs změněné — jak skript pozná
Pro každou položku z delta odpovědi:
1. **Má `@removed`?** → označit `permanently_deleted` v Mongo, hotovo.
2. **`graph_id` už je v Mongo?** → existující změna — pošle se jen `extract_sync_fields` (is_read, flag, folder, …) přes `$set`.
3. **`graph_id` v Mongo není?** → nová zpráva — udělá se druhý GET `/messages/{id}?$expand=attachments` (delta nepodporuje `$expand`), aby přišla těla, hlavičky i přílohy, a uloží se přes `extract_message` jako klasický nový dokument.
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) |
| `--folder` | ne | substring | (všechny) | Filtr složek (např. `Inbox` zahrne i `Inbox/Archive`) |
| `--limit N` | ne | int | 0 (bez limitu) | Max položek na složku (test) |
| `--reset` | ne | flag | false | Smaže všechny `deltaLink`y pro vybrané schránky → další běh začne od fresh delta |
| `--dry-run` | ne | flag | false | Nic neuloží do Mongo, jen vypíše co by se stalo |
## SKIP_MAILBOXES (hardcoded ve skriptu)
| Schránka | Důvod |
|---|---|
| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Pro tuto schránku je nutný samostatný skript (lokální `.msg` parser nebo jiný zdroj). |
Při `--mailbox vbuzalka@its.jnj.com` skript skončí s exit kódem 2. Při běhu bez `--mailbox` se schránka tiše přeskočí s hlášením `[skip]`.
## Varianty volání
```bash
# VŠECHNY schránky najednou (mimo SKIP_MAILBOXES) — pro cron / pravidelný sync:
docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py
# Jedna schránka — první běh (fresh delta — projde všechno, uloží deltaLinky):
docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz
# Pravidelný sync jedné schránky (jen změny od minulého běhu):
docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz
# Dry-run — uvidíš co by se stalo, nic se neuloží:
docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --dry-run
# Test jen na složce Inbox, max 20 položek:
docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox --limit 20
# Reset — zahodí deltaLinky a najede znova od plné delta:
docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --reset
# Cron / na pozadí (každých 5 min):
docker exec -d python-runner bash -c "python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz > /scripts/delta_sync.log 2>&1"
```
## Co dělat na začátek
1. **První import** schránky pořád přes `1_parse_emails_graph_v1.4.py` (existující data zůstanou).
2. **První běh** `1b_…delta_v1.0.py` — fresh delta projde znovu všechny zprávy a hlavně uloží `deltaLink`y do `sync_state`. To může chvíli trvat (podobně jako `--mode new-only` na v1.4).
3. **Další běhy** = už jen rychlé, vrací 0-X změn za interval.
## Otevřené body k otestování
- Jak rychle běží první (fresh) delta na velké schránce (`vladimir.buzalka@buzalka.cz` ~80k mailů)
- Co Graph vrátí pro nově vytvořené složky (mělo by fungovat — appendnou se do `folders` při dalším `get_all_folders`)
- Chování při `--limit` (drží se starý deltaLink → pristi beh dokonci zbytek)
## HTTP 410 — expirovaný deltaLink
DeltaLinky drží Graph cca 30 dní. Pokud nebudeš schránku syncovat měsíc, skript dostane 410, **smaže starý state** a sám zopakuje běh jako fresh delta. Žádný manuální zásah není potřeba.
## Závislosti
Stejné jako `1_parse_emails_graph_v1.4.py` (msal, requests, pymongo, dateutil) — žádné nové.
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/delta_sync.log
docker exec -it python-runner tail -f /scripts/delta_errors.log
```
## Stav sync_state v Mongo
```python
# Přehled posledních synců:
db.sync_state.find().sort("last_run_at", -1)
# Zahodit deltaLinky pro jednu schránku (= efekt --reset):
db.sync_state.delete_many({"mailbox": "ordinace@buzalkova.cz"})
# Najít všechny permanentně smazané v jedné schránce:
db["ordinace@buzalkova.cz"].find({"permanently_deleted": true}, {"subject": 1, "permanently_deleted_at": 1})
```
@@ -0,0 +1,514 @@
"""
==============================================================================
Skript: 1b_parse_emails_graph_delta_v1.0.py
Verze: 1.0
Datum: 2026-06-04
Autor: vladimir.buzalka
Popis:
Inkrementalni sync emailu pres Microsoft Graph DELTA QUERY.
Sourozenec `1_parse_emails_graph_v1.4.py` — kazdy resi jiny use case:
1_parse_emails_graph_v1.4.py = prvni plny import schranky
1b_parse_emails_graph_delta_v1.0.py = pravidelny sync (zmeny od minula)
Delta query je server-side change tracking — Graph si pamatuje "zalozku"
(deltaLink) a vraci jen to, co se od ni zmenilo:
- nove zpravy
- zmeny existujicich (isRead, flag, presun do jine slozky, kategorie)
- SMAZANE zpravy (@removed) — definitivne smazane, nikoli v kosi
Pro mail v "Deleted Items" delta nic specialniho nedela — je to porad
normalni zprava, jen s folder_path="Deleted Items". @removed prijde az
kdyz uzivatel vysype kos / Shift+Del.
State:
Kolekce `emaily.sync_state`, _id = "<mailbox>|<folder_id>".
{
mailbox, folder_id, folder_path,
delta_link, # plny URL s $deltatoken na pristi beh
last_run_at,
cumulative_new, cumulative_sync, cumulative_removed
}
Permanentne smazane zpravy:
Skript je NEMAZE z Mongo. Pouze nastavi:
permanently_deleted: True
permanently_deleted_at: <UTC datetime detekce>
Dohledani: col.find({"permanently_deleted": True})
Reuse:
Funkce extract_message / extract_sync_fields se nactou primo z modulu
1_parse_emails_graph_v1.4.py (importlib, file-based), aby se logika
extrahce nikdy nerozesla.
Spousteni:
python 1b_parse_emails_graph_delta_v1.0.py # VSECHNY schranky (mimo SKIP_MAILBOXES)
python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz # jedna schranka
python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox
python 1b_parse_emails_graph_delta_v1.0.py --reset # zahodit deltaLinky a najet znova
python 1b_parse_emails_graph_delta_v1.0.py --dry-run # nic neulozit
SKIP_MAILBOXES (hardcoded):
vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup. Pro tuto
schranku je nutny samostatny skript (lokalni .msg).
Zavislosti:
msal, requests, pymongo, python-dateutil
Python 3.10+
==============================================================================
"""
from __future__ import annotations
import argparse
import importlib.util
import logging
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import msal
import requests
from pymongo import MongoClient, ASCENDING
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
SYNC_STATE_COL = "sync_state"
PAGE_SIZE = 100 # delta endpoint typicky vraci max 100/stranka
LOG_FILE = Path(__file__).parent / "delta_errors.log"
SCRIPT_VERSION = "1.0"
# Kolekce v `emaily` ktere NEJSOU mailboxy:
NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
# Schranky, kde NEMAME Graph API pristup — pri bezneho behu se preskoci.
# Pro tyto je nutny separatni skript (napr. lokalni .msg parser).
SKIP_MAILBOXES = {
"vbuzalka@its.jnj.com", # JNJ tenant — nemame Graph credentials
}
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
# Co tahnout z delta endpointu (stejne jako MSG_SELECT v v1.4, mimo internetMessageHeaders
# ktere delta neumi vratit pro vsechny polozky — pro nove zpravy si je dotahneme
# samostatnym fetchem).
DELTA_SELECT = (
"id,internetMessageId,subject,bodyPreview,body,"
"importance,isRead,isDraft,hasAttachments,"
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
"conversationId,conversationIndex,parentFolderId,"
"categories,flag,inferenceClassification"
)
# Pro plne nacteni nove zpravy (vcetne hlavicek + priloh) pouzijeme stejny
# select+expand jako v1.4
FULL_FETCH_SELECT = (
"id,internetMessageId,subject,bodyPreview,body,"
"importance,isRead,isDraft,hasAttachments,"
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
"conversationId,conversationIndex,parentFolderId,"
"categories,flag,inferenceClassification,internetMessageHeaders"
)
FULL_FETCH_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
# ─── Reuse extract logiky z v1.4 ──────────────────────────────────────────────
_HERE = Path(__file__).parent
_V14_PATH = _HERE / "1_parse_emails_graph_v1.4.py"
if not _V14_PATH.exists():
print(f"CHYBA: chybi sourozenec {_V14_PATH.name} — extract logiku nelze nacist", file=sys.stderr)
sys.exit(1)
_spec = importlib.util.spec_from_file_location("v14_parse", _V14_PATH)
_v14 = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_v14)
extract_message = _v14.extract_message
extract_sync_fields = _v14.extract_sync_fields
# GRAPH_MAILBOX modul-level v v1.4 — pro extract neni potreba, ale pro
# konzistenci nastavujeme ho v main()
# ─── Graph API ────────────────────────────────────────────────────────────────
_graph_token: Optional[str] = None
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
class DeltaExpired(Exception):
"""deltaLink expiroval (HTTP 410) — je nutne zacit od plne delta znovu."""
def graph_get(url: str, params: dict = None, allow_410: bool = False) -> dict:
"""GET na Graph s retry pri 401. Pri 410 a allow_410=True vyhodi DeltaExpired."""
global _graph_token
if not _graph_token:
get_token()
for attempt in range(3):
r = requests.get(
url,
headers={"Authorization": f"Bearer {_graph_token}"},
params=params,
timeout=60,
)
if r.status_code == 401:
get_token()
continue
if r.status_code == 410 and allow_410:
raise DeltaExpired(url)
if r.status_code == 429:
# rate limit — respect Retry-After
wait = int(r.headers.get("Retry-After", "5"))
print(f" [429] cekam {wait}s ...")
time.sleep(wait)
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Graph GET failed after retries: {url}")
def get_all_folders(mailbox: str, parent_id: str = None, parent_path: str = "") -> list[dict]:
if parent_id is None:
url = f"{GRAPH_URL}/users/{mailbox}/mailFolders"
else:
url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{parent_id}/childFolders"
folders = []
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
while url:
data = graph_get(url, params)
for f in data.get("value", []):
path = f"{parent_path}/{f['displayName']}".lstrip("/")
folders.append({"id": f["id"], "path": path})
if f.get("childFolderCount", 0) > 0:
folders.extend(get_all_folders(mailbox, f["id"], path))
url = data.get("@odata.nextLink")
params = None
return folders
def fetch_full_message(mailbox: str, msg_id: str) -> Optional[dict]:
"""Stahne celou zpravu vcetne hlavicek a priloh — pro nove zpravy zachycene v delte."""
url = f"{GRAPH_URL}/users/{mailbox}/messages/{msg_id}"
params = {"$select": FULL_FETCH_SELECT, "$expand": FULL_FETCH_EXPAND}
try:
return graph_get(url, params)
except requests.HTTPError as e:
logging.error("fetch_full_message %s: %s", msg_id, e)
return None
# ─── Delta iterace ────────────────────────────────────────────────────────────
def iter_folder_delta(mailbox: str, folder_id: str, delta_link: Optional[str], limit: int = 0):
"""
Generator: vraci (item, final_delta_link).
item je dict s polozkou (bud zmena nebo {'@removed': ...}).
Posledni vyhozeny tuple ma final_delta_link != None (zbytek None).
Pri HTTP 410 (expirovany deltaLink) vyhodi DeltaExpired — caller ma
pustit znova s delta_link=None (= fresh full delta).
"""
if delta_link:
url = delta_link
params = None
else:
url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{folder_id}/messages/delta"
params = {"$select": DELTA_SELECT, "$top": PAGE_SIZE}
n = 0
while url:
data = graph_get(url, params, allow_410=True)
params = None
for item in data.get("value", []):
yield item, None
n += 1
if limit and n >= limit:
# ulozime aspon stavajici nextLink jako "delta" — neni to ciste,
# ale pri --limit jde o test, takze pristi beh proste pocnize znovu
return
next_link = data.get("@odata.nextLink")
final_link = data.get("@odata.deltaLink")
if final_link:
# konec — predame final delta
yield None, final_link
return
url = next_link
# ─── Per-folder sync ──────────────────────────────────────────────────────────
def sync_folder(col, sync_col, mailbox: str, folder: dict, dry_run: bool, limit: int) -> dict:
"""Vrati statistiky."""
fid = folder["id"]
fpath = folder["path"]
state_id = f"{mailbox}|{fid}"
state = sync_col.find_one({"_id": state_id})
delta_link = state.get("delta_link") if state else None
is_first_run = delta_link is None
label = "FRESH" if is_first_run else "DELTA"
print(f"\n[{label}] {fpath}")
stats = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
final_delta = None
try:
gen = iter_folder_delta(mailbox, fid, delta_link, limit=limit)
for item, fin in gen:
if fin:
final_delta = fin
break
try:
process_item(col, mailbox, fpath, item, stats, dry_run)
except Exception as e:
stats["errors"] += 1
logging.error("process_item %s: %s", item.get("id", "?"), e)
except DeltaExpired:
print(f" [410] deltaLink expiroval — restart od fresh delta")
# rekurzivni restart s vymazanym statem
sync_col.delete_one({"_id": state_id})
return sync_folder(col, sync_col, mailbox, folder, dry_run, limit)
print(f" new={stats['new']} sync={stats['sync']} removed={stats['removed']} err={stats['errors']}")
# Ulozit sync_state pokud mame final_delta a neni dry run
if final_delta and not dry_run:
sync_col.update_one(
{"_id": state_id},
{
"$set": {
"mailbox": mailbox,
"folder_id": fid,
"folder_path": fpath,
"delta_link": final_delta,
"last_run_at": datetime.now(timezone.utc).replace(tzinfo=None),
},
"$inc": {
"cumulative_new": stats["new"],
"cumulative_sync": stats["sync"],
"cumulative_removed": stats["removed"],
"run_count": 1,
},
},
upsert=True,
)
elif not final_delta:
# neprisel deltaLink (napr. limit nebo chyba) — nemenime state, pristi beh
# bude pokracovat normalne podle stareho deltaLinku nebo zacne od fresh
if not is_first_run:
print(f" [pozn] delta neukoncena — pristi beh pojede od ulozeneho deltaLinku")
return stats
def process_item(col, mailbox: str, folder_path: str, item: dict, stats: dict, dry_run: bool):
"""Zpracuje jednu polozku z delta odpovedi."""
# 1) Smazana zprava (@removed)
if "@removed" in item or item.get("@removed.reason"):
graph_id = item.get("id")
if not graph_id:
return
if dry_run:
print(f" REMOVED graph_id={graph_id[:30]}...")
else:
col.update_one(
{"graph_id": graph_id},
{"$set": {
"permanently_deleted": True,
"permanently_deleted_at": datetime.now(timezone.utc).replace(tzinfo=None),
}},
)
stats["removed"] += 1
return
# 2) Nova nebo zmenena zprava — rozhodneme podle existence graph_id v Mongo
graph_id = item.get("id")
if not graph_id:
return
existing = col.find_one({"graph_id": graph_id}, {"_id": 1})
if existing:
# Existujici zprava — update jen sync poli (delta payload je obsahuje)
fields = extract_sync_fields(item, folder_path)
if dry_run:
print(f" SYNC {item.get('subject','')[:60]}")
else:
col.update_one({"_id": existing["_id"]}, {"$set": fields})
stats["sync"] += 1
else:
# Nova zprava — pro telo+attachments+headers fetchneme plnou verzi
full = fetch_full_message(mailbox, graph_id)
if full is None:
stats["errors"] += 1
return
doc = extract_message(full, folder_path)
if doc is None:
stats["errors"] += 1
return
if dry_run:
print(f" NEW {doc.get('subject','')[:60]}")
else:
col.update_one({"_id": doc["_id"]}, {"$set": doc}, upsert=True)
stats["new"] += 1
# ─── Indexy pro sync_state ────────────────────────────────────────────────────
def ensure_sync_state_indexes(sync_col):
sync_col.create_index([("mailbox", ASCENDING), ("folder_id", ASCENDING)])
sync_col.create_index([("last_run_at", ASCENDING)])
def ensure_perm_deleted_index(col):
col.create_index([("permanently_deleted", ASCENDING)], sparse=True)
# ─── Main ─────────────────────────────────────────────────────────────────────
def discover_mailboxes(db) -> list[str]:
"""Vrati seznam mailboxu = vsechny kolekce v `emaily` mimo NON_MAILBOX_COLLECTIONS
a SKIP_MAILBOXES."""
out = []
for name in sorted(db.list_collection_names()):
if name in NON_MAILBOX_COLLECTIONS:
continue
if name in SKIP_MAILBOXES:
print(f" [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
continue
out.append(name)
return out
def sync_mailbox(client, mailbox: str, args) -> dict:
"""Sync jedne schranky. Vraci totals dict."""
_v14.GRAPH_MAILBOX = mailbox
print(f"\n========== {mailbox} ==========")
col = client[MONGO_DB][mailbox]
sync_col = client[MONGO_DB][SYNC_STATE_COL]
if not args.dry_run:
ensure_sync_state_indexes(sync_col)
ensure_perm_deleted_index(col)
if args.reset:
n = sync_col.delete_many({"mailbox": mailbox}).deleted_count
print(f" --reset: smazano {n} deltaLinku pro {mailbox}")
print("Nacitam seznam slozek...")
try:
folders = get_all_folders(mailbox)
except requests.HTTPError as e:
print(f" CHYBA: nelze nacist slozky pro {mailbox}: {e}")
logging.error("get_all_folders %s: %s", mailbox, e)
return {"new": 0, "sync": 0, "removed": 0, "errors": 1}
if args.folder:
folders = [f for f in folders if args.folder.lower() in f["path"].lower()]
print(f" Slozek ke zpracovani: {len(folders)}")
totals = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
for folder in folders:
s = sync_folder(col, sync_col, mailbox, folder, args.dry_run, args.limit)
for k in totals:
totals[k] += s[k]
print(f" -> mailbox total: new={totals['new']} sync={totals['sync']} removed={totals['removed']} err={totals['errors']}")
return totals
def main():
ap = argparse.ArgumentParser(description=f"parse_emails_graph delta sync v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", default="",
help="E-mail schranky (= kolekce v Mongo). "
"Bez argumentu projede vsechny schranky z `emaily` (mimo SKIP_MAILBOXES).")
ap.add_argument("--folder", default="", help="Filtruje slozky obsahujici tento retezec (default: vsechny)")
ap.add_argument("--limit", type=int, default=0, help="Max polozek na slozku (test)")
ap.add_argument("--reset", action="store_true",
help="Smaze deltaLinky pro vybrane schranky — pristi beh zacne od fresh delta")
ap.add_argument("--dry-run", action="store_true", help="Nic neulozi do Mongo, jen vypise co by se stalo")
args = ap.parse_args()
print(f"=== Delta sync v{SCRIPT_VERSION} ===")
if args.dry_run:
print(" DRY-RUN — zadne zmeny v Mongo")
print("Pripojuji se k MongoDB...")
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[MONGO_DB]
if args.mailbox:
if args.mailbox in SKIP_MAILBOXES:
print(f" CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
sys.exit(2)
mailboxes = [args.mailbox]
else:
mailboxes = discover_mailboxes(db)
print(f" Schranky ke zpracovani: {len(mailboxes)}")
for m in mailboxes:
print(f" {m}")
print("Token Graph API...")
get_token()
print(" OK")
t0 = time.time()
grand = {"new": 0, "sync": 0, "removed": 0, "errors": 0}
per_mailbox = []
for mb in mailboxes:
try:
s = sync_mailbox(client, mb, args)
except Exception as e:
print(f" FATAL pri sync {mb}: {e}")
logging.error("sync_mailbox %s: %s", mb, e)
s = {"new": 0, "sync": 0, "removed": 0, "errors": 1}
per_mailbox.append((mb, s))
for k in grand:
grand[k] += s[k]
dt = time.time() - t0
print(f"\n=== SHRNUTI ===")
for mb, s in per_mailbox:
print(f" {mb:40} new={s['new']:>5} sync={s['sync']:>5} removed={s['removed']:>4} err={s['errors']:>3}")
print(f" {'TOTAL':40} new={grand['new']:>5} sync={grand['sync']:>5} removed={grand['removed']:>4} err={grand['errors']:>3}")
print(f" trvalo: {dt:.1f} s")
return 1 if grand["errors"] > 0 else 0
if __name__ == "__main__":
sys.exit(main() or 0)
@@ -0,0 +1,34 @@
# 2_refetch_text_bodies_v1.0.py
**Krok 2 pipeline** — ONETIME oprava starých plain-text emailů. Starý `parse_emails_graph_v1.3` ukládal plain-text emaily jen jako prvních 2000 znaků do `body_preview`; plné tělo se zahazovalo. Tento skript najde takové emaily a re-fetchne plný obsah do nového pole `body_text` (max 2 MB).
> Pro schránky importované rovnou v1.4 nemá co dělat (kandidátů 0). Drží se kvůli archivním schránkám importovaným ve v1.3.
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka |
| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
## Varianty volání
```bash
# Všechny schránky:
docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py
# Jedna schránka:
docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz
# Test 20 emailů:
docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz --limit 20
# Plný běh na pozadí, log do souboru:
docker exec -d python-runner bash -c "python /scripts/2_refetch_text_bodies_v1.0.py > /scripts/refetch.log 2>&1"
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/refetch.log
```
+270
View File
@@ -0,0 +1,270 @@
"""
==============================================================================
Skript: refetch_text_bodies_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis:
ONETIME oprava — parse_emails_graph_v1.3 ukladal plain-text emaily jen jako
prvnich 2000 znaku do `body_preview`. Plne telo se zahazovalo.
Tento skript:
1) Najde v Mongo emaily kde body_html IS NULL/missing/empty
a soucasne maji graph_id (lze refetch)
2) Pro kazdy GET /users/{mailbox}/messages/{graph_id}?$select=body,bodyPreview
3) Pokud body.contentType == 'text' -> ulozi PLNY obsah do noveho pole
body_text (max 2 MB - stejny limit jako body_html)
4) Pokud body.contentType == 'html' (Graph mezitim prepnul) -> ulozi do body_html
5) Aktualizuje body_preview na realny 255-znakovy bodyPreview z Graphu
Bezpecne preusitelne a opakovatelne - skript znovu refetchne jen ty kde
stale chybi body_html i body_text.
Spusteni:
python refetch_text_bodies_v1.0.py # vsechny schranky
python refetch_text_bodies_v1.0.py --mailbox vladimir.buzalka@buzalka.cz
python refetch_text_bodies_v1.0.py --limit 100 # test
==============================================================================
"""
from __future__ import annotations
import argparse
import logging
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import msal
import requests
from pymongo import MongoClient, UpdateOne
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# --- konfigurace ------------------------------------------------------------
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
SKIP_COLLECTIONS = {"attachments_index"}
MAX_BODY_BYTES = 2 * 1024 * 1024 # 2 MB - stejny limit jako body_html v parseru
BATCH_SIZE = 50
LOG_FILE = Path(__file__).parent / "refetch_text_bodies_errors.log"
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
# --- Graph auth -------------------------------------------------------------
_token: Optional[str] = None
def get_token() -> str:
global _token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in res:
raise RuntimeError(f"Graph auth failed: {res}")
_token = res["access_token"]
return _token
def graph_get(url: str, params: dict = None) -> Optional[dict]:
global _token
if not _token:
get_token()
for attempt in range(3):
try:
r = requests.get(
url,
headers={"Authorization": f"Bearer {_token}"},
params=params,
timeout=30,
)
if r.status_code == 401:
get_token()
continue
if r.status_code == 404:
return None # zprava uz neexistuje na strane Outlook
if r.status_code == 429:
wait = int(r.headers.get("Retry-After", "5"))
print(f" [429] throttled, cekam {wait}s", flush=True)
time.sleep(wait)
continue
r.raise_for_status()
return r.json()
except requests.RequestException as e:
if attempt == 2:
raise
time.sleep(2)
return None
# --- hlavni smycka ----------------------------------------------------------
# emaily kde chybi obe tela (body_html i body_text) - tj. jeste nezpracovane
EMPTY_BODY_FILTER = {
"$and": [
{"$or": [
{"body_html": None},
{"body_html": {"$exists": False}},
{"body_html": ""},
]},
{"$or": [
{"body_text": None},
{"body_text": {"$exists": False}},
{"body_text": ""},
]},
{"graph_id": {"$exists": True, "$ne": None, "$ne": ""}},
]
}
def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict:
total = col.count_documents(EMPTY_BODY_FILTER)
print(f"[{mailbox}] kandidatu k refetchi: {total}"
+ (f" (limit {limit})" if limit else ""))
if total == 0:
return {"mailbox": mailbox, "candidates": 0, "refetched": 0,
"text": 0, "html": 0, "still_empty": 0, "errors": 0, "missing": 0}
cursor = col.find(EMPTY_BODY_FILTER, {"_id": 1, "graph_id": 1},
no_cursor_timeout=True)
if limit:
cursor = cursor.limit(limit)
n = refetched = txt = html = still_empty = err = missing = 0
bulk: list[UpdateOne] = []
try:
for doc in cursor:
n += 1
mid = doc["_id"]
gid = doc["graph_id"]
url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}"
params = {"$select": "body,bodyPreview"}
try:
data = graph_get(url, params)
except Exception as e:
err += 1
logging.error("[%s] graph_get %s: %s", mailbox, gid, e)
continue
if data is None:
missing += 1
continue
body = data.get("body") or {}
ctype = body.get("contentType")
content = body.get("content") or ""
preview = data.get("bodyPreview") or ""
update: dict = {"refetched_at": datetime.now(timezone.utc).replace(tzinfo=None)}
if not content:
still_empty += 1
update["body_refetch_status"] = "graph_empty"
elif ctype == "html":
update["body_html"] = (content[:MAX_BODY_BYTES]
if len(content) > MAX_BODY_BYTES else content)
update["body_refetch_status"] = "html"
html += 1
refetched += 1
elif ctype == "text":
update["body_text"] = (content[:MAX_BODY_BYTES]
if len(content) > MAX_BODY_BYTES else content)
update["body_refetch_status"] = "text"
txt += 1
refetched += 1
else:
update["body_refetch_status"] = f"unknown_ctype:{ctype}"
still_empty += 1
if preview:
update["body_preview"] = preview[:300]
bulk.append(UpdateOne({"_id": mid}, {"$set": update}))
if len(bulk) >= BATCH_SIZE:
col.bulk_write(bulk, ordered=False)
bulk.clear()
if n % 100 == 0 or n == 1:
print(f" [{n:>5}/{total}] refetched={refetched} "
f"text={txt} html={html} still_empty={still_empty} "
f"missing={missing} err={err}",
flush=True)
finally:
cursor.close()
if bulk:
col.bulk_write(bulk, ordered=False)
print(f" [{n}/{total}] DONE refetched={refetched} text={txt} html={html} "
f"still_empty={still_empty} missing={missing} err={err}")
return {"mailbox": mailbox, "candidates": total, "refetched": refetched,
"text": txt, "html": html, "still_empty": still_empty,
"errors": err, "missing": missing}
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
args = ap.parse_args()
t0 = time.time()
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Token Graph API...")
get_token()
print("OK\n")
if args.mailbox:
mailboxes = [args.mailbox]
else:
mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
print(f"Schranky ({len(mailboxes)}): {mailboxes}\n")
results = []
for mb in mailboxes:
results.append(process_mailbox(db[mb], mb, limit=args.limit))
print()
print("=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']}: candidates={r['candidates']} "
f"refetched={r['refetched']} text={r['text']} html={r['html']} "
f"still_empty={r['still_empty']} missing={r['missing']} errors={r['errors']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
import traceback
traceback.print_exc()
sys.exit(1)
@@ -0,0 +1,47 @@
# 3_download_attachments_v1.3.py
**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails/<schránka>/Attachments/`. Deduplikace podle **SHA256** obsahu:
- stejný hash → soubor už existuje → přeskočí
- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf`
Po uložení doplní do Mongo `file_hash` + `local_path` a aktualizuje kolekci `emaily.attachments_index` (`_id`=hash, filename, path, size, mime, mailbox, ref_count). Emaily kde mají všechny přílohy `file_hash` → skip → **bezpečné opakovat**.
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) |
| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N emailů **per schránka** (test) |
| `--force-recheck` | ne | flag | false | Znovu ověří i už stažené přílohy |
| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci |
## SKIP_MAILBOXES (hardcoded)
| Schránka | Důvod |
|---|---|
| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
## Varianty volání
```bash
# VŠECHNY schránky (mimo SKIP_MAILBOXES):
docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py
# Jedna schránka interaktivně:
docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
# Test 50 emailů:
docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
# Force-recheck (znovu ověří všechny):
docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
# Na pozadí, log do souboru:
docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/download_attachments.log
```
@@ -0,0 +1,546 @@
"""
download_attachments_v1.3.py
Nazev: download_attachments_v1.3.py
Verze: 1.3
Datum: 2026-06-02
Autor: vladimir.buzalka
Popis:
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
pres Microsoft Graph API a uklada je do adresare
/mnt/Emails/<schránka>/Attachments/.
Schránka se predava jako povinny parametr --mailbox.
Deduplikace podle SHA256 hashe obsahu:
- stejny hash = soubor uz existuje -> preskoci
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
Po ulozeni aktualizuje MongoDB:
- v email dokumentu: kazda priloha dostane file_hash + local_path
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
mime_type, mailbox, first_seen_at, ref_count
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
se preskoci. --force-recheck znovu overi i uz stazene.
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
Spousteni:
python download_attachments_v1.3.py # VSECHNY schranky (mimo SKIP_MAILBOXES)
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz # jedna schranka
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
SKIP_MAILBOXES (hardcoded):
vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup.
Docker:
docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py
Zavislosti:
msal, requests, pymongo
Python 3.10+
Historie verzi:
1.0 2026-06-02 Inicialni verze
1.1 2026-06-02 Schránka jako parametr --mailbox
1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu;
preskoceni S/MIME; inline z Graphu -> SKIP ne ERR
1.3 2026-06-02 Primarni stazeni pres graph_att_id (prime ID bez name-matchingu);
oprava $select na attachment listu (odstranen contentId ktery
zpusoboval BadRequest a vracel prazdny seznam); name-matching
zustava jako fallback pro stare emaily bez graph_att_id
"""
import sys
import re
import hashlib
import logging
import argparse
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import msal
import requests
from pymongo import MongoClient, UpdateOne
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
MONGO_COL_INDEX = "attachments_index"
EMAILS_BASE_DIR = Path("/mnt/Emails")
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.3"
BATCH_SIZE = 50
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
# Kolekce v `emaily` ktere NEJSOU mailboxy
NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
# Schranky kde NEMAME Graph API pristup — pri behu bez --mailbox se preskocia
SKIP_MAILBOXES = {
"vbuzalka@its.jnj.com", # JNJ tenant — nemame Graph credentials
}
# ──────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
_graph_token: Optional[str] = None
# ─── Graph API ────────────────────────────────────────────────────────────────
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
def graph_get_bytes(url: str) -> bytes:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.content
raise RuntimeError(f"Graph GET bytes failed: {url}")
def graph_get_json(url: str, params: dict = None) -> dict:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Graph GET json failed: {url}")
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
"""Nacte metadata vsech priloh zpravy (bez contentBytes)."""
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
try:
# Pozor: contentId NENI v base attachment type — nesmi byt v $select
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
return data.get("value", [])
except Exception as e:
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
return []
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
try:
return graph_get_bytes(url)
except Exception as e:
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s",
graph_message_id, attachment_id, e)
return None
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
def normalize_name(name: str) -> str:
"""Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-"""
nfkd = unicodedata.normalize("NFKD", name.lower().strip())
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
return re.sub(r"[^\w.\-]", "_", ascii_str)
def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
"""Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id)."""
# 1. Presna shoda
for ga in graph_atts:
if ga["name"] == att_name:
return ga
norm_want = normalize_name(att_name)
# 2. Normalizovana shoda
for ga in graph_atts:
if normalize_name(ga["name"]) == norm_want:
return ga
# 3. Normalizovana shoda + velikost (±10 %)
for ga in graph_atts:
if normalize_name(ga["name"]) == norm_want:
ga_size = ga.get("size", 0)
if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
return ga
# 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména)
for ga in graph_atts:
if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
return ga
return None
def sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def safe_filename(name: str) -> str:
safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
return safe or "attachment"
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
existing = col_index.find_one({"filename": desired_name})
if existing:
if existing["_id"] == hash_val:
return desired_name
stem = Path(desired_name).stem
suffix = Path(desired_name).suffix
n = 2
while True:
candidate = f"{stem}_{n}{suffix}"
ex2 = col_index.find_one({"filename": candidate})
if not ex2 or ex2["_id"] == hash_val:
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
return candidate
n += 1
return desired_name
def save_attachment(
content: bytes,
original_name: str,
mime_type: str,
mailbox: str,
att_dir: Path,
col_index,
) -> tuple[str, str, bool]:
hash_val = sha256(content)
existing = col_index.find_one({"_id": hash_val})
if existing:
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
return hash_val, existing["local_path"], False
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
file_path = att_dir / filename
file_path.write_bytes(content)
col_index.insert_one({
"_id": hash_val,
"filename": filename,
"local_path": filename,
"size_bytes": len(content),
"mime_type": mime_type,
"mailbox": mailbox,
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
"ref_count": 1,
})
return hash_val, filename, True
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def process_mailbox(client, mailbox: str, args) -> dict:
"""Zpracuje jednu schranku. Vraci statistiky."""
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
mongo_col = mailbox
start = datetime.now()
print(f"\n========== {mailbox} ==========")
print(f"Cilovy adresar: {att_dir}")
att_dir.mkdir(parents=True, exist_ok=True)
col_emails = client[MONGO_DB][mongo_col]
col_index = client[MONGO_DB][MONGO_COL_INDEX]
if args.force_recheck:
query = {"has_attachments": True}
else:
query = {
"has_attachments": True,
"attachments": {
"$elemMatch": {
"is_inline": False,
"file_hash": {"$exists": False},
}
}
}
total = col_emails.count_documents(query)
print(f"Emailu ke zpracovani: {total}")
if total == 0:
print(" Neni co stahnout.")
return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0, "err": 0,
"elapsed": 0.0}
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
if args.limit:
cursor = cursor.limit(args.limit)
ok_count = 0
new_count = 0
dup_count = 0
skip_count = 0
err_count = 0
email_i = 0
batch = []
def flush():
if not batch:
return
try:
col_emails.bulk_write(batch, ordered=False)
except Exception as e:
logging.error("bulk_write: %s", e)
print(f" CHYBA bulk_write: {e}")
batch.clear()
for email_doc in cursor:
email_i += 1
email_id = email_doc["_id"]
graph_id = email_doc.get("graph_id", "")
subject = (email_doc.get("subject") or "")[:60]
att_list = email_doc.get("attachments") or []
real_atts = [a for a in att_list if not a.get("is_inline", False)]
if not real_atts:
continue
print(f"\n {email_i:>5}/{total} {subject}")
# Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id
need_listing = any(
not a.get("is_inline", False)
and not (not args.force_recheck and a.get("file_hash"))
and not a.get("graph_att_id")
for a in att_list
)
graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
updated_atts = list(att_list)
email_ok = True
for i, att in enumerate(updated_atts):
if att.get("is_inline", False):
continue
if not args.force_recheck and att.get("file_hash"):
continue
att_name = att.get("filename", "")
att_size = att.get("size_bytes", 0)
graph_att_id = att.get("graph_att_id")
# Preskoc S/MIME podpisy
if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
skip_count += 1
print(f" SKIP {att_name} (S/MIME)")
continue
# Primy pristup pres graph_att_id (emaily parsovane v1.2+)
if graph_att_id:
content = fetch_attachment_content(mailbox, graph_id, graph_att_id)
if content is None:
err_count += 1
email_ok = False
print(f" ERR {att_name} (stazeni selhalo)")
continue
# Zkontroluj zda jde skutecne o inline (pro edge case)
mime_type = att.get("mime_type", "")
else:
# Fallback: name matching pro stare emaily (parsovane pred v1.2)
graph_att = find_graph_att(att_name, att_size, graph_atts)
if not graph_att:
logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
print(f" ERR {att_name} (nenalezeno)")
err_count += 1
email_ok = False
continue
# Pokud Graph rika ze je inline — preskoc
if graph_att.get("isInline", False):
updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
skip_count += 1
print(f" SKIP {att_name} (inline obrazek)")
continue
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
if content is None:
err_count += 1
email_ok = False
print(f" ERR {att_name} (stazeni selhalo)")
continue
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
hash_val, local_path, was_new = save_attachment(
content, att_name, mime_type, mailbox, att_dir, col_index
)
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
if was_new:
new_count += 1
print(f" NEW {local_path} ({len(content):,} B)")
else:
dup_count += 1
print(f" DUP {att_name} -> {local_path}")
if email_ok:
ok_count += 1
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
if len(batch) >= BATCH_SIZE:
flush()
if email_i % 100 == 0:
elapsed = (datetime.now() - start).total_seconds()
print(f" {''*60}")
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} skip={skip_count} err={err_count}")
print(f" {''*60}")
flush()
elapsed = (datetime.now() - start).total_seconds()
print(f" -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
f"skip={skip_count} err={err_count} ({elapsed:.1f} s)")
return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
"skip": skip_count, "err": err_count, "elapsed": elapsed}
def discover_mailboxes(db) -> list[str]:
"""Vrati seznam mailboxu = vsechny kolekce mimo NON_MAILBOX a SKIP_MAILBOXES."""
out = []
for name in sorted(db.list_collection_names()):
if name in NON_MAILBOX_COLLECTIONS:
continue
if name in SKIP_MAILBOXES:
print(f" [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
continue
out.append(name)
return out
def main():
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", default="",
help="Emailova schranka. Bez argumentu projede vsechny schranky "
"v `emaily` mimo SKIP_MAILBOXES.")
ap.add_argument("--limit", type=int, default=0,
help="Zpracovat max N emailu (0 = vse) — per schranka")
ap.add_argument("--force-recheck", action="store_true",
help="Znovu overi i emaily kde prilohy uz maji file_hash")
ap.add_argument("--no-indexes", action="store_true",
help="Nevytvorit indexy na attachments_index kolekci")
args = ap.parse_args()
start_all = datetime.now()
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}")
print("\nPřipojuji se k Graph API...")
try:
get_token()
print(" Graph API OK")
except Exception as e:
print(f" CHYBA: {e}")
sys.exit(1)
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
try:
client.admin.command("ping")
print(" MongoDB OK")
except Exception as e:
print(f" CHYBA: MongoDB neni dostupna -- {e}")
sys.exit(1)
col_index = client[MONGO_DB][MONGO_COL_INDEX]
if not args.no_indexes:
col_index.create_index("filename")
col_index.create_index("mime_type")
col_index.create_index("mailbox")
db = client[MONGO_DB]
if args.mailbox:
if args.mailbox in SKIP_MAILBOXES:
print(f" CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
sys.exit(2)
mailboxes = [args.mailbox]
else:
mailboxes = discover_mailboxes(db)
print(f" Schranky ke zpracovani: {len(mailboxes)}")
for m in mailboxes:
print(f" {m}")
results = []
for mb in mailboxes:
try:
results.append(process_mailbox(client, mb, args))
except Exception as e:
logging.error("process_mailbox %s: %s", mb, e)
print(f" FATAL pri zpracovani {mb}: {e}")
results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
"skip": 0, "err": 1, "elapsed": 0.0})
elapsed_total = (datetime.now() - start_all).total_seconds()
files_total = col_index.count_documents({})
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
grand = {k: sum(r[k] for r in results) for k in ("ok", "new", "dup", "skip", "err")}
print(f"\n{'='*60}")
print("=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
f"dup={r['dup']:>4} skip={r['skip']:>3} err={r['err']:>3}")
print(f" {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
f"dup={grand['dup']:>4} skip={grand['skip']:>3} err={grand['err']:>3}")
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if grand['err']:
print(f"Chyby logovany do: {LOG_FILE}")
client.close()
if __name__ == "__main__":
main()
@@ -0,0 +1,74 @@
# 3_download_attachments_v1.4.py
**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails/<schránka>/Attachments/`. Deduplikace podle **SHA256** obsahu.
## Nové ve verzi 1.4
| Typ přílohy | `@odata.type` | Co skript dělá |
|---|---|---|
| **File** | `#microsoft.graph.fileAttachment` | Stáhne přes `/$value`, uloží binárku |
| **Item** (vnořený email) | `#microsoft.graph.itemAttachment` | `$expand=...itemAttachment/item`, sestaví **`.eml`** z hlaviček a body vnitřní zprávy |
| **Reference** (OneDrive/SharePoint link) | `#microsoft.graph.referenceAttachment` | Žádný file — uloží jen `reference_url` do Mongo |
Plus:
- **Retry** s exponenciálním backoffem na 429/500/502/503/504 (1s, 2s, 4s; respektuje `Retry-After`).
- **Permanentní označení chyb v Mongo** per-attachment:
- `attachment_missing: True` + `attachment_missing_at: <UTC>` při 404 (email/příloha už neexistuje v mailboxu)
- `attachment_reference: True` + `reference_url: <URL>` u referenceAttachment
- Tagované přílohy se při dalším běhu **automaticky přeskočí** (bez `--force-recheck`).
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | ne | e-mail | (všechny) | Schránka = kolekce v Mongo. Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` a `SKIP_MAILBOXES` |
| `--limit N` | ne | int | 0 | Per schránka, jen prvních N emailů (test) |
| `--force-recheck` | ne | flag | false | Znovu ověří i emaily kde přílohy mají `file_hash` **nebo** `attachment_missing` **nebo** `attachment_reference` |
| `--no-indexes` | ne | flag | false | Nevytváří indexy na `attachments_index` |
## SKIP_MAILBOXES (hardcoded)
| Schránka | Důvod |
|---|---|
| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
## Statistiky per schránka
```
ok=N nove=N dup=N skip=N miss=N ref=N err=N
```
| Kategorie | Význam |
|---|---|
| `ok` | emaily zpracované bez chyby (všechny přílohy hotové) |
| `nove` | nové soubory uložené (NEW + NEW(eml)) |
| `dup` | hash už existuje (jen ref_count++) |
| `skip` | S/MIME (.p7m/.p7s/...) nebo inline obrázek |
| `miss` | 404 — označeno `attachment_missing` (nepokračuje se) |
| `ref` | referenceAttachment — uložen jen URL |
| `err` | tranzientní chyba (5xx, timeout) — bude retry při dalším běhu |
## Varianty volání
```bash
# Všechny schránky (mimo SKIP_MAILBOXES):
docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py
# Jedna schránka:
docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz
# Test 50 emailů:
docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
# Force-recheck (i missing/reference přepíše):
docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck
# Na pozadí:
docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.4.py > /scripts/download_attachments.log 2>&1"
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/download_attachments.log
```
@@ -0,0 +1,713 @@
"""
download_attachments_v1.4.py
Nazev: download_attachments_v1.4.py
Verze: 1.4
Datum: 2026-06-04
Autor: vladimir.buzalka
Popis:
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
pres Microsoft Graph API a uklada je do adresare
/mnt/Emails/<schranka>/Attachments/.
Bez argumentu --mailbox projede vsechny kolekce v `emaily` mimo
NON_MAILBOX_COLLECTIONS a SKIP_MAILBOXES.
Deduplikace podle SHA256 hashe obsahu:
- stejny hash = soubor uz existuje -> preskoci
- prvni vyskyt: ulozi pod puvodnim nazvem
- kolize nazvu: faktura_2.pdf, faktura_3.pdf ...
Po ulozeni aktualizuje MongoDB:
- v email dokumentu: kazda priloha dostane file_hash + local_path
- kolekce emaily.attachments_index: _id=hash, filename, ...
NOVE v 1.4:
- Spravne zpracovani vsech typu priloh:
* fileAttachment -> /$value (jako predtim)
* itemAttachment -> /$expand=microsoft.graph.itemAttachment/item
-> sestavi .eml z vnitrni zpravy
* referenceAttachment -> ulozi jen URL, neexistuje content
- Retry s exponencialnim backoffem (1s, 2s, 4s) na 429/5xx
- Permanentni tagging chyb v Mongo per-attachment:
* attachment_missing: True (404, email/att uz neexistuje)
* attachment_reference: True (referenceAttachment, jen URL)
* reference_url, attachment_type — diagnosticke metadata
- Tagovane prilohy se pri dalsim behu preskocia (bez --force-recheck)
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
Spousteni:
python download_attachments_v1.4.py
python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz
python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50
python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck
SKIP_MAILBOXES (hardcoded):
vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup.
Docker:
docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py
Zavislosti:
msal, requests, pymongo
Python 3.10+
Historie verzi:
1.0 2026-06-02 Inicialni verze
1.1 2026-06-02 Schranka jako parametr --mailbox
1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu
1.3 2026-06-02 Primarni stazeni pres graph_att_id; --mailbox volitelny
1.4 2026-06-04 itemAttachment/referenceAttachment handling; retry s backoffem;
permanentni tagging chyb (attachment_missing / attachment_reference)
"""
import sys
import re
import time
import json
import hashlib
import logging
import argparse
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import msal
import requests
from pymongo import MongoClient, UpdateOne
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
MONGO_COL_INDEX = "attachments_index"
EMAILS_BASE_DIR = Path("/mnt/Emails")
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.4"
BATCH_SIZE = 50
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
# Kolekce v `emaily` ktere NEJSOU mailboxy
NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
# Schranky kde NEMAME Graph API pristup
SKIP_MAILBOXES = {
"vbuzalka@its.jnj.com",
}
# Retry konfigurace pro tranzientni chyby
RETRY_STATUSES = {429, 500, 502, 503, 504}
RETRY_BACKOFF_S = [1, 2, 4] # max 3 pokusy
# Sentinel hodnoty pro fetch_attachment_smart
FETCH_MISSING = "__MISSING__" # 404
FETCH_REFERENCE = "__REFERENCE__" # referenceAttachment
# ──────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
_graph_token: Optional[str] = None
# ─── Graph API ────────────────────────────────────────────────────────────────
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
def _graph_request(method: str, url: str, *, params: dict = None,
stream: bool = False, timeout: int = 60):
"""Nizko-urovnova HTTP volani s retry na 429/5xx a auto-reauth na 401.
Vraci requests.Response (pro stream=True pred .content); pro 404 vraci Response."""
global _graph_token
if not _graph_token:
get_token()
last_exc = None
for attempt in range(len(RETRY_BACKOFF_S) + 1):
try:
r = requests.request(
method, url,
headers={"Authorization": f"Bearer {_graph_token}"},
params=params, timeout=timeout, stream=stream,
)
if r.status_code == 401:
get_token()
continue
if r.status_code in RETRY_STATUSES and attempt < len(RETRY_BACKOFF_S):
# Retry-After hlavicka ma prednost
ra = r.headers.get("Retry-After")
sleep_s = float(ra) if ra and ra.replace(".", "").isdigit() else RETRY_BACKOFF_S[attempt]
time.sleep(sleep_s)
continue
return r
except (requests.ConnectionError, requests.Timeout) as e:
last_exc = e
if attempt < len(RETRY_BACKOFF_S):
time.sleep(RETRY_BACKOFF_S[attempt])
continue
raise
raise RuntimeError(f"Graph request exhausted retries: {url} (last_exc={last_exc})")
def graph_get_json(url: str, params: dict = None) -> dict:
r = _graph_request("GET", url, params=params, timeout=30)
r.raise_for_status()
return r.json()
def graph_get_bytes(url: str) -> bytes:
r = _graph_request("GET", url, stream=True, timeout=120)
r.raise_for_status()
return r.content
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
"""Nacte metadata vsech priloh zpravy. Vraci i @odata.type."""
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
try:
# @odata.type se vraci automaticky (neni v base $select)
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
return data.get("value", [])
except Exception as e:
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
return []
def _build_eml_from_item(item: dict) -> bytes:
"""Sestavi minimalni RFC822 .eml z itemAttachment.item (message)."""
def hdr(name, val):
return f"{name}: {val}\r\n" if val else ""
def addrs(field):
rec = item.get(field) or []
out = []
for r in rec:
ea = r.get("emailAddress") or {}
name = ea.get("name", "")
addr = ea.get("address", "")
if name and addr:
out.append(f'"{name}" <{addr}>')
elif addr:
out.append(addr)
return ", ".join(out)
subj = item.get("subject", "")
sender = item.get("from") or item.get("sender") or {}
sender_ea = sender.get("emailAddress") or {}
from_str = (f'"{sender_ea.get("name","")}" <{sender_ea.get("address","")}>'
if sender_ea.get("address") else "")
sent = item.get("sentDateTime") or item.get("receivedDateTime") or ""
body = item.get("body") or {}
content_type = body.get("contentType", "text") # 'text' | 'html'
body_content = body.get("content", "") or ""
mime_type = "text/html" if content_type.lower() == "html" else "text/plain"
headers = (
hdr("From", from_str)
+ hdr("To", addrs("toRecipients"))
+ hdr("Cc", addrs("ccRecipients"))
+ hdr("Subject", subj)
+ hdr("Date", sent)
+ f"Content-Type: {mime_type}; charset=utf-8\r\n"
+ "MIME-Version: 1.0\r\n"
+ "\r\n"
)
return (headers + body_content).encode("utf-8", errors="replace")
def fetch_attachment_smart(mailbox: str, graph_message_id: str,
attachment_id: str, odata_type: str = "") -> tuple:
"""Smart fetch: rozezna typ prilohy a vrati (content_bytes, type_str, extra).
type_str: 'file' | 'item' | 'reference' | FETCH_MISSING | FETCH_REFERENCE
extra: pri 'reference' = sourceUrl; pri 'item' = puvodni subject (info)
Vraci (None, FETCH_MISSING, None) pri 404.
Vyhazuje exception pri jinych failures po vycerpani retry.
"""
base = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}"
# Zname typ → optimalni cesta
if odata_type == "#microsoft.graph.fileAttachment":
r = _graph_request("GET", base + "/$value", stream=True, timeout=120)
if r.status_code == 404:
return (None, FETCH_MISSING, None)
r.raise_for_status()
return (r.content, "file", None)
if odata_type == "#microsoft.graph.itemAttachment":
r = _graph_request("GET", base,
params={"$expand": "microsoft.graph.itemAttachment/item"},
timeout=60)
if r.status_code == 404:
return (None, FETCH_MISSING, None)
r.raise_for_status()
obj = r.json()
item = obj.get("item") or {}
return (_build_eml_from_item(item), "item", item.get("subject"))
if odata_type == "#microsoft.graph.referenceAttachment":
r = _graph_request("GET", base, timeout=30)
if r.status_code == 404:
return (None, FETCH_MISSING, None)
r.raise_for_status()
obj = r.json()
return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name"))
# Neznamy typ — zkus $value, pri 405 detekuj typ a rekurzivne zpracuj
r = _graph_request("GET", base + "/$value", stream=True, timeout=120)
if r.status_code == 404:
return (None, FETCH_MISSING, None)
if r.status_code == 405:
# Method Not Allowed -> neni fileAttachment; zjisti typ
r2 = _graph_request("GET", base, timeout=30)
if r2.status_code == 404:
return (None, FETCH_MISSING, None)
r2.raise_for_status()
obj = r2.json()
ot = obj.get("@odata.type", "")
if ot == "#microsoft.graph.itemAttachment":
# objekt nema item bez expand → druhy request
return fetch_attachment_smart(mailbox, graph_message_id, attachment_id, ot)
if ot == "#microsoft.graph.referenceAttachment":
return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name"))
# fallback: fileAttachment ale jeho contentBytes je v JSON
if ot == "#microsoft.graph.fileAttachment":
import base64
cb = obj.get("contentBytes")
if cb:
return (base64.b64decode(cb), "file", None)
raise RuntimeError(f"unknown attachment odata.type={ot}")
r.raise_for_status()
return (r.content, "file", None)
# ─── Pomocne funkce ───────────────────────────────────────────────────────────
def normalize_name(name: str) -> str:
nfkd = unicodedata.normalize("NFKD", name.lower().strip())
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
return re.sub(r"[^\w.\-]", "_", ascii_str)
def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
for ga in graph_atts:
if ga["name"] == att_name:
return ga
norm_want = normalize_name(att_name)
for ga in graph_atts:
if normalize_name(ga["name"]) == norm_want:
return ga
for ga in graph_atts:
if normalize_name(ga["name"]) == norm_want:
ga_size = ga.get("size", 0)
if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
return ga
for ga in graph_atts:
if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
return ga
return None
def sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def safe_filename(name: str) -> str:
safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
return safe or "attachment"
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
existing = col_index.find_one({"filename": desired_name})
if existing:
if existing["_id"] == hash_val:
return desired_name
stem = Path(desired_name).stem
suffix = Path(desired_name).suffix
n = 2
while True:
candidate = f"{stem}_{n}{suffix}"
ex2 = col_index.find_one({"filename": candidate})
if not ex2 or ex2["_id"] == hash_val:
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
return candidate
n += 1
return desired_name
def save_attachment(content: bytes, original_name: str, mime_type: str,
mailbox: str, att_dir: Path, col_index) -> tuple[str, str, bool]:
hash_val = sha256(content)
existing = col_index.find_one({"_id": hash_val})
if existing:
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
return hash_val, existing["local_path"], False
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
file_path = att_dir / filename
file_path.write_bytes(content)
col_index.insert_one({
"_id": hash_val,
"filename": filename,
"local_path": filename,
"size_bytes": len(content),
"mime_type": mime_type,
"mailbox": mailbox,
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
"ref_count": 1,
})
return hash_val, filename, True
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def process_mailbox(client, mailbox: str, args) -> dict:
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
mongo_col = mailbox
start = datetime.now()
print(f"\n========== {mailbox} ==========")
print(f"Cilovy adresar: {att_dir}")
att_dir.mkdir(parents=True, exist_ok=True)
col_emails = client[MONGO_DB][mongo_col]
col_index = client[MONGO_DB][MONGO_COL_INDEX]
if args.force_recheck:
query = {"has_attachments": True}
else:
# priloha "ke zpracovani" = neni inline, nema file_hash, neni oznacena
# jako missing/reference
query = {
"has_attachments": True,
"attachments": {
"$elemMatch": {
"is_inline": False,
"file_hash": {"$exists": False},
"attachment_missing": {"$ne": True},
"attachment_reference": {"$ne": True},
}
}
}
total = col_emails.count_documents(query)
print(f"Emailu ke zpracovani: {total}")
if total == 0:
print(" Neni co stahnout.")
return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0,
"miss": 0, "ref": 0, "err": 0, "elapsed": 0.0}
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
if args.limit:
cursor = cursor.limit(args.limit)
ok_count = 0
new_count = 0
dup_count = 0
skip_count = 0
miss_count = 0
ref_count = 0
err_count = 0
email_i = 0
batch = []
def flush():
if not batch:
return
try:
col_emails.bulk_write(batch, ordered=False)
except Exception as e:
logging.error("bulk_write: %s", e)
print(f" CHYBA bulk_write: {e}")
batch.clear()
for email_doc in cursor:
email_i += 1
email_id = email_doc["_id"]
graph_id = email_doc.get("graph_id", "")
subject = (email_doc.get("subject") or "")[:60]
att_list = email_doc.get("attachments") or []
real_atts = [a for a in att_list if not a.get("is_inline", False)
and not a.get("attachment_missing")
and not a.get("attachment_reference")]
if not real_atts:
continue
print(f"\n {email_i:>5}/{total} {subject}")
need_listing = any(
not a.get("is_inline", False)
and not (not args.force_recheck and a.get("file_hash"))
and not a.get("graph_att_id")
for a in att_list
)
graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
# mapa graph_att_id -> @odata.type (z listingu pokud byl)
type_map = {ga["id"]: ga.get("@odata.type", "") for ga in graph_atts}
updated_atts = list(att_list)
email_ok = True
for i, att in enumerate(updated_atts):
if att.get("is_inline", False):
continue
if att.get("attachment_missing") or att.get("attachment_reference"):
continue
if not args.force_recheck and att.get("file_hash"):
continue
att_name = att.get("filename", "")
att_size = att.get("size_bytes", 0)
graph_att_id = att.get("graph_att_id")
if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
skip_count += 1
print(f" SKIP {att_name} (S/MIME)")
continue
# Resolve graph_att_id + odata_type
resolved_id = graph_att_id
odata_type = type_map.get(graph_att_id, "") if graph_att_id else ""
if not resolved_id:
# Fallback: name matching (legacy)
graph_att = find_graph_att(att_name, att_size, graph_atts)
if not graph_att:
logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
print(f" ERR {att_name} (nenalezeno)")
err_count += 1
email_ok = False
continue
if graph_att.get("isInline", False):
updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
skip_count += 1
print(f" SKIP {att_name} (inline obrazek)")
continue
resolved_id = graph_att["id"]
odata_type = graph_att.get("@odata.type", "")
# Smart fetch
try:
content, kind, extra = fetch_attachment_smart(
mailbox, graph_id, resolved_id, odata_type
)
except Exception as e:
logging.error("fetch_attachment_smart failed [msg=%s att=%s type=%s]: %s",
graph_id, resolved_id, odata_type, e)
err_count += 1
email_ok = False
print(f" ERR {att_name} (stazeni selhalo)")
continue
now_utc = datetime.now(timezone.utc).replace(tzinfo=None)
if kind == FETCH_MISSING:
updated_atts[i] = {
**att,
"attachment_missing": True,
"attachment_missing_at": now_utc,
}
miss_count += 1
print(f" MISS {att_name} (404 — oznaceno jako missing)")
continue
if kind == FETCH_REFERENCE:
updated_atts[i] = {
**att,
"attachment_reference": True,
"attachment_type": "reference",
"reference_url": extra,
}
ref_count += 1
print(f" REF {att_name} -> {extra}")
continue
# kind in ('file', 'item') — mame bytes
mime_type = att.get("mime_type") or (
"message/rfc822" if kind == "item" else "application/octet-stream"
)
# Pro itemAttachment vyrobime .eml priponu pokud chybi
save_name = att_name
if kind == "item" and not save_name.lower().endswith(".eml"):
save_name = (save_name or "embedded_email") + ".eml"
hash_val, local_path, was_new = save_attachment(
content, save_name, mime_type, mailbox, att_dir, col_index
)
updated_atts[i] = {
**att,
"file_hash": hash_val,
"local_path": local_path,
"attachment_type": kind,
}
if was_new:
new_count += 1
tag = "NEW(eml)" if kind == "item" else "NEW"
print(f" {tag} {local_path} ({len(content):,} B)")
else:
dup_count += 1
print(f" DUP {att_name} -> {local_path}")
if email_ok:
ok_count += 1
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
if len(batch) >= BATCH_SIZE:
flush()
if email_i % 100 == 0:
elapsed = (datetime.now() - start).total_seconds()
print(f" {''*60}")
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} "
f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count}")
print(f" {''*60}")
flush()
elapsed = (datetime.now() - start).total_seconds()
print(f" -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count} ({elapsed:.1f} s)")
return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
"skip": skip_count, "miss": miss_count, "ref": ref_count, "err": err_count,
"elapsed": elapsed}
def discover_mailboxes(db) -> list[str]:
out = []
for name in sorted(db.list_collection_names()):
if name in NON_MAILBOX_COLLECTIONS:
continue
if name in SKIP_MAILBOXES:
print(f" [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)")
continue
out.append(name)
return out
def main():
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", default="",
help="Emailova schranka. Bez argumentu projede vsechny schranky.")
ap.add_argument("--limit", type=int, default=0,
help="Zpracovat max N emailu (0 = vse) — per schranka")
ap.add_argument("--force-recheck", action="store_true",
help="Znovu overi i emaily kde prilohy uz maji file_hash / missing / reference")
ap.add_argument("--no-indexes", action="store_true",
help="Nevytvorit indexy na attachments_index kolekci")
args = ap.parse_args()
start_all = datetime.now()
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}")
print("\nPřipojuji se k Graph API...")
try:
get_token()
print(" Graph API OK")
except Exception as e:
print(f" CHYBA: {e}")
sys.exit(1)
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
try:
client.admin.command("ping")
print(" MongoDB OK")
except Exception as e:
print(f" CHYBA: MongoDB neni dostupna -- {e}")
sys.exit(1)
col_index = client[MONGO_DB][MONGO_COL_INDEX]
if not args.no_indexes:
col_index.create_index("filename")
col_index.create_index("mime_type")
col_index.create_index("mailbox")
db = client[MONGO_DB]
if args.mailbox:
if args.mailbox in SKIP_MAILBOXES:
print(f" CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
sys.exit(2)
mailboxes = [args.mailbox]
else:
mailboxes = discover_mailboxes(db)
print(f" Schranky ke zpracovani: {len(mailboxes)}")
for m in mailboxes:
print(f" {m}")
results = []
for mb in mailboxes:
try:
results.append(process_mailbox(client, mb, args))
except Exception as e:
logging.error("process_mailbox %s: %s", mb, e)
print(f" FATAL pri zpracovani {mb}: {e}")
results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
"skip": 0, "miss": 0, "ref": 0, "err": 1, "elapsed": 0.0})
elapsed_total = (datetime.now() - start_all).total_seconds()
files_total = col_index.count_documents({})
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
grand = {k: sum(r.get(k, 0) for r in results)
for k in ("ok", "new", "dup", "skip", "miss", "ref", "err")}
print(f"\n{'='*60}")
print("=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
f"dup={r['dup']:>4} skip={r['skip']:>3} miss={r.get('miss',0):>3} "
f"ref={r.get('ref',0):>3} err={r['err']:>3}")
print(f" {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
f"dup={grand['dup']:>4} skip={grand['skip']:>3} miss={grand['miss']:>3} "
f"ref={grand['ref']:>3} err={grand['err']:>3}")
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if grand['err']:
print(f"Chyby logovany do: {LOG_FILE}")
client.close()
return 1 if grand['err'] > 0 else 0
if __name__ == "__main__":
sys.exit(main() or 0)
+63
View File
@@ -0,0 +1,63 @@
# 4_unwrap_smime_v1.0.py
**Krok 4 pipeline** — rozbalení S/MIME wrapper zpráv. Některé emaily (Datová schránka, mBank, ComGate, PayU, PostSignum …) mají viditelné tělo jen *"This is an S/MIME signed message"* — skutečný obsah je zabalený uvnitř přílohy `smime.p7m`.
Skript najde tyto emaily, stáhne binárku `smime.p7m` z Graphu, rozbalí PKCS7 SignedData (`asn1crypto.cms`), extrahuje vnitřní MIME zprávu a doplní do Mongo:
| Pole | Obsah |
|---|---|
| `smime_unwrapped: True` | flag — už rozbaleno |
| `smime_subject` | Subject z vnitřní MIME hlavičky |
| `smime_body_text` | plain text vnitřního těla |
| `smime_body_html` | HTML vnitřního těla (pokud je) |
| `smime_inner_attachments[]` | `{filename, content_type, size_bytes}` vnitřních příloh |
## POZOR: `smime.p7m` vs `smime.p7s`
| Příloha | Co to je | Skript dělá |
|---|---|---|
| `smime.p7m` | **Enveloped wrapper** kolem celé MIME zprávy | **Rozbalí** |
| `smime.p7s` | **Detached signature** vedle čistého emailu (tělo je normálně dostupné) | **Ignoruje** — není co rozbalovat |
Filtr ve skriptu (`SMIME_FILTER`) je proto explicitně `^smime\.p7m$`. Email s přílohou `smime.p7s` a `smime_unwrapped != True` je **správný stav**.
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka. Bez argumentu projede všechny kolekce v `emaily` mimo `SKIP_COLLECTIONS` (`attachments_index`, `sync_state`) a `SKIP_MAILBOXES`. |
| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
## SKIP_MAILBOXES (hardcoded)
| Schránka | Důvod |
|---|---|
| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. |
## Varianty volání
```bash
# Všechny schránky (mimo SKIP_MAILBOXES):
docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py
# Jedna schránka:
docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz
# Test 10 emailů:
docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz --limit 10
# Plný běh na pozadí, log do souboru:
docker exec -d python-runner bash -c "python /scripts/4_unwrap_smime_v1.0.py > /scripts/unwrap_smime.log 2>&1"
```
## Závislosti
```bash
docker exec python-runner pip install asn1crypto
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/unwrap_smime.log
```
+445
View File
@@ -0,0 +1,445 @@
"""
==============================================================================
Skript: unwrap_smime_v1.0.py
Verze: 1.0
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis:
Najde v Mongo emaily s prilohou smime.p7m (S/MIME signed-data),
stahne binarni obsah prilohy z Microsoft Graph API, rozbali PKCS7
SignedData (CMS), extrahuje vnitrni MIME message, a ulozi do Mongo:
- smime_unwrapped: True
- smime_body_text : plain text vnitrniho tela
- smime_body_html : HTML vnitrniho tela (kdyz je)
- smime_subject : Subject vnitrni MIME hlavicky
- smime_inner_attachments : [{filename, content_type, size_bytes}]
Tyto pole pak pouzije enrich_fulltext_emails_v1.2 a doplni jejich
obsah do PG fulltext indexu.
Typicke S/MIME odesilatele:
notifikace@mojedatovaschranka.cz (844 emailu)
kontakt@mbank.cz (226)
payments@comgate.cz, service@payu.com (~250)
info.postsignum@cpost.cz
Architekturalni poznamka:
S/MIME priloha smime.p7m ma Content-Type application/pkcs7-mime
s parametrem smime-type=signed-data. Vnitrni obsah je v PKCS7
ContentInfo -> SignedData -> encapContentInfo.eContent. To uz je
primo MIME zprava (multipart nebo single body).
Zavislosti (instalovat v kontejneru):
pip install asn1crypto
Spusteni:
python unwrap_smime_v1.0.py # vsechny schranky (mimo SKIP_MAILBOXES)
python unwrap_smime_v1.0.py --mailbox vladimir.buzalka@buzalka.cz
python unwrap_smime_v1.0.py --limit 10 # test
SKIP_MAILBOXES (hardcoded):
vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup. Pri behu
bez --mailbox se tise preskoci, s --mailbox skript
skonci s exit kodem 2.
==============================================================================
"""
from __future__ import annotations
import argparse
import email
import email.policy
import logging
import sys
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import msal
import requests
from asn1crypto import cms
from pymongo import MongoClient, UpdateOne
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# --- konfigurace ------------------------------------------------------------
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
SKIP_COLLECTIONS = {"attachments_index", "sync_state"}
# Schranky kde NEMAME Graph API pristup — pri bezne behu se preskocia.
SKIP_MAILBOXES = {
"vbuzalka@its.jnj.com", # JNJ tenant — nemame Graph credentials
}
MAX_BODY_BYTES = 2 * 1024 * 1024 # 2 MB strop pro extrahovany text
BATCH_SIZE = 25
LOG_FILE = Path(__file__).parent / "unwrap_smime_errors.log"
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
# --- Graph auth -------------------------------------------------------------
_token: Optional[str] = None
def get_token() -> str:
global _token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in res:
raise RuntimeError(f"Graph auth failed: {res}")
_token = res["access_token"]
return _token
def graph_get_raw(url: str) -> Optional[bytes]:
"""GET na Graph endpoint, vraci raw bytes (pro $value attachment endpoint)."""
global _token
if not _token:
get_token()
for attempt in range(3):
try:
r = requests.get(url, headers={"Authorization": f"Bearer {_token}"}, timeout=60)
if r.status_code == 401:
get_token(); continue
if r.status_code == 404:
return None
if r.status_code == 429:
wait = int(r.headers.get("Retry-After", "5"))
time.sleep(wait); continue
r.raise_for_status()
return r.content
except requests.RequestException:
if attempt == 2:
raise
time.sleep(2)
return None
# --- PKCS7 / MIME unwrap ----------------------------------------------------
def extract_inner_mime(content_bytes: bytes) -> bytes:
"""Z S/MIME prilohy vytahne vnitrni MIME (signed content) jako bytes.
Dva formaty se v Graph API vyskytuji:
A) multipart/signed (detached signature) - bytes zacinaji 'Content-Type: multipart/signed'.
Obsah je rovnou citelny v prvni MIME casti (druha cast je oddeleny PKCS7 podpis).
B) application/pkcs7-mime (opaque, smime-type=signed-data) - vnitrni MIME je
schovany uvnitr PKCS7 SignedData -> encapContentInfo.eContent.
Vraci raw MIME bytes pro pripravu pro email.message_from_bytes.
"""
head = content_bytes[:300].lower()
# A) multipart/signed (detached) - nejcastejsi pro maily z Graphu
if b"content-type:" in head and b"multipart/signed" in head:
try:
outer = email.message_from_bytes(content_bytes, policy=email.policy.default)
except Exception as e:
raise RuntimeError(f"MIME parse failed: {e}")
# iteruj parts - prvni non-signature je signed payload
signed_payload = None
if outer.is_multipart():
for part in outer.iter_parts():
ct = (part.get_content_type() or "").lower()
if "pkcs7-signature" in ct or "x-pkcs7-signature" in ct:
continue
signed_payload = part
break
if signed_payload is None:
raise RuntimeError("multipart/signed: no signed payload found")
return signed_payload.as_bytes()
# B) opaque PKCS7 SignedData - DER nebo base64
data = content_bytes
try:
ci = cms.ContentInfo.load(data)
except Exception:
try:
import base64
stripped = b"".join(line for line in data.splitlines()
if not line.startswith(b"-----"))
data = base64.b64decode(stripped, validate=False)
ci = cms.ContentInfo.load(data)
except Exception as e:
raise RuntimeError(f"PKCS7/MIME parse failed: {e}")
if ci["content_type"].native != "signed_data":
raise RuntimeError(f"Not signed-data, got {ci['content_type'].native}")
sd = ci["content"]
inner = sd["encap_content_info"]["content"]
if inner is None:
raise RuntimeError("encapContentInfo.content is null (detached without MIME wrapper)")
return bytes(inner.native) if hasattr(inner, "native") else bytes(inner)
def parse_inner_mime(mime_bytes: bytes) -> dict:
"""Z MIME bytes vytahne text, html a prilohy."""
msg = email.message_from_bytes(mime_bytes, policy=email.policy.default)
text_parts: list[str] = []
html_parts: list[str] = []
inner_attachments: list[dict] = []
def walk(part):
ctype = part.get_content_type()
disp = (part.get_content_disposition() or "").lower()
filename = part.get_filename()
if part.is_multipart():
for sub in part.iter_parts():
walk(sub)
return
if disp == "attachment" or filename:
try:
payload = part.get_content()
if isinstance(payload, str):
payload_bytes = payload.encode("utf-8", errors="replace")
elif isinstance(payload, bytes):
payload_bytes = payload
else:
payload_bytes = b""
size = len(payload_bytes)
except Exception:
size = 0
inner_attachments.append({
"filename": filename or "(unnamed)",
"content_type": ctype,
"size_bytes": size,
})
return
if ctype == "text/plain":
try:
text_parts.append(part.get_content())
except Exception:
try:
text_parts.append(part.get_payload(decode=True).decode(
part.get_content_charset() or "utf-8", errors="replace"))
except Exception:
pass
elif ctype == "text/html":
try:
html_parts.append(part.get_content())
except Exception:
try:
html_parts.append(part.get_payload(decode=True).decode(
part.get_content_charset() or "utf-8", errors="replace"))
except Exception:
pass
walk(msg)
body_text = "\n\n".join(t.strip() for t in text_parts if t and t.strip())
body_html = "\n".join(h for h in html_parts if h and h.strip())
if len(body_text) > MAX_BODY_BYTES:
body_text = body_text[:MAX_BODY_BYTES]
if len(body_html) > MAX_BODY_BYTES:
body_html = body_html[:MAX_BODY_BYTES]
return {
"subject": str(msg.get("Subject") or "").strip(),
"from": str(msg.get("From") or "").strip(),
"to": str(msg.get("To") or "").strip(),
"date": str(msg.get("Date") or "").strip(),
"body_text": body_text or None,
"body_html": body_html or None,
"inner_attachments": inner_attachments,
}
# --- hlavni smycka ----------------------------------------------------------
SMIME_FILTER = {
"$and": [
{"attachments.filename": {"$regex": "^smime\\.p7m$", "$options": "i"}},
{"smime_unwrapped": {"$ne": True}},
]
}
def find_p7m_graph_att_id(doc: dict) -> Optional[str]:
for att in doc.get("attachments") or []:
if (att.get("filename") or "").lower() == "smime.p7m":
return att.get("graph_att_id")
return None
def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict:
total = col.count_documents(SMIME_FILTER)
print(f"[{mailbox}] S/MIME k rozbaleni: {total}"
+ (f" (limit {limit})" if limit else ""))
if total == 0:
return {"mailbox": mailbox, "candidates": 0, "unwrapped": 0,
"errors": 0, "no_att_id": 0, "missing": 0,
"with_inner_att": 0, "inner_att_total": 0}
cursor = col.find(SMIME_FILTER, {"_id": 1, "graph_id": 1, "attachments": 1},
no_cursor_timeout=True)
if limit:
cursor = cursor.limit(limit)
n = unwrapped = err = no_att_id = missing = with_inner = inner_total = 0
bulk: list[UpdateOne] = []
try:
for doc in cursor:
n += 1
mid = doc["_id"]
gid = doc.get("graph_id")
att_id = find_p7m_graph_att_id(doc)
if not gid or not att_id:
no_att_id += 1
continue
url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}/attachments/{att_id}/$value"
try:
p7m_bytes = graph_get_raw(url)
except Exception as e:
err += 1
logging.error("[%s] graph fetch %s: %s", mailbox, gid, e)
bulk.append(UpdateOne({"_id": mid}, {"$set": {
"smime_unwrapped": False,
"smime_error": f"fetch: {type(e).__name__}: {e}"[:300],
"smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}}))
continue
if p7m_bytes is None:
missing += 1
bulk.append(UpdateOne({"_id": mid}, {"$set": {
"smime_unwrapped": False,
"smime_error": "attachment_404",
"smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}}))
continue
try:
inner_bytes = extract_inner_mime(p7m_bytes)
parsed = parse_inner_mime(inner_bytes)
except Exception as e:
err += 1
logging.error("[%s] unwrap %s: %s", mailbox, mid, e)
bulk.append(UpdateOne({"_id": mid}, {"$set": {
"smime_unwrapped": False,
"smime_error": f"unwrap: {type(e).__name__}: {e}"[:300],
"smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}}))
continue
inner_atts = parsed["inner_attachments"]
inner_total += len(inner_atts)
if inner_atts:
with_inner += 1
update = {
"smime_unwrapped": True,
"smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None),
"smime_body_text": parsed["body_text"],
"smime_body_html": parsed["body_html"],
"smime_subject": parsed["subject"],
"smime_from": parsed["from"],
"smime_to": parsed["to"],
"smime_date": parsed["date"],
"smime_inner_attachments": inner_atts,
"smime_error": None,
}
bulk.append(UpdateOne({"_id": mid}, {"$set": update}))
unwrapped += 1
if len(bulk) >= BATCH_SIZE:
col.bulk_write(bulk, ordered=False)
bulk.clear()
if n % 50 == 0 or n == 1:
print(f" [{n:>5}/{total}] unwrapped={unwrapped} err={err} "
f"no_att_id={no_att_id} missing={missing} "
f"inner_atts_total={inner_total}", flush=True)
finally:
cursor.close()
if bulk:
col.bulk_write(bulk, ordered=False)
print(f" [{n}/{total}] DONE unwrapped={unwrapped} err={err} "
f"no_att_id={no_att_id} missing={missing} "
f"with_inner_atts={with_inner} inner_atts_total={inner_total}")
return {"mailbox": mailbox, "candidates": total, "unwrapped": unwrapped,
"errors": err, "no_att_id": no_att_id, "missing": missing,
"with_inner_att": with_inner, "inner_att_total": inner_total}
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
args = ap.parse_args()
t0 = time.time()
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print("Token Graph API...")
get_token()
print("OK\n")
if args.mailbox:
if args.mailbox in SKIP_MAILBOXES:
print(f"CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.")
return 2
mailboxes = [args.mailbox]
else:
mailboxes = []
for c in db.list_collection_names():
if c in SKIP_COLLECTIONS:
continue
if c in SKIP_MAILBOXES:
print(f" [skip] {c} — v SKIP_MAILBOXES (neni Graph pristup)")
continue
mailboxes.append(c)
print(f"Schranky ({len(mailboxes)}): {mailboxes}\n")
results = []
for mb in mailboxes:
results.append(process_mailbox(db[mb], mb, limit=args.limit))
print()
print("=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']}: candidates={r['candidates']} unwrapped={r['unwrapped']} "
f"errors={r['errors']} no_att_id={r['no_att_id']} missing={r['missing']} "
f"with_inner_atts={r['with_inner_att']} inner_atts_total={r['inner_att_total']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
total_errors = sum(r.get("errors", 0) for r in results)
return 1 if total_errors > 0 else 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
@@ -0,0 +1,47 @@
# 5_enrich_fulltext_emails_v1.2.py
**Krok 5 pipeline** — vytáhne plný text z emailů v MongoDB (db: `emaily`) a uloží do PostgreSQL (db: `MongoEmaily`, tabulka: `emails`) s GIN `tsvector` indexem (config `soubory` — simple + unaccent).
Emaily se **nestahují znovu** — tělo už je v Mongo z kroků 1/2/4. Tento skript jen vybere první dostupné tělo podle priority a pošle text do PG na fulltext.
## Priorita zdroje těla (`body_source`)
1. **`smime`** — `smime_body_text` / `smime_body_html` (pokud unwrap proběhl)
2. **`html`** — `body_html`
3. **`text`** — `body_text` (z parse v1.4 nebo refetch v1.0)
4. **`preview`** — `body_preview` (fallback)
Názvy vnitřních S/MIME příloh (`smime_inner_attachments`) jdou do `attachments_summary` — dohledatelné přes MCP `emaily.find_attachment`.
## Inkrementalita
Pokud `(mailbox, message_id)` v PG existuje, `extractor_version` je aktuální (1.2) a `modified_at` v Mongo není novější → **skip**. Při bumpu `EXTRACTOR_VERSION` se vše přeparsuje.
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka |
| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) |
## Varianty volání
```bash
# Všechny schránky:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py
# Jedna schránka:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz
# Test 500 emailů:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz --limit 500
# Plný běh na pozadí, log do souboru:
docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.2.py > /scripts/enrich.log 2>&1"
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/enrich.log
```
@@ -0,0 +1,489 @@
"""
==============================================================================
Skript: enrich_fulltext_emails_v1.2.py
Verze: 1.2
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis:
Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
(a refetch_text_bodies_v1.0 pro stare plain-text emaily).
Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
Zmeny proti v1.1:
- S/MIME emaily (signed-data od Datove schranky, mBank, ComGate, PayU, ...):
pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html, pouzije se
PREFEROVANE pred bezvyznamnym vnejsim wrapper telem ("This is an S/MIME
signed message"). Nazvy vnitrnich priloh (smime_inner_attachments) se
pridavaji do attachments_summary, tj. dohledatelne pres find_attachment.
- body_source: nova hodnota "smime" (rozbalene vnitrni telo).
- EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
Zmeny v1.1 vs v1.0:
- Fallback poradi rozsireno o body_text (novy v parse_emails_graph_v1.4).
- body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
Zdroj:
MongoDB 192.168.1.76 db=emaily kolekce=<mailbox>
(krome attachments_index)
Cil:
PostgreSQL 192.168.1.76 db=MongoEmaily tabulka=emails
tsvector config 'soubory' (sdileny - simple + unaccent)
Inkrementalita:
Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
se vse preparsuje.
Spusteni:
python enrich_fulltext_emails_v1.0.py # vsechny schranky
python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com
python enrich_fulltext_emails_v1.0.py --limit 500 # test
==============================================================================
"""
from __future__ import annotations
import argparse
import re
import sys
import time
import traceback
from datetime import datetime, timezone
from typing import Optional
import psycopg
from bs4 import BeautifulSoup
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.2"
MAX_TEXT_BYTES = 5 * 1024 * 1024 # plain text max 5 MB
SKIP_COLLECTIONS = {"attachments_index"}
BATCH_SIZE = 100
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS emails (
id BIGSERIAL PRIMARY KEY,
mailbox TEXT NOT NULL,
message_id TEXT NOT NULL,
graph_id TEXT,
conversation_id TEXT,
folder_path TEXT,
subject TEXT,
sender_email TEXT,
sender_name TEXT,
to_addrs TEXT,
cc_addrs TEXT,
bcc_addrs TEXT,
sent_at TIMESTAMPTZ,
received_at TIMESTAMPTZ,
modified_at TIMESTAMPTZ,
is_read BOOLEAN,
is_draft BOOLEAN,
has_attachments BOOLEAN,
attachment_count INT,
attachments_summary TEXT,
body TEXT,
body_length INT,
body_source TEXT, -- 'html' | 'preview' | 'empty'
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig,
left(
coalesce(subject, '') || ' ' ||
coalesce(sender_email, '') || ' ' ||
coalesce(sender_name, '') || ' ' ||
coalesce(to_addrs, '') || ' ' ||
coalesce(cc_addrs, '') || ' ' ||
coalesce(attachments_summary, '') || ' ' ||
coalesce(body, ''),
800000)
)
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (mailbox, message_id)
);
CREATE INDEX IF NOT EXISTS emails_tsv_gin ON emails USING gin(tsv);
CREATE INDEX IF NOT EXISTS emails_subject_trgm ON emails USING gin(subject gin_trgm_ops);
CREATE INDEX IF NOT EXISTS emails_sender_email_idx ON emails(sender_email);
CREATE INDEX IF NOT EXISTS emails_mailbox_idx ON emails(mailbox);
CREATE INDEX IF NOT EXISTS emails_received_idx ON emails(received_at DESC);
CREATE INDEX IF NOT EXISTS emails_conv_idx ON emails(conversation_id);
"""
# --- HELPERY ----------------------------------------------------------------
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
_WS_RX = re.compile(r"[ \t]+")
_NL_RX = re.compile(r"\n{3,}")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
def html_to_text(html: str) -> str:
"""Extrahuje plain text z HTML emailu. Odstrani <script>, <style>, normalizuje whitespace."""
if not html:
return ""
try:
soup = BeautifulSoup(html, "lxml")
except Exception:
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
text = soup.get_text(separator="\n")
# normalizace whitespace
lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
text = "\n".join(ln for ln in lines if ln)
text = _NL_RX.sub("\n\n", text)
return text
def fmt_recipients(recipients: list, kind: str) -> str:
"""Sloupec to_addrs/cc_addrs/bcc_addrs - 'Jmeno <email>; Jmeno2 <email2>'."""
if not recipients:
return ""
out = []
for r in recipients:
if not isinstance(r, dict):
continue
if r.get("type") != kind:
continue
name = (r.get("name") or "").strip()
email = (r.get("email") or "").strip()
if name and email:
out.append(f"{name} <{email}>")
elif email:
out.append(email)
elif name:
out.append(name)
return "; ".join(out)
def fmt_attachments(attachments: list) -> str:
if not attachments:
return ""
out = []
for a in attachments[:20]:
if not isinstance(a, dict):
continue
name = a.get("name") or a.get("filename") or ""
if name:
out.append(name)
return " | ".join(out)
def _short(s, n=60):
if not s:
return ""
s = str(s).replace("\n", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
limit: Optional[int] = None) -> dict:
# existujici zaznamy v PG (rychly inkrementalni lookup)
with pg.cursor() as cur:
cur.execute(
"SELECT message_id, extractor_version, modified_at, ok "
"FROM emails WHERE mailbox = %s",
(mailbox,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
proj = {
"_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
"subject": 1, "sender": 1, "recipients": 1,
"sent_at": 1, "received_at": 1, "modified_at": 1,
"is_read": 1, "is_draft": 1,
"has_attachments": 1, "attachment_count": 1, "attachments": 1,
"body_html": 1, "body_text": 1, "body_preview": 1,
# S/MIME unwrapped fields (z unwrap_smime_v1.0)
"smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
"smime_subject": 1, "smime_inner_attachments": 1,
}
cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
if limit:
cursor = cursor.limit(limit)
total_pending = limit or mongo_coll.estimated_document_count()
print(f"[{mailbox}] kandidatu: ~{total_pending}")
processed = ok = errors = skipped = empty_body = 0
queue: list[dict] = []
n = 0
try:
for doc in cursor:
n += 1
msg_id = doc.get("_id") or ""
prev = existing.get(msg_id)
mongo_mtime = doc.get("modified_at")
if (prev and prev[0] == EXTRACTOR_VERSION and prev[2]
and (mongo_mtime is None
or (prev[1] and prev[1] >= mongo_mtime))):
skipped += 1
continue
sender = doc.get("sender") or {}
recipients = doc.get("recipients") or []
attachments = doc.get("attachments") or []
# u S/MIME prilepime nazvy SKUTECNYCH vnitrnich priloh (PDF faktura, ...)
# za vnejsi smime.p7m, aby je find_attachment nasel
inner = doc.get("smime_inner_attachments") or []
if inner:
attachments = list(attachments) + [
{"filename": (a.get("filename") or "") + " [smime]"}
for a in inner if a.get("filename")
]
row = {
"mailbox": mailbox,
"message_id": msg_id,
"graph_id": doc.get("graph_id"),
"conversation_id": doc.get("conversation_id"),
"folder_path": doc.get("folder_path"),
"subject": doc.get("subject") or "",
"sender_email": sender.get("email"),
"sender_name": sender.get("name"),
"to_addrs": fmt_recipients(recipients, "to"),
"cc_addrs": fmt_recipients(recipients, "cc"),
"bcc_addrs": fmt_recipients(recipients, "bcc"),
"sent_at": doc.get("sent_at"),
"received_at": doc.get("received_at"),
"modified_at": mongo_mtime,
"is_read": doc.get("is_read"),
"is_draft": doc.get("is_draft"),
"has_attachments": doc.get("has_attachments"),
"attachment_count": doc.get("attachment_count"),
"attachments_summary": fmt_attachments(attachments),
"body": None,
"body_length": 0,
"body_source": "empty",
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "; detail = ""
try:
# fallback poradi (v1.2):
# smime_body_text/html (rozbaleny S/MIME) -> body_html -> body_text -> body_preview
text = ""
if doc.get("smime_unwrapped"):
s_text = doc.get("smime_body_text") or ""
s_html = doc.get("smime_body_html") or ""
s_html_text = html_to_text(s_html) if s_html else ""
# preferuj plain text, fallback html
combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
s_subject = doc.get("smime_subject") or ""
if s_subject:
combined = f"Subject: {s_subject}\n\n{combined}"
if combined:
text = combined
row["body_source"] = "smime"
if not text:
html = doc.get("body_html") or ""
h_text = html_to_text(html) if html else ""
if h_text:
text = h_text
row["body_source"] = "html"
if not text:
plain = doc.get("body_text") or ""
if plain:
text = plain
row["body_source"] = "text"
if not text:
preview = doc.get("body_preview") or ""
if preview:
text = preview
row["body_source"] = "preview"
if not text:
row["body_source"] = "empty"
empty_body += 1
body = _truncate(text)
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
if n % 200 == 0 or n == 1:
subj = _short(row["subject"], 50)
print(f" [{n:>5}] {status} {row['body_source']:<7} "
f"{row['body_length']:>7}ch | {subj}", flush=True)
if len(queue) >= BATCH_SIZE:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"mailbox": mailbox, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "empty_body": empty_body}
UPSERT_SQL = """
INSERT INTO emails
(mailbox, message_id, graph_id, conversation_id, folder_path,
subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
sent_at, received_at, modified_at, is_read, is_draft,
has_attachments, attachment_count, attachments_summary,
body, body_length, body_source,
extracted_at, extractor_version, ok, error)
VALUES
(%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
%(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
%(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
%(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
%(body)s, %(body_length)s, %(body_source)s,
%(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (mailbox, message_id) DO UPDATE SET
graph_id = EXCLUDED.graph_id,
conversation_id = EXCLUDED.conversation_id,
folder_path = EXCLUDED.folder_path,
subject = EXCLUDED.subject,
sender_email = EXCLUDED.sender_email,
sender_name = EXCLUDED.sender_name,
to_addrs = EXCLUDED.to_addrs,
cc_addrs = EXCLUDED.cc_addrs,
bcc_addrs = EXCLUDED.bcc_addrs,
sent_at = EXCLUDED.sent_at,
received_at = EXCLUDED.received_at,
modified_at = EXCLUDED.modified_at,
is_read = EXCLUDED.is_read,
is_draft = EXCLUDED.is_draft,
has_attachments = EXCLUDED.has_attachments,
attachment_count = EXCLUDED.attachment_count,
attachments_summary = EXCLUDED.attachments_summary,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
body_source = EXCLUDED.body_source,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
for r in rows:
for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
"bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
if r.get(k):
r[k] = _clean_for_pg(r[k])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
args = ap.parse_args()
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
# MongoEmaily DB musi existovat (create externe pres psql nebo DBeaver),
# protoze CREATE DATABASE nesmi byt v transakci.
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
if args.mailbox:
mailboxes = [args.mailbox]
else:
mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
print(f"Schranky ({len(mailboxes)}): {mailboxes}")
results = []
for mb in mailboxes:
results.append(process_mailbox(pg, db[mb], mb, limit=args.limit))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} empty={r['empty_body']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
@@ -0,0 +1,79 @@
# 5_enrich_fulltext_emails_v1.3.py
**Krok 5 pipeline** — vytáhne plain-text z emailů v Mongu (`emaily.<schránka>`) a uloží do PostgreSQL (`MongoEmaily.emails`) s tsvector GIN indexem nad konfigurací `soubory` (simple + unaccent).
## Co dělá
1. Vybere první dostupné tělo v tomto pořadí:
- `smime_body_text/html` (rozbaleno krokem 4)
- `body_html` → strip HTML přes BeautifulSoup
- `body_text` (legacy plain)
- `body_preview` (jako fallback)
2. Naplní řádek v PG `emails` (mailbox, subject, sender, recipients, body, attachments_summary, ...) + tsvector se vygeneruje sám.
3. Upsert (`ON CONFLICT (mailbox, message_id) DO UPDATE`).
## Inkrementální logika
Pokud `(mailbox, message_id)` už je v PG a:
- `extractor_version == EXTRACTOR_VERSION` (aktuálně `1.2`)
- `ok = true`
- `modified_at` v Mongo není novější než v PG
**skip**. Nemusíš se bát opakovaného spuštění — vladimirovo přepsání 73k záznamů co teď probíhá je proto, že `EXTRACTOR_VERSION` byl povýšen z 1.1 → 1.2, takže všechny řádky v PG jsou „zastaralé". Po doběhnutí bude další běh skipovat všechno až na nově přibyvší.
## Změny v1.3 vs v1.2
- **Bugfix** `NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}` — předtím `sync_state` (přibyla s delta syncem) projížděla jako mailbox.
- **`--index-reset`** — před zpracováním schránky `DELETE FROM emails WHERE mailbox=%s`. Force re-extract bez nutnosti povyšovat verzi.
- **Vylepšený per-mailbox header** — ukáže `v Mongu N, v PG M (uptodate K), k zpracovani K`.
- Když `to_process_estimate == 0` → schránku přeskočí úplně (bez iterace cursorem).
## Argumenty
| Argument | Povinný | Hodnoty | Default | Popis |
|---|---|---|---|---|
| `--mailbox` | ne | e-mail | (všechny) | Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` |
| `--limit N` | ne | int | (bez limitu) | Per schránka, jen prvních N emailů (test) |
| `--index-reset` | ne | flag | false | Před zpracováním **smaže** všechny emaily dané schránky v PG. **Bez `--mailbox` smaže CELÝ index!** |
## Varianty volání
```bash
# Všechny schránky, inkrementální:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py
# Jedna schránka:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz
# Test 500 emailů:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz --limit 500
# Force reindex jedné schránky:
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz --index-reset
# DANGEROUS: smaže celý index a postaví znovu (POMALÉ — typicky 30+ minut):
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.3.py --index-reset
# Na pozadí, log do souboru:
docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.3.py > /scripts/enrich_fulltext.log 2>&1"
```
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/enrich_fulltext.log
```
V průběhu skript vypisuje každých 200 zpracovaných emailů:
```
[ 38800|p= 5800] OK html 2831ch | CLEAR/RA payment information for invoice #22FV049
```
- první číslo = pozice v cursoru (počet všech emailů co prošlo)
- `p=N` = počet skutečně zprocesovaných (zbytek byl skipnut jako už-aktuální)
- `OK / ERR`, `body_source`, délka, subject
## Závislosti
```bash
docker exec python-runner pip install psycopg[binary] beautifulsoup4 lxml pymongo
```
@@ -0,0 +1,567 @@
"""
==============================================================================
Skript: enrich_fulltext_emails_v1.3.py
Verze: 1.3
Datum: 2026-06-04
Autor: vladimir.buzalka
Popis:
Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
(a refetch_text_bodies_v1.0 pro stare plain-text emaily).
Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
Zmeny v1.3 vs v1.2:
- Bugfix: NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
(sync_state pribyla v delta syncu, predtim ji v1.2 brala jako mailbox).
- --index-reset: pred zpracovanim schranky vymaze vsechny jeji emaily z PG
(force re-extract; pouzij kdyz povysis EXTRACTOR_VERSION nebo chces ciste).
- Vylepseny header per-mailbox: ukaze pocet v Mongu, v PG a k zpracovani.
Zmeny v1.2 vs v1.1:
- S/MIME emaily: pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html,
pouzije se PREFEROVANE pred bezvyznamnym wrapper telem.
- body_source: nova hodnota "smime".
- EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
Zmeny v1.1 vs v1.0:
- Fallback poradi rozsireno o body_text.
- body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
Zdroj:
MongoDB 192.168.1.76 db=emaily kolekce=<mailbox>
(krome NON_MAILBOX_COLLECTIONS)
Cil:
PostgreSQL 192.168.1.76 db=MongoEmaily tabulka=emails
tsvector config 'soubory' (sdileny - simple + unaccent)
Inkrementalita:
Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
se vse preparsuje. --index-reset to obejde a smaze PG pred behom.
Spusteni:
python enrich_fulltext_emails_v1.3.py # vsechny schranky
python enrich_fulltext_emails_v1.3.py --mailbox ordinace@buzalkova.cz
python enrich_fulltext_emails_v1.3.py --limit 500 # test
python enrich_fulltext_emails_v1.3.py --mailbox X --index-reset # smaze PG schranky a re-extrahuje vsechno
python enrich_fulltext_emails_v1.3.py --index-reset # smaze CELY index a postavi znovu (POMALE!)
==============================================================================
"""
from __future__ import annotations
import argparse
import re
import sys
import time
import traceback
from datetime import datetime, timezone
from typing import Optional
import psycopg
from bs4 import BeautifulSoup
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.2" # NEMENIT pokud nemenis fallback logiku!
MAX_TEXT_BYTES = 5 * 1024 * 1024 # plain text max 5 MB
# Kolekce v `emaily` ktere NEJSOU mailboxy (nezpracovavame)
NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
BATCH_SIZE = 100
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS emails (
id BIGSERIAL PRIMARY KEY,
mailbox TEXT NOT NULL,
message_id TEXT NOT NULL,
graph_id TEXT,
conversation_id TEXT,
folder_path TEXT,
subject TEXT,
sender_email TEXT,
sender_name TEXT,
to_addrs TEXT,
cc_addrs TEXT,
bcc_addrs TEXT,
sent_at TIMESTAMPTZ,
received_at TIMESTAMPTZ,
modified_at TIMESTAMPTZ,
is_read BOOLEAN,
is_draft BOOLEAN,
has_attachments BOOLEAN,
attachment_count INT,
attachments_summary TEXT,
body TEXT,
body_length INT,
body_source TEXT, -- 'html' | 'preview' | 'empty'
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig,
left(
coalesce(subject, '') || ' ' ||
coalesce(sender_email, '') || ' ' ||
coalesce(sender_name, '') || ' ' ||
coalesce(to_addrs, '') || ' ' ||
coalesce(cc_addrs, '') || ' ' ||
coalesce(attachments_summary, '') || ' ' ||
coalesce(body, ''),
800000)
)
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (mailbox, message_id)
);
CREATE INDEX IF NOT EXISTS emails_tsv_gin ON emails USING gin(tsv);
CREATE INDEX IF NOT EXISTS emails_subject_trgm ON emails USING gin(subject gin_trgm_ops);
CREATE INDEX IF NOT EXISTS emails_sender_email_idx ON emails(sender_email);
CREATE INDEX IF NOT EXISTS emails_mailbox_idx ON emails(mailbox);
CREATE INDEX IF NOT EXISTS emails_received_idx ON emails(received_at DESC);
CREATE INDEX IF NOT EXISTS emails_conv_idx ON emails(conversation_id);
"""
# --- HELPERY ----------------------------------------------------------------
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
_WS_RX = re.compile(r"[ \t]+")
_NL_RX = re.compile(r"\n{3,}")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
def html_to_text(html: str) -> str:
if not html:
return ""
try:
soup = BeautifulSoup(html, "lxml")
except Exception:
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
text = soup.get_text(separator="\n")
lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
text = "\n".join(ln for ln in lines if ln)
text = _NL_RX.sub("\n\n", text)
return text
def fmt_recipients(recipients: list, kind: str) -> str:
if not recipients:
return ""
out = []
for r in recipients:
if not isinstance(r, dict):
continue
if r.get("type") != kind:
continue
name = (r.get("name") or "").strip()
email = (r.get("email") or "").strip()
if name and email:
out.append(f"{name} <{email}>")
elif email:
out.append(email)
elif name:
out.append(name)
return "; ".join(out)
def fmt_attachments(attachments: list) -> str:
if not attachments:
return ""
out = []
for a in attachments[:20]:
if not isinstance(a, dict):
continue
name = a.get("name") or a.get("filename") or ""
if name:
out.append(name)
return " | ".join(out)
def _short(s, n=60):
if not s:
return ""
s = str(s).replace("\n", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
def _aware_utc(dt: Optional[datetime]) -> Optional[datetime]:
"""Sjednoceni: PG TIMESTAMPTZ -> tz-aware UTC; Mongo datetime -> naive (UTC).
Vrati tz-aware UTC datetime nebo None."""
if dt is None:
return None
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
limit: Optional[int] = None,
index_reset: bool = False) -> dict:
# --index-reset: smaz vse pro tuto schranku v PG
if index_reset:
with pg.cursor() as cur:
cur.execute("DELETE FROM emails WHERE mailbox = %s", (mailbox,))
deleted = cur.rowcount
pg.commit()
print(f"[{mailbox}] --index-reset: smazano {deleted} radku v PG")
# existujici zaznamy v PG (rychly inkrementalni lookup)
# tuple = (extractor_version, ok, body_source)
with pg.cursor() as cur:
cur.execute(
"SELECT message_id, extractor_version, ok, body_source "
"FROM emails WHERE mailbox = %s",
(mailbox,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
mongo_total = mongo_coll.estimated_document_count()
pg_total = len(existing)
pg_uptodate = sum(1 for v in existing.values()
if v[0] == EXTRACTOR_VERSION and v[1])
to_process_estimate = mongo_total - pg_uptodate
print(f"\n========== {mailbox} ==========")
print(f" v Mongu: {mongo_total}")
print(f" v PG: {pg_total} (z toho ext_v={EXTRACTOR_VERSION} & ok=true: {pg_uptodate})")
print(f" k zpracovani: ~{to_process_estimate}{' (limit=' + str(limit) + ')' if limit else ''}")
if to_process_estimate <= 0 and not index_reset and not limit:
print(" Nic noveho ke zpracovani.")
return {"mailbox": mailbox, "processed": 0, "ok": 0, "errors": 0,
"skipped": pg_uptodate, "empty_body": 0}
proj = {
"_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
"subject": 1, "sender": 1, "recipients": 1,
"sent_at": 1, "received_at": 1, "modified_at": 1,
"is_read": 1, "is_draft": 1,
"has_attachments": 1, "attachment_count": 1, "attachments": 1,
"body_html": 1, "body_text": 1, "body_preview": 1,
"smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
"smime_subject": 1, "smime_inner_attachments": 1,
}
cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
if limit:
cursor = cursor.limit(limit)
processed = ok = errors = skipped = empty_body = 0
queue: list[dict] = []
n = 0
try:
for doc in cursor:
n += 1
msg_id = doc.get("_id") or ""
prev = existing.get(msg_id) # (extractor_version, ok, body_source)
mongo_mtime = doc.get("modified_at")
# Skip kdyz PG ma stejnou EV a ok=true.
# Vyjimka: smime_unwrapped v Mongu, ale PG body_source != 'smime'
# -> unwrap_smime pridal rozbaleny text az po enrichu -> re-enrich.
if prev and prev[0] == EXTRACTOR_VERSION and prev[1]:
needs_smime_reindex = (
bool(doc.get("smime_unwrapped"))
and prev[2] != "smime"
)
if not needs_smime_reindex:
skipped += 1
continue
sender = doc.get("sender") or {}
recipients = doc.get("recipients") or []
attachments = doc.get("attachments") or []
inner = doc.get("smime_inner_attachments") or []
if inner:
attachments = list(attachments) + [
{"filename": (a.get("filename") or "") + " [smime]"}
for a in inner if a.get("filename")
]
row = {
"mailbox": mailbox,
"message_id": msg_id,
"graph_id": doc.get("graph_id"),
"conversation_id": doc.get("conversation_id"),
"folder_path": doc.get("folder_path"),
"subject": doc.get("subject") or "",
"sender_email": sender.get("email"),
"sender_name": sender.get("name"),
"to_addrs": fmt_recipients(recipients, "to"),
"cc_addrs": fmt_recipients(recipients, "cc"),
"bcc_addrs": fmt_recipients(recipients, "bcc"),
# Vsechny timestampy z Monga jsou naive ale interpretovany jako UTC.
# Tagneme je tz-aware aby PG TIMESTAMPTZ ulozil spravnou UTC hodnotu
# a nepocital posun podle session timezone.
"sent_at": _aware_utc(doc.get("sent_at")),
"received_at": _aware_utc(doc.get("received_at")),
"modified_at": _aware_utc(mongo_mtime),
"is_read": doc.get("is_read"),
"is_draft": doc.get("is_draft"),
"has_attachments": doc.get("has_attachments"),
"attachment_count": doc.get("attachment_count"),
"attachments_summary": fmt_attachments(attachments),
"body": None,
"body_length": 0,
"body_source": "empty",
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "; detail = ""
try:
text = ""
if doc.get("smime_unwrapped"):
s_text = doc.get("smime_body_text") or ""
s_html = doc.get("smime_body_html") or ""
s_html_text = html_to_text(s_html) if s_html else ""
combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
s_subject = doc.get("smime_subject") or ""
if s_subject:
combined = f"Subject: {s_subject}\n\n{combined}"
if combined:
text = combined
row["body_source"] = "smime"
if not text:
html = doc.get("body_html") or ""
h_text = html_to_text(html) if html else ""
if h_text:
text = h_text
row["body_source"] = "html"
if not text:
plain = doc.get("body_text") or ""
if plain:
text = plain
row["body_source"] = "text"
if not text:
preview = doc.get("body_preview") or ""
if preview:
text = preview
row["body_source"] = "preview"
if not text:
row["body_source"] = "empty"
empty_body += 1
body = _truncate(text)
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
if processed % 200 == 0 or processed == 1:
subj = _short(row["subject"], 50)
print(f" [{n:>6}|p={processed:>5}] {status} {row['body_source']:<7} "
f"{row['body_length']:>7}ch | {subj}", flush=True)
if len(queue) >= BATCH_SIZE:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"mailbox": mailbox, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "empty_body": empty_body}
UPSERT_SQL = """
INSERT INTO emails
(mailbox, message_id, graph_id, conversation_id, folder_path,
subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
sent_at, received_at, modified_at, is_read, is_draft,
has_attachments, attachment_count, attachments_summary,
body, body_length, body_source,
extracted_at, extractor_version, ok, error)
VALUES
(%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
%(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
%(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
%(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
%(body)s, %(body_length)s, %(body_source)s,
%(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (mailbox, message_id) DO UPDATE SET
graph_id = EXCLUDED.graph_id,
conversation_id = EXCLUDED.conversation_id,
folder_path = EXCLUDED.folder_path,
subject = EXCLUDED.subject,
sender_email = EXCLUDED.sender_email,
sender_name = EXCLUDED.sender_name,
to_addrs = EXCLUDED.to_addrs,
cc_addrs = EXCLUDED.cc_addrs,
bcc_addrs = EXCLUDED.bcc_addrs,
sent_at = EXCLUDED.sent_at,
received_at = EXCLUDED.received_at,
modified_at = EXCLUDED.modified_at,
is_read = EXCLUDED.is_read,
is_draft = EXCLUDED.is_draft,
has_attachments = EXCLUDED.has_attachments,
attachment_count = EXCLUDED.attachment_count,
attachments_summary = EXCLUDED.attachments_summary,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
body_source = EXCLUDED.body_source,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
for r in rows:
for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
"bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
if r.get(k):
r[k] = _clean_for_pg(r[k])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def discover_mailboxes(db) -> list[str]:
out = []
for name in sorted(db.list_collection_names()):
if name in NON_MAILBOX_COLLECTIONS:
continue
out.append(name)
return out
def main() -> int:
ap = argparse.ArgumentParser(description="enrich_fulltext_emails v1.3")
ap.add_argument("--mailbox", default="",
help="Jedna konkretni schranka. Bez argumentu projede vsechny.")
ap.add_argument("--limit", type=int,
help="Limit emailu na schranku (test)")
ap.add_argument("--index-reset", action="store_true",
help="Pred zpracovanim schranky vymaze vsechny jeji emaily z PG "
"(force re-extract). Bez --mailbox SMAZE CELY index.")
args = ap.parse_args()
t0 = time.time()
print(f"=== enrich_fulltext_emails v1.3 ===")
print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nPripojuji se k PostgreSQL...")
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print(" Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
print(" MongoDB OK.")
if args.mailbox:
mailboxes = [args.mailbox]
else:
mailboxes = discover_mailboxes(db)
print(f"\nSchranky ke zpracovani ({len(mailboxes)}):")
for mb in mailboxes:
print(f" - {mb}")
if args.index_reset and not args.mailbox:
print(f"\n!!! --index-reset bez --mailbox => SMAZE CELY INDEX ({len(mailboxes)} schranek) !!!")
results = []
for mb in mailboxes:
try:
results.append(process_mailbox(pg, db[mb], mb,
limit=args.limit,
index_reset=args.index_reset))
except Exception as e:
traceback.print_exc()
print(f" FATAL pri zpracovani {mb}: {e}")
results.append({"mailbox": mb, "processed": 0, "ok": 0,
"errors": 1, "skipped": 0, "empty_body": 0})
pg.close()
print("\n" + "="*60)
print("=== SHRNUTI ===")
grand = {"processed": 0, "ok": 0, "errors": 0, "skipped": 0, "empty_body": 0}
for r in results:
print(f" {r['mailbox']:40} processed={r['processed']:>5} ok={r['ok']:>5} "
f"errors={r['errors']:>3} skipped={r['skipped']:>6} empty={r['empty_body']:>4}")
for k in grand:
grand[k] += r.get(k, 0)
print(f" {'TOTAL':40} processed={grand['processed']:>5} ok={grand['ok']:>5} "
f"errors={grand['errors']:>3} skipped={grand['skipped']:>6} empty={grand['empty_body']:>4}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
print(f"Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# exit code: 0 jen kdyz vsechny schranky probehly bez chyby
return 1 if grand["errors"] > 0 else 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
@@ -0,0 +1,455 @@
"""
==============================================================================
Skript: enrich_fulltext_emails_v1.1.py
Verze: 1.1
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis:
Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
(a refetch_text_bodies_v1.0 pro stare plain-text emaily).
Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
Zmeny proti v1.0:
- Fallback poradi rozsireno: body_html -> body_text (novy v parse_emails_graph_v1.4)
-> body_preview -> empty. Drive bylo body_html -> body_preview.
- body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
- EXTRACTOR_VERSION=1.1 -> vsechny existujici emaily v PG se preparsuji.
Zdroj:
MongoDB 192.168.1.76 db=emaily kolekce=<mailbox>
(krome attachments_index)
Cil:
PostgreSQL 192.168.1.76 db=MongoEmaily tabulka=emails
tsvector config 'soubory' (sdileny - simple + unaccent)
Inkrementalita:
Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
se vse preparsuje.
Spusteni:
python enrich_fulltext_emails_v1.0.py # vsechny schranky
python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com
python enrich_fulltext_emails_v1.0.py --limit 500 # test
==============================================================================
"""
from __future__ import annotations
import argparse
import re
import sys
import time
import traceback
from datetime import datetime, timezone
from typing import Optional
import psycopg
from bs4 import BeautifulSoup
from pymongo import MongoClient
# --- konfigurace ------------------------------------------------------------
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
"user=vladimir.buzalka password=Vlado7309208104++")
EXTRACTOR_VERSION = "1.1"
MAX_TEXT_BYTES = 5 * 1024 * 1024 # plain text max 5 MB
SKIP_COLLECTIONS = {"attachments_index"}
BATCH_SIZE = 100
# --- SCHEMA -----------------------------------------------------------------
SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS unaccent;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION soubory
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
END IF;
END$$;
CREATE TABLE IF NOT EXISTS emails (
id BIGSERIAL PRIMARY KEY,
mailbox TEXT NOT NULL,
message_id TEXT NOT NULL,
graph_id TEXT,
conversation_id TEXT,
folder_path TEXT,
subject TEXT,
sender_email TEXT,
sender_name TEXT,
to_addrs TEXT,
cc_addrs TEXT,
bcc_addrs TEXT,
sent_at TIMESTAMPTZ,
received_at TIMESTAMPTZ,
modified_at TIMESTAMPTZ,
is_read BOOLEAN,
is_draft BOOLEAN,
has_attachments BOOLEAN,
attachment_count INT,
attachments_summary TEXT,
body TEXT,
body_length INT,
body_source TEXT, -- 'html' | 'preview' | 'empty'
tsv tsvector GENERATED ALWAYS AS (
to_tsvector('soubory'::regconfig,
left(
coalesce(subject, '') || ' ' ||
coalesce(sender_email, '') || ' ' ||
coalesce(sender_name, '') || ' ' ||
coalesce(to_addrs, '') || ' ' ||
coalesce(cc_addrs, '') || ' ' ||
coalesce(attachments_summary, '') || ' ' ||
coalesce(body, ''),
800000)
)
) STORED,
extracted_at TIMESTAMPTZ DEFAULT now(),
extractor_version TEXT,
ok BOOLEAN,
error TEXT,
UNIQUE (mailbox, message_id)
);
CREATE INDEX IF NOT EXISTS emails_tsv_gin ON emails USING gin(tsv);
CREATE INDEX IF NOT EXISTS emails_subject_trgm ON emails USING gin(subject gin_trgm_ops);
CREATE INDEX IF NOT EXISTS emails_sender_email_idx ON emails(sender_email);
CREATE INDEX IF NOT EXISTS emails_mailbox_idx ON emails(mailbox);
CREATE INDEX IF NOT EXISTS emails_received_idx ON emails(received_at DESC);
CREATE INDEX IF NOT EXISTS emails_conv_idx ON emails(conversation_id);
"""
# --- HELPERY ----------------------------------------------------------------
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
_WS_RX = re.compile(r"[ \t]+")
_NL_RX = re.compile(r"\n{3,}")
def _clean_for_pg(s: str) -> str:
if not s:
return ""
return _CTRL_RX.sub("", s)
def _truncate(s: str) -> str:
s = _clean_for_pg(s or "")
if not s:
return ""
b = s.encode("utf-8", errors="replace")
if len(b) <= MAX_TEXT_BYTES:
return s
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
def html_to_text(html: str) -> str:
"""Extrahuje plain text z HTML emailu. Odstrani <script>, <style>, normalizuje whitespace."""
if not html:
return ""
try:
soup = BeautifulSoup(html, "lxml")
except Exception:
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
text = soup.get_text(separator="\n")
# normalizace whitespace
lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
text = "\n".join(ln for ln in lines if ln)
text = _NL_RX.sub("\n\n", text)
return text
def fmt_recipients(recipients: list, kind: str) -> str:
"""Sloupec to_addrs/cc_addrs/bcc_addrs - 'Jmeno <email>; Jmeno2 <email2>'."""
if not recipients:
return ""
out = []
for r in recipients:
if not isinstance(r, dict):
continue
if r.get("type") != kind:
continue
name = (r.get("name") or "").strip()
email = (r.get("email") or "").strip()
if name and email:
out.append(f"{name} <{email}>")
elif email:
out.append(email)
elif name:
out.append(name)
return "; ".join(out)
def fmt_attachments(attachments: list) -> str:
if not attachments:
return ""
out = []
for a in attachments[:20]:
if not isinstance(a, dict):
continue
name = a.get("name") or a.get("filename") or ""
if name:
out.append(name)
return " | ".join(out)
def _short(s, n=60):
if not s:
return ""
s = str(s).replace("\n", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _now() -> datetime:
return datetime.now(tz=timezone.utc)
# --- HLAVNI SMYCKA ----------------------------------------------------------
def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
limit: Optional[int] = None) -> dict:
# existujici zaznamy v PG (rychly inkrementalni lookup)
with pg.cursor() as cur:
cur.execute(
"SELECT message_id, extractor_version, modified_at, ok "
"FROM emails WHERE mailbox = %s",
(mailbox,),
)
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
proj = {
"_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
"subject": 1, "sender": 1, "recipients": 1,
"sent_at": 1, "received_at": 1, "modified_at": 1,
"is_read": 1, "is_draft": 1,
"has_attachments": 1, "attachment_count": 1, "attachments": 1,
"body_html": 1, "body_text": 1, "body_preview": 1,
}
cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
if limit:
cursor = cursor.limit(limit)
total_pending = limit or mongo_coll.estimated_document_count()
print(f"[{mailbox}] kandidatu: ~{total_pending}")
processed = ok = errors = skipped = empty_body = 0
queue: list[dict] = []
n = 0
try:
for doc in cursor:
n += 1
msg_id = doc.get("_id") or ""
prev = existing.get(msg_id)
mongo_mtime = doc.get("modified_at")
if (prev and prev[0] == EXTRACTOR_VERSION and prev[2]
and (mongo_mtime is None
or (prev[1] and prev[1] >= mongo_mtime))):
skipped += 1
continue
sender = doc.get("sender") or {}
recipients = doc.get("recipients") or []
attachments = doc.get("attachments") or []
row = {
"mailbox": mailbox,
"message_id": msg_id,
"graph_id": doc.get("graph_id"),
"conversation_id": doc.get("conversation_id"),
"folder_path": doc.get("folder_path"),
"subject": doc.get("subject") or "",
"sender_email": sender.get("email"),
"sender_name": sender.get("name"),
"to_addrs": fmt_recipients(recipients, "to"),
"cc_addrs": fmt_recipients(recipients, "cc"),
"bcc_addrs": fmt_recipients(recipients, "bcc"),
"sent_at": doc.get("sent_at"),
"received_at": doc.get("received_at"),
"modified_at": mongo_mtime,
"is_read": doc.get("is_read"),
"is_draft": doc.get("is_draft"),
"has_attachments": doc.get("has_attachments"),
"attachment_count": doc.get("attachment_count"),
"attachments_summary": fmt_attachments(attachments),
"body": None,
"body_length": 0,
"body_source": "empty",
"extracted_at": _now(),
"extractor_version": EXTRACTOR_VERSION,
"ok": False,
"error": None,
}
status = "OK "; detail = ""
try:
# fallback poradi (v1.1): body_html -> body_text -> body_preview
html = doc.get("body_html") or ""
text = html_to_text(html) if html else ""
if text:
row["body_source"] = "html"
else:
plain = doc.get("body_text") or ""
if plain:
text = plain
row["body_source"] = "text"
else:
preview = doc.get("body_preview") or ""
if preview:
text = preview
row["body_source"] = "preview"
else:
row["body_source"] = "empty"
empty_body += 1
body = _truncate(text)
row["body"] = body if body else None
row["body_length"] = len(body)
row["ok"] = True
ok += 1
detail = f"{len(body)} znaku {_short(body, 60)!r}"
except Exception as e:
row["error"] = f"{type(e).__name__}: {e}"[:500]
status = "ERR"; detail = row["error"][:80]; errors += 1
queue.append(row)
processed += 1
if n % 200 == 0 or n == 1:
subj = _short(row["subject"], 50)
print(f" [{n:>5}] {status} {row['body_source']:<7} "
f"{row['body_length']:>7}ch | {subj}", flush=True)
if len(queue) >= BATCH_SIZE:
_flush(pg, queue); queue.clear()
finally:
cursor.close()
if queue:
_flush(pg, queue)
return {"mailbox": mailbox, "processed": processed, "ok": ok,
"errors": errors, "skipped": skipped, "empty_body": empty_body}
UPSERT_SQL = """
INSERT INTO emails
(mailbox, message_id, graph_id, conversation_id, folder_path,
subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
sent_at, received_at, modified_at, is_read, is_draft,
has_attachments, attachment_count, attachments_summary,
body, body_length, body_source,
extracted_at, extractor_version, ok, error)
VALUES
(%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
%(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
%(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
%(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
%(body)s, %(body_length)s, %(body_source)s,
%(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
ON CONFLICT (mailbox, message_id) DO UPDATE SET
graph_id = EXCLUDED.graph_id,
conversation_id = EXCLUDED.conversation_id,
folder_path = EXCLUDED.folder_path,
subject = EXCLUDED.subject,
sender_email = EXCLUDED.sender_email,
sender_name = EXCLUDED.sender_name,
to_addrs = EXCLUDED.to_addrs,
cc_addrs = EXCLUDED.cc_addrs,
bcc_addrs = EXCLUDED.bcc_addrs,
sent_at = EXCLUDED.sent_at,
received_at = EXCLUDED.received_at,
modified_at = EXCLUDED.modified_at,
is_read = EXCLUDED.is_read,
is_draft = EXCLUDED.is_draft,
has_attachments = EXCLUDED.has_attachments,
attachment_count = EXCLUDED.attachment_count,
attachments_summary = EXCLUDED.attachments_summary,
body = EXCLUDED.body,
body_length = EXCLUDED.body_length,
body_source = EXCLUDED.body_source,
extracted_at = EXCLUDED.extracted_at,
extractor_version = EXCLUDED.extractor_version,
ok = EXCLUDED.ok,
error = EXCLUDED.error
"""
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
for r in rows:
for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
"bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
if r.get(k):
r[k] = _clean_for_pg(r[k])
with pg.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
pg.commit()
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)")
ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)")
args = ap.parse_args()
t0 = time.time()
print("Pripojuji se k PostgreSQL...")
# MongoEmaily DB musi existovat (create externe pres psql nebo DBeaver),
# protoze CREATE DATABASE nesmi byt v transakci.
pg = psycopg.connect(PG_DSN, connect_timeout=10)
with pg.cursor() as cur:
cur.execute(SCHEMA_SQL)
pg.commit()
print("Schema OK.")
print("Pripojuji se k MongoDB...")
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
db = mongo[MONGO_DB]
if args.mailbox:
mailboxes = [args.mailbox]
else:
mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS]
print(f"Schranky ({len(mailboxes)}): {mailboxes}")
results = []
for mb in mailboxes:
results.append(process_mailbox(pg, db[mb], mb, limit=args.limit))
pg.close()
print("\n=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']}: processed={r['processed']} ok={r['ok']} "
f"errors={r['errors']} skipped={r['skipped']} empty={r['empty_body']}")
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
except Exception:
traceback.print_exc()
sys.exit(1)
+41
View File
@@ -0,0 +1,41 @@
#!/bin/bash
# ============================================================================
# Wrapper for the email pipeline. Calls Python wrapper inside python-runner
# container. Logs to dated file. Cleans up logs older than 30 days.
#
# Install via User Scripts plugin or /etc/cron.d/email_pipeline:
# 0 6,18 * * * /mnt/user/Scripts/run_pipeline.sh
# ============================================================================
set -u
LOG_DIR="/mnt/user/Scripts/logs"
TIMESTAMP=$(date +%Y%m%d_%H%M)
LOG_FILE="${LOG_DIR}/pipeline_${TIMESTAMP}.log"
LATEST_LINK="${LOG_DIR}/pipeline_latest.log"
RETENTION_DAYS=30
mkdir -p "$LOG_DIR"
echo "=== Email pipeline run @ $(date '+%Y-%m-%d %H:%M:%S') ===" >> "$LOG_FILE"
# Make sure the container is running
if ! docker inspect -f '{{.State.Running}}' python-runner 2>/dev/null | grep -q true; then
echo "ERROR: python-runner container is not running" >> "$LOG_FILE"
docker start python-runner >> "$LOG_FILE" 2>&1 || exit 1
sleep 5
fi
docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet >> "$LOG_FILE" 2>&1
RET=$?
echo "" >> "$LOG_FILE"
echo "=== Wrapper finished @ $(date '+%Y-%m-%d %H:%M:%S') exit=$RET ===" >> "$LOG_FILE"
# Update "latest" symlink for easy tailing
ln -sf "$LOG_FILE" "$LATEST_LINK"
# Cleanup logs older than RETENTION_DAYS
find "$LOG_DIR" -name 'pipeline_*.log' -type f -mtime +${RETENTION_DAYS} -delete
exit $RET