From a3470511452f55fbf50a48263013b363d34d280d Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Fri, 5 Jun 2026 21:21:30 +0200 Subject: [PATCH] notebook --- EmailsImport/mcp_emaily.py | 753 ++++++++++++++++++ Python-runner/0_run_pipeline_v1.0.md | 77 ++ Python-runner/0_run_pipeline_v1.0.py | 176 ++++ Python-runner/1_parse_emails_graph_v1.4.md | 41 + Python-runner/1_parse_emails_graph_v1.4.py | 624 +++++++++++++++ .../1b_parse_emails_graph_delta_v1.0.md | 139 ++++ .../1b_parse_emails_graph_delta_v1.0.py | 514 ++++++++++++ Python-runner/2_refetch_text_bodies_v1.0.md | 34 + Python-runner/2_refetch_text_bodies_v1.0.py | 270 +++++++ Python-runner/3_download_attachments_v1.3.md | 47 ++ Python-runner/3_download_attachments_v1.3.py | 546 +++++++++++++ Python-runner/3_download_attachments_v1.4.md | 74 ++ Python-runner/3_download_attachments_v1.4.py | 713 +++++++++++++++++ Python-runner/4_unwrap_smime_v1.0.md | 63 ++ Python-runner/4_unwrap_smime_v1.0.py | 445 +++++++++++ .../5_enrich_fulltext_emails_v1.2.md | 47 ++ .../5_enrich_fulltext_emails_v1.2.py | 489 ++++++++++++ .../5_enrich_fulltext_emails_v1.3.md | 79 ++ .../5_enrich_fulltext_emails_v1.3.py | 567 +++++++++++++ .../Trash/enrich_fulltext_emails_v1.1.py | 455 +++++++++++ Python-runner/run_pipeline.sh | 41 + Soubory/mcp_soubory.py | 672 ++++++++++++++++ Soubory/priklady_dotazu.md | 210 +++++ Soubory/query_v0.1.py | 203 +++++ .../feedback_proactive_suggestions.md | 22 + claude-memory/feedback_use_mcp_emaily.md | 31 + claude-memory/project_mcp_emaily.md | 39 + claude-memory/project_mcp_soubory.md | 31 + 28 files changed, 7402 insertions(+) create mode 100644 EmailsImport/mcp_emaily.py create mode 100644 Python-runner/0_run_pipeline_v1.0.md create mode 100644 Python-runner/0_run_pipeline_v1.0.py create mode 100644 Python-runner/1_parse_emails_graph_v1.4.md create mode 100644 Python-runner/1_parse_emails_graph_v1.4.py create mode 100644 Python-runner/1b_parse_emails_graph_delta_v1.0.md create mode 100644 Python-runner/1b_parse_emails_graph_delta_v1.0.py create mode 100644 Python-runner/2_refetch_text_bodies_v1.0.md create mode 100644 Python-runner/2_refetch_text_bodies_v1.0.py create mode 100644 Python-runner/3_download_attachments_v1.3.md create mode 100644 Python-runner/3_download_attachments_v1.3.py create mode 100644 Python-runner/3_download_attachments_v1.4.md create mode 100644 Python-runner/3_download_attachments_v1.4.py create mode 100644 Python-runner/4_unwrap_smime_v1.0.md create mode 100644 Python-runner/4_unwrap_smime_v1.0.py create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.2.md create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.2.py create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.3.md create mode 100644 Python-runner/5_enrich_fulltext_emails_v1.3.py create mode 100644 Python-runner/Trash/enrich_fulltext_emails_v1.1.py create mode 100644 Python-runner/run_pipeline.sh create mode 100644 Soubory/mcp_soubory.py create mode 100644 Soubory/priklady_dotazu.md create mode 100644 Soubory/query_v0.1.py create mode 100644 claude-memory/feedback_proactive_suggestions.md create mode 100644 claude-memory/feedback_use_mcp_emaily.md create mode 100644 claude-memory/project_mcp_emaily.md create mode 100644 claude-memory/project_mcp_soubory.md diff --git a/EmailsImport/mcp_emaily.py b/EmailsImport/mcp_emaily.py new file mode 100644 index 0000000..3f193b7 --- /dev/null +++ b/EmailsImport/mcp_emaily.py @@ -0,0 +1,753 @@ +#!/usr/bin/env python3 +""" +============================================================================== +MCP server: EMAILY (vsechny schranky importovane z Microsoft Graph) + +Hybridni dotaz nad: + - PostgreSQL 192.168.1.76 db=MongoEmaily tabulka=emails + (fulltext tsvector - subject + sender + recipients + + attachments + body, GIN index, ts_headline, ts_rank) + - MongoDB 192.168.1.76 db=emaily kolekce= + (puvodni dokumenty z parse_emails_graph_v1.3.py: + headers, body_html, recipients[], attachments[], ...) + +Source: U:\\janssen\\EmailsImport\\enrich_fulltext_emails_v1.0.py + +Spusteni: + python mcp_emaily.py (stdio MCP) +============================================================================== +""" + +from __future__ import annotations + +import sys +import traceback +from datetime import datetime, timezone, timedelta +from typing import Optional, Union + +import psycopg +from mcp.server.fastmcp import FastMCP +from pymongo import MongoClient + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" + +PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily " + "user=vladimir.buzalka password=Vlado7309208104++") + +DEFAULT_BODY_CHARS = 8000 +MAX_BODY_CHARS = 200_000 + +SKIP_COLLECTIONS = {"attachments_index", "sync_state"} + + +def log(msg: str) -> None: + print(msg, file=sys.stderr, flush=True) + + +try: + mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + mongo.admin.command("ping") + log(f"Mongo OK ({MONGO_URI})") +except Exception as e: + log(f"Mongo connection failed: {e}") + sys.exit(1) + +try: + _t = psycopg.connect(PG_DSN, connect_timeout=10) + _t.close() + log("Postgres OK") +except Exception as e: + log(f"Postgres connection failed: {e}") + sys.exit(1) + + +def pg_conn(): + return psycopg.connect(PG_DSN, connect_timeout=10) + + +def serialize(obj): + if isinstance(obj, datetime): + return obj.isoformat() + if isinstance(obj, bytes): + return obj.decode("utf-8", errors="replace") + if isinstance(obj, dict): + return {k: serialize(v) for k, v in obj.items()} + if isinstance(obj, list): + return [serialize(v) for v in obj] + return obj + + +def normalize_mailbox(mailbox: Optional[Union[str, list]]) -> Optional[list[str]]: + if mailbox is None or mailbox == "" or mailbox == []: + return None + if isinstance(mailbox, str): + return [mailbox] + return list(mailbox) + + +def parse_since(s: Optional[str]) -> Optional[datetime]: + if not s: + return None + try: + if "T" in s: + return datetime.fromisoformat(s.replace("Z", "+00:00")) + return datetime.strptime(s, "%Y-%m-%d").replace(tzinfo=timezone.utc) + except Exception as e: + raise ValueError(f"Bad date {s!r}: {e}") + + +# --- MCP -------------------------------------------------------------------- +mcp = FastMCP("emaily") + + +@mcp.tool() +def ping() -> dict: + """Quick health check. Reports Mongo + Postgres connectivity, total mailboxes, + PG indexed emails count, ok/error breakdown. + """ + try: + info = mongo.admin.command("buildInfo") + mailboxes = [c for c in mongo[MONGO_DB].list_collection_names() + if c not in SKIP_COLLECTIONS] + mongo_counts = {} + for mb in mailboxes: + mongo_counts[mb] = mongo[MONGO_DB][mb].estimated_document_count() + with pg_conn() as pg, pg.cursor() as cur: + cur.execute("SELECT mailbox, ok, count(*) FROM emails " + "GROUP BY mailbox, ok ORDER BY mailbox, ok") + rows = cur.fetchall() + pg_summary: dict = {} + for mb, ok, c in rows: + pg_summary.setdefault(mb, {})[("ok" if ok else "error")] = c + return { + "status": "ok", + "mongo_version": info.get("version"), + "mailboxes": mailboxes, + "mongo_email_count": mongo_counts, + "pg_indexed_per_mailbox": pg_summary, + } + except Exception as e: + log(traceback.format_exc()) + return {"status": "error", "error": str(e)} + + +@mcp.tool() +def list_mailboxes() -> dict: + """Overview of all mailboxes — totals, indexed coverage, earliest/latest received_at, + top senders by volume. Use to understand the corpus before searching. + """ + out = {} + try: + mailboxes = [c for c in mongo[MONGO_DB].list_collection_names() + if c not in SKIP_COLLECTIONS] + for mb in mailboxes: + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(""" + SELECT count(*) FILTER (WHERE ok) AS ok, + count(*) AS total, + min(received_at) AS first_at, + max(received_at) AS last_at, + count(*) FILTER (WHERE has_attachments) AS with_att + FROM emails WHERE mailbox = %s + """, (mb,)) + ok, total, first_at, last_at, with_att = cur.fetchone() + cur.execute(""" + SELECT sender_email, count(*) c FROM emails + WHERE mailbox = %s AND sender_email IS NOT NULL + GROUP BY sender_email ORDER BY c DESC LIMIT 5 + """, (mb,)) + top_senders = [{"email": s, "count": c} for s, c in cur.fetchall()] + out[mb] = { + "indexed_ok": ok, + "indexed_total": total, + "with_attachments": with_att, + "first_received": serialize(first_at), + "last_received": serialize(last_at), + "top_senders": top_senders, + } + return {"mailboxes": out} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def search( + query: str, + mailbox: Optional[Union[str, list]] = None, + since: Optional[str] = None, + until: Optional[str] = None, + folder_contains: Optional[str] = None, + sender_contains: Optional[str] = None, + has_attachments: Optional[bool] = None, + limit: int = 20, +) -> dict: + """PRIMARY TOOL — fulltext search across all indexed emails. + + Index includes: subject, sender (email + name), recipients (to/cc), + attachment filenames, AND full body text. + + query: websearch_to_tsquery syntax: + invoice payment -> AND + "lot expiration" -> phrase + SAE OR "serious adverse" -> OR + urgent -newsletter -> exclude + mailbox: one mailbox string or list (e.g. "vbuzalka@its.jnj.com"). None = all. + since/until: ISO date "YYYY-MM-DD" on received_at + folder_contains: substring match against folder_path (case-insensitive) + sender_contains: substring match against sender_email OR sender_name (case-insensitive) + has_attachments: True / False / None (any) + limit: max 100 + + Returns ranked results with `snippet` showing matches highlighted as <<...>>. + Use `read_email` to fetch full body of any hit. + """ + try: + mboxes = normalize_mailbox(mailbox) + since_dt = parse_since(since) + until_dt = parse_since(until) + limit = min(max(1, limit), 100) + + sql = """ + WITH q AS ( + SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq + ) + SELECT + e.id, e.mailbox, e.message_id, e.conversation_id, e.folder_path, + e.subject, e.sender_email, e.sender_name, + e.to_addrs, e.cc_addrs, + e.received_at, e.sent_at, e.is_read, + e.has_attachments, e.attachment_count, e.attachments_summary, + e.body_length, e.body_source, + ts_rank(e.tsv, q.tsq) AS rank, + ts_headline('soubory'::regconfig, + left(coalesce(e.body, e.subject), 200000), + q.tsq, + 'MaxFragments=3, MinWords=4, MaxWords=18, ' + 'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet + FROM emails e, q + WHERE e.tsv @@ q.tsq + AND e.ok = TRUE + AND (%(mboxes)s::text[] IS NULL OR e.mailbox = ANY(%(mboxes)s::text[])) + AND (%(since)s::timestamptz IS NULL OR e.received_at >= %(since)s::timestamptz) + AND (%(until)s::timestamptz IS NULL OR e.received_at < %(until)s::timestamptz) + AND (%(folder)s::text IS NULL OR e.folder_path ILIKE %(folder_like)s) + AND (%(sender)s::text IS NULL + OR e.sender_email ILIKE %(sender_like)s + OR e.sender_name ILIKE %(sender_like)s) + AND (%(has_att)s::boolean IS NULL OR e.has_attachments = %(has_att)s::boolean) + ORDER BY rank DESC, e.received_at DESC NULLS LAST + LIMIT %(limit)s + """ + params = { + "query": query, "mboxes": mboxes, + "since": since_dt, "until": until_dt, + "folder": folder_contains, + "folder_like": f"%{folder_contains}%" if folder_contains else None, + "sender": sender_contains, + "sender_like": f"%{sender_contains}%" if sender_contains else None, + "has_att": has_attachments, + "limit": limit, + } + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(sql, params) + cols = [c.name for c in cur.description] + rows = [dict(zip(cols, r)) for r in cur.fetchall()] + + results = [] + for r in rows: + results.append({ + "mailbox": r["mailbox"], + "message_id": r["message_id"], + "conversation_id": r["conversation_id"], + "folder": r["folder_path"], + "subject": r["subject"], + "from": (f"{r['sender_name']} <{r['sender_email']}>" + if r["sender_name"] else r["sender_email"]), + "to": r["to_addrs"], + "cc": r["cc_addrs"], + "received_at": serialize(r["received_at"]), + "is_read": r["is_read"], + "has_attachments": r["has_attachments"], + "attachment_count": r["attachment_count"], + "attachments": r["attachments_summary"], + "body_length": r["body_length"], + "body_source": r["body_source"], + "rank": round(float(r["rank"]), 5), + "snippet": (r["snippet"] or "").strip(), + }) + + return { + "query": query, + "filters": {"mailbox": mboxes, "since": since, "until": until, + "folder_contains": folder_contains, + "sender_contains": sender_contains, + "has_attachments": has_attachments, + "limit": limit}, + "count": len(results), + "results": results, + "tip": "Use read_email(mailbox=..., message_id=...) for full body or thread.", + } + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e), "query": query} + + +@mcp.tool() +def read_email( + message_id: Optional[str] = None, + mailbox: Optional[str] = None, + offset: int = 0, + length: int = DEFAULT_BODY_CHARS, + around_match: Optional[str] = None, + include_html: bool = False, +) -> dict: + """Read one email — full plain text body + metadata. + + Identify by `message_id` (Internet Message-ID, the _id in Mongo). + `mailbox` narrows the lookup if the same Message-ID appears in multiple mailboxes + (e.g. you got copies in both work and personal accounts). + + offset, length: slice the body. length max 200000. + around_match: case-insensitive substring; returns up to 3 windows of ~1000 chars + centered on matches, instead of a flat slice. + include_html: also return raw body_html from Mongo (typically large — only if you + really need the original markup). + """ + if not message_id: + return {"error": "Provide message_id."} + try: + length = min(max(1, length), MAX_BODY_CHARS) + + sql = """ + SELECT id, mailbox, message_id, graph_id, conversation_id, folder_path, + subject, sender_email, sender_name, + to_addrs, cc_addrs, bcc_addrs, + sent_at, received_at, modified_at, is_read, is_draft, + has_attachments, attachment_count, attachments_summary, + body, body_length, body_source, + extractor_version, extracted_at, ok, error + FROM emails WHERE message_id = %s + """ + params = [message_id] + if mailbox: + sql += " AND mailbox = %s" + params.append(mailbox) + sql += " LIMIT 1" + + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(sql, params) + row = cur.fetchone() + cols = [c.name for c in cur.description] + if not row: + return {"error": "Email not found.", + "message_id": message_id, "mailbox": mailbox} + rec = dict(zip(cols, row)) + + body = rec.get("body") or "" + if around_match and body: + needle = around_match.lower() + hay = body.lower() + windows = [] + start = 0 + while len(windows) < 3: + pos = hay.find(needle, start) + if pos < 0: + break + lo = max(0, pos - 400) + hi = min(len(body), pos + 600) + windows.append({"offset": lo, "text": body[lo:hi]}) + start = pos + len(needle) + body_out = None + slice_info = {"mode": "around_match", "match": around_match, + "windows_found": len(windows), "windows": windows} + else: + body_out = body[offset:offset + length] + slice_info = { + "mode": "slice", "offset": offset, + "length_returned": len(body_out), + "has_more": offset + length < len(body), + "next_offset": offset + length if offset + length < len(body) else None, + } + + out = { + "mailbox": rec["mailbox"], + "message_id": rec["message_id"], + "conversation_id": rec["conversation_id"], + "folder": rec["folder_path"], + "subject": rec["subject"], + "from": (f"{rec['sender_name']} <{rec['sender_email']}>" + if rec["sender_name"] else rec["sender_email"]), + "to": rec["to_addrs"], + "cc": rec["cc_addrs"], + "bcc": rec["bcc_addrs"], + "received_at": serialize(rec["received_at"]), + "sent_at": serialize(rec["sent_at"]), + "is_read": rec["is_read"], + "is_draft": rec["is_draft"], + "has_attachments": rec["has_attachments"], + "attachment_count": rec["attachment_count"], + "attachments": rec["attachments_summary"], + "body_length": rec["body_length"], + "body_source": rec["body_source"], + "extractor_version": rec["extractor_version"], + "ok": rec["ok"], + "error": rec["error"], + } + if body_out is not None: + out["body"] = body_out + out["slice"] = slice_info + + if include_html: + mdoc = mongo[MONGO_DB][rec["mailbox"]].find_one( + {"_id": rec["message_id"]}, {"body_html": 1, "attachments": 1}) + if mdoc: + out["body_html"] = mdoc.get("body_html") + out["attachments_detail"] = mdoc.get("attachments") + return out + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def by_sender( + sender: str, + mailbox: Optional[Union[str, list]] = None, + since: Optional[str] = None, + has_attachments: Optional[bool] = None, + limit: int = 30, +) -> dict: + """List emails from a specific sender (substring match on sender_email or sender_name, + case-insensitive). Use for "what did X send me" or "all newsletters from Y". + + Returned sorted by received_at DESC. + """ + try: + mboxes = normalize_mailbox(mailbox) + since_dt = parse_since(since) + limit = min(max(1, limit), 200) + sql = """ + SELECT mailbox, message_id, subject, sender_email, sender_name, + to_addrs, folder_path, received_at, has_attachments, attachment_count, + attachments_summary, body_length + FROM emails + WHERE ok = TRUE + AND (sender_email ILIKE %(s)s OR sender_name ILIKE %(s)s) + AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[])) + AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz) + AND (%(has_att)s::boolean IS NULL OR has_attachments = %(has_att)s::boolean) + ORDER BY received_at DESC NULLS LAST + LIMIT %(limit)s + """ + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(sql, {"s": f"%{sender}%", "mboxes": mboxes, + "since": since_dt, "has_att": has_attachments, + "limit": limit}) + cols = [c.name for c in cur.description] + rows = [dict(zip(cols, r)) for r in cur.fetchall()] + for r in rows: + r["received_at"] = serialize(r["received_at"]) + return {"sender_match": sender, "count": len(rows), "results": rows} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def recent_emails( + mailbox: Optional[Union[str, list]] = None, + days: int = 7, + folder_contains: Optional[str] = None, + has_attachments: Optional[bool] = None, + limit: int = 30, +) -> dict: + """List recent emails (by received_at). Use for "what came in today/this week". + days=0 to ignore time window (just top-N newest). + """ + try: + mboxes = normalize_mailbox(mailbox) + limit = min(max(1, limit), 200) + since_dt = None + if days and days > 0: + since_dt = datetime.now(timezone.utc) - timedelta(days=days) + sql = """ + SELECT mailbox, message_id, subject, sender_email, sender_name, + folder_path, received_at, has_attachments, attachment_count, + attachments_summary, body_length, is_read + FROM emails + WHERE ok = TRUE + AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[])) + AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz) + AND (%(folder)s::text IS NULL OR folder_path ILIKE %(folder_like)s) + AND (%(has_att)s::boolean IS NULL OR has_attachments = %(has_att)s::boolean) + ORDER BY received_at DESC NULLS LAST + LIMIT %(limit)s + """ + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(sql, { + "mboxes": mboxes, "since": since_dt, + "folder": folder_contains, + "folder_like": f"%{folder_contains}%" if folder_contains else None, + "has_att": has_attachments, "limit": limit, + }) + cols = [c.name for c in cur.description] + rows = [dict(zip(cols, r)) for r in cur.fetchall()] + for r in rows: + r["received_at"] = serialize(r["received_at"]) + return {"days": days, "count": len(rows), "results": rows} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def conversation_thread(conversation_id: str, limit: int = 50) -> dict: + """Return all emails in one Outlook conversation thread (conversation_id from Graph). + Ordered chronologically. Use to see the full back-and-forth on a topic. + """ + try: + limit = min(max(1, limit), 200) + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(""" + SELECT mailbox, message_id, subject, sender_email, sender_name, + to_addrs, received_at, folder_path, body_length, has_attachments, + attachments_summary + FROM emails + WHERE conversation_id = %s AND ok = TRUE + ORDER BY received_at ASC NULLS LAST + LIMIT %s + """, (conversation_id, limit)) + cols = [c.name for c in cur.description] + rows = [dict(zip(cols, r)) for r in cur.fetchall()] + for r in rows: + r["received_at"] = serialize(r["received_at"]) + return {"conversation_id": conversation_id, "count": len(rows), "thread": rows} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def find_attachment( + name_contains: str, + mailbox: Optional[Union[str, list]] = None, + since: Optional[str] = None, + limit: int = 30, +) -> dict: + """Find emails whose attachment filename contains the substring (case-insensitive). + Use for "find emails with that protocol PDF" or "any invoice attachments". + Returns emails ordered by received_at DESC. + """ + try: + mboxes = normalize_mailbox(mailbox) + since_dt = parse_since(since) + limit = min(max(1, limit), 200) + sql = """ + SELECT mailbox, message_id, subject, sender_email, sender_name, + received_at, attachment_count, attachments_summary, folder_path + FROM emails + WHERE ok = TRUE + AND has_attachments = TRUE + AND attachments_summary ILIKE %(s)s + AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[])) + AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz) + ORDER BY received_at DESC NULLS LAST + LIMIT %(limit)s + """ + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(sql, {"s": f"%{name_contains}%", + "mboxes": mboxes, "since": since_dt, "limit": limit}) + cols = [c.name for c in cur.description] + rows = [dict(zip(cols, r)) for r in cur.fetchall()] + for r in rows: + r["received_at"] = serialize(r["received_at"]) + return {"name_match": name_contains, "count": len(rows), "results": rows} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def top_senders( + mailbox: Optional[Union[str, list]] = None, + since: Optional[str] = None, + limit: int = 20, +) -> dict: + """Top senders by volume (count of received emails). Optionally limit by mailbox or date window. + Use for "who emails me most" or "top senders this month". + """ + try: + mboxes = normalize_mailbox(mailbox) + since_dt = parse_since(since) + limit = min(max(1, limit), 100) + sql = """ + SELECT sender_email, count(*) AS c, max(received_at) AS last_at + FROM emails + WHERE ok = TRUE AND sender_email IS NOT NULL + AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[])) + AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz) + GROUP BY sender_email + ORDER BY c DESC + LIMIT %(limit)s + """ + with pg_conn() as pg, pg.cursor() as cur: + cur.execute(sql, {"mboxes": mboxes, "since": since_dt, "limit": limit}) + rows = [{"sender_email": s, "count": c, "last_at": serialize(t)} + for s, c, t in cur.fetchall()] + return {"count": len(rows), "results": rows} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def pipeline_status(mailbox: Optional[Union[str, list]] = None) -> dict: + """End-to-end status of the email-ingest pipeline per mailbox. + + Reports, for each mailbox, where it stands in the 5-step pipeline: + 1. parse_emails_graph -> mongo_total + 2. (refetch text bodies) -> body_text_missing (legacy v1.3 emails) + 3. download_attachments -> attach_done / attach_pending + attach_missing (404 — marked, won't retry) + attach_reference (OneDrive/SharePoint link, no content) + 4. unwrap_smime -> smime_p7m_total / smime_unwrapped / smime_pending + smime_p7s_count (informational; not unwrapped by design) + 5. enrich_fulltext -> pg_indexed + + Plus: + - permanently_deleted (marked by delta sync) + + Use this instead of running multiple Mongo count queries by hand. Returns + one row per mailbox; if `mailbox` is given, returns just those rows. + """ + try: + mbs = normalize_mailbox(mailbox) + all_mb = [c for c in mongo[MONGO_DB].list_collection_names() + if c not in SKIP_COLLECTIONS] + targets = [m for m in all_mb if (mbs is None or m in mbs)] + + # PG counts in one pass + pg_counts: dict[str, int] = {} + with pg_conn() as pg, pg.cursor() as cur: + cur.execute("SELECT mailbox, count(*) FROM emails " + "WHERE ok = true GROUP BY mailbox") + for mb, c in cur.fetchall(): + pg_counts[mb] = c + + out = {} + for mb in targets: + col = mongo[MONGO_DB][mb] + mongo_total = col.estimated_document_count() + with_att = col.count_documents({"has_attachments": True}) + attach_pending = col.count_documents({ + "has_attachments": True, + "attachments": {"$elemMatch": { + "is_inline": False, + "file_hash": {"$exists": False}, + "attachment_missing": {"$ne": True}, + "attachment_reference": {"$ne": True}, + }}, + }) + attach_missing = col.count_documents({ + "attachments.attachment_missing": True, + }) + attach_reference = col.count_documents({ + "attachments.attachment_reference": True, + }) + attach_done = with_att - attach_pending - attach_missing - attach_reference + + smime_p7m_total = col.count_documents( + {"attachments.filename": {"$regex": r"^smime\.p7m$", "$options": "i"}} + ) + smime_unwrapped = col.count_documents({ + "attachments.filename": {"$regex": r"^smime\.p7m$", "$options": "i"}, + "smime_unwrapped": True, + }) + smime_p7s_count = col.count_documents( + {"attachments.filename": {"$regex": r"^smime\.p7s$", "$options": "i"}} + ) + + body_text_missing = col.count_documents({ + "body_html": {"$in": [None, ""]}, + "body_text": {"$exists": False}, + "graph_id": {"$exists": True}, + }) + + permanently_deleted = col.count_documents({"permanently_deleted": True}) + + out[mb] = { + "mongo_total": mongo_total, + "with_attachments": with_att, + "attach_done": attach_done, + "attach_pending": attach_pending, + "attach_missing": attach_missing, + "attach_reference": attach_reference, + "smime_p7m_total": smime_p7m_total, + "smime_unwrapped": smime_unwrapped, + "smime_pending": smime_p7m_total - smime_unwrapped, + "smime_p7s_count": smime_p7s_count, + "body_text_missing": body_text_missing, + "pg_indexed": pg_counts.get(mb, 0), + "permanently_deleted": permanently_deleted, + } + return {"mailboxes": out} + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +@mcp.tool() +def sync_state_overview(mailbox: Optional[Union[str, list]] = None) -> dict: + """Delta-sync state across mailboxes (collection `emaily.sync_state`). + + For each (mailbox, folder) pair shows: deltaLink present?, last_run_at, + cumulative new/sync/removed/run_count. Use to confirm a mailbox is + incrementally synced and to spot folders that haven't run in a while. + """ + try: + sync_col = mongo[MONGO_DB]["sync_state"] + q: dict = {} + mbs = normalize_mailbox(mailbox) + if mbs: + q["mailbox"] = {"$in": mbs} + cursor = sync_col.find(q, { + "mailbox": 1, "folder_path": 1, "folder_id": 1, + "delta_link": 1, "last_run_at": 1, + "cumulative_new": 1, "cumulative_sync": 1, + "cumulative_removed": 1, "run_count": 1, + }).sort([("mailbox", 1), ("folder_path", 1)]) + + by_mailbox: dict[str, list] = {} + for d in cursor: + row = { + "folder_path": d.get("folder_path"), + "folder_id": d.get("folder_id"), + "has_delta_link": bool(d.get("delta_link")), + "last_run_at": serialize(d.get("last_run_at")), + "cumulative_new": d.get("cumulative_new", 0), + "cumulative_sync": d.get("cumulative_sync", 0), + "cumulative_removed": d.get("cumulative_removed", 0), + "run_count": d.get("run_count", 0), + } + by_mailbox.setdefault(d["mailbox"], []).append(row) + + # mailboxes that have collections but ZERO sync_state entries + all_mb = {c for c in mongo[MONGO_DB].list_collection_names() + if c not in SKIP_COLLECTIONS} + not_synced = sorted(all_mb - set(by_mailbox.keys())) + if mbs: + not_synced = [m for m in not_synced if m in mbs] + return { + "mailboxes": by_mailbox, + "never_delta_synced": not_synced, + } + except Exception as e: + log(traceback.format_exc()) + return {"error": str(e)} + + +if __name__ == "__main__": + log("MCP emaily server started (FastMCP)") + mcp.run() diff --git a/Python-runner/0_run_pipeline_v1.0.md b/Python-runner/0_run_pipeline_v1.0.md new file mode 100644 index 0000000..5f45d12 --- /dev/null +++ b/Python-runner/0_run_pipeline_v1.0.md @@ -0,0 +1,77 @@ +# 0_run_pipeline_v1.0.py + +**Wrapper kolem celé emailové pipeline.** Spustí postupně všechny 4 kroky daily syncu, vždy přes všechny dostupné schránky: + +| # | Krok | Skript | +|---|---|---| +| 1b | Graph delta sync (inkrementální Mongo update) | `1b_parse_emails_graph_delta_v1.0.py` | +| 3 | Download attachments | `3_download_attachments_v1.4.py` | +| 4 | Unwrap S/MIME | `4_unwrap_smime_v1.0.py` | +| 5 | Enrich fulltext (PG) | `5_enrich_fulltext_emails_v1.3.py` | + +## Politika chyb + +Default je **continue-on-error** — když některý krok selže, pipeline pokračuje dalším (downstream se nezasekne kvůli minor problému). Po vyběhnutí dostaneš souhrnnou tabulku s `OK / FAIL(N)` per krok. + +Použij `--stop-on-error` pokud chceš tvrdou abort při první chybě. + +## Logování + +Každý krok jde do vlastního logu v `/scripts/pipeline_.log`: +- `pipeline_1b.log` +- `pipeline_3.log` +- `pipeline_4.log` +- `pipeline_5.log` + +Live výstup se zároveň tee-uje na konzoli (vypneš přes `--quiet`). + +## Argumenty + +| Argument | Hodnoty | Popis | +|---|---|---| +| `--only` | `1b 3 4 5` | Spustit jen tyto kroky | +| `--skip` | `1b 3 4 5` | Přeskočit tyto kroky | +| `--stop-on-error` | flag | Zastavit při první chybě (default: pokračovat) | +| `--quiet` | flag | Necpat stdout na konzoli (zůstane v logu) | + +## Varianty volání + +```bash +# Daily run — vše, všechny schránky: +docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py + +# Jen enrich (např. po manuálním zásahu do Mongo): +docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 5 + +# Vše mimo S/MIME (krok 4 občas vyžaduje pip install asn1crypto): +docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --skip 4 + +# Test daily sync bez fulltextu: +docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py --only 1b 3 4 + +# Na pozadí, master log: +docker exec -d python-runner bash -c "python /scripts/0_run_pipeline_v1.0.py --quiet > /scripts/pipeline_master.log 2>&1" +docker exec -it python-runner tail -f /scripts/pipeline_master.log +``` + +## Cron / nightly automation + +Pro nightly se hodí jednoduchý cron na Unraidu (`/etc/cron.daily/` nebo User Scripts plugin): + +```bash +#!/bin/bash +docker exec python-runner python /scripts/0_run_pipeline_v1.0.py --quiet \ + > /mnt/user/Scripts/pipeline_$(date +%Y%m%d).log 2>&1 +``` + +Stačí denně, delta sync z minulého stavu trvá ~30s s prázdným backlogem. + +## Exit kódy wrapperu + +| Kód | Význam | +|---|---| +| 0 | Všechny kroky OK | +| 1 | Alespoň jeden krok selhal | +| 2 | Žádný krok k běhu (--only + --skip vyloučily vše) | +| 127 | Některý skript neexistuje v `/scripts/` | +| 130 | Přerušeno Ctrl+C | diff --git a/Python-runner/0_run_pipeline_v1.0.py b/Python-runner/0_run_pipeline_v1.0.py new file mode 100644 index 0000000..eec6988 --- /dev/null +++ b/Python-runner/0_run_pipeline_v1.0.py @@ -0,0 +1,176 @@ +""" +============================================================================== +Skript: 0_run_pipeline_v1.0.py +Verze: 1.0 +Datum: 2026-06-04 +Autor: vladimir.buzalka + +Popis: + Wrapper kolem cele emailove pipeline. Spousti postupne: + 1b. parse_emails_graph_delta -> delta sync z Graph API do Mongo + 3. download_attachments -> stahne pripojeny soubory + 4. unwrap_smime -> rozbali S/MIME wrapper zpravy + 5. enrich_fulltext_emails -> doindexuje do PG fulltext + + Vzdy projizdi VSECHNY schranky (mimo SKIP_MAILBOXES v jednotlivych skriptech). + Per-krok merici cas + exit code. Pokud krok selze, default pokracuje dal + (aby se downstream nezasekl) — viz --stop-on-error. + + Vsechny vystupy a chyby kazdeho kroku jsou ulozeny do /scripts/pipeline_.log + +Spousteni: + python 0_run_pipeline_v1.0.py # vse, vsechny schranky + python 0_run_pipeline_v1.0.py --only 5 # jen krok 5 (enrich) + python 0_run_pipeline_v1.0.py --skip 4 # vse mimo smime unwrap + python 0_run_pipeline_v1.0.py --stop-on-error # zastavit pri prvni chybe + python 0_run_pipeline_v1.0.py --quiet # bez tee na konzoli, jen logy + +Docker: + docker exec -it python-runner python /scripts/0_run_pipeline_v1.0.py +============================================================================== +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + +SCRIPTS_DIR = Path("/scripts") +LOGS_DIR = SCRIPTS_DIR # vse do /scripts/ + +# Definice pipeline (step_id, label, executable filename) +STEPS = [ + ("1b", "Graph delta sync", "1b_parse_emails_graph_delta_v1.0.py"), + ("3", "Download attachments", "3_download_attachments_v1.4.py"), + ("4", "Unwrap S/MIME", "4_unwrap_smime_v1.0.py"), + ("5", "Enrich fulltext (PG)", "5_enrich_fulltext_emails_v1.3.py"), +] + + +def fmt_dur(s: float) -> str: + if s < 60: + return f"{s:.1f}s" + m, s = divmod(int(s), 60) + if m < 60: + return f"{m}m{s:02d}s" + h, m = divmod(m, 60) + return f"{h}h{m:02d}m{s:02d}s" + + +def run_step(step_id: str, label: str, script: str, *, + quiet: bool = False) -> tuple[int, float]: + script_path = SCRIPTS_DIR / script + log_path = LOGS_DIR / f"pipeline_{step_id}.log" + + if not script_path.exists(): + print(f" CHYBA: {script_path} neexistuje!") + return 127, 0.0 + + print(f"\n{'='*70}") + print(f" KROK {step_id}: {label}") + print(f" script: {script_path}") + print(f" log: {log_path}") + print(f" start: {datetime.now().strftime('%H:%M:%S')}") + print(f"{'='*70}") + + t0 = time.time() + + # Tee: zaroven do konzole i do logu (pokud ne --quiet) + with open(log_path, "w", encoding="utf-8") as logf: + proc = subprocess.Popen( + [sys.executable, str(script_path)], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + encoding="utf-8", + errors="replace", + ) + for line in proc.stdout: + logf.write(line) + if not quiet: + print(line, end="", flush=True) + ret = proc.wait() + + dur = time.time() - t0 + print(f"\n KROK {step_id} {'OK' if ret == 0 else f'FAILED ({ret})'} za {fmt_dur(dur)}") + return ret, dur + + +def main() -> int: + ap = argparse.ArgumentParser(description="Email pipeline wrapper v1.0") + ap.add_argument("--only", nargs="+", default=None, + help="Spustit jen tyto kroky (napr. --only 3 4 5)") + ap.add_argument("--skip", nargs="+", default=None, + help="Preskocit tyto kroky") + ap.add_argument("--stop-on-error", action="store_true", + help="Zastavit pipeline pri prvni nenulovem exit kodu") + ap.add_argument("--quiet", action="store_true", + help="Necpat stdout kroku na konzoli, jen do logu") + args = ap.parse_args() + + # Filter step set + only_set = set(args.only) if args.only else None + skip_set = set(args.skip) if args.skip else set() + + to_run = [] + for sid, label, script in STEPS: + if only_set and sid not in only_set: + continue + if sid in skip_set: + continue + to_run.append((sid, label, script)) + + if not to_run: + print("Zadny krok k spusteni.") + return 2 + + print(f"=== Email Pipeline Wrapper v1.0 ===") + print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Kroku k spusteni: {len(to_run)}") + for sid, label, _ in to_run: + print(f" {sid}: {label}") + if args.stop_on_error: + print("Politika: stop-on-error") + else: + print("Politika: continue-on-error (default)") + + t_all = time.time() + results = [] + + for sid, label, script in to_run: + ret, dur = run_step(sid, label, script, quiet=args.quiet) + results.append((sid, label, ret, dur)) + if ret != 0 and args.stop_on_error: + print(f"\n!!! Pipeline zastavena na kroku {sid} (--stop-on-error)") + break + + total_dur = time.time() - t_all + + print(f"\n{'='*70}") + print("=== SHRNUTI PIPELINE ===") + print(f"{'='*70}") + failed = 0 + for sid, label, ret, dur in results: + status = "OK" if ret == 0 else f"FAIL({ret})" + if ret != 0: + failed += 1 + print(f" [{sid:>2}] {label:30} {status:>8} {fmt_dur(dur):>10}") + print(f"{'='*70}") + print(f" Celkem: {len(results)} kroku, {failed} chyb, {fmt_dur(total_dur)}") + print(f" Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f" Per-krok logy: {LOGS_DIR}/pipeline_.log") + + return 1 if failed else 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except KeyboardInterrupt: + print("\nPreruseno uzivatelem") + sys.exit(130) diff --git a/Python-runner/1_parse_emails_graph_v1.4.md b/Python-runner/1_parse_emails_graph_v1.4.md new file mode 100644 index 0000000..9dc59c4 --- /dev/null +++ b/Python-runner/1_parse_emails_graph_v1.4.md @@ -0,0 +1,41 @@ +# 1_parse_emails_graph_v1.4.py + +**Krok 1 pipeline** — import emailů z libovolné schránky přes Microsoft Graph API do MongoDB (`emaily.`). Čte všechny složky rekurzivně. Upsert podle Message-ID → bezpečné přerušit a opakovat. + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | ano | e-mail | — | Schránka = název kolekce v Mongo | +| `--mode` | ne | `full` / `new-only` / `sync` | `full` | `full` = plný upsert; `new-only` = jen nové; `sync` = aktualizuje `is_read`/`flag_status`/`categories`/`folder_path` u existujících + importuje nové | +| `--folder` | ne | název složky | (všechny) | Jen jedna složka (např. `Inbox`) | +| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N zpráv (test) | +| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci | + +## Varianty volání + +```bash +# První plný import schránky (vše): +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz + +# Test na 50 zprávách bez vytváření indexů: +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes + +# Jen nové emaily (po prvním importu): +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode new-only + +# Pravidelný sync (nové + aktualizace flagů u existujících) na pozadí, log do souboru: +docker exec -d python-runner bash -c "python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1" + +# Import jen složky Inbox: +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder Inbox + +# Test 10 emailů z konkrétní složky: +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --folder "Sent Items" --limit 10 +``` + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/parse_emails.log +``` diff --git a/Python-runner/1_parse_emails_graph_v1.4.py b/Python-runner/1_parse_emails_graph_v1.4.py new file mode 100644 index 0000000..6836f9a --- /dev/null +++ b/Python-runner/1_parse_emails_graph_v1.4.py @@ -0,0 +1,624 @@ +""" +parse_emails_graph_v1.4.py +Nazev: parse_emails_graph_v1.4.py +Verze: 1.4 +Datum: 2026-06-03 +Autor: vladimir.buzalka + +Popis: + Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API + a importuje je jako dokumenty do MongoDB. + Ze kazde zpravy extrahuje vsechny dostupne vlastnosti: + + - predmet, odesilatel, prijemci (To/CC/BCC s typy) + - cas doruceni, odeslani, vytvoreni, modifikace (UTC) + - telo HTML (max 2 MB) + textovy preview + - prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id) + - internet headers (SPF, DKIM, Received, X-*, ...) + - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno, + kategorie, In-Reply-To, References, ... + - navic: isRead, isDraft, folder_path, inferenceClassification + + Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted, + archivni slozky, ...). + + DB: emaily + Kolekce: (napr. ordinace@buzalkova.cz) + _id: Internet Message-ID (nebo "graphid:" jako fallback) + + POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! + +Spousteni: + # Prvni import (vsechno): + python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz + + # Test na prvnich 50: + python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes + + # Jen jedna slozka: + python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox + + # Pokracovani po preruseni (pouze nove): + python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only + + # Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove): + python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync + + # Jina schranka: + python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz + +Rezimy (--mode): + full Plny upsert vsech poli pro kazdou zpravu (vychozi) + new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove + sync Existujici: aktualizuje jen is_read/flag_status/categories/ + modified_at/folder_path. Nove zpravy importuje cely. + Idealni pro pravidelne spousteni. + +Zavislosti: + msal, requests, pymongo, python-dateutil + Python 3.10+ + +Struktura dokumentu v MongoDB: + _id Internet Message-ID (nebo graphid: fallback) + graph_id Graph API message ID + subject predmet zpravy + normalized_subject predmet bez RE:/FW:/AW: prefixu + importance 0=nizka 1=normalni 2=vysoka + flag_status 0=bez priznaku 1=oznaceno 2=dokonceno + is_read bool — aktualni stav precteni ve schrance + is_draft bool + has_attachments bool + attachment_count int + inference_classification focused / other + categories [str] + conversation_id Graph conversationId + conversation_index base64 conversationIndex + conversation_topic tema vlakna (z internet headers Thread-Topic) + in_reply_to Message-ID predchozi zpravy + internet_references [Message-ID] + received_at datetime UTC + sent_at datetime UTC + created_at datetime UTC + modified_at datetime UTC + folder_id Graph parentFolderId + folder_path cela cesta slozky (napr. Inbox/Subfolder) + sender.email emailova adresa odesilatele + sender.name zobrazovane jmeno + to retezec To (joined) + cc retezec CC + bcc retezec BCC + recipients [{type, email, name}] + body_html HTML telo (pokud contentType=='html', max 2 MB) + body_text plain-text telo (pokud contentType=='text', max 2 MB) + body_preview textovy nahled z Graph bodyPreview (max 255 znaku) + attachments [{filename, size_bytes, mime_type, is_inline, graph_att_id}] + headers dict internet headers + parsed_at datetime UTC + +Indexy: + received_at, sent_at, sender.email, graph_id (unique), + conversation_id, folder_path, has_attachments, categories, + importance, flag_status, is_read, + text_search (subject + body_preview + to + cc) + +Historie verzi: + 1.0 2026-06-02 Inicialni verze + 1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync; + odstranen --skip-existing (nahrazen --mode new-only) + 1.2 2026-06-02 $expand attachments s $select (bez contentBytes — rychlejsi); + prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu + 1.3 2026-06-02 --mailbox jako povinny parametr — univerzalni pouziti pro + libovolnou schranku; kolekce v MongoDB = nazev schranky + 1.4 2026-06-03 Plain-text emaily (contentType=='text') se ukladaji do + noveho pole body_text (max 2 MB), drive se truncovalo na + 2000 znaku do body_preview a zbytek se zahazoval. + body_preview ted obsahuje vzdy puvodni Graph bodyPreview. + Pro existujici emaily z v1.3 lze pouzit + refetch_text_bodies_v1.0.py. +""" + +import sys +import re +import logging +import argparse +import base64 +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional + +import msal +import requests +from dateutil import parser as dtparser +from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# ─── KONFIGURACE ────────────────────────────────────────────────────────────── +GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" +GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" +GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" +GRAPH_URL = "https://graph.microsoft.com/v1.0" + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +BATCH_SIZE = 100 +PAGE_SIZE = 50 +LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" +SCRIPT_VERSION = "1.4" + +# Schránka se nastavuje za behu z --mailbox parametru +GRAPH_MAILBOX: str = "" +# ────────────────────────────────────────────────────────────────────────────── + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + +IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2} +FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2} +RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE) + +# $expand prilohy bez contentBytes — jen metadata co potrebujeme +ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)" + +MSG_SELECT = ( + "id,internetMessageId,subject,bodyPreview,body," + "importance,isRead,isDraft,hasAttachments," + "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime," + "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo," + "conversationId,conversationIndex,parentFolderId," + "categories,flag,inferenceClassification,internetMessageHeaders" +) + +MSG_SELECT_SYNC = ( + "id,internetMessageId,isRead,isDraft,flag,categories," + "lastModifiedDateTime,parentFolderId,importance" +) + + +# ─── Graph API helpers ──────────────────────────────────────────────────────── + +_graph_token: Optional[str] = None + + +def get_token() -> str: + global _graph_token + app = msal.ConfidentialClientApplication( + GRAPH_CLIENT_ID, + authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", + client_credential=GRAPH_CLIENT_SECRET, + ) + result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + if "access_token" not in result: + raise RuntimeError(f"Graph auth failed: {result}") + _graph_token = result["access_token"] + return _graph_token + + +def graph_get(url: str, params: dict = None) -> dict: + global _graph_token + if not _graph_token: + get_token() + for attempt in range(2): + r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) + if r.status_code == 401: + get_token() + continue + r.raise_for_status() + return r.json() + raise RuntimeError(f"Graph GET failed after retry: {url}") + + +def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]: + """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}].""" + if parent_id is None: + url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders" + else: + url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders" + + folders = [] + params = {"$top": 100, "$select": "id,displayName,childFolderCount"} + while url: + data = graph_get(url, params) + for f in data.get("value", []): + path = f"{parent_path}/{f['displayName']}".lstrip("/") + folders.append({"id": f["id"], "path": path}) + if f.get("childFolderCount", 0) > 0: + folders.extend(get_all_folders(f["id"], path)) + url = data.get("@odata.nextLink") + params = None + return folders + + +def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True): + """Generator: vraci zpravy ze slozky po strankach.""" + url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages" + params = {"$top": PAGE_SIZE, "$select": select} + if expand_attachments: + params["$expand"] = ATT_EXPAND + while url: + data = graph_get(url, params) + for msg in data.get("value", []): + yield msg + url = data.get("@odata.nextLink") + params = None + + +# ─── Pomocné funkce ─────────────────────────────────────────────────────────── + +def parse_date(raw) -> Optional[datetime]: + if raw is None: + return None + if isinstance(raw, datetime): + if raw.tzinfo: + return raw.astimezone(timezone.utc).replace(tzinfo=None) + return raw + try: + dt = dtparser.parse(str(raw)) + if dt.tzinfo: + return dt.astimezone(timezone.utc).replace(tzinfo=None) + return dt + except Exception: + return None + + +def normalize_subject(subject: str) -> str: + s = subject.strip() + while True: + m = RE_SUBJECT.match(s) + if not m: + break + s = s[m.end():].strip() + return s + + +def parse_headers(raw_headers: list) -> dict: + result = {} + for h in raw_headers: + k = h["name"].lower().replace("-", "_") + v = h["value"] + if k in result: + existing = result[k] + result[k] = existing + [v] if isinstance(existing, list) else [existing, v] + else: + result[k] = v + return result + + +def format_recipients(lst: list) -> str: + return "; ".join( + f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip() + for r in lst + ) + + +# ─── Extrakce zprávy ───────────────────────────────────────────────────────── + +def extract_message(msg: dict, folder_path: str) -> Optional[dict]: + """Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only.""" + try: + mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}" + subject = msg.get("subject") or "" + + body_html = None + body_text = None + body_preview = msg.get("bodyPreview") or "" + body = msg.get("body", {}) + _MAX_BODY = 2 * 1024 * 1024 # 2 MB + if body.get("contentType") == "html": + content = body.get("content") or "" + body_html = content if len(content) <= _MAX_BODY else content[:_MAX_BODY] + elif body.get("contentType") == "text": + content = body.get("content") or "" + # v1.4: ulozime PLNY plain text do body_text (drive se truncovalo na 2000 znaku + # do body_preview a zbytek se zahodil) + body_text = content if len(content) <= _MAX_BODY else content[:_MAX_BODY] + + sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {}) + to_list = msg.get("toRecipients", []) + cc_list = msg.get("ccRecipients", []) + bcc_list = msg.get("bccRecipients", []) + + recipients = ( + [{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] + + [{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] + + [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list] + ) + + importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1) + flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0) + + raw_headers = msg.get("internetMessageHeaders") or [] + headers = parse_headers(raw_headers) + + in_reply_to = headers.get("in_reply_to", "") + if isinstance(in_reply_to, list): + in_reply_to = in_reply_to[0] + + refs_raw = headers.get("references", "") + if isinstance(refs_raw, list): + refs_raw = " ".join(refs_raw) + internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else [] + + conv_topic = headers.get("thread_topic", "") + if isinstance(conv_topic, list): + conv_topic = conv_topic[0] + + conv_index = "" + ci_raw = msg.get("conversationIndex") + if ci_raw: + try: + conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode() + except Exception: + conv_index = ci_raw + + attachments = [] + for att in msg.get("attachments") or []: + fname = att.get("name") or "" + if not fname: + continue + attachments.append({ + "filename": fname, + "size_bytes": att.get("size", 0), + "mime_type": att.get("contentType", "application/octet-stream"), + "is_inline": att.get("isInline", False), + "graph_att_id": att.get("id"), + }) + + return { + "_id": mid, + "graph_id": msg["id"], + + "subject": subject, + "normalized_subject": normalize_subject(subject), + "importance": importance, + "flag_status": flag_status, + "is_read": msg.get("isRead", False), + "is_draft": msg.get("isDraft", False), + "has_attachments": msg.get("hasAttachments", False), + "attachment_count": len(attachments), + "inference_classification": msg.get("inferenceClassification", ""), + "categories": msg.get("categories") or [], + + "conversation_id": msg.get("conversationId", ""), + "conversation_index": conv_index, + "conversation_topic": conv_topic, + "in_reply_to": in_reply_to, + "internet_references": internet_refs, + + "received_at": parse_date(msg.get("receivedDateTime")), + "sent_at": parse_date(msg.get("sentDateTime")), + "created_at": parse_date(msg.get("createdDateTime")), + "modified_at": parse_date(msg.get("lastModifiedDateTime")), + + "folder_id": msg.get("parentFolderId", ""), + "folder_path": folder_path, + + "sender": { + "email": sender_ea.get("address", ""), + "name": sender_ea.get("name", ""), + }, + "to": format_recipients(to_list), + "cc": format_recipients(cc_list), + "bcc": format_recipients(bcc_list), + "recipients": recipients, + + "body_html": body_html, + "body_text": body_text, + "body_preview": body_preview, + + "attachments": attachments, + "headers": headers, + + "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None), + } + + except Exception as e: + logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e) + return None + + +def extract_sync_fields(msg: dict, folder_path: str) -> dict: + """Jen menitelna pole — pouziva se v sync mode pro existujici zpravy.""" + return { + "is_read": msg.get("isRead", False), + "is_draft": msg.get("isDraft", False), + "flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0), + "importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1), + "categories": msg.get("categories") or [], + "modified_at": parse_date(msg.get("lastModifiedDateTime")), + "folder_id": msg.get("parentFolderId", ""), + "folder_path": folder_path, + "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None), + } + + +# ─── MongoDB indexy ─────────────────────────────────────────────────────────── + +def create_indexes(col): + print(" Vytvarim indexy...") + col.create_index([("received_at", ASCENDING)]) + col.create_index([("sent_at", ASCENDING)]) + col.create_index([("sender.email", ASCENDING)]) + col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True) + col.create_index([("conversation_id", ASCENDING)]) + col.create_index([("folder_path", ASCENDING)]) + col.create_index([("has_attachments", ASCENDING)]) + col.create_index([("categories", ASCENDING)]) + col.create_index([("importance", ASCENDING)]) + col.create_index([("flag_status", ASCENDING)]) + col.create_index([("is_read", ASCENDING)]) + col.create_index([ + ("subject", TEXT), + ("body_preview", TEXT), + ("to", TEXT), + ("cc", TEXT), + ], name="text_search", default_language="none") + print(" Indexy hotovy.") + + +# ─── MAIN ───────────────────────────────────────────────────────────────────── + +def main(): + global GRAPH_MAILBOX + + ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}") + ap.add_argument("--mailbox", required=True, + help="Emailova schranka (napr. ordinace@buzalkova.cz)") + ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"], + help="full=plny upsert (vychozi) | new-only=jen nove zpravy | " + "sync=existujici aktualizuje jen menitelna pole, nove importuje cely") + ap.add_argument("--limit", type=int, default=0, + help="Zpracovat max N zprav (0 = vse)") + ap.add_argument("--folder", default="", + help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)") + ap.add_argument("--no-indexes", action="store_true", + help="Nevytvorit indexy na konci") + args = ap.parse_args() + + GRAPH_MAILBOX = args.mailbox + mongo_col = args.mailbox + + start = datetime.now() + print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===") + print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Schránka: {GRAPH_MAILBOX}") + print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}") + print(f"Režim: {args.mode}") + + print("\nPřipojuji se k Graph API...") + try: + get_token() + print(" Graph API OK") + except Exception as e: + print(f" CHYBA: {e}") + sys.exit(1) + + client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + try: + client.admin.command("ping") + print(" MongoDB OK") + except Exception as e: + print(f" CHYBA: MongoDB neni dostupna -- {e}") + sys.exit(1) + col = client[MONGO_DB][mongo_col] + + existing: set = set() + if args.mode in ("new-only", "sync"): + print(" Nacitam existujici zaznamy z MongoDB...") + existing = set(col.distinct("_id")) + print(f" {len(existing)} jiz importovano") + + print("\nNacitam seznam slozek...") + all_folders = get_all_folders() + if args.folder: + all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()] + print(f" Slozek ke zpracovani: {len(all_folders)}") + for f in all_folders: + print(f" {f['path']}") + + is_sync = args.mode == "sync" + msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT + expand_att = not is_sync + + batch = [] + ok_count = 0 + sync_count = 0 + err_count = 0 + skip_count = 0 + total_i = 0 + + def flush(): + if not batch: + return + try: + col.bulk_write(batch, ordered=False) + except Exception as e: + logging.error("bulk_write: %s", e) + print(f" CHYBA bulk_write: {e}") + batch.clear() + + print() + for folder in all_folders: + print(f"--- Složka: {folder['path']} ---") + folder_count = 0 + + for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att): + if args.limit and total_i >= args.limit: + break + + mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}" + total_i += 1 + folder_count += 1 + + if args.mode == "new-only" and mid in existing: + skip_count += 1 + continue + + if is_sync and mid in existing: + fields = extract_sync_fields(msg, folder["path"]) + batch.append(UpdateOne({"_id": mid}, {"$set": fields})) + sync_count += 1 + print(f" {total_i:>6} SYN {mid[:80]}") + else: + if is_sync: + full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}" + full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND} + try: + msg = graph_get(full_url, full_params) + except Exception as e: + logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e) + err_count += 1 + continue + + doc = extract_message(msg, folder["path"]) + if doc is None: + err_count += 1 + print(f" {total_i:>6} ERR {mid[:80]}") + else: + batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True)) + ok_count += 1 + subject_str = (doc.get("subject") or "")[:60] + sender_str = (doc.get("sender", {}).get("email") or "")[:40] + print(f" {total_i:>6} OK {subject_str:<60} {sender_str}") + + if len(batch) >= BATCH_SIZE: + flush() + + if total_i % 500 == 0: + elapsed = (datetime.now() - start).total_seconds() + rate = total_i / elapsed if elapsed > 0 else 0 + print(f" {'─'*80}") + print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s") + print(f" {'─'*80}") + + flush() + print(f" → {folder_count} zprav ze slozky {folder['path']}") + + if args.limit and total_i >= args.limit: + break + + elapsed_total = (datetime.now() - start).total_seconds() + print(f"\n{'='*52}") + print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}") + print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") + print(f"Dokumentu v kolekci: {col.count_documents({})}") + + if not args.no_indexes: + print() + create_indexes(col) + + print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + if err_count: + print(f"Chyby logovany do: {LOG_FILE}") + + client.close() + + +if __name__ == "__main__": + main() diff --git a/Python-runner/1b_parse_emails_graph_delta_v1.0.md b/Python-runner/1b_parse_emails_graph_delta_v1.0.md new file mode 100644 index 0000000..83e02d9 --- /dev/null +++ b/Python-runner/1b_parse_emails_graph_delta_v1.0.md @@ -0,0 +1,139 @@ +# 1b_parse_emails_graph_delta_v1.0.py + +**Inkrementalní sync přes Microsoft Graph delta query.** Sourozenec [`1_parse_emails_graph_v1.4.py`](1_parse_emails_graph_v1.4.md) — každý řeší jiný use case: + +| Skript | Použití | +|---|---| +| `1_parse_emails_graph_v1.4.py` | **První plný import** schránky (vše od začátku) | +| `1b_parse_emails_graph_delta_v1.0.py` | **Pravidelný sync** — jen co se od minula změnilo | + +## Jak funguje + +Graph API vystavuje `messages/delta` endpoint, který si pamatuje **záložku** (`deltaLink` s tokenem). Při dalším volání s touto záložkou vrátí jen: + +- **nové zprávy** +- **změny** existujících (`isRead`, vlajka, přesun do jiné složky, kategorie) +- **smazané** zprávy (`@removed`) + +Delta běží **per složka**. Skript drží stav v Mongo kolekci `emaily.sync_state`: + +```json +{ + "_id": "ordinace@buzalkova.cz|", + "mailbox": "ordinace@buzalkova.cz", + "folder_id": "AAA...", + "folder_path": "Inbox", + "delta_link": "https://graph.microsoft.com/.../delta?$deltatoken=...", + "last_run_at": "2026-06-04T10:00:00Z", + "cumulative_new": 1234, "cumulative_sync": 5678, "cumulative_removed": 12, "run_count": 42 +} +``` + +První běh = fresh delta (Graph vrátí všechno + dá `deltaLink`). Každý další = jen změny od poslední záložky. + +## Co se stane se smazanými zprávami + +Když delta vrátí `@removed` pro zprávu, skript ji **nemaže** z Mongo. Pouze nastaví: + +```json +{ "permanently_deleted": true, "permanently_deleted_at": "2026-06-04T10:00:00Z" } +``` + +Dohledatelné: `col.find({"permanently_deleted": true})`. + +**`@removed` přijde jen pro definitivně smazané** zprávy (uživatel vysypal koš / Shift+Del). Mail v `Deleted Items` je pořád normální zpráva, jen má `folder_path = "Deleted Items"`. + +## Extrakce zprávy + +Funkce `extract_message` a `extract_sync_fields` se načítají přímo z modulu `1_parse_emails_graph_v1.4.py` (přes `importlib`) — extrakční logika je jediná na celý projekt, nemůže se rozejít. + +## Nové vs změněné — jak skript pozná + +Pro každou položku z delta odpovědi: + +1. **Má `@removed`?** → označit `permanently_deleted` v Mongo, hotovo. +2. **`graph_id` už je v Mongo?** → existující změna — pošle se jen `extract_sync_fields` (is_read, flag, folder, …) přes `$set`. +3. **`graph_id` v Mongo není?** → nová zpráva — udělá se druhý GET `/messages/{id}?$expand=attachments` (delta nepodporuje `$expand`), aby přišla těla, hlavičky i přílohy, a uloží se přes `extract_message` jako klasický nový dokument. + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) | +| `--folder` | ne | substring | (všechny) | Filtr složek (např. `Inbox` zahrne i `Inbox/Archive`) | +| `--limit N` | ne | int | 0 (bez limitu) | Max položek na složku (test) | +| `--reset` | ne | flag | false | Smaže všechny `deltaLink`y pro vybrané schránky → další běh začne od fresh delta | +| `--dry-run` | ne | flag | false | Nic neuloží do Mongo, jen vypíše co by se stalo | + +## SKIP_MAILBOXES (hardcoded ve skriptu) + +| Schránka | Důvod | +|---|---| +| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Pro tuto schránku je nutný samostatný skript (lokální `.msg` parser nebo jiný zdroj). | + +Při `--mailbox vbuzalka@its.jnj.com` skript skončí s exit kódem 2. Při běhu bez `--mailbox` se schránka tiše přeskočí s hlášením `[skip]`. + +## Varianty volání + +```bash +# VŠECHNY schránky najednou (mimo SKIP_MAILBOXES) — pro cron / pravidelný sync: +docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py + +# Jedna schránka — první běh (fresh delta — projde všechno, uloží deltaLinky): +docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz + +# Pravidelný sync jedné schránky (jen změny od minulého běhu): +docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz + +# Dry-run — uvidíš co by se stalo, nic se neuloží: +docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --dry-run + +# Test jen na složce Inbox, max 20 položek: +docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox --limit 20 + +# Reset — zahodí deltaLinky a najede znova od plné delta: +docker exec -it python-runner python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --reset + +# Cron / na pozadí (každých 5 min): +docker exec -d python-runner bash -c "python /scripts/1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz > /scripts/delta_sync.log 2>&1" +``` + +## Co dělat na začátek + +1. **První import** schránky pořád přes `1_parse_emails_graph_v1.4.py` (existující data zůstanou). +2. **První běh** `1b_…delta_v1.0.py` — fresh delta projde znovu všechny zprávy a hlavně uloží `deltaLink`y do `sync_state`. To může chvíli trvat (podobně jako `--mode new-only` na v1.4). +3. **Další běhy** = už jen rychlé, vrací 0-X změn za interval. + +## Otevřené body k otestování + +- Jak rychle běží první (fresh) delta na velké schránce (`vladimir.buzalka@buzalka.cz` ~80k mailů) +- Co Graph vrátí pro nově vytvořené složky (mělo by fungovat — appendnou se do `folders` při dalším `get_all_folders`) +- Chování při `--limit` (drží se starý deltaLink → pristi beh dokonci zbytek) + +## HTTP 410 — expirovaný deltaLink + +DeltaLinky drží Graph cca 30 dní. Pokud nebudeš schránku syncovat měsíc, skript dostane 410, **smaže starý state** a sám zopakuje běh jako fresh delta. Žádný manuální zásah není potřeba. + +## Závislosti + +Stejné jako `1_parse_emails_graph_v1.4.py` (msal, requests, pymongo, dateutil) — žádné nové. + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/delta_sync.log +docker exec -it python-runner tail -f /scripts/delta_errors.log +``` + +## Stav sync_state v Mongo + +```python +# Přehled posledních synců: +db.sync_state.find().sort("last_run_at", -1) + +# Zahodit deltaLinky pro jednu schránku (= efekt --reset): +db.sync_state.delete_many({"mailbox": "ordinace@buzalkova.cz"}) + +# Najít všechny permanentně smazané v jedné schránce: +db["ordinace@buzalkova.cz"].find({"permanently_deleted": true}, {"subject": 1, "permanently_deleted_at": 1}) +``` diff --git a/Python-runner/1b_parse_emails_graph_delta_v1.0.py b/Python-runner/1b_parse_emails_graph_delta_v1.0.py new file mode 100644 index 0000000..b9a8ae7 --- /dev/null +++ b/Python-runner/1b_parse_emails_graph_delta_v1.0.py @@ -0,0 +1,514 @@ +""" +============================================================================== +Skript: 1b_parse_emails_graph_delta_v1.0.py +Verze: 1.0 +Datum: 2026-06-04 +Autor: vladimir.buzalka + +Popis: + Inkrementalni sync emailu pres Microsoft Graph DELTA QUERY. + Sourozenec `1_parse_emails_graph_v1.4.py` — kazdy resi jiny use case: + + 1_parse_emails_graph_v1.4.py = prvni plny import schranky + 1b_parse_emails_graph_delta_v1.0.py = pravidelny sync (zmeny od minula) + + Delta query je server-side change tracking — Graph si pamatuje "zalozku" + (deltaLink) a vraci jen to, co se od ni zmenilo: + - nove zpravy + - zmeny existujicich (isRead, flag, presun do jine slozky, kategorie) + - SMAZANE zpravy (@removed) — definitivne smazane, nikoli v kosi + + Pro mail v "Deleted Items" delta nic specialniho nedela — je to porad + normalni zprava, jen s folder_path="Deleted Items". @removed prijde az + kdyz uzivatel vysype kos / Shift+Del. + +State: + Kolekce `emaily.sync_state`, _id = "|". + { + mailbox, folder_id, folder_path, + delta_link, # plny URL s $deltatoken na pristi beh + last_run_at, + cumulative_new, cumulative_sync, cumulative_removed + } + +Permanentne smazane zpravy: + Skript je NEMAZE z Mongo. Pouze nastavi: + permanently_deleted: True + permanently_deleted_at: + Dohledani: col.find({"permanently_deleted": True}) + +Reuse: + Funkce extract_message / extract_sync_fields se nactou primo z modulu + 1_parse_emails_graph_v1.4.py (importlib, file-based), aby se logika + extrahce nikdy nerozesla. + +Spousteni: + python 1b_parse_emails_graph_delta_v1.0.py # VSECHNY schranky (mimo SKIP_MAILBOXES) + python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz # jedna schranka + python 1b_parse_emails_graph_delta_v1.0.py --mailbox ordinace@buzalkova.cz --folder Inbox + python 1b_parse_emails_graph_delta_v1.0.py --reset # zahodit deltaLinky a najet znova + python 1b_parse_emails_graph_delta_v1.0.py --dry-run # nic neulozit + +SKIP_MAILBOXES (hardcoded): + vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup. Pro tuto + schranku je nutny samostatny skript (lokalni .msg). + +Zavislosti: + msal, requests, pymongo, python-dateutil + Python 3.10+ +============================================================================== +""" + +from __future__ import annotations + +import argparse +import importlib.util +import logging +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import msal +import requests +from pymongo import MongoClient, ASCENDING + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# ─── KONFIGURACE ────────────────────────────────────────────────────────────── +GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" +GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" +GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" +GRAPH_URL = "https://graph.microsoft.com/v1.0" + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +SYNC_STATE_COL = "sync_state" +PAGE_SIZE = 100 # delta endpoint typicky vraci max 100/stranka +LOG_FILE = Path(__file__).parent / "delta_errors.log" +SCRIPT_VERSION = "1.0" + +# Kolekce v `emaily` ktere NEJSOU mailboxy: +NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"} + +# Schranky, kde NEMAME Graph API pristup — pri bezneho behu se preskoci. +# Pro tyto je nutny separatni skript (napr. lokalni .msg parser). +SKIP_MAILBOXES = { + "vbuzalka@its.jnj.com", # JNJ tenant — nemame Graph credentials +} + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + +# Co tahnout z delta endpointu (stejne jako MSG_SELECT v v1.4, mimo internetMessageHeaders +# ktere delta neumi vratit pro vsechny polozky — pro nove zpravy si je dotahneme +# samostatnym fetchem). +DELTA_SELECT = ( + "id,internetMessageId,subject,bodyPreview,body," + "importance,isRead,isDraft,hasAttachments," + "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime," + "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo," + "conversationId,conversationIndex,parentFolderId," + "categories,flag,inferenceClassification" +) + +# Pro plne nacteni nove zpravy (vcetne hlavicek + priloh) pouzijeme stejny +# select+expand jako v1.4 +FULL_FETCH_SELECT = ( + "id,internetMessageId,subject,bodyPreview,body," + "importance,isRead,isDraft,hasAttachments," + "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime," + "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo," + "conversationId,conversationIndex,parentFolderId," + "categories,flag,inferenceClassification,internetMessageHeaders" +) +FULL_FETCH_EXPAND = "attachments($select=id,name,contentType,size,isInline)" + +# ─── Reuse extract logiky z v1.4 ────────────────────────────────────────────── + +_HERE = Path(__file__).parent +_V14_PATH = _HERE / "1_parse_emails_graph_v1.4.py" +if not _V14_PATH.exists(): + print(f"CHYBA: chybi sourozenec {_V14_PATH.name} — extract logiku nelze nacist", file=sys.stderr) + sys.exit(1) + +_spec = importlib.util.spec_from_file_location("v14_parse", _V14_PATH) +_v14 = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_v14) +extract_message = _v14.extract_message +extract_sync_fields = _v14.extract_sync_fields + +# GRAPH_MAILBOX modul-level v v1.4 — pro extract neni potreba, ale pro +# konzistenci nastavujeme ho v main() + +# ─── Graph API ──────────────────────────────────────────────────────────────── + +_graph_token: Optional[str] = None + + +def get_token() -> str: + global _graph_token + app = msal.ConfidentialClientApplication( + GRAPH_CLIENT_ID, + authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", + client_credential=GRAPH_CLIENT_SECRET, + ) + result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + if "access_token" not in result: + raise RuntimeError(f"Graph auth failed: {result}") + _graph_token = result["access_token"] + return _graph_token + + +class DeltaExpired(Exception): + """deltaLink expiroval (HTTP 410) — je nutne zacit od plne delta znovu.""" + + +def graph_get(url: str, params: dict = None, allow_410: bool = False) -> dict: + """GET na Graph s retry pri 401. Pri 410 a allow_410=True vyhodi DeltaExpired.""" + global _graph_token + if not _graph_token: + get_token() + for attempt in range(3): + r = requests.get( + url, + headers={"Authorization": f"Bearer {_graph_token}"}, + params=params, + timeout=60, + ) + if r.status_code == 401: + get_token() + continue + if r.status_code == 410 and allow_410: + raise DeltaExpired(url) + if r.status_code == 429: + # rate limit — respect Retry-After + wait = int(r.headers.get("Retry-After", "5")) + print(f" [429] cekam {wait}s ...") + time.sleep(wait) + continue + r.raise_for_status() + return r.json() + raise RuntimeError(f"Graph GET failed after retries: {url}") + + +def get_all_folders(mailbox: str, parent_id: str = None, parent_path: str = "") -> list[dict]: + if parent_id is None: + url = f"{GRAPH_URL}/users/{mailbox}/mailFolders" + else: + url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{parent_id}/childFolders" + + folders = [] + params = {"$top": 100, "$select": "id,displayName,childFolderCount"} + while url: + data = graph_get(url, params) + for f in data.get("value", []): + path = f"{parent_path}/{f['displayName']}".lstrip("/") + folders.append({"id": f["id"], "path": path}) + if f.get("childFolderCount", 0) > 0: + folders.extend(get_all_folders(mailbox, f["id"], path)) + url = data.get("@odata.nextLink") + params = None + return folders + + +def fetch_full_message(mailbox: str, msg_id: str) -> Optional[dict]: + """Stahne celou zpravu vcetne hlavicek a priloh — pro nove zpravy zachycene v delte.""" + url = f"{GRAPH_URL}/users/{mailbox}/messages/{msg_id}" + params = {"$select": FULL_FETCH_SELECT, "$expand": FULL_FETCH_EXPAND} + try: + return graph_get(url, params) + except requests.HTTPError as e: + logging.error("fetch_full_message %s: %s", msg_id, e) + return None + + +# ─── Delta iterace ──────────────────────────────────────────────────────────── + +def iter_folder_delta(mailbox: str, folder_id: str, delta_link: Optional[str], limit: int = 0): + """ + Generator: vraci (item, final_delta_link). + item je dict s polozkou (bud zmena nebo {'@removed': ...}). + Posledni vyhozeny tuple ma final_delta_link != None (zbytek None). + + Pri HTTP 410 (expirovany deltaLink) vyhodi DeltaExpired — caller ma + pustit znova s delta_link=None (= fresh full delta). + """ + if delta_link: + url = delta_link + params = None + else: + url = f"{GRAPH_URL}/users/{mailbox}/mailFolders/{folder_id}/messages/delta" + params = {"$select": DELTA_SELECT, "$top": PAGE_SIZE} + + n = 0 + while url: + data = graph_get(url, params, allow_410=True) + params = None + for item in data.get("value", []): + yield item, None + n += 1 + if limit and n >= limit: + # ulozime aspon stavajici nextLink jako "delta" — neni to ciste, + # ale pri --limit jde o test, takze pristi beh proste pocnize znovu + return + next_link = data.get("@odata.nextLink") + final_link = data.get("@odata.deltaLink") + if final_link: + # konec — predame final delta + yield None, final_link + return + url = next_link + + +# ─── Per-folder sync ────────────────────────────────────────────────────────── + +def sync_folder(col, sync_col, mailbox: str, folder: dict, dry_run: bool, limit: int) -> dict: + """Vrati statistiky.""" + fid = folder["id"] + fpath = folder["path"] + state_id = f"{mailbox}|{fid}" + state = sync_col.find_one({"_id": state_id}) + delta_link = state.get("delta_link") if state else None + + is_first_run = delta_link is None + label = "FRESH" if is_first_run else "DELTA" + print(f"\n[{label}] {fpath}") + + stats = {"new": 0, "sync": 0, "removed": 0, "errors": 0} + final_delta = None + + try: + gen = iter_folder_delta(mailbox, fid, delta_link, limit=limit) + for item, fin in gen: + if fin: + final_delta = fin + break + try: + process_item(col, mailbox, fpath, item, stats, dry_run) + except Exception as e: + stats["errors"] += 1 + logging.error("process_item %s: %s", item.get("id", "?"), e) + except DeltaExpired: + print(f" [410] deltaLink expiroval — restart od fresh delta") + # rekurzivni restart s vymazanym statem + sync_col.delete_one({"_id": state_id}) + return sync_folder(col, sync_col, mailbox, folder, dry_run, limit) + + print(f" new={stats['new']} sync={stats['sync']} removed={stats['removed']} err={stats['errors']}") + + # Ulozit sync_state pokud mame final_delta a neni dry run + if final_delta and not dry_run: + sync_col.update_one( + {"_id": state_id}, + { + "$set": { + "mailbox": mailbox, + "folder_id": fid, + "folder_path": fpath, + "delta_link": final_delta, + "last_run_at": datetime.now(timezone.utc).replace(tzinfo=None), + }, + "$inc": { + "cumulative_new": stats["new"], + "cumulative_sync": stats["sync"], + "cumulative_removed": stats["removed"], + "run_count": 1, + }, + }, + upsert=True, + ) + elif not final_delta: + # neprisel deltaLink (napr. limit nebo chyba) — nemenime state, pristi beh + # bude pokracovat normalne podle stareho deltaLinku nebo zacne od fresh + if not is_first_run: + print(f" [pozn] delta neukoncena — pristi beh pojede od ulozeneho deltaLinku") + + return stats + + +def process_item(col, mailbox: str, folder_path: str, item: dict, stats: dict, dry_run: bool): + """Zpracuje jednu polozku z delta odpovedi.""" + # 1) Smazana zprava (@removed) + if "@removed" in item or item.get("@removed.reason"): + graph_id = item.get("id") + if not graph_id: + return + if dry_run: + print(f" REMOVED graph_id={graph_id[:30]}...") + else: + col.update_one( + {"graph_id": graph_id}, + {"$set": { + "permanently_deleted": True, + "permanently_deleted_at": datetime.now(timezone.utc).replace(tzinfo=None), + }}, + ) + stats["removed"] += 1 + return + + # 2) Nova nebo zmenena zprava — rozhodneme podle existence graph_id v Mongo + graph_id = item.get("id") + if not graph_id: + return + + existing = col.find_one({"graph_id": graph_id}, {"_id": 1}) + + if existing: + # Existujici zprava — update jen sync poli (delta payload je obsahuje) + fields = extract_sync_fields(item, folder_path) + if dry_run: + print(f" SYNC {item.get('subject','')[:60]}") + else: + col.update_one({"_id": existing["_id"]}, {"$set": fields}) + stats["sync"] += 1 + else: + # Nova zprava — pro telo+attachments+headers fetchneme plnou verzi + full = fetch_full_message(mailbox, graph_id) + if full is None: + stats["errors"] += 1 + return + doc = extract_message(full, folder_path) + if doc is None: + stats["errors"] += 1 + return + if dry_run: + print(f" NEW {doc.get('subject','')[:60]}") + else: + col.update_one({"_id": doc["_id"]}, {"$set": doc}, upsert=True) + stats["new"] += 1 + + +# ─── Indexy pro sync_state ──────────────────────────────────────────────────── + +def ensure_sync_state_indexes(sync_col): + sync_col.create_index([("mailbox", ASCENDING), ("folder_id", ASCENDING)]) + sync_col.create_index([("last_run_at", ASCENDING)]) + + +def ensure_perm_deleted_index(col): + col.create_index([("permanently_deleted", ASCENDING)], sparse=True) + + +# ─── Main ───────────────────────────────────────────────────────────────────── + +def discover_mailboxes(db) -> list[str]: + """Vrati seznam mailboxu = vsechny kolekce v `emaily` mimo NON_MAILBOX_COLLECTIONS + a SKIP_MAILBOXES.""" + out = [] + for name in sorted(db.list_collection_names()): + if name in NON_MAILBOX_COLLECTIONS: + continue + if name in SKIP_MAILBOXES: + print(f" [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)") + continue + out.append(name) + return out + + +def sync_mailbox(client, mailbox: str, args) -> dict: + """Sync jedne schranky. Vraci totals dict.""" + _v14.GRAPH_MAILBOX = mailbox + + print(f"\n========== {mailbox} ==========") + + col = client[MONGO_DB][mailbox] + sync_col = client[MONGO_DB][SYNC_STATE_COL] + + if not args.dry_run: + ensure_sync_state_indexes(sync_col) + ensure_perm_deleted_index(col) + + if args.reset: + n = sync_col.delete_many({"mailbox": mailbox}).deleted_count + print(f" --reset: smazano {n} deltaLinku pro {mailbox}") + + print("Nacitam seznam slozek...") + try: + folders = get_all_folders(mailbox) + except requests.HTTPError as e: + print(f" CHYBA: nelze nacist slozky pro {mailbox}: {e}") + logging.error("get_all_folders %s: %s", mailbox, e) + return {"new": 0, "sync": 0, "removed": 0, "errors": 1} + + if args.folder: + folders = [f for f in folders if args.folder.lower() in f["path"].lower()] + print(f" Slozek ke zpracovani: {len(folders)}") + + totals = {"new": 0, "sync": 0, "removed": 0, "errors": 0} + for folder in folders: + s = sync_folder(col, sync_col, mailbox, folder, args.dry_run, args.limit) + for k in totals: + totals[k] += s[k] + print(f" -> mailbox total: new={totals['new']} sync={totals['sync']} removed={totals['removed']} err={totals['errors']}") + return totals + + +def main(): + ap = argparse.ArgumentParser(description=f"parse_emails_graph delta sync v{SCRIPT_VERSION}") + ap.add_argument("--mailbox", default="", + help="E-mail schranky (= kolekce v Mongo). " + "Bez argumentu projede vsechny schranky z `emaily` (mimo SKIP_MAILBOXES).") + ap.add_argument("--folder", default="", help="Filtruje slozky obsahujici tento retezec (default: vsechny)") + ap.add_argument("--limit", type=int, default=0, help="Max polozek na slozku (test)") + ap.add_argument("--reset", action="store_true", + help="Smaze deltaLinky pro vybrane schranky — pristi beh zacne od fresh delta") + ap.add_argument("--dry-run", action="store_true", help="Nic neulozi do Mongo, jen vypise co by se stalo") + args = ap.parse_args() + + print(f"=== Delta sync v{SCRIPT_VERSION} ===") + if args.dry_run: + print(" DRY-RUN — zadne zmeny v Mongo") + + print("Pripojuji se k MongoDB...") + client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + client.admin.command("ping") + db = client[MONGO_DB] + + if args.mailbox: + if args.mailbox in SKIP_MAILBOXES: + print(f" CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.") + sys.exit(2) + mailboxes = [args.mailbox] + else: + mailboxes = discover_mailboxes(db) + print(f" Schranky ke zpracovani: {len(mailboxes)}") + for m in mailboxes: + print(f" {m}") + + print("Token Graph API...") + get_token() + print(" OK") + + t0 = time.time() + grand = {"new": 0, "sync": 0, "removed": 0, "errors": 0} + per_mailbox = [] + for mb in mailboxes: + try: + s = sync_mailbox(client, mb, args) + except Exception as e: + print(f" FATAL pri sync {mb}: {e}") + logging.error("sync_mailbox %s: %s", mb, e) + s = {"new": 0, "sync": 0, "removed": 0, "errors": 1} + per_mailbox.append((mb, s)) + for k in grand: + grand[k] += s[k] + + dt = time.time() - t0 + print(f"\n=== SHRNUTI ===") + for mb, s in per_mailbox: + print(f" {mb:40} new={s['new']:>5} sync={s['sync']:>5} removed={s['removed']:>4} err={s['errors']:>3}") + print(f" {'TOTAL':40} new={grand['new']:>5} sync={grand['sync']:>5} removed={grand['removed']:>4} err={grand['errors']:>3}") + print(f" trvalo: {dt:.1f} s") + return 1 if grand["errors"] > 0 else 0 + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/Python-runner/2_refetch_text_bodies_v1.0.md b/Python-runner/2_refetch_text_bodies_v1.0.md new file mode 100644 index 0000000..6203ae0 --- /dev/null +++ b/Python-runner/2_refetch_text_bodies_v1.0.md @@ -0,0 +1,34 @@ +# 2_refetch_text_bodies_v1.0.py + +**Krok 2 pipeline** — ONETIME oprava starých plain-text emailů. Starý `parse_emails_graph_v1.3` ukládal plain-text emaily jen jako prvních 2000 znaků do `body_preview`; plné tělo se zahazovalo. Tento skript najde takové emaily a re-fetchne plný obsah do nového pole `body_text` (max 2 MB). + +> Pro schránky importované rovnou v1.4 nemá co dělat (kandidátů 0). Drží se kvůli archivním schránkám importovaným ve v1.3. + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka | +| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) | + +## Varianty volání + +```bash +# Všechny schránky: +docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py + +# Jedna schránka: +docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz + +# Test 20 emailů: +docker exec -it python-runner python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz --limit 20 + +# Plný běh na pozadí, log do souboru: +docker exec -d python-runner bash -c "python /scripts/2_refetch_text_bodies_v1.0.py > /scripts/refetch.log 2>&1" +``` + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/refetch.log +``` diff --git a/Python-runner/2_refetch_text_bodies_v1.0.py b/Python-runner/2_refetch_text_bodies_v1.0.py new file mode 100644 index 0000000..64ba811 --- /dev/null +++ b/Python-runner/2_refetch_text_bodies_v1.0.py @@ -0,0 +1,270 @@ +""" +============================================================================== +Skript: refetch_text_bodies_v1.0.py +Verze: 1.0 +Datum: 2026-06-03 +Autor: vladimir.buzalka + +Popis: + ONETIME oprava — parse_emails_graph_v1.3 ukladal plain-text emaily jen jako + prvnich 2000 znaku do `body_preview`. Plne telo se zahazovalo. + + Tento skript: + 1) Najde v Mongo emaily kde body_html IS NULL/missing/empty + a soucasne maji graph_id (lze refetch) + 2) Pro kazdy GET /users/{mailbox}/messages/{graph_id}?$select=body,bodyPreview + 3) Pokud body.contentType == 'text' -> ulozi PLNY obsah do noveho pole + body_text (max 2 MB - stejny limit jako body_html) + 4) Pokud body.contentType == 'html' (Graph mezitim prepnul) -> ulozi do body_html + 5) Aktualizuje body_preview na realny 255-znakovy bodyPreview z Graphu + + Bezpecne preusitelne a opakovatelne - skript znovu refetchne jen ty kde + stale chybi body_html i body_text. + +Spusteni: + python refetch_text_bodies_v1.0.py # vsechny schranky + python refetch_text_bodies_v1.0.py --mailbox vladimir.buzalka@buzalka.cz + python refetch_text_bodies_v1.0.py --limit 100 # test +============================================================================== +""" + +from __future__ import annotations + +import argparse +import logging +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import msal +import requests +from pymongo import MongoClient, UpdateOne + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# --- konfigurace ------------------------------------------------------------ +GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" +GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" +GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" +GRAPH_URL = "https://graph.microsoft.com/v1.0" + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +SKIP_COLLECTIONS = {"attachments_index"} + +MAX_BODY_BYTES = 2 * 1024 * 1024 # 2 MB - stejny limit jako body_html v parseru +BATCH_SIZE = 50 +LOG_FILE = Path(__file__).parent / "refetch_text_bodies_errors.log" + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + + +# --- Graph auth ------------------------------------------------------------- +_token: Optional[str] = None + + +def get_token() -> str: + global _token + app = msal.ConfidentialClientApplication( + GRAPH_CLIENT_ID, + authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", + client_credential=GRAPH_CLIENT_SECRET, + ) + res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + if "access_token" not in res: + raise RuntimeError(f"Graph auth failed: {res}") + _token = res["access_token"] + return _token + + +def graph_get(url: str, params: dict = None) -> Optional[dict]: + global _token + if not _token: + get_token() + for attempt in range(3): + try: + r = requests.get( + url, + headers={"Authorization": f"Bearer {_token}"}, + params=params, + timeout=30, + ) + if r.status_code == 401: + get_token() + continue + if r.status_code == 404: + return None # zprava uz neexistuje na strane Outlook + if r.status_code == 429: + wait = int(r.headers.get("Retry-After", "5")) + print(f" [429] throttled, cekam {wait}s", flush=True) + time.sleep(wait) + continue + r.raise_for_status() + return r.json() + except requests.RequestException as e: + if attempt == 2: + raise + time.sleep(2) + return None + + +# --- hlavni smycka ---------------------------------------------------------- + +# emaily kde chybi obe tela (body_html i body_text) - tj. jeste nezpracovane +EMPTY_BODY_FILTER = { + "$and": [ + {"$or": [ + {"body_html": None}, + {"body_html": {"$exists": False}}, + {"body_html": ""}, + ]}, + {"$or": [ + {"body_text": None}, + {"body_text": {"$exists": False}}, + {"body_text": ""}, + ]}, + {"graph_id": {"$exists": True, "$ne": None, "$ne": ""}}, + ] +} + + +def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict: + total = col.count_documents(EMPTY_BODY_FILTER) + print(f"[{mailbox}] kandidatu k refetchi: {total}" + + (f" (limit {limit})" if limit else "")) + if total == 0: + return {"mailbox": mailbox, "candidates": 0, "refetched": 0, + "text": 0, "html": 0, "still_empty": 0, "errors": 0, "missing": 0} + + cursor = col.find(EMPTY_BODY_FILTER, {"_id": 1, "graph_id": 1}, + no_cursor_timeout=True) + if limit: + cursor = cursor.limit(limit) + + n = refetched = txt = html = still_empty = err = missing = 0 + bulk: list[UpdateOne] = [] + + try: + for doc in cursor: + n += 1 + mid = doc["_id"] + gid = doc["graph_id"] + url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}" + params = {"$select": "body,bodyPreview"} + try: + data = graph_get(url, params) + except Exception as e: + err += 1 + logging.error("[%s] graph_get %s: %s", mailbox, gid, e) + continue + + if data is None: + missing += 1 + continue + + body = data.get("body") or {} + ctype = body.get("contentType") + content = body.get("content") or "" + preview = data.get("bodyPreview") or "" + + update: dict = {"refetched_at": datetime.now(timezone.utc).replace(tzinfo=None)} + + if not content: + still_empty += 1 + update["body_refetch_status"] = "graph_empty" + elif ctype == "html": + update["body_html"] = (content[:MAX_BODY_BYTES] + if len(content) > MAX_BODY_BYTES else content) + update["body_refetch_status"] = "html" + html += 1 + refetched += 1 + elif ctype == "text": + update["body_text"] = (content[:MAX_BODY_BYTES] + if len(content) > MAX_BODY_BYTES else content) + update["body_refetch_status"] = "text" + txt += 1 + refetched += 1 + else: + update["body_refetch_status"] = f"unknown_ctype:{ctype}" + still_empty += 1 + + if preview: + update["body_preview"] = preview[:300] + + bulk.append(UpdateOne({"_id": mid}, {"$set": update})) + + if len(bulk) >= BATCH_SIZE: + col.bulk_write(bulk, ordered=False) + bulk.clear() + + if n % 100 == 0 or n == 1: + print(f" [{n:>5}/{total}] refetched={refetched} " + f"text={txt} html={html} still_empty={still_empty} " + f"missing={missing} err={err}", + flush=True) + finally: + cursor.close() + if bulk: + col.bulk_write(bulk, ordered=False) + + print(f" [{n}/{total}] DONE refetched={refetched} text={txt} html={html} " + f"still_empty={still_empty} missing={missing} err={err}") + return {"mailbox": mailbox, "candidates": total, "refetched": refetched, + "text": txt, "html": html, "still_empty": still_empty, + "errors": err, "missing": missing} + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)") + ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)") + args = ap.parse_args() + + t0 = time.time() + print("Pripojuji se k MongoDB...") + mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + mongo.admin.command("ping") + db = mongo[MONGO_DB] + + print("Token Graph API...") + get_token() + print("OK\n") + + if args.mailbox: + mailboxes = [args.mailbox] + else: + mailboxes = [c for c in db.list_collection_names() if c not in SKIP_COLLECTIONS] + print(f"Schranky ({len(mailboxes)}): {mailboxes}\n") + + results = [] + for mb in mailboxes: + results.append(process_mailbox(db[mb], mb, limit=args.limit)) + print() + + print("=== SHRNUTI ===") + for r in results: + print(f" {r['mailbox']}: candidates={r['candidates']} " + f"refetched={r['refetched']} text={r['text']} html={r['html']} " + f"still_empty={r['still_empty']} missing={r['missing']} errors={r['errors']}") + print(f"\nCelkem trvalo: {time.time() - t0:.1f} s") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except KeyboardInterrupt: + print("\nPreruseno uzivatelem") + except Exception: + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/Python-runner/3_download_attachments_v1.3.md b/Python-runner/3_download_attachments_v1.3.md new file mode 100644 index 0000000..dc4701c --- /dev/null +++ b/Python-runner/3_download_attachments_v1.3.md @@ -0,0 +1,47 @@ +# 3_download_attachments_v1.3.py + +**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails//Attachments/`. Deduplikace podle **SHA256** obsahu: +- stejný hash → soubor už existuje → přeskočí +- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf` … + +Po uložení doplní do Mongo `file_hash` + `local_path` a aktualizuje kolekci `emaily.attachments_index` (`_id`=hash, filename, path, size, mime, mailbox, ref_count). Emaily kde mají všechny přílohy `file_hash` → skip → **bezpečné opakovat**. + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | **ne** | e-mail | (všechny) | Schránka = kolekce v Mongo. **Bez argumentu projede všechny** kolekce v `emaily` mimo `SKIP_MAILBOXES` a systémové (`attachments_index`, `sync_state`) | +| `--limit N` | ne | int | 0 (bez limitu) | Zpracuje jen prvních N emailů **per schránka** (test) | +| `--force-recheck` | ne | flag | false | Znovu ověří i už stažené přílohy | +| `--no-indexes` | ne | flag | false | Nevytváří indexy na konci | + +## SKIP_MAILBOXES (hardcoded) + +| Schránka | Důvod | +|---|---| +| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. | + +## Varianty volání + +```bash +# VŠECHNY schránky (mimo SKIP_MAILBOXES): +docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py + +# Jedna schránka interaktivně: +docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz + +# Test 50 emailů: +docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes + +# Force-recheck (znovu ověří všechny): +docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck + +# Na pozadí, log do souboru: +docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1" +``` + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/download_attachments.log +``` diff --git a/Python-runner/3_download_attachments_v1.3.py b/Python-runner/3_download_attachments_v1.3.py new file mode 100644 index 0000000..93e544e --- /dev/null +++ b/Python-runner/3_download_attachments_v1.3.py @@ -0,0 +1,546 @@ +""" +download_attachments_v1.3.py +Nazev: download_attachments_v1.3.py +Verze: 1.3 +Datum: 2026-06-02 +Autor: vladimir.buzalka + +Popis: + Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB + pres Microsoft Graph API a uklada je do adresare + /mnt/Emails//Attachments/. + + Schránka se predava jako povinny parametr --mailbox. + + Deduplikace podle SHA256 hashe obsahu: + - stejny hash = soubor uz existuje -> preskoci + - prvni vyskytu souboru: ulozi pod puvodnimnazvem + - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ... + + Po ulozeni aktualizuje MongoDB: + - v email dokumentu: kazda priloha dostane file_hash + local_path + - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes, + mime_type, mailbox, first_seen_at, ref_count + + Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash + se preskoci. --force-recheck znovu overi i uz stazene. + + POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! + +Spousteni: + python download_attachments_v1.3.py # VSECHNY schranky (mimo SKIP_MAILBOXES) + python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz # jedna schranka + python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 + python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck + +SKIP_MAILBOXES (hardcoded): + vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup. + +Docker: + docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py + +Zavislosti: + msal, requests, pymongo + Python 3.10+ + +Historie verzi: + 1.0 2026-06-02 Inicialni verze + 1.1 2026-06-02 Schránka jako parametr --mailbox + 1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu; + preskoceni S/MIME; inline z Graphu -> SKIP ne ERR + 1.3 2026-06-02 Primarni stazeni pres graph_att_id (prime ID bez name-matchingu); + oprava $select na attachment listu (odstranen contentId ktery + zpusoboval BadRequest a vracel prazdny seznam); name-matching + zustava jako fallback pro stare emaily bez graph_att_id +""" + +import sys +import re +import hashlib +import logging +import argparse +import unicodedata +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional + +import msal +import requests +from pymongo import MongoClient, UpdateOne + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# ─── KONFIGURACE ────────────────────────────────────────────────────────────── +GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" +GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" +GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" +GRAPH_URL = "https://graph.microsoft.com/v1.0" + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +MONGO_COL_INDEX = "attachments_index" + +EMAILS_BASE_DIR = Path("/mnt/Emails") +LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" +SCRIPT_VERSION = "1.3" +BATCH_SIZE = 50 + +# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty) +SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"} + +# Kolekce v `emaily` ktere NEJSOU mailboxy +NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"} + +# Schranky kde NEMAME Graph API pristup — pri behu bez --mailbox se preskocia +SKIP_MAILBOXES = { + "vbuzalka@its.jnj.com", # JNJ tenant — nemame Graph credentials +} +# ────────────────────────────────────────────────────────────────────────────── + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + +_graph_token: Optional[str] = None + + +# ─── Graph API ──────────────────────────────────────────────────────────────── + +def get_token() -> str: + global _graph_token + app = msal.ConfidentialClientApplication( + GRAPH_CLIENT_ID, + authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", + client_credential=GRAPH_CLIENT_SECRET, + ) + result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + if "access_token" not in result: + raise RuntimeError(f"Graph auth failed: {result}") + _graph_token = result["access_token"] + return _graph_token + + +def graph_get_bytes(url: str) -> bytes: + global _graph_token + if not _graph_token: + get_token() + for attempt in range(2): + r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True) + if r.status_code == 401: + get_token() + continue + r.raise_for_status() + return r.content + raise RuntimeError(f"Graph GET bytes failed: {url}") + + +def graph_get_json(url: str, params: dict = None) -> dict: + global _graph_token + if not _graph_token: + get_token() + for attempt in range(2): + r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) + if r.status_code == 401: + get_token() + continue + r.raise_for_status() + return r.json() + raise RuntimeError(f"Graph GET json failed: {url}") + + +def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]: + """Nacte metadata vsech priloh zpravy (bez contentBytes).""" + url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments" + try: + # Pozor: contentId NENI v base attachment type — nesmi byt v $select + data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"}) + return data.get("value", []) + except Exception as e: + logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e) + return [] + + +def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]: + url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value" + try: + return graph_get_bytes(url) + except Exception as e: + logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", + graph_message_id, attachment_id, e) + return None + + +# ─── Pomocné funkce ─────────────────────────────────────────────────────────── + +def normalize_name(name: str) -> str: + """Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-""" + nfkd = unicodedata.normalize("NFKD", name.lower().strip()) + ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) + return re.sub(r"[^\w.\-]", "_", ascii_str) + + +def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]: + """Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id).""" + # 1. Presna shoda + for ga in graph_atts: + if ga["name"] == att_name: + return ga + + norm_want = normalize_name(att_name) + + # 2. Normalizovana shoda + for ga in graph_atts: + if normalize_name(ga["name"]) == norm_want: + return ga + + # 3. Normalizovana shoda + velikost (±10 %) + for ga in graph_atts: + if normalize_name(ga["name"]) == norm_want: + ga_size = ga.get("size", 0) + if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1: + return ga + + # 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména) + for ga in graph_atts: + if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]): + return ga + + return None + + +def sha256(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def safe_filename(name: str) -> str: + safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip() + return safe or "attachment" + + +def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str: + existing = col_index.find_one({"filename": desired_name}) + if existing: + if existing["_id"] == hash_val: + return desired_name + stem = Path(desired_name).stem + suffix = Path(desired_name).suffix + n = 2 + while True: + candidate = f"{stem}_{n}{suffix}" + ex2 = col_index.find_one({"filename": candidate}) + if not ex2 or ex2["_id"] == hash_val: + if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val): + return candidate + n += 1 + return desired_name + + +def save_attachment( + content: bytes, + original_name: str, + mime_type: str, + mailbox: str, + att_dir: Path, + col_index, +) -> tuple[str, str, bool]: + hash_val = sha256(content) + + existing = col_index.find_one({"_id": hash_val}) + if existing: + col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}}) + return hash_val, existing["local_path"], False + + filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index) + file_path = att_dir / filename + file_path.write_bytes(content) + + col_index.insert_one({ + "_id": hash_val, + "filename": filename, + "local_path": filename, + "size_bytes": len(content), + "mime_type": mime_type, + "mailbox": mailbox, + "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None), + "ref_count": 1, + }) + + return hash_val, filename, True + + +# ─── MAIN ───────────────────────────────────────────────────────────────────── + +def process_mailbox(client, mailbox: str, args) -> dict: + """Zpracuje jednu schranku. Vraci statistiky.""" + att_dir = EMAILS_BASE_DIR / mailbox / "Attachments" + mongo_col = mailbox + + start = datetime.now() + print(f"\n========== {mailbox} ==========") + print(f"Cilovy adresar: {att_dir}") + + att_dir.mkdir(parents=True, exist_ok=True) + + col_emails = client[MONGO_DB][mongo_col] + col_index = client[MONGO_DB][MONGO_COL_INDEX] + + if args.force_recheck: + query = {"has_attachments": True} + else: + query = { + "has_attachments": True, + "attachments": { + "$elemMatch": { + "is_inline": False, + "file_hash": {"$exists": False}, + } + } + } + + total = col_emails.count_documents(query) + print(f"Emailu ke zpracovani: {total}") + if total == 0: + print(" Neni co stahnout.") + return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0, "err": 0, + "elapsed": 0.0} + + cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1}) + if args.limit: + cursor = cursor.limit(args.limit) + + ok_count = 0 + new_count = 0 + dup_count = 0 + skip_count = 0 + err_count = 0 + email_i = 0 + batch = [] + + def flush(): + if not batch: + return + try: + col_emails.bulk_write(batch, ordered=False) + except Exception as e: + logging.error("bulk_write: %s", e) + print(f" CHYBA bulk_write: {e}") + batch.clear() + + for email_doc in cursor: + email_i += 1 + email_id = email_doc["_id"] + graph_id = email_doc.get("graph_id", "") + subject = (email_doc.get("subject") or "")[:60] + att_list = email_doc.get("attachments") or [] + + real_atts = [a for a in att_list if not a.get("is_inline", False)] + if not real_atts: + continue + + print(f"\n {email_i:>5}/{total} {subject}") + + # Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id + need_listing = any( + not a.get("is_inline", False) + and not (not args.force_recheck and a.get("file_hash")) + and not a.get("graph_att_id") + for a in att_list + ) + graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else [] + + updated_atts = list(att_list) + email_ok = True + + for i, att in enumerate(updated_atts): + if att.get("is_inline", False): + continue + if not args.force_recheck and att.get("file_hash"): + continue + + att_name = att.get("filename", "") + att_size = att.get("size_bytes", 0) + graph_att_id = att.get("graph_att_id") + + # Preskoc S/MIME podpisy + if Path(att_name).suffix.lower() in SKIP_EXTENSIONS: + updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""} + skip_count += 1 + print(f" SKIP {att_name} (S/MIME)") + continue + + # Primy pristup pres graph_att_id (emaily parsovane v1.2+) + if graph_att_id: + content = fetch_attachment_content(mailbox, graph_id, graph_att_id) + if content is None: + err_count += 1 + email_ok = False + print(f" ERR {att_name} (stazeni selhalo)") + continue + # Zkontroluj zda jde skutecne o inline (pro edge case) + mime_type = att.get("mime_type", "") + else: + # Fallback: name matching pro stare emaily (parsovane pred v1.2) + graph_att = find_graph_att(att_name, att_size, graph_atts) + + if not graph_att: + logging.error("attachment not found [email=%s att=%s]", email_id, att_name) + print(f" ERR {att_name} (nenalezeno)") + err_count += 1 + email_ok = False + continue + + # Pokud Graph rika ze je inline — preskoc + if graph_att.get("isInline", False): + updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""} + skip_count += 1 + print(f" SKIP {att_name} (inline obrazek)") + continue + + content = fetch_attachment_content(mailbox, graph_id, graph_att["id"]) + if content is None: + err_count += 1 + email_ok = False + print(f" ERR {att_name} (stazeni selhalo)") + continue + + mime_type = att.get("mime_type") or graph_att.get("contentType", "") + + hash_val, local_path, was_new = save_attachment( + content, att_name, mime_type, mailbox, att_dir, col_index + ) + + updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path} + + if was_new: + new_count += 1 + print(f" NEW {local_path} ({len(content):,} B)") + else: + dup_count += 1 + print(f" DUP {att_name} -> {local_path}") + + if email_ok: + ok_count += 1 + + batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}})) + + if len(batch) >= BATCH_SIZE: + flush() + + if email_i % 100 == 0: + elapsed = (datetime.now() - start).total_seconds() + print(f" {'─'*60}") + print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} skip={skip_count} err={err_count}") + print(f" {'─'*60}") + + flush() + + elapsed = (datetime.now() - start).total_seconds() + print(f" -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} " + f"skip={skip_count} err={err_count} ({elapsed:.1f} s)") + return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count, + "skip": skip_count, "err": err_count, "elapsed": elapsed} + + +def discover_mailboxes(db) -> list[str]: + """Vrati seznam mailboxu = vsechny kolekce mimo NON_MAILBOX a SKIP_MAILBOXES.""" + out = [] + for name in sorted(db.list_collection_names()): + if name in NON_MAILBOX_COLLECTIONS: + continue + if name in SKIP_MAILBOXES: + print(f" [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)") + continue + out.append(name) + return out + + +def main(): + ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}") + ap.add_argument("--mailbox", default="", + help="Emailova schranka. Bez argumentu projede vsechny schranky " + "v `emaily` mimo SKIP_MAILBOXES.") + ap.add_argument("--limit", type=int, default=0, + help="Zpracovat max N emailu (0 = vse) — per schranka") + ap.add_argument("--force-recheck", action="store_true", + help="Znovu overi i emaily kde prilohy uz maji file_hash") + ap.add_argument("--no-indexes", action="store_true", + help="Nevytvorit indexy na attachments_index kolekci") + args = ap.parse_args() + + start_all = datetime.now() + print(f"=== download_attachments v{SCRIPT_VERSION} ===") + print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}") + + print("\nPřipojuji se k Graph API...") + try: + get_token() + print(" Graph API OK") + except Exception as e: + print(f" CHYBA: {e}") + sys.exit(1) + + client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + try: + client.admin.command("ping") + print(" MongoDB OK") + except Exception as e: + print(f" CHYBA: MongoDB neni dostupna -- {e}") + sys.exit(1) + + col_index = client[MONGO_DB][MONGO_COL_INDEX] + if not args.no_indexes: + col_index.create_index("filename") + col_index.create_index("mime_type") + col_index.create_index("mailbox") + + db = client[MONGO_DB] + if args.mailbox: + if args.mailbox in SKIP_MAILBOXES: + print(f" CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.") + sys.exit(2) + mailboxes = [args.mailbox] + else: + mailboxes = discover_mailboxes(db) + print(f" Schranky ke zpracovani: {len(mailboxes)}") + for m in mailboxes: + print(f" {m}") + + results = [] + for mb in mailboxes: + try: + results.append(process_mailbox(client, mb, args)) + except Exception as e: + logging.error("process_mailbox %s: %s", mb, e) + print(f" FATAL pri zpracovani {mb}: {e}") + results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0, + "skip": 0, "err": 1, "elapsed": 0.0}) + + elapsed_total = (datetime.now() - start_all).total_seconds() + files_total = col_index.count_documents({}) + size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1})) + + grand = {k: sum(r[k] for r in results) for k in ("ok", "new", "dup", "skip", "err")} + + print(f"\n{'='*60}") + print("=== SHRNUTI ===") + for r in results: + print(f" {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} " + f"dup={r['dup']:>4} skip={r['skip']:>3} err={r['err']:>3}") + print(f" {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} " + f"dup={grand['dup']:>4} skip={grand['skip']:>3} err={grand['err']:>3}") + print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)") + print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") + print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + if grand['err']: + print(f"Chyby logovany do: {LOG_FILE}") + + client.close() + + +if __name__ == "__main__": + main() diff --git a/Python-runner/3_download_attachments_v1.4.md b/Python-runner/3_download_attachments_v1.4.md new file mode 100644 index 0000000..f721798 --- /dev/null +++ b/Python-runner/3_download_attachments_v1.4.md @@ -0,0 +1,74 @@ +# 3_download_attachments_v1.4.py + +**Krok 3 pipeline** — stahuje skutečné přílohy (`is_inline=False`) z Mongo emailů přes Graph API do `/mnt/Emails//Attachments/`. Deduplikace podle **SHA256** obsahu. + +## Nové ve verzi 1.4 + +| Typ přílohy | `@odata.type` | Co skript dělá | +|---|---|---| +| **File** | `#microsoft.graph.fileAttachment` | Stáhne přes `/$value`, uloží binárku | +| **Item** (vnořený email) | `#microsoft.graph.itemAttachment` | `$expand=...itemAttachment/item`, sestaví **`.eml`** z hlaviček a body vnitřní zprávy | +| **Reference** (OneDrive/SharePoint link) | `#microsoft.graph.referenceAttachment` | Žádný file — uloží jen `reference_url` do Mongo | + +Plus: +- **Retry** s exponenciálním backoffem na 429/500/502/503/504 (1s, 2s, 4s; respektuje `Retry-After`). +- **Permanentní označení chyb v Mongo** per-attachment: + - `attachment_missing: True` + `attachment_missing_at: ` při 404 (email/příloha už neexistuje v mailboxu) + - `attachment_reference: True` + `reference_url: ` u referenceAttachment +- Tagované přílohy se při dalším běhu **automaticky přeskočí** (bez `--force-recheck`). + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | ne | e-mail | (všechny) | Schránka = kolekce v Mongo. Bez argumentu projede všechny kolekce mimo `NON_MAILBOX_COLLECTIONS` a `SKIP_MAILBOXES` | +| `--limit N` | ne | int | 0 | Per schránka, jen prvních N emailů (test) | +| `--force-recheck` | ne | flag | false | Znovu ověří i emaily kde přílohy mají `file_hash` **nebo** `attachment_missing` **nebo** `attachment_reference` | +| `--no-indexes` | ne | flag | false | Nevytváří indexy na `attachments_index` | + +## SKIP_MAILBOXES (hardcoded) + +| Schránka | Důvod | +|---|---| +| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. | + +## Statistiky per schránka + +``` +ok=N nove=N dup=N skip=N miss=N ref=N err=N +``` + +| Kategorie | Význam | +|---|---| +| `ok` | emaily zpracované bez chyby (všechny přílohy hotové) | +| `nove` | nové soubory uložené (NEW + NEW(eml)) | +| `dup` | hash už existuje (jen ref_count++) | +| `skip` | S/MIME (.p7m/.p7s/...) nebo inline obrázek | +| `miss` | 404 — označeno `attachment_missing` (nepokračuje se) | +| `ref` | referenceAttachment — uložen jen URL | +| `err` | tranzientní chyba (5xx, timeout) — bude retry při dalším běhu | + +## Varianty volání + +```bash +# Všechny schránky (mimo SKIP_MAILBOXES): +docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py + +# Jedna schránka: +docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz + +# Test 50 emailů: +docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes + +# Force-recheck (i missing/reference přepíše): +docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck + +# Na pozadí: +docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.4.py > /scripts/download_attachments.log 2>&1" +``` + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/download_attachments.log +``` diff --git a/Python-runner/3_download_attachments_v1.4.py b/Python-runner/3_download_attachments_v1.4.py new file mode 100644 index 0000000..487852f --- /dev/null +++ b/Python-runner/3_download_attachments_v1.4.py @@ -0,0 +1,713 @@ +""" +download_attachments_v1.4.py +Nazev: download_attachments_v1.4.py +Verze: 1.4 +Datum: 2026-06-04 +Autor: vladimir.buzalka + +Popis: + Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB + pres Microsoft Graph API a uklada je do adresare + /mnt/Emails//Attachments/. + + Bez argumentu --mailbox projede vsechny kolekce v `emaily` mimo + NON_MAILBOX_COLLECTIONS a SKIP_MAILBOXES. + + Deduplikace podle SHA256 hashe obsahu: + - stejny hash = soubor uz existuje -> preskoci + - prvni vyskyt: ulozi pod puvodnim nazvem + - kolize nazvu: faktura_2.pdf, faktura_3.pdf ... + + Po ulozeni aktualizuje MongoDB: + - v email dokumentu: kazda priloha dostane file_hash + local_path + - kolekce emaily.attachments_index: _id=hash, filename, ... + + NOVE v 1.4: + - Spravne zpracovani vsech typu priloh: + * fileAttachment -> /$value (jako predtim) + * itemAttachment -> /$expand=microsoft.graph.itemAttachment/item + -> sestavi .eml z vnitrni zpravy + * referenceAttachment -> ulozi jen URL, neexistuje content + - Retry s exponencialnim backoffem (1s, 2s, 4s) na 429/5xx + - Permanentni tagging chyb v Mongo per-attachment: + * attachment_missing: True (404, email/att uz neexistuje) + * attachment_reference: True (referenceAttachment, jen URL) + * reference_url, attachment_type — diagnosticke metadata + - Tagovane prilohy se pri dalsim behu preskocia (bez --force-recheck) + + POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! + +Spousteni: + python download_attachments_v1.4.py + python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz + python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 + python download_attachments_v1.4.py --mailbox ordinace@buzalkova.cz --force-recheck + +SKIP_MAILBOXES (hardcoded): + vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup. + +Docker: + docker exec -it python-runner python /scripts/3_download_attachments_v1.4.py + +Zavislosti: + msal, requests, pymongo + Python 3.10+ + +Historie verzi: + 1.0 2026-06-02 Inicialni verze + 1.1 2026-06-02 Schranka jako parametr --mailbox + 1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu + 1.3 2026-06-02 Primarni stazeni pres graph_att_id; --mailbox volitelny + 1.4 2026-06-04 itemAttachment/referenceAttachment handling; retry s backoffem; + permanentni tagging chyb (attachment_missing / attachment_reference) +""" + +import sys +import re +import time +import json +import hashlib +import logging +import argparse +import unicodedata +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional + +import msal +import requests +from pymongo import MongoClient, UpdateOne + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# ─── KONFIGURACE ────────────────────────────────────────────────────────────── +GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" +GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" +GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" +GRAPH_URL = "https://graph.microsoft.com/v1.0" + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +MONGO_COL_INDEX = "attachments_index" + +EMAILS_BASE_DIR = Path("/mnt/Emails") +LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" +SCRIPT_VERSION = "1.4" +BATCH_SIZE = 50 + +# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty) +SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"} + +# Kolekce v `emaily` ktere NEJSOU mailboxy +NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"} + +# Schranky kde NEMAME Graph API pristup +SKIP_MAILBOXES = { + "vbuzalka@its.jnj.com", +} + +# Retry konfigurace pro tranzientni chyby +RETRY_STATUSES = {429, 500, 502, 503, 504} +RETRY_BACKOFF_S = [1, 2, 4] # max 3 pokusy + +# Sentinel hodnoty pro fetch_attachment_smart +FETCH_MISSING = "__MISSING__" # 404 +FETCH_REFERENCE = "__REFERENCE__" # referenceAttachment +# ────────────────────────────────────────────────────────────────────────────── + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + +_graph_token: Optional[str] = None + + +# ─── Graph API ──────────────────────────────────────────────────────────────── + +def get_token() -> str: + global _graph_token + app = msal.ConfidentialClientApplication( + GRAPH_CLIENT_ID, + authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", + client_credential=GRAPH_CLIENT_SECRET, + ) + result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + if "access_token" not in result: + raise RuntimeError(f"Graph auth failed: {result}") + _graph_token = result["access_token"] + return _graph_token + + +def _graph_request(method: str, url: str, *, params: dict = None, + stream: bool = False, timeout: int = 60): + """Nizko-urovnova HTTP volani s retry na 429/5xx a auto-reauth na 401. + Vraci requests.Response (pro stream=True pred .content); pro 404 vraci Response.""" + global _graph_token + if not _graph_token: + get_token() + + last_exc = None + for attempt in range(len(RETRY_BACKOFF_S) + 1): + try: + r = requests.request( + method, url, + headers={"Authorization": f"Bearer {_graph_token}"}, + params=params, timeout=timeout, stream=stream, + ) + if r.status_code == 401: + get_token() + continue + if r.status_code in RETRY_STATUSES and attempt < len(RETRY_BACKOFF_S): + # Retry-After hlavicka ma prednost + ra = r.headers.get("Retry-After") + sleep_s = float(ra) if ra and ra.replace(".", "").isdigit() else RETRY_BACKOFF_S[attempt] + time.sleep(sleep_s) + continue + return r + except (requests.ConnectionError, requests.Timeout) as e: + last_exc = e + if attempt < len(RETRY_BACKOFF_S): + time.sleep(RETRY_BACKOFF_S[attempt]) + continue + raise + raise RuntimeError(f"Graph request exhausted retries: {url} (last_exc={last_exc})") + + +def graph_get_json(url: str, params: dict = None) -> dict: + r = _graph_request("GET", url, params=params, timeout=30) + r.raise_for_status() + return r.json() + + +def graph_get_bytes(url: str) -> bytes: + r = _graph_request("GET", url, stream=True, timeout=120) + r.raise_for_status() + return r.content + + +def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]: + """Nacte metadata vsech priloh zpravy. Vraci i @odata.type.""" + url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments" + try: + # @odata.type se vraci automaticky (neni v base $select) + data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"}) + return data.get("value", []) + except Exception as e: + logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e) + return [] + + +def _build_eml_from_item(item: dict) -> bytes: + """Sestavi minimalni RFC822 .eml z itemAttachment.item (message).""" + def hdr(name, val): + return f"{name}: {val}\r\n" if val else "" + + def addrs(field): + rec = item.get(field) or [] + out = [] + for r in rec: + ea = r.get("emailAddress") or {} + name = ea.get("name", "") + addr = ea.get("address", "") + if name and addr: + out.append(f'"{name}" <{addr}>') + elif addr: + out.append(addr) + return ", ".join(out) + + subj = item.get("subject", "") + sender = item.get("from") or item.get("sender") or {} + sender_ea = sender.get("emailAddress") or {} + from_str = (f'"{sender_ea.get("name","")}" <{sender_ea.get("address","")}>' + if sender_ea.get("address") else "") + sent = item.get("sentDateTime") or item.get("receivedDateTime") or "" + + body = item.get("body") or {} + content_type = body.get("contentType", "text") # 'text' | 'html' + body_content = body.get("content", "") or "" + + mime_type = "text/html" if content_type.lower() == "html" else "text/plain" + + headers = ( + hdr("From", from_str) + + hdr("To", addrs("toRecipients")) + + hdr("Cc", addrs("ccRecipients")) + + hdr("Subject", subj) + + hdr("Date", sent) + + f"Content-Type: {mime_type}; charset=utf-8\r\n" + + "MIME-Version: 1.0\r\n" + + "\r\n" + ) + return (headers + body_content).encode("utf-8", errors="replace") + + +def fetch_attachment_smart(mailbox: str, graph_message_id: str, + attachment_id: str, odata_type: str = "") -> tuple: + """Smart fetch: rozezna typ prilohy a vrati (content_bytes, type_str, extra). + type_str: 'file' | 'item' | 'reference' | FETCH_MISSING | FETCH_REFERENCE + extra: pri 'reference' = sourceUrl; pri 'item' = puvodni subject (info) + Vraci (None, FETCH_MISSING, None) pri 404. + Vyhazuje exception pri jinych failures po vycerpani retry. + """ + base = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}" + + # Zname typ → optimalni cesta + if odata_type == "#microsoft.graph.fileAttachment": + r = _graph_request("GET", base + "/$value", stream=True, timeout=120) + if r.status_code == 404: + return (None, FETCH_MISSING, None) + r.raise_for_status() + return (r.content, "file", None) + + if odata_type == "#microsoft.graph.itemAttachment": + r = _graph_request("GET", base, + params={"$expand": "microsoft.graph.itemAttachment/item"}, + timeout=60) + if r.status_code == 404: + return (None, FETCH_MISSING, None) + r.raise_for_status() + obj = r.json() + item = obj.get("item") or {} + return (_build_eml_from_item(item), "item", item.get("subject")) + + if odata_type == "#microsoft.graph.referenceAttachment": + r = _graph_request("GET", base, timeout=30) + if r.status_code == 404: + return (None, FETCH_MISSING, None) + r.raise_for_status() + obj = r.json() + return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name")) + + # Neznamy typ — zkus $value, pri 405 detekuj typ a rekurzivne zpracuj + r = _graph_request("GET", base + "/$value", stream=True, timeout=120) + if r.status_code == 404: + return (None, FETCH_MISSING, None) + if r.status_code == 405: + # Method Not Allowed -> neni fileAttachment; zjisti typ + r2 = _graph_request("GET", base, timeout=30) + if r2.status_code == 404: + return (None, FETCH_MISSING, None) + r2.raise_for_status() + obj = r2.json() + ot = obj.get("@odata.type", "") + if ot == "#microsoft.graph.itemAttachment": + # objekt nema item bez expand → druhy request + return fetch_attachment_smart(mailbox, graph_message_id, attachment_id, ot) + if ot == "#microsoft.graph.referenceAttachment": + return (None, FETCH_REFERENCE, obj.get("sourceUrl") or obj.get("name")) + # fallback: fileAttachment ale jeho contentBytes je v JSON + if ot == "#microsoft.graph.fileAttachment": + import base64 + cb = obj.get("contentBytes") + if cb: + return (base64.b64decode(cb), "file", None) + raise RuntimeError(f"unknown attachment odata.type={ot}") + r.raise_for_status() + return (r.content, "file", None) + + +# ─── Pomocne funkce ─────────────────────────────────────────────────────────── + +def normalize_name(name: str) -> str: + nfkd = unicodedata.normalize("NFKD", name.lower().strip()) + ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) + return re.sub(r"[^\w.\-]", "_", ascii_str) + + +def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]: + for ga in graph_atts: + if ga["name"] == att_name: + return ga + norm_want = normalize_name(att_name) + for ga in graph_atts: + if normalize_name(ga["name"]) == norm_want: + return ga + for ga in graph_atts: + if normalize_name(ga["name"]) == norm_want: + ga_size = ga.get("size", 0) + if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1: + return ga + for ga in graph_atts: + if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]): + return ga + return None + + +def sha256(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def safe_filename(name: str) -> str: + safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip() + return safe or "attachment" + + +def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str: + existing = col_index.find_one({"filename": desired_name}) + if existing: + if existing["_id"] == hash_val: + return desired_name + stem = Path(desired_name).stem + suffix = Path(desired_name).suffix + n = 2 + while True: + candidate = f"{stem}_{n}{suffix}" + ex2 = col_index.find_one({"filename": candidate}) + if not ex2 or ex2["_id"] == hash_val: + if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val): + return candidate + n += 1 + return desired_name + + +def save_attachment(content: bytes, original_name: str, mime_type: str, + mailbox: str, att_dir: Path, col_index) -> tuple[str, str, bool]: + hash_val = sha256(content) + existing = col_index.find_one({"_id": hash_val}) + if existing: + col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}}) + return hash_val, existing["local_path"], False + + filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index) + file_path = att_dir / filename + file_path.write_bytes(content) + + col_index.insert_one({ + "_id": hash_val, + "filename": filename, + "local_path": filename, + "size_bytes": len(content), + "mime_type": mime_type, + "mailbox": mailbox, + "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None), + "ref_count": 1, + }) + return hash_val, filename, True + + +# ─── MAIN ───────────────────────────────────────────────────────────────────── + +def process_mailbox(client, mailbox: str, args) -> dict: + att_dir = EMAILS_BASE_DIR / mailbox / "Attachments" + mongo_col = mailbox + + start = datetime.now() + print(f"\n========== {mailbox} ==========") + print(f"Cilovy adresar: {att_dir}") + + att_dir.mkdir(parents=True, exist_ok=True) + + col_emails = client[MONGO_DB][mongo_col] + col_index = client[MONGO_DB][MONGO_COL_INDEX] + + if args.force_recheck: + query = {"has_attachments": True} + else: + # priloha "ke zpracovani" = neni inline, nema file_hash, neni oznacena + # jako missing/reference + query = { + "has_attachments": True, + "attachments": { + "$elemMatch": { + "is_inline": False, + "file_hash": {"$exists": False}, + "attachment_missing": {"$ne": True}, + "attachment_reference": {"$ne": True}, + } + } + } + + total = col_emails.count_documents(query) + print(f"Emailu ke zpracovani: {total}") + if total == 0: + print(" Neni co stahnout.") + return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0, + "miss": 0, "ref": 0, "err": 0, "elapsed": 0.0} + + cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1}) + if args.limit: + cursor = cursor.limit(args.limit) + + ok_count = 0 + new_count = 0 + dup_count = 0 + skip_count = 0 + miss_count = 0 + ref_count = 0 + err_count = 0 + email_i = 0 + batch = [] + + def flush(): + if not batch: + return + try: + col_emails.bulk_write(batch, ordered=False) + except Exception as e: + logging.error("bulk_write: %s", e) + print(f" CHYBA bulk_write: {e}") + batch.clear() + + for email_doc in cursor: + email_i += 1 + email_id = email_doc["_id"] + graph_id = email_doc.get("graph_id", "") + subject = (email_doc.get("subject") or "")[:60] + att_list = email_doc.get("attachments") or [] + + real_atts = [a for a in att_list if not a.get("is_inline", False) + and not a.get("attachment_missing") + and not a.get("attachment_reference")] + if not real_atts: + continue + + print(f"\n {email_i:>5}/{total} {subject}") + + need_listing = any( + not a.get("is_inline", False) + and not (not args.force_recheck and a.get("file_hash")) + and not a.get("graph_att_id") + for a in att_list + ) + graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else [] + + # mapa graph_att_id -> @odata.type (z listingu pokud byl) + type_map = {ga["id"]: ga.get("@odata.type", "") for ga in graph_atts} + + updated_atts = list(att_list) + email_ok = True + + for i, att in enumerate(updated_atts): + if att.get("is_inline", False): + continue + if att.get("attachment_missing") or att.get("attachment_reference"): + continue + if not args.force_recheck and att.get("file_hash"): + continue + + att_name = att.get("filename", "") + att_size = att.get("size_bytes", 0) + graph_att_id = att.get("graph_att_id") + + if Path(att_name).suffix.lower() in SKIP_EXTENSIONS: + updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""} + skip_count += 1 + print(f" SKIP {att_name} (S/MIME)") + continue + + # Resolve graph_att_id + odata_type + resolved_id = graph_att_id + odata_type = type_map.get(graph_att_id, "") if graph_att_id else "" + + if not resolved_id: + # Fallback: name matching (legacy) + graph_att = find_graph_att(att_name, att_size, graph_atts) + if not graph_att: + logging.error("attachment not found [email=%s att=%s]", email_id, att_name) + print(f" ERR {att_name} (nenalezeno)") + err_count += 1 + email_ok = False + continue + if graph_att.get("isInline", False): + updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""} + skip_count += 1 + print(f" SKIP {att_name} (inline obrazek)") + continue + resolved_id = graph_att["id"] + odata_type = graph_att.get("@odata.type", "") + + # Smart fetch + try: + content, kind, extra = fetch_attachment_smart( + mailbox, graph_id, resolved_id, odata_type + ) + except Exception as e: + logging.error("fetch_attachment_smart failed [msg=%s att=%s type=%s]: %s", + graph_id, resolved_id, odata_type, e) + err_count += 1 + email_ok = False + print(f" ERR {att_name} (stazeni selhalo)") + continue + + now_utc = datetime.now(timezone.utc).replace(tzinfo=None) + + if kind == FETCH_MISSING: + updated_atts[i] = { + **att, + "attachment_missing": True, + "attachment_missing_at": now_utc, + } + miss_count += 1 + print(f" MISS {att_name} (404 — oznaceno jako missing)") + continue + + if kind == FETCH_REFERENCE: + updated_atts[i] = { + **att, + "attachment_reference": True, + "attachment_type": "reference", + "reference_url": extra, + } + ref_count += 1 + print(f" REF {att_name} -> {extra}") + continue + + # kind in ('file', 'item') — mame bytes + mime_type = att.get("mime_type") or ( + "message/rfc822" if kind == "item" else "application/octet-stream" + ) + + # Pro itemAttachment vyrobime .eml priponu pokud chybi + save_name = att_name + if kind == "item" and not save_name.lower().endswith(".eml"): + save_name = (save_name or "embedded_email") + ".eml" + + hash_val, local_path, was_new = save_attachment( + content, save_name, mime_type, mailbox, att_dir, col_index + ) + + updated_atts[i] = { + **att, + "file_hash": hash_val, + "local_path": local_path, + "attachment_type": kind, + } + + if was_new: + new_count += 1 + tag = "NEW(eml)" if kind == "item" else "NEW" + print(f" {tag} {local_path} ({len(content):,} B)") + else: + dup_count += 1 + print(f" DUP {att_name} -> {local_path}") + + if email_ok: + ok_count += 1 + + batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}})) + + if len(batch) >= BATCH_SIZE: + flush() + + if email_i % 100 == 0: + elapsed = (datetime.now() - start).total_seconds() + print(f" {'─'*60}") + print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} " + f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count}") + print(f" {'─'*60}") + + flush() + + elapsed = (datetime.now() - start).total_seconds() + print(f" -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} " + f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count} ({elapsed:.1f} s)") + return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count, + "skip": skip_count, "miss": miss_count, "ref": ref_count, "err": err_count, + "elapsed": elapsed} + + +def discover_mailboxes(db) -> list[str]: + out = [] + for name in sorted(db.list_collection_names()): + if name in NON_MAILBOX_COLLECTIONS: + continue + if name in SKIP_MAILBOXES: + print(f" [skip] {name} — v SKIP_MAILBOXES (neni Graph pristup)") + continue + out.append(name) + return out + + +def main(): + ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}") + ap.add_argument("--mailbox", default="", + help="Emailova schranka. Bez argumentu projede vsechny schranky.") + ap.add_argument("--limit", type=int, default=0, + help="Zpracovat max N emailu (0 = vse) — per schranka") + ap.add_argument("--force-recheck", action="store_true", + help="Znovu overi i emaily kde prilohy uz maji file_hash / missing / reference") + ap.add_argument("--no-indexes", action="store_true", + help="Nevytvorit indexy na attachments_index kolekci") + args = ap.parse_args() + + start_all = datetime.now() + print(f"=== download_attachments v{SCRIPT_VERSION} ===") + print(f"Start: {start_all.strftime('%Y-%m-%d %H:%M:%S')}") + + print("\nPřipojuji se k Graph API...") + try: + get_token() + print(" Graph API OK") + except Exception as e: + print(f" CHYBA: {e}") + sys.exit(1) + + client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + try: + client.admin.command("ping") + print(" MongoDB OK") + except Exception as e: + print(f" CHYBA: MongoDB neni dostupna -- {e}") + sys.exit(1) + + col_index = client[MONGO_DB][MONGO_COL_INDEX] + if not args.no_indexes: + col_index.create_index("filename") + col_index.create_index("mime_type") + col_index.create_index("mailbox") + + db = client[MONGO_DB] + if args.mailbox: + if args.mailbox in SKIP_MAILBOXES: + print(f" CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.") + sys.exit(2) + mailboxes = [args.mailbox] + else: + mailboxes = discover_mailboxes(db) + print(f" Schranky ke zpracovani: {len(mailboxes)}") + for m in mailboxes: + print(f" {m}") + + results = [] + for mb in mailboxes: + try: + results.append(process_mailbox(client, mb, args)) + except Exception as e: + logging.error("process_mailbox %s: %s", mb, e) + print(f" FATAL pri zpracovani {mb}: {e}") + results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0, + "skip": 0, "miss": 0, "ref": 0, "err": 1, "elapsed": 0.0}) + + elapsed_total = (datetime.now() - start_all).total_seconds() + files_total = col_index.count_documents({}) + size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1})) + + grand = {k: sum(r.get(k, 0) for r in results) + for k in ("ok", "new", "dup", "skip", "miss", "ref", "err")} + + print(f"\n{'='*60}") + print("=== SHRNUTI ===") + for r in results: + print(f" {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} " + f"dup={r['dup']:>4} skip={r['skip']:>3} miss={r.get('miss',0):>3} " + f"ref={r.get('ref',0):>3} err={r['err']:>3}") + print(f" {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} " + f"dup={grand['dup']:>4} skip={grand['skip']:>3} miss={grand['miss']:>3} " + f"ref={grand['ref']:>3} err={grand['err']:>3}") + print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)") + print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") + print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + if grand['err']: + print(f"Chyby logovany do: {LOG_FILE}") + + client.close() + return 1 if grand['err'] > 0 else 0 + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/Python-runner/4_unwrap_smime_v1.0.md b/Python-runner/4_unwrap_smime_v1.0.md new file mode 100644 index 0000000..db8e420 --- /dev/null +++ b/Python-runner/4_unwrap_smime_v1.0.md @@ -0,0 +1,63 @@ +# 4_unwrap_smime_v1.0.py + +**Krok 4 pipeline** — rozbalení S/MIME wrapper zpráv. Některé emaily (Datová schránka, mBank, ComGate, PayU, PostSignum …) mají viditelné tělo jen *"This is an S/MIME signed message"* — skutečný obsah je zabalený uvnitř přílohy `smime.p7m`. + +Skript najde tyto emaily, stáhne binárku `smime.p7m` z Graphu, rozbalí PKCS7 SignedData (`asn1crypto.cms`), extrahuje vnitřní MIME zprávu a doplní do Mongo: + +| Pole | Obsah | +|---|---| +| `smime_unwrapped: True` | flag — už rozbaleno | +| `smime_subject` | Subject z vnitřní MIME hlavičky | +| `smime_body_text` | plain text vnitřního těla | +| `smime_body_html` | HTML vnitřního těla (pokud je) | +| `smime_inner_attachments[]` | `{filename, content_type, size_bytes}` vnitřních příloh | + +## POZOR: `smime.p7m` vs `smime.p7s` + +| Příloha | Co to je | Skript dělá | +|---|---|---| +| `smime.p7m` | **Enveloped wrapper** kolem celé MIME zprávy | **Rozbalí** | +| `smime.p7s` | **Detached signature** vedle čistého emailu (tělo je normálně dostupné) | **Ignoruje** — není co rozbalovat | + +Filtr ve skriptu (`SMIME_FILTER`) je proto explicitně `^smime\.p7m$`. Email s přílohou `smime.p7s` a `smime_unwrapped != True` je **správný stav**. + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka. Bez argumentu projede všechny kolekce v `emaily` mimo `SKIP_COLLECTIONS` (`attachments_index`, `sync_state`) a `SKIP_MAILBOXES`. | +| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) | + +## SKIP_MAILBOXES (hardcoded) + +| Schránka | Důvod | +|---|---| +| `vbuzalka@its.jnj.com` | JNJ tenant, nemáme Graph API přístup. Při běhu bez `--mailbox` se tiše přeskočí. S explicitním `--mailbox vbuzalka@its.jnj.com` skript skončí exit kódem 2. | + +## Varianty volání + +```bash +# Všechny schránky (mimo SKIP_MAILBOXES): +docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py + +# Jedna schránka: +docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz + +# Test 10 emailů: +docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz --limit 10 + +# Plný běh na pozadí, log do souboru: +docker exec -d python-runner bash -c "python /scripts/4_unwrap_smime_v1.0.py > /scripts/unwrap_smime.log 2>&1" +``` + +## Závislosti + +```bash +docker exec python-runner pip install asn1crypto +``` + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/unwrap_smime.log +``` diff --git a/Python-runner/4_unwrap_smime_v1.0.py b/Python-runner/4_unwrap_smime_v1.0.py new file mode 100644 index 0000000..6d79340 --- /dev/null +++ b/Python-runner/4_unwrap_smime_v1.0.py @@ -0,0 +1,445 @@ +""" +============================================================================== +Skript: unwrap_smime_v1.0.py +Verze: 1.0 +Datum: 2026-06-03 +Autor: vladimir.buzalka + +Popis: + Najde v Mongo emaily s prilohou smime.p7m (S/MIME signed-data), + stahne binarni obsah prilohy z Microsoft Graph API, rozbali PKCS7 + SignedData (CMS), extrahuje vnitrni MIME message, a ulozi do Mongo: + - smime_unwrapped: True + - smime_body_text : plain text vnitrniho tela + - smime_body_html : HTML vnitrniho tela (kdyz je) + - smime_subject : Subject vnitrni MIME hlavicky + - smime_inner_attachments : [{filename, content_type, size_bytes}] + + Tyto pole pak pouzije enrich_fulltext_emails_v1.2 a doplni jejich + obsah do PG fulltext indexu. + + Typicke S/MIME odesilatele: + notifikace@mojedatovaschranka.cz (844 emailu) + kontakt@mbank.cz (226) + payments@comgate.cz, service@payu.com (~250) + info.postsignum@cpost.cz + +Architekturalni poznamka: + S/MIME priloha smime.p7m ma Content-Type application/pkcs7-mime + s parametrem smime-type=signed-data. Vnitrni obsah je v PKCS7 + ContentInfo -> SignedData -> encapContentInfo.eContent. To uz je + primo MIME zprava (multipart nebo single body). + +Zavislosti (instalovat v kontejneru): + pip install asn1crypto + +Spusteni: + python unwrap_smime_v1.0.py # vsechny schranky (mimo SKIP_MAILBOXES) + python unwrap_smime_v1.0.py --mailbox vladimir.buzalka@buzalka.cz + python unwrap_smime_v1.0.py --limit 10 # test + +SKIP_MAILBOXES (hardcoded): + vbuzalka@its.jnj.com — JNJ tenant, nemame Graph API pristup. Pri behu + bez --mailbox se tise preskoci, s --mailbox skript + skonci s exit kodem 2. +============================================================================== +""" + +from __future__ import annotations + +import argparse +import email +import email.policy +import logging +import sys +import time +import traceback +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +import msal +import requests +from asn1crypto import cms +from pymongo import MongoClient, UpdateOne + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# --- konfigurace ------------------------------------------------------------ +GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" +GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" +GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" +GRAPH_URL = "https://graph.microsoft.com/v1.0" + +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +SKIP_COLLECTIONS = {"attachments_index", "sync_state"} + +# Schranky kde NEMAME Graph API pristup — pri bezne behu se preskocia. +SKIP_MAILBOXES = { + "vbuzalka@its.jnj.com", # JNJ tenant — nemame Graph credentials +} + +MAX_BODY_BYTES = 2 * 1024 * 1024 # 2 MB strop pro extrahovany text +BATCH_SIZE = 25 +LOG_FILE = Path(__file__).parent / "unwrap_smime_errors.log" + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + + +# --- Graph auth ------------------------------------------------------------- +_token: Optional[str] = None + + +def get_token() -> str: + global _token + app = msal.ConfidentialClientApplication( + GRAPH_CLIENT_ID, + authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", + client_credential=GRAPH_CLIENT_SECRET, + ) + res = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + if "access_token" not in res: + raise RuntimeError(f"Graph auth failed: {res}") + _token = res["access_token"] + return _token + + +def graph_get_raw(url: str) -> Optional[bytes]: + """GET na Graph endpoint, vraci raw bytes (pro $value attachment endpoint).""" + global _token + if not _token: + get_token() + for attempt in range(3): + try: + r = requests.get(url, headers={"Authorization": f"Bearer {_token}"}, timeout=60) + if r.status_code == 401: + get_token(); continue + if r.status_code == 404: + return None + if r.status_code == 429: + wait = int(r.headers.get("Retry-After", "5")) + time.sleep(wait); continue + r.raise_for_status() + return r.content + except requests.RequestException: + if attempt == 2: + raise + time.sleep(2) + return None + + +# --- PKCS7 / MIME unwrap ---------------------------------------------------- + +def extract_inner_mime(content_bytes: bytes) -> bytes: + """Z S/MIME prilohy vytahne vnitrni MIME (signed content) jako bytes. + + Dva formaty se v Graph API vyskytuji: + A) multipart/signed (detached signature) - bytes zacinaji 'Content-Type: multipart/signed'. + Obsah je rovnou citelny v prvni MIME casti (druha cast je oddeleny PKCS7 podpis). + B) application/pkcs7-mime (opaque, smime-type=signed-data) - vnitrni MIME je + schovany uvnitr PKCS7 SignedData -> encapContentInfo.eContent. + + Vraci raw MIME bytes pro pripravu pro email.message_from_bytes. + """ + head = content_bytes[:300].lower() + + # A) multipart/signed (detached) - nejcastejsi pro maily z Graphu + if b"content-type:" in head and b"multipart/signed" in head: + try: + outer = email.message_from_bytes(content_bytes, policy=email.policy.default) + except Exception as e: + raise RuntimeError(f"MIME parse failed: {e}") + # iteruj parts - prvni non-signature je signed payload + signed_payload = None + if outer.is_multipart(): + for part in outer.iter_parts(): + ct = (part.get_content_type() or "").lower() + if "pkcs7-signature" in ct or "x-pkcs7-signature" in ct: + continue + signed_payload = part + break + if signed_payload is None: + raise RuntimeError("multipart/signed: no signed payload found") + return signed_payload.as_bytes() + + # B) opaque PKCS7 SignedData - DER nebo base64 + data = content_bytes + try: + ci = cms.ContentInfo.load(data) + except Exception: + try: + import base64 + stripped = b"".join(line for line in data.splitlines() + if not line.startswith(b"-----")) + data = base64.b64decode(stripped, validate=False) + ci = cms.ContentInfo.load(data) + except Exception as e: + raise RuntimeError(f"PKCS7/MIME parse failed: {e}") + + if ci["content_type"].native != "signed_data": + raise RuntimeError(f"Not signed-data, got {ci['content_type'].native}") + sd = ci["content"] + inner = sd["encap_content_info"]["content"] + if inner is None: + raise RuntimeError("encapContentInfo.content is null (detached without MIME wrapper)") + return bytes(inner.native) if hasattr(inner, "native") else bytes(inner) + + +def parse_inner_mime(mime_bytes: bytes) -> dict: + """Z MIME bytes vytahne text, html a prilohy.""" + msg = email.message_from_bytes(mime_bytes, policy=email.policy.default) + + text_parts: list[str] = [] + html_parts: list[str] = [] + inner_attachments: list[dict] = [] + + def walk(part): + ctype = part.get_content_type() + disp = (part.get_content_disposition() or "").lower() + filename = part.get_filename() + + if part.is_multipart(): + for sub in part.iter_parts(): + walk(sub) + return + + if disp == "attachment" or filename: + try: + payload = part.get_content() + if isinstance(payload, str): + payload_bytes = payload.encode("utf-8", errors="replace") + elif isinstance(payload, bytes): + payload_bytes = payload + else: + payload_bytes = b"" + size = len(payload_bytes) + except Exception: + size = 0 + inner_attachments.append({ + "filename": filename or "(unnamed)", + "content_type": ctype, + "size_bytes": size, + }) + return + + if ctype == "text/plain": + try: + text_parts.append(part.get_content()) + except Exception: + try: + text_parts.append(part.get_payload(decode=True).decode( + part.get_content_charset() or "utf-8", errors="replace")) + except Exception: + pass + elif ctype == "text/html": + try: + html_parts.append(part.get_content()) + except Exception: + try: + html_parts.append(part.get_payload(decode=True).decode( + part.get_content_charset() or "utf-8", errors="replace")) + except Exception: + pass + + walk(msg) + + body_text = "\n\n".join(t.strip() for t in text_parts if t and t.strip()) + body_html = "\n".join(h for h in html_parts if h and h.strip()) + if len(body_text) > MAX_BODY_BYTES: + body_text = body_text[:MAX_BODY_BYTES] + if len(body_html) > MAX_BODY_BYTES: + body_html = body_html[:MAX_BODY_BYTES] + + return { + "subject": str(msg.get("Subject") or "").strip(), + "from": str(msg.get("From") or "").strip(), + "to": str(msg.get("To") or "").strip(), + "date": str(msg.get("Date") or "").strip(), + "body_text": body_text or None, + "body_html": body_html or None, + "inner_attachments": inner_attachments, + } + + +# --- hlavni smycka ---------------------------------------------------------- + +SMIME_FILTER = { + "$and": [ + {"attachments.filename": {"$regex": "^smime\\.p7m$", "$options": "i"}}, + {"smime_unwrapped": {"$ne": True}}, + ] +} + + +def find_p7m_graph_att_id(doc: dict) -> Optional[str]: + for att in doc.get("attachments") or []: + if (att.get("filename") or "").lower() == "smime.p7m": + return att.get("graph_att_id") + return None + + +def process_mailbox(col, mailbox: str, limit: Optional[int]) -> dict: + total = col.count_documents(SMIME_FILTER) + print(f"[{mailbox}] S/MIME k rozbaleni: {total}" + + (f" (limit {limit})" if limit else "")) + if total == 0: + return {"mailbox": mailbox, "candidates": 0, "unwrapped": 0, + "errors": 0, "no_att_id": 0, "missing": 0, + "with_inner_att": 0, "inner_att_total": 0} + + cursor = col.find(SMIME_FILTER, {"_id": 1, "graph_id": 1, "attachments": 1}, + no_cursor_timeout=True) + if limit: + cursor = cursor.limit(limit) + + n = unwrapped = err = no_att_id = missing = with_inner = inner_total = 0 + bulk: list[UpdateOne] = [] + + try: + for doc in cursor: + n += 1 + mid = doc["_id"] + gid = doc.get("graph_id") + att_id = find_p7m_graph_att_id(doc) + if not gid or not att_id: + no_att_id += 1 + continue + + url = f"{GRAPH_URL}/users/{mailbox}/messages/{gid}/attachments/{att_id}/$value" + try: + p7m_bytes = graph_get_raw(url) + except Exception as e: + err += 1 + logging.error("[%s] graph fetch %s: %s", mailbox, gid, e) + bulk.append(UpdateOne({"_id": mid}, {"$set": { + "smime_unwrapped": False, + "smime_error": f"fetch: {type(e).__name__}: {e}"[:300], + "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None), + }})) + continue + if p7m_bytes is None: + missing += 1 + bulk.append(UpdateOne({"_id": mid}, {"$set": { + "smime_unwrapped": False, + "smime_error": "attachment_404", + "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None), + }})) + continue + + try: + inner_bytes = extract_inner_mime(p7m_bytes) + parsed = parse_inner_mime(inner_bytes) + except Exception as e: + err += 1 + logging.error("[%s] unwrap %s: %s", mailbox, mid, e) + bulk.append(UpdateOne({"_id": mid}, {"$set": { + "smime_unwrapped": False, + "smime_error": f"unwrap: {type(e).__name__}: {e}"[:300], + "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None), + }})) + continue + + inner_atts = parsed["inner_attachments"] + inner_total += len(inner_atts) + if inner_atts: + with_inner += 1 + + update = { + "smime_unwrapped": True, + "smime_processed_at": datetime.now(timezone.utc).replace(tzinfo=None), + "smime_body_text": parsed["body_text"], + "smime_body_html": parsed["body_html"], + "smime_subject": parsed["subject"], + "smime_from": parsed["from"], + "smime_to": parsed["to"], + "smime_date": parsed["date"], + "smime_inner_attachments": inner_atts, + "smime_error": None, + } + bulk.append(UpdateOne({"_id": mid}, {"$set": update})) + unwrapped += 1 + + if len(bulk) >= BATCH_SIZE: + col.bulk_write(bulk, ordered=False) + bulk.clear() + + if n % 50 == 0 or n == 1: + print(f" [{n:>5}/{total}] unwrapped={unwrapped} err={err} " + f"no_att_id={no_att_id} missing={missing} " + f"inner_atts_total={inner_total}", flush=True) + finally: + cursor.close() + if bulk: + col.bulk_write(bulk, ordered=False) + + print(f" [{n}/{total}] DONE unwrapped={unwrapped} err={err} " + f"no_att_id={no_att_id} missing={missing} " + f"with_inner_atts={with_inner} inner_atts_total={inner_total}") + return {"mailbox": mailbox, "candidates": total, "unwrapped": unwrapped, + "errors": err, "no_att_id": no_att_id, "missing": missing, + "with_inner_att": with_inner, "inner_att_total": inner_total} + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--mailbox", help="Jedna konkretni schranka (default: vsechny)") + ap.add_argument("--limit", type=int, help="Limit emailu na schranku (test)") + args = ap.parse_args() + + t0 = time.time() + print("Pripojuji se k MongoDB...") + mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + mongo.admin.command("ping") + db = mongo[MONGO_DB] + + print("Token Graph API...") + get_token() + print("OK\n") + + if args.mailbox: + if args.mailbox in SKIP_MAILBOXES: + print(f"CHYBA: {args.mailbox} je v SKIP_MAILBOXES — neni Graph pristup.") + return 2 + mailboxes = [args.mailbox] + else: + mailboxes = [] + for c in db.list_collection_names(): + if c in SKIP_COLLECTIONS: + continue + if c in SKIP_MAILBOXES: + print(f" [skip] {c} — v SKIP_MAILBOXES (neni Graph pristup)") + continue + mailboxes.append(c) + print(f"Schranky ({len(mailboxes)}): {mailboxes}\n") + + results = [] + for mb in mailboxes: + results.append(process_mailbox(db[mb], mb, limit=args.limit)) + print() + + print("=== SHRNUTI ===") + for r in results: + print(f" {r['mailbox']}: candidates={r['candidates']} unwrapped={r['unwrapped']} " + f"errors={r['errors']} no_att_id={r['no_att_id']} missing={r['missing']} " + f"with_inner_atts={r['with_inner_att']} inner_atts_total={r['inner_att_total']}") + print(f"\nCelkem trvalo: {time.time() - t0:.1f} s") + total_errors = sum(r.get("errors", 0) for r in results) + return 1 if total_errors > 0 else 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except KeyboardInterrupt: + print("\nPreruseno uzivatelem") + except Exception: + traceback.print_exc() + sys.exit(1) diff --git a/Python-runner/5_enrich_fulltext_emails_v1.2.md b/Python-runner/5_enrich_fulltext_emails_v1.2.md new file mode 100644 index 0000000..d6392fb --- /dev/null +++ b/Python-runner/5_enrich_fulltext_emails_v1.2.md @@ -0,0 +1,47 @@ +# 5_enrich_fulltext_emails_v1.2.py + +**Krok 5 pipeline** — vytáhne plný text z emailů v MongoDB (db: `emaily`) a uloží do PostgreSQL (db: `MongoEmaily`, tabulka: `emails`) s GIN `tsvector` indexem (config `soubory` — simple + unaccent). + +Emaily se **nestahují znovu** — tělo už je v Mongo z kroků 1/2/4. Tento skript jen vybere první dostupné tělo podle priority a pošle text do PG na fulltext. + +## Priorita zdroje těla (`body_source`) + +1. **`smime`** — `smime_body_text` / `smime_body_html` (pokud unwrap proběhl) +2. **`html`** — `body_html` +3. **`text`** — `body_text` (z parse v1.4 nebo refetch v1.0) +4. **`preview`** — `body_preview` (fallback) + +Názvy vnitřních S/MIME příloh (`smime_inner_attachments`) jdou do `attachments_summary` — dohledatelné přes MCP `emaily.find_attachment`. + +## Inkrementalita + +Pokud `(mailbox, message_id)` v PG existuje, `extractor_version` je aktuální (1.2) a `modified_at` v Mongo není novější → **skip**. Při bumpu `EXTRACTOR_VERSION` se vše přeparsuje. + +## Argumenty + +| Argument | Povinný | Hodnoty | Default | Popis | +|---|---|---|---|---| +| `--mailbox` | ne | e-mail | (všechny) | Jedna konkrétní schránka | +| `--limit N` | ne | int | (bez limitu) | Limit emailů na schránku (test) | + +## Varianty volání + +```bash +# Všechny schránky: +docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py + +# Jedna schránka: +docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz + +# Test 500 emailů: +docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz --limit 500 + +# Plný běh na pozadí, log do souboru: +docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.2.py > /scripts/enrich.log 2>&1" +``` + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/enrich.log +``` diff --git a/Python-runner/5_enrich_fulltext_emails_v1.2.py b/Python-runner/5_enrich_fulltext_emails_v1.2.py new file mode 100644 index 0000000..530d19c --- /dev/null +++ b/Python-runner/5_enrich_fulltext_emails_v1.2.py @@ -0,0 +1,489 @@ +""" +============================================================================== +Skript: enrich_fulltext_emails_v1.2.py +Verze: 1.2 +Datum: 2026-06-03 +Autor: vladimir.buzalka + +Popis: + Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do + PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem. + + Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4 + (a refetch_text_bodies_v1.0 pro stare plain-text emaily). + Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext. + +Zmeny proti v1.1: + - S/MIME emaily (signed-data od Datove schranky, mBank, ComGate, PayU, ...): + pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html, pouzije se + PREFEROVANE pred bezvyznamnym vnejsim wrapper telem ("This is an S/MIME + signed message"). Nazvy vnitrnich priloh (smime_inner_attachments) se + pridavaji do attachments_summary, tj. dohledatelne pres find_attachment. + - body_source: nova hodnota "smime" (rozbalene vnitrni telo). + - EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji. + +Zmeny v1.1 vs v1.0: + - Fallback poradi rozsireno o body_text (novy v parse_emails_graph_v1.4). + - body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB). + +Zdroj: + MongoDB 192.168.1.76 db=emaily kolekce= + (krome attachments_index) + +Cil: + PostgreSQL 192.168.1.76 db=MongoEmaily tabulka=emails + tsvector config 'soubory' (sdileny - simple + unaccent) + +Inkrementalita: + Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni + a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru + se vse preparsuje. + +Spusteni: + python enrich_fulltext_emails_v1.0.py # vsechny schranky + python enrich_fulltext_emails_v1.0.py --mailbox vbuzalka@its.jnj.com + python enrich_fulltext_emails_v1.0.py --limit 500 # test +============================================================================== +""" + +from __future__ import annotations + +import argparse +import re +import sys +import time +import traceback +from datetime import datetime, timezone +from typing import Optional + +import psycopg +from bs4 import BeautifulSoup +from pymongo import MongoClient + +# --- konfigurace ------------------------------------------------------------ +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" + +PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily " + "user=vladimir.buzalka password=Vlado7309208104++") + +EXTRACTOR_VERSION = "1.2" + +MAX_TEXT_BYTES = 5 * 1024 * 1024 # plain text max 5 MB +SKIP_COLLECTIONS = {"attachments_index"} + +BATCH_SIZE = 100 + + +# --- SCHEMA ----------------------------------------------------------------- + +SCHEMA_SQL = """ +CREATE EXTENSION IF NOT EXISTS unaccent; +CREATE EXTENSION IF NOT EXISTS pg_trgm; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN + CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple ); + ALTER TEXT SEARCH CONFIGURATION soubory + ALTER MAPPING FOR hword, hword_part, word + WITH unaccent, simple; + END IF; +END$$; + +CREATE TABLE IF NOT EXISTS emails ( + id BIGSERIAL PRIMARY KEY, + mailbox TEXT NOT NULL, + message_id TEXT NOT NULL, + graph_id TEXT, + conversation_id TEXT, + folder_path TEXT, + subject TEXT, + sender_email TEXT, + sender_name TEXT, + to_addrs TEXT, + cc_addrs TEXT, + bcc_addrs TEXT, + sent_at TIMESTAMPTZ, + received_at TIMESTAMPTZ, + modified_at TIMESTAMPTZ, + is_read BOOLEAN, + is_draft BOOLEAN, + has_attachments BOOLEAN, + attachment_count INT, + attachments_summary TEXT, + body TEXT, + body_length INT, + body_source TEXT, -- 'html' | 'preview' | 'empty' + tsv tsvector GENERATED ALWAYS AS ( + to_tsvector('soubory'::regconfig, + left( + coalesce(subject, '') || ' ' || + coalesce(sender_email, '') || ' ' || + coalesce(sender_name, '') || ' ' || + coalesce(to_addrs, '') || ' ' || + coalesce(cc_addrs, '') || ' ' || + coalesce(attachments_summary, '') || ' ' || + coalesce(body, ''), + 800000) + ) + ) STORED, + extracted_at TIMESTAMPTZ DEFAULT now(), + extractor_version TEXT, + ok BOOLEAN, + error TEXT, + UNIQUE (mailbox, message_id) +); + +CREATE INDEX IF NOT EXISTS emails_tsv_gin ON emails USING gin(tsv); +CREATE INDEX IF NOT EXISTS emails_subject_trgm ON emails USING gin(subject gin_trgm_ops); +CREATE INDEX IF NOT EXISTS emails_sender_email_idx ON emails(sender_email); +CREATE INDEX IF NOT EXISTS emails_mailbox_idx ON emails(mailbox); +CREATE INDEX IF NOT EXISTS emails_received_idx ON emails(received_at DESC); +CREATE INDEX IF NOT EXISTS emails_conv_idx ON emails(conversation_id); +""" + + +# --- HELPERY ---------------------------------------------------------------- + +_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") +_WS_RX = re.compile(r"[ \t]+") +_NL_RX = re.compile(r"\n{3,}") + + +def _clean_for_pg(s: str) -> str: + if not s: + return "" + return _CTRL_RX.sub("", s) + + +def _truncate(s: str) -> str: + s = _clean_for_pg(s or "") + if not s: + return "" + b = s.encode("utf-8", errors="replace") + if len(b) <= MAX_TEXT_BYTES: + return s + return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore") + + +def html_to_text(html: str) -> str: + """Extrahuje plain text z HTML emailu. Odstrani