From 9b12745e1d100f7107cbc5751e5524ec089ef3f6 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Thu, 4 Jun 2026 07:15:17 +0200 Subject: [PATCH] notebook --- .mcp.json | 10 + Python-runner/download_attachments_v1.3.py | 483 ---------------- Python-runner/parse_emails_errors.log | 0 Python-runner/parse_emails_graph_v1.3.py | 611 --------------------- Python-runner/python_runner.md | 158 +++++- claude-memory/MEMORY.md | 4 + 6 files changed, 147 insertions(+), 1119 deletions(-) delete mode 100644 Python-runner/download_attachments_v1.3.py delete mode 100644 Python-runner/parse_emails_errors.log delete mode 100644 Python-runner/parse_emails_graph_v1.3.py diff --git a/.mcp.json b/.mcp.json index 7ce92f3..b003afd 100644 --- a/.mcp.json +++ b/.mcp.json @@ -9,6 +9,16 @@ "command": "python", "args": ["U:\\PythonProject\\Janssen\\EmailsImport\\mcp_jnjemails.py"], "cwd": "U:\\PythonProject\\Janssen\\EmailsImport" + }, + "soubory": { + "command": "U:\\janssen\\.venv\\Scripts\\python.exe", + "args": ["U:\\janssen\\soubory\\mcp_soubory.py"], + "cwd": "U:\\janssen\\soubory" + }, + "emaily": { + "command": "U:\\janssen\\.venv\\Scripts\\python.exe", + "args": ["U:\\janssen\\EmailsImport\\mcp_emaily.py"], + "cwd": "U:\\janssen\\EmailsImport" } } } diff --git a/Python-runner/download_attachments_v1.3.py b/Python-runner/download_attachments_v1.3.py deleted file mode 100644 index 15beeb1..0000000 --- a/Python-runner/download_attachments_v1.3.py +++ /dev/null @@ -1,483 +0,0 @@ -""" -download_attachments_v1.3.py -Nazev: download_attachments_v1.3.py -Verze: 1.3 -Datum: 2026-06-02 -Autor: vladimir.buzalka - -Popis: - Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB - pres Microsoft Graph API a uklada je do adresare - /mnt/Emails//Attachments/. - - Schránka se predava jako povinny parametr --mailbox. - - Deduplikace podle SHA256 hashe obsahu: - - stejny hash = soubor uz existuje -> preskoci - - prvni vyskytu souboru: ulozi pod puvodnimnazvem - - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ... - - Po ulozeni aktualizuje MongoDB: - - v email dokumentu: kazda priloha dostane file_hash + local_path - - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes, - mime_type, mailbox, first_seen_at, ref_count - - Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash - se preskoci. --force-recheck znovu overi i uz stazene. - - POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! - -Spousteni: - python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz - python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 - python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck - -Docker: - docker exec -it python-runner python /scripts/download_attachments_v1.3.py \\ - --mailbox ordinace@buzalkova.cz - -Zavislosti: - msal, requests, pymongo - Python 3.10+ - -Historie verzi: - 1.0 2026-06-02 Inicialni verze - 1.1 2026-06-02 Schránka jako parametr --mailbox - 1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu; - preskoceni S/MIME; inline z Graphu -> SKIP ne ERR - 1.3 2026-06-02 Primarni stazeni pres graph_att_id (prime ID bez name-matchingu); - oprava $select na attachment listu (odstranen contentId ktery - zpusoboval BadRequest a vracel prazdny seznam); name-matching - zustava jako fallback pro stare emaily bez graph_att_id -""" - -import sys -import re -import hashlib -import logging -import argparse -import unicodedata -from pathlib import Path -from datetime import datetime, timezone -from typing import Optional - -import msal -import requests -from pymongo import MongoClient, UpdateOne - -if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - -# ─── KONFIGURACE ────────────────────────────────────────────────────────────── -GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" -GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" -GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" -GRAPH_URL = "https://graph.microsoft.com/v1.0" - -MONGO_URI = "mongodb://192.168.1.76:27017" -MONGO_DB = "emaily" -MONGO_COL_INDEX = "attachments_index" - -EMAILS_BASE_DIR = Path("/mnt/Emails") -LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" -SCRIPT_VERSION = "1.3" -BATCH_SIZE = 50 - -# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty) -SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"} -# ────────────────────────────────────────────────────────────────────────────── - -logging.basicConfig( - filename=str(LOG_FILE), - level=logging.ERROR, - format="%(asctime)s | %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - encoding="utf-8", -) - -_graph_token: Optional[str] = None - - -# ─── Graph API ──────────────────────────────────────────────────────────────── - -def get_token() -> str: - global _graph_token - app = msal.ConfidentialClientApplication( - GRAPH_CLIENT_ID, - authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", - client_credential=GRAPH_CLIENT_SECRET, - ) - result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - if "access_token" not in result: - raise RuntimeError(f"Graph auth failed: {result}") - _graph_token = result["access_token"] - return _graph_token - - -def graph_get_bytes(url: str) -> bytes: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.content - raise RuntimeError(f"Graph GET bytes failed: {url}") - - -def graph_get_json(url: str, params: dict = None) -> dict: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.json() - raise RuntimeError(f"Graph GET json failed: {url}") - - -def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]: - """Nacte metadata vsech priloh zpravy (bez contentBytes).""" - url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments" - try: - # Pozor: contentId NENI v base attachment type — nesmi byt v $select - data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"}) - return data.get("value", []) - except Exception as e: - logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e) - return [] - - -def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]: - url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value" - try: - return graph_get_bytes(url) - except Exception as e: - logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", - graph_message_id, attachment_id, e) - return None - - -# ─── Pomocné funkce ─────────────────────────────────────────────────────────── - -def normalize_name(name: str) -> str: - """Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-""" - nfkd = unicodedata.normalize("NFKD", name.lower().strip()) - ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) - return re.sub(r"[^\w.\-]", "_", ascii_str) - - -def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]: - """Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id).""" - # 1. Presna shoda - for ga in graph_atts: - if ga["name"] == att_name: - return ga - - norm_want = normalize_name(att_name) - - # 2. Normalizovana shoda - for ga in graph_atts: - if normalize_name(ga["name"]) == norm_want: - return ga - - # 3. Normalizovana shoda + velikost (±10 %) - for ga in graph_atts: - if normalize_name(ga["name"]) == norm_want: - ga_size = ga.get("size", 0) - if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1: - return ga - - # 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména) - for ga in graph_atts: - if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]): - return ga - - return None - - -def sha256(data: bytes) -> str: - return hashlib.sha256(data).hexdigest() - - -def safe_filename(name: str) -> str: - safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip() - return safe or "attachment" - - -def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str: - existing = col_index.find_one({"filename": desired_name}) - if existing: - if existing["_id"] == hash_val: - return desired_name - stem = Path(desired_name).stem - suffix = Path(desired_name).suffix - n = 2 - while True: - candidate = f"{stem}_{n}{suffix}" - ex2 = col_index.find_one({"filename": candidate}) - if not ex2 or ex2["_id"] == hash_val: - if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val): - return candidate - n += 1 - return desired_name - - -def save_attachment( - content: bytes, - original_name: str, - mime_type: str, - mailbox: str, - att_dir: Path, - col_index, -) -> tuple[str, str, bool]: - hash_val = sha256(content) - - existing = col_index.find_one({"_id": hash_val}) - if existing: - col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}}) - return hash_val, existing["local_path"], False - - filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index) - file_path = att_dir / filename - file_path.write_bytes(content) - - col_index.insert_one({ - "_id": hash_val, - "filename": filename, - "local_path": filename, - "size_bytes": len(content), - "mime_type": mime_type, - "mailbox": mailbox, - "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None), - "ref_count": 1, - }) - - return hash_val, filename, True - - -# ─── MAIN ───────────────────────────────────────────────────────────────────── - -def main(): - ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}") - ap.add_argument("--mailbox", required=True, - help="Emailova schranka (napr. ordinace@buzalkova.cz)") - ap.add_argument("--limit", type=int, default=0, - help="Zpracovat max N emailu (0 = vse)") - ap.add_argument("--force-recheck", action="store_true", - help="Znovu overi i emaily kde prilohy uz maji file_hash") - ap.add_argument("--no-indexes", action="store_true", - help="Nevytvorit indexy na attachments_index kolekci") - args = ap.parse_args() - - mailbox = args.mailbox - att_dir = EMAILS_BASE_DIR / mailbox / "Attachments" - mongo_col = mailbox - - start = datetime.now() - print(f"=== download_attachments v{SCRIPT_VERSION} ===") - print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") - print(f"Schránka: {mailbox}") - print(f"Cilovy adresar: {att_dir}") - print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}") - - att_dir.mkdir(parents=True, exist_ok=True) - print(" Adresar OK") - - print("\nPřipojuji se k Graph API...") - try: - get_token() - print(" Graph API OK") - except Exception as e: - print(f" CHYBA: {e}") - sys.exit(1) - - client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) - try: - client.admin.command("ping") - print(" MongoDB OK") - except Exception as e: - print(f" CHYBA: MongoDB neni dostupna -- {e}") - sys.exit(1) - - col_emails = client[MONGO_DB][mongo_col] - col_index = client[MONGO_DB][MONGO_COL_INDEX] - - if not args.no_indexes: - col_index.create_index("filename") - col_index.create_index("mime_type") - col_index.create_index("mailbox") - - if args.force_recheck: - query = {"has_attachments": True} - else: - query = { - "has_attachments": True, - "attachments": { - "$elemMatch": { - "is_inline": False, - "file_hash": {"$exists": False}, - } - } - } - - total = col_emails.count_documents(query) - print(f"\nEmailu ke zpracovani: {total}") - if total == 0: - print("Neni co stahnout.") - client.close() - return - - cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1}) - if args.limit: - cursor = cursor.limit(args.limit) - - ok_count = 0 - new_count = 0 - dup_count = 0 - skip_count = 0 - err_count = 0 - email_i = 0 - batch = [] - - def flush(): - if not batch: - return - try: - col_emails.bulk_write(batch, ordered=False) - except Exception as e: - logging.error("bulk_write: %s", e) - print(f" CHYBA bulk_write: {e}") - batch.clear() - - for email_doc in cursor: - email_i += 1 - email_id = email_doc["_id"] - graph_id = email_doc.get("graph_id", "") - subject = (email_doc.get("subject") or "")[:60] - att_list = email_doc.get("attachments") or [] - - real_atts = [a for a in att_list if not a.get("is_inline", False)] - if not real_atts: - continue - - print(f"\n {email_i:>5}/{total} {subject}") - - # Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id - need_listing = any( - not a.get("is_inline", False) - and not (not args.force_recheck and a.get("file_hash")) - and not a.get("graph_att_id") - for a in att_list - ) - graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else [] - - updated_atts = list(att_list) - email_ok = True - - for i, att in enumerate(updated_atts): - if att.get("is_inline", False): - continue - if not args.force_recheck and att.get("file_hash"): - continue - - att_name = att.get("filename", "") - att_size = att.get("size_bytes", 0) - graph_att_id = att.get("graph_att_id") - - # Preskoc S/MIME podpisy - if Path(att_name).suffix.lower() in SKIP_EXTENSIONS: - updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""} - skip_count += 1 - print(f" SKIP {att_name} (S/MIME)") - continue - - # Primy pristup pres graph_att_id (emaily parsovane v1.2+) - if graph_att_id: - content = fetch_attachment_content(mailbox, graph_id, graph_att_id) - if content is None: - err_count += 1 - email_ok = False - print(f" ERR {att_name} (stazeni selhalo)") - continue - # Zkontroluj zda jde skutecne o inline (pro edge case) - mime_type = att.get("mime_type", "") - else: - # Fallback: name matching pro stare emaily (parsovane pred v1.2) - graph_att = find_graph_att(att_name, att_size, graph_atts) - - if not graph_att: - logging.error("attachment not found [email=%s att=%s]", email_id, att_name) - print(f" ERR {att_name} (nenalezeno)") - err_count += 1 - email_ok = False - continue - - # Pokud Graph rika ze je inline — preskoc - if graph_att.get("isInline", False): - updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""} - skip_count += 1 - print(f" SKIP {att_name} (inline obrazek)") - continue - - content = fetch_attachment_content(mailbox, graph_id, graph_att["id"]) - if content is None: - err_count += 1 - email_ok = False - print(f" ERR {att_name} (stazeni selhalo)") - continue - - mime_type = att.get("mime_type") or graph_att.get("contentType", "") - - hash_val, local_path, was_new = save_attachment( - content, att_name, mime_type, mailbox, att_dir, col_index - ) - - updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path} - - if was_new: - new_count += 1 - print(f" NEW {local_path} ({len(content):,} B)") - else: - dup_count += 1 - print(f" DUP {att_name} -> {local_path}") - - if email_ok: - ok_count += 1 - - batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}})) - - if len(batch) >= BATCH_SIZE: - flush() - - if email_i % 100 == 0: - elapsed = (datetime.now() - start).total_seconds() - print(f" {'─'*60}") - print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} skip={skip_count} err={err_count}") - print(f" {'─'*60}") - - flush() - - elapsed_total = (datetime.now() - start).total_seconds() - files_total = col_index.count_documents({}) - size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1})) - - print(f"\n{'='*52}") - print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | skip={skip_count} | err={err_count}") - print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)") - print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") - print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - if err_count: - print(f"Chyby logovany do: {LOG_FILE}") - - client.close() - - -if __name__ == "__main__": - main() diff --git a/Python-runner/parse_emails_errors.log b/Python-runner/parse_emails_errors.log deleted file mode 100644 index e69de29..0000000 diff --git a/Python-runner/parse_emails_graph_v1.3.py b/Python-runner/parse_emails_graph_v1.3.py deleted file mode 100644 index 9719a8b..0000000 --- a/Python-runner/parse_emails_graph_v1.3.py +++ /dev/null @@ -1,611 +0,0 @@ -""" -parse_emails_graph_v1.3.py -Nazev: parse_emails_graph_v1.3.py -Verze: 1.3 -Datum: 2026-06-02 -Autor: vladimir.buzalka - -Popis: - Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API - a importuje je jako dokumenty do MongoDB. - Ze kazde zpravy extrahuje vsechny dostupne vlastnosti: - - - predmet, odesilatel, prijemci (To/CC/BCC s typy) - - cas doruceni, odeslani, vytvoreni, modifikace (UTC) - - telo HTML (max 2 MB) + textovy preview - - prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id) - - internet headers (SPF, DKIM, Received, X-*, ...) - - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno, - kategorie, In-Reply-To, References, ... - - navic: isRead, isDraft, folder_path, inferenceClassification - - Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted, - archivni slozky, ...). - - DB: emaily - Kolekce: (napr. ordinace@buzalkova.cz) - _id: Internet Message-ID (nebo "graphid:" jako fallback) - - POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! - -Spousteni: - # Prvni import (vsechno): - python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz - - # Test na prvnich 50: - python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes - - # Jen jedna slozka: - python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox - - # Pokracovani po preruseni (pouze nove): - python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only - - # Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove): - python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync - - # Jina schranka: - python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz - -Rezimy (--mode): - full Plny upsert vsech poli pro kazdou zpravu (vychozi) - new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove - sync Existujici: aktualizuje jen is_read/flag_status/categories/ - modified_at/folder_path. Nove zpravy importuje cely. - Idealni pro pravidelne spousteni. - -Zavislosti: - msal, requests, pymongo, python-dateutil - Python 3.10+ - -Struktura dokumentu v MongoDB: - _id Internet Message-ID (nebo graphid: fallback) - graph_id Graph API message ID - subject predmet zpravy - normalized_subject predmet bez RE:/FW:/AW: prefixu - importance 0=nizka 1=normalni 2=vysoka - flag_status 0=bez priznaku 1=oznaceno 2=dokonceno - is_read bool — aktualni stav precteni ve schrance - is_draft bool - has_attachments bool - attachment_count int - inference_classification focused / other - categories [str] - conversation_id Graph conversationId - conversation_index base64 conversationIndex - conversation_topic tema vlakna (z internet headers Thread-Topic) - in_reply_to Message-ID predchozi zpravy - internet_references [Message-ID] - received_at datetime UTC - sent_at datetime UTC - created_at datetime UTC - modified_at datetime UTC - folder_id Graph parentFolderId - folder_path cela cesta slozky (napr. Inbox/Subfolder) - sender.email emailova adresa odesilatele - sender.name zobrazovane jmeno - to retezec To (joined) - cc retezec CC - bcc retezec BCC - recipients [{type, email, name}] - body_html HTML telo (max 2 MB) - body_preview textovy nahled (max 255 znaku) - attachments [{filename, size_bytes, mime_type, is_inline, graph_att_id}] - headers dict internet headers - parsed_at datetime UTC - -Indexy: - received_at, sent_at, sender.email, graph_id (unique), - conversation_id, folder_path, has_attachments, categories, - importance, flag_status, is_read, - text_search (subject + body_preview + to + cc) - -Historie verzi: - 1.0 2026-06-02 Inicialni verze - 1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync; - odstranen --skip-existing (nahrazen --mode new-only) - 1.2 2026-06-02 $expand attachments s $select (bez contentBytes — rychlejsi); - prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu - 1.3 2026-06-02 --mailbox jako povinny parametr — univerzalni pouziti pro - libovolnou schranku; kolekce v MongoDB = nazev schranky -""" - -import sys -import re -import logging -import argparse -import base64 -from pathlib import Path -from datetime import datetime, timezone -from typing import Optional - -import msal -import requests -from dateutil import parser as dtparser -from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT - -if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - -# ─── KONFIGURACE ────────────────────────────────────────────────────────────── -GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" -GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" -GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" -GRAPH_URL = "https://graph.microsoft.com/v1.0" - -MONGO_URI = "mongodb://192.168.1.76:27017" -MONGO_DB = "emaily" -BATCH_SIZE = 100 -PAGE_SIZE = 50 -LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" -SCRIPT_VERSION = "1.3" - -# Schránka se nastavuje za behu z --mailbox parametru -GRAPH_MAILBOX: str = "" -# ────────────────────────────────────────────────────────────────────────────── - -logging.basicConfig( - filename=str(LOG_FILE), - level=logging.ERROR, - format="%(asctime)s | %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - encoding="utf-8", -) - -IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2} -FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2} -RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE) - -# $expand prilohy bez contentBytes — jen metadata co potrebujeme -ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)" - -MSG_SELECT = ( - "id,internetMessageId,subject,bodyPreview,body," - "importance,isRead,isDraft,hasAttachments," - "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime," - "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo," - "conversationId,conversationIndex,parentFolderId," - "categories,flag,inferenceClassification,internetMessageHeaders" -) - -MSG_SELECT_SYNC = ( - "id,internetMessageId,isRead,isDraft,flag,categories," - "lastModifiedDateTime,parentFolderId,importance" -) - - -# ─── Graph API helpers ──────────────────────────────────────────────────────── - -_graph_token: Optional[str] = None - - -def get_token() -> str: - global _graph_token - app = msal.ConfidentialClientApplication( - GRAPH_CLIENT_ID, - authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", - client_credential=GRAPH_CLIENT_SECRET, - ) - result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - if "access_token" not in result: - raise RuntimeError(f"Graph auth failed: {result}") - _graph_token = result["access_token"] - return _graph_token - - -def graph_get(url: str, params: dict = None) -> dict: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.json() - raise RuntimeError(f"Graph GET failed after retry: {url}") - - -def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]: - """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}].""" - if parent_id is None: - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders" - else: - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders" - - folders = [] - params = {"$top": 100, "$select": "id,displayName,childFolderCount"} - while url: - data = graph_get(url, params) - for f in data.get("value", []): - path = f"{parent_path}/{f['displayName']}".lstrip("/") - folders.append({"id": f["id"], "path": path}) - if f.get("childFolderCount", 0) > 0: - folders.extend(get_all_folders(f["id"], path)) - url = data.get("@odata.nextLink") - params = None - return folders - - -def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True): - """Generator: vraci zpravy ze slozky po strankach.""" - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages" - params = {"$top": PAGE_SIZE, "$select": select} - if expand_attachments: - params["$expand"] = ATT_EXPAND - while url: - data = graph_get(url, params) - for msg in data.get("value", []): - yield msg - url = data.get("@odata.nextLink") - params = None - - -# ─── Pomocné funkce ─────────────────────────────────────────────────────────── - -def parse_date(raw) -> Optional[datetime]: - if raw is None: - return None - if isinstance(raw, datetime): - if raw.tzinfo: - return raw.astimezone(timezone.utc).replace(tzinfo=None) - return raw - try: - dt = dtparser.parse(str(raw)) - if dt.tzinfo: - return dt.astimezone(timezone.utc).replace(tzinfo=None) - return dt - except Exception: - return None - - -def normalize_subject(subject: str) -> str: - s = subject.strip() - while True: - m = RE_SUBJECT.match(s) - if not m: - break - s = s[m.end():].strip() - return s - - -def parse_headers(raw_headers: list) -> dict: - result = {} - for h in raw_headers: - k = h["name"].lower().replace("-", "_") - v = h["value"] - if k in result: - existing = result[k] - result[k] = existing + [v] if isinstance(existing, list) else [existing, v] - else: - result[k] = v - return result - - -def format_recipients(lst: list) -> str: - return "; ".join( - f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip() - for r in lst - ) - - -# ─── Extrakce zprávy ───────────────────────────────────────────────────────── - -def extract_message(msg: dict, folder_path: str) -> Optional[dict]: - """Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only.""" - try: - mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}" - subject = msg.get("subject") or "" - - body_html = None - body_preview = msg.get("bodyPreview") or "" - body = msg.get("body", {}) - if body.get("contentType") == "html": - content = body.get("content") or "" - body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024] - elif body.get("contentType") == "text": - body_preview = (body.get("content") or "")[:2000] - - sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {}) - to_list = msg.get("toRecipients", []) - cc_list = msg.get("ccRecipients", []) - bcc_list = msg.get("bccRecipients", []) - - recipients = ( - [{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] + - [{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] + - [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list] - ) - - importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1) - flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0) - - raw_headers = msg.get("internetMessageHeaders") or [] - headers = parse_headers(raw_headers) - - in_reply_to = headers.get("in_reply_to", "") - if isinstance(in_reply_to, list): - in_reply_to = in_reply_to[0] - - refs_raw = headers.get("references", "") - if isinstance(refs_raw, list): - refs_raw = " ".join(refs_raw) - internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else [] - - conv_topic = headers.get("thread_topic", "") - if isinstance(conv_topic, list): - conv_topic = conv_topic[0] - - conv_index = "" - ci_raw = msg.get("conversationIndex") - if ci_raw: - try: - conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode() - except Exception: - conv_index = ci_raw - - attachments = [] - for att in msg.get("attachments") or []: - fname = att.get("name") or "" - if not fname: - continue - attachments.append({ - "filename": fname, - "size_bytes": att.get("size", 0), - "mime_type": att.get("contentType", "application/octet-stream"), - "is_inline": att.get("isInline", False), - "graph_att_id": att.get("id"), - }) - - return { - "_id": mid, - "graph_id": msg["id"], - - "subject": subject, - "normalized_subject": normalize_subject(subject), - "importance": importance, - "flag_status": flag_status, - "is_read": msg.get("isRead", False), - "is_draft": msg.get("isDraft", False), - "has_attachments": msg.get("hasAttachments", False), - "attachment_count": len(attachments), - "inference_classification": msg.get("inferenceClassification", ""), - "categories": msg.get("categories") or [], - - "conversation_id": msg.get("conversationId", ""), - "conversation_index": conv_index, - "conversation_topic": conv_topic, - "in_reply_to": in_reply_to, - "internet_references": internet_refs, - - "received_at": parse_date(msg.get("receivedDateTime")), - "sent_at": parse_date(msg.get("sentDateTime")), - "created_at": parse_date(msg.get("createdDateTime")), - "modified_at": parse_date(msg.get("lastModifiedDateTime")), - - "folder_id": msg.get("parentFolderId", ""), - "folder_path": folder_path, - - "sender": { - "email": sender_ea.get("address", ""), - "name": sender_ea.get("name", ""), - }, - "to": format_recipients(to_list), - "cc": format_recipients(cc_list), - "bcc": format_recipients(bcc_list), - "recipients": recipients, - - "body_html": body_html, - "body_preview": body_preview, - - "attachments": attachments, - "headers": headers, - - "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None), - } - - except Exception as e: - logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e) - return None - - -def extract_sync_fields(msg: dict, folder_path: str) -> dict: - """Jen menitelna pole — pouziva se v sync mode pro existujici zpravy.""" - return { - "is_read": msg.get("isRead", False), - "is_draft": msg.get("isDraft", False), - "flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0), - "importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1), - "categories": msg.get("categories") or [], - "modified_at": parse_date(msg.get("lastModifiedDateTime")), - "folder_id": msg.get("parentFolderId", ""), - "folder_path": folder_path, - "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None), - } - - -# ─── MongoDB indexy ─────────────────────────────────────────────────────────── - -def create_indexes(col): - print(" Vytvarim indexy...") - col.create_index([("received_at", ASCENDING)]) - col.create_index([("sent_at", ASCENDING)]) - col.create_index([("sender.email", ASCENDING)]) - col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True) - col.create_index([("conversation_id", ASCENDING)]) - col.create_index([("folder_path", ASCENDING)]) - col.create_index([("has_attachments", ASCENDING)]) - col.create_index([("categories", ASCENDING)]) - col.create_index([("importance", ASCENDING)]) - col.create_index([("flag_status", ASCENDING)]) - col.create_index([("is_read", ASCENDING)]) - col.create_index([ - ("subject", TEXT), - ("body_preview", TEXT), - ("to", TEXT), - ("cc", TEXT), - ], name="text_search", default_language="none") - print(" Indexy hotovy.") - - -# ─── MAIN ───────────────────────────────────────────────────────────────────── - -def main(): - global GRAPH_MAILBOX - - ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}") - ap.add_argument("--mailbox", required=True, - help="Emailova schranka (napr. ordinace@buzalkova.cz)") - ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"], - help="full=plny upsert (vychozi) | new-only=jen nove zpravy | " - "sync=existujici aktualizuje jen menitelna pole, nove importuje cely") - ap.add_argument("--limit", type=int, default=0, - help="Zpracovat max N zprav (0 = vse)") - ap.add_argument("--folder", default="", - help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)") - ap.add_argument("--no-indexes", action="store_true", - help="Nevytvorit indexy na konci") - args = ap.parse_args() - - GRAPH_MAILBOX = args.mailbox - mongo_col = args.mailbox - - start = datetime.now() - print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===") - print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") - print(f"Schránka: {GRAPH_MAILBOX}") - print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}") - print(f"Režim: {args.mode}") - - print("\nPřipojuji se k Graph API...") - try: - get_token() - print(" Graph API OK") - except Exception as e: - print(f" CHYBA: {e}") - sys.exit(1) - - client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) - try: - client.admin.command("ping") - print(" MongoDB OK") - except Exception as e: - print(f" CHYBA: MongoDB neni dostupna -- {e}") - sys.exit(1) - col = client[MONGO_DB][mongo_col] - - existing: set = set() - if args.mode in ("new-only", "sync"): - print(" Nacitam existujici zaznamy z MongoDB...") - existing = set(col.distinct("_id")) - print(f" {len(existing)} jiz importovano") - - print("\nNacitam seznam slozek...") - all_folders = get_all_folders() - if args.folder: - all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()] - print(f" Slozek ke zpracovani: {len(all_folders)}") - for f in all_folders: - print(f" {f['path']}") - - is_sync = args.mode == "sync" - msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT - expand_att = not is_sync - - batch = [] - ok_count = 0 - sync_count = 0 - err_count = 0 - skip_count = 0 - total_i = 0 - - def flush(): - if not batch: - return - try: - col.bulk_write(batch, ordered=False) - except Exception as e: - logging.error("bulk_write: %s", e) - print(f" CHYBA bulk_write: {e}") - batch.clear() - - print() - for folder in all_folders: - print(f"--- Složka: {folder['path']} ---") - folder_count = 0 - - for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att): - if args.limit and total_i >= args.limit: - break - - mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}" - total_i += 1 - folder_count += 1 - - if args.mode == "new-only" and mid in existing: - skip_count += 1 - continue - - if is_sync and mid in existing: - fields = extract_sync_fields(msg, folder["path"]) - batch.append(UpdateOne({"_id": mid}, {"$set": fields})) - sync_count += 1 - print(f" {total_i:>6} SYN {mid[:80]}") - else: - if is_sync: - full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}" - full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND} - try: - msg = graph_get(full_url, full_params) - except Exception as e: - logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e) - err_count += 1 - continue - - doc = extract_message(msg, folder["path"]) - if doc is None: - err_count += 1 - print(f" {total_i:>6} ERR {mid[:80]}") - else: - batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True)) - ok_count += 1 - subject_str = (doc.get("subject") or "")[:60] - sender_str = (doc.get("sender", {}).get("email") or "")[:40] - print(f" {total_i:>6} OK {subject_str:<60} {sender_str}") - - if len(batch) >= BATCH_SIZE: - flush() - - if total_i % 500 == 0: - elapsed = (datetime.now() - start).total_seconds() - rate = total_i / elapsed if elapsed > 0 else 0 - print(f" {'─'*80}") - print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s") - print(f" {'─'*80}") - - flush() - print(f" → {folder_count} zprav ze slozky {folder['path']}") - - if args.limit and total_i >= args.limit: - break - - elapsed_total = (datetime.now() - start).total_seconds() - print(f"\n{'='*52}") - print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}") - print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") - print(f"Dokumentu v kolekci: {col.count_documents({})}") - - if not args.no_indexes: - print() - create_indexes(col) - - print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - if err_count: - print(f"Chyby logovany do: {LOG_FILE}") - - client.close() - - -if __name__ == "__main__": - main() diff --git a/Python-runner/python_runner.md b/Python-runner/python_runner.md index 415e09d..89ffa97 100644 --- a/Python-runner/python_runner.md +++ b/Python-runner/python_runner.md @@ -48,20 +48,32 @@ c.close() --- -## Aktuální skripty v /scripts +## Pipeline — 5 skriptů v pořadí spouštění -| Soubor | Popis | -|---------------------------------|--------------------------------------------------------------| -| `parse_emails_graph_v1.3.py` | Import emailů ze schránky přes Graph API → MongoDB | -| `download_attachments_v1.3.py` | Stažení skutečných příloh emailů (Graph API) → `/mnt/Emails` | -| `python_runner.md` | Tato dokumentace | -| `parse_emails_errors.log` | Log chyb (soubory/zprávy které selhaly) | +Prefix `1_` … `5_` indikuje pořadí v pipeline. Bezpečné opakovat každý krok (idempotentní upserty). -> **POZOR:** oba skripty pouze **čtou** ze schránky — žádný zápis do schránky. +| # | Skript | Účel | Zdroj → Cíl | +|---|---|---|---| +| 1 | `1_parse_emails_graph_v1.4.py` | Import emailů z Graph API | Graph → Mongo `emaily.` | +| 2 | `2_refetch_text_bodies_v1.0.py` | ONETIME oprava starých plain-text emailů (v1.3 ukládal jen 2000 znaků do `body_preview`) | Graph → Mongo `body_text` | +| 3 | `3_download_attachments_v1.3.py` | Stažení binárek příloh + SHA256 dedup | Graph → `/mnt/Emails//Attachments/` + Mongo `attachments_index` | +| 4 | `4_unwrap_smime_v1.0.py` | Rozbalení S/MIME wrapper (`smime.p7m`) na vnitřní MIME tělo | Graph → Mongo `smime_body_text/html`, `smime_inner_attachments` | +| 5 | `5_enrich_fulltext_emails_v1.2.py`| Plný text emailů do PG fulltext indexu | Mongo → PG `MongoEmaily.emails` | + +Doplňkové soubory v `/scripts/`: + +| Soubor | Popis | +|---|---| +| `python_runner.md` | Tato dokumentace | +| `*.log` | Výstupy běhů (`parse_emails.log`, `download_attachments.log`, `unwrap_smime.log`, `refetch.log`) | +| `*_errors.log` | Chyby konkrétních zpráv/příloh | +| `Trash/` | Staré verze skriptů | + +> **POZOR:** všechny skripty pouze **čtou** ze schránky — žádný zápis do schránky. --- -## Microsoft Graph API — konfigurace (v obou skriptech) +## Microsoft Graph API — konfigurace (sdílená všemi skripty) | Parametr | Hodnota | |-----------------|----------------------------------------| @@ -77,29 +89,35 @@ c.close() | Kolekce emailů | `` (např. `ordinace@buzalkova.cz`) | | Index příloh | `attachments_index` | +| PostgreSQL | Hodnota | +|-----------------|----------------------------------------| +| Host | `192.168.1.76` | +| DB | `MongoEmaily` | +| Tabulka | `emails` (GIN tsvector, config `soubory`) | + --- -## 1) parse_emails_graph_v1.3.py — import emailů → MongoDB +## 1) `1_parse_emails_graph_v1.4.py` — import emailů → MongoDB Čte **všechny složky** schránky rekurzivně (Inbox, Sent, Deleted, archivy …) přes Graph API a importuje každou zprávu jako dokument do MongoDB. `_id` = Internet Message-ID (fallback `graphid:`). Upsert → bezpečné přerušit a opakovat. Z každé zprávy extrahuje: předmět, odesílatel, příjemci To/CC/BCC, časy (UTC), -HTML tělo (max 2 MB) + text preview, přílohy (metadata + `graph_att_id`), -internet headers (SPF/DKIM/Received/X-*), MAPI-ekvivalenty (důležitost, příznak, -konverzační vlákno, kategorie, In-Reply-To, References), `isRead`, `isDraft`, -`folder_path`, `inferenceClassification`. +HTML tělo (max 2 MB) + text preview, **plné plain-text tělo (`body_text`, max 2 MB)**, +přílohy (metadata + `graph_att_id`), internet headers (SPF/DKIM/Received/X-*), +MAPI-ekvivalenty (důležitost, příznak, konverzační vlákno, kategorie, +In-Reply-To, References), `isRead`, `isDraft`, `folder_path`, `inferenceClassification`. ```bash # První import (vše): -docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz # Test na 50 zprávách bez indexů: -docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes +docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes # Pravidelný sync na pozadí (log do souboru): -docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1" +docker exec -d python-runner bash -c "python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1" ``` > **`-d` = detached:** příkaz se hned vrátí a skript běží dál v kontejneru i po @@ -119,7 +137,23 @@ docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py --- -## 2) download_attachments_v1.3.py — stažení příloh → /mnt/Emails +## 2) `2_refetch_text_bodies_v1.0.py` — dohnání plain-text těl + +**ONETIME oprava.** Starý `parse_emails_graph_v1.3` ukládal plain-text emaily +jen jako prvních 2000 znaků do `body_preview` — plné tělo se zahazovalo. +Tenhle skript v Mongo najde emaily kde `body_html` chybí a re-fetchne plné +tělo z Graphu do nového pole `body_text` (max 2 MB). + +```bash +docker exec -d python-runner bash -c "python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz > /scripts/refetch.log 2>&1" +``` + +> Po importu schránky přes v1.4 už tenhle skript prakticky nemá co dělat +> (kandidátů 0). Drží se kvůli archivním schránkám, které byly importovány v1.3. + +--- + +## 3) `3_download_attachments_v1.3.py` — stažení příloh → /mnt/Emails Stahuje skutečné přílohy (`is_inline=False`) všech emailů z MongoDB přes Graph API do `/mnt/Emails//Attachments/`. Primárně přes `graph_att_id` (přímé ID), @@ -135,15 +169,13 @@ mime_type, mailbox, first_seen_at, ref_count). Emaily kde mají všechny přílo `file_hash` se přeskočí → bezpečné opakovat. ```bash -# Interaktivně (vidíš výstup, skončí zavřením terminálu): -docker exec -it python-runner python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz +# Interaktivně: +docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz -# Na pozadí (běží dál i po zavření terminálu, log do souboru): -docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1" +# Na pozadí: +docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1" ``` -> `-d` = detached — stejné chování jako u skriptu výše (viz poznámka v sekci 1). - ### Parametry | Parametr | Popis | @@ -155,10 +187,82 @@ docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3. --- +## 4) `4_unwrap_smime_v1.0.py` — rozbalení S/MIME zpráv + +Některé emaily (Datová schránka, mBank, ComGate, PayU, PostSignum …) přicházejí +jako S/MIME signed-data wrapper: viditelné tělo je jen *"This is an S/MIME +signed message"*, skutečný obsah je zabalený uvnitř přílohy `smime.p7m`. + +Skript najde tyto emaily, stáhne binárku `smime.p7m` z Graphu, rozbalí PKCS7 +SignedData (`asn1crypto.cms`), extrahuje vnitřní MIME zprávu a doplní do Mongo: + +| Pole | Obsah | +|---|---| +| `smime_unwrapped: True` | flag — už rozbaleno | +| `smime_subject` | Subject z vnitřní MIME hlavičky | +| `smime_body_text` | plain text vnitřního těla | +| `smime_body_html` | HTML vnitřního těla (pokud je) | +| `smime_inner_attachments[]` | `{filename, content_type, size_bytes}` vnitřních příloh | + +Pole pak používá `5_enrich_fulltext_emails_v1.2` — preferuje `smime_body_*` před +prázdným wrapper tělem a názvy vnitřních příloh přidá do `attachments_summary` +(takže je najde MCP `emaily.find_attachment`). + +```bash +docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py # vsechny schránky +docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz +docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --limit 10 # test +``` + +### POZOR: `smime.p7m` vs `smime.p7s` — dva různé typy + +| Příloha | Co to je | Skript dělá | +|---|---|---| +| `smime.p7m` | **Enveloped/signed-data wrapper** — vnější obal kolem celé MIME zprávy. Bez rozbalení je viditelné jen *"This is an S/MIME signed message"*. | **Rozbalí** → extrahuje vnitřní tělo + přílohy do Mongo. | +| `smime.p7s` | **Detached signature** — jen digitální podpis vedle čistého emailu. Vlastní `body_html` / `body_text` je normálně dostupné. | **Ignoruje** — není co rozbalovat. Mail je už čitelný. | + +Filtr ve skriptu (`SMIME_FILTER`) je proto explicitně `^smime\.p7m$`. Pokud při +auditu vidíš email s přílohou `smime.p7s` a `smime_unwrapped != True`, je to +**správně** — žádná akce není potřeba. + +### Závislosti + +```bash +pip install asn1crypto +``` + +--- + +## 5) `5_enrich_fulltext_emails_v1.2.py` — fulltext do PostgreSQL + +Vytáhne plný text z emailů v MongoDB a uloží do PostgreSQL +(`MongoEmaily.emails`) s GIN `tsvector` indexem (config `soubory` — simple + unaccent). +Emaily se **nestahují znovu** — tělo už je v Mongo z kroků 1/2/4. + +**Priorita zdroje těla** (`body_source`): +1. `smime` — `smime_body_text` / `smime_body_html` (pokud unwrap proběhl) +2. `html` — `body_html` +3. `text` — `body_text` (z parse v1.4 nebo refetch v1.0) +4. `preview` — `body_preview` (fallback) + +Inkrementalita: pokud `(mailbox, message_id)` existuje a `extractor_version` +je aktuální a `modified_at` v Mongo není novější → skip. Bump `EXTRACTOR_VERSION` +přeparsuje vše. + +```bash +docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.2.py > /scripts/enrich.log 2>&1" +docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz +docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --limit 500 # test +``` + +--- + ## Sledování průběhu ```bash docker exec -it python-runner tail -f /scripts/parse_emails.log +docker exec -it python-runner tail -f /scripts/download_attachments.log +docker exec -it python-runner tail -f /scripts/unwrap_smime.log ``` --- @@ -172,6 +276,7 @@ pymongo 4.17.0 python-dateutil 2.9.0.post0 extract-msg 0.55.0 cryptography 48.0.0 +asn1crypto (S/MIME unwrap) beautifulsoup4 4.13.5 oletools 0.60.2 msoffcrypto-tool 6.0.0 @@ -183,6 +288,7 @@ pcodedmp 1.2.6 tzlocal 5.3.1 six 1.17.0 pip 25.0.1 +psycopg (PG klient pro krok 5) ``` --- @@ -201,4 +307,6 @@ docker exec python-runner pip install | Datum | Změna | |---|---| -| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. Skript `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. | +| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. | +| 2026-06-03 | `parse_emails_graph_v1.4` (ukládá i plné plain-text tělo do `body_text`). Přidán `refetch_text_bodies_v1.0` (dohnání starých plain-text). Přidán `unwrap_smime_v1.0` (rozbalení `smime.p7m`). `enrich_fulltext_emails_v1.2` (preferuje `smime_body_*`, body_source `smime`/`text`). | +| 2026-06-04 | Skripty přejmenovány s prefixem `1_…5_` podle pořadí v pipeline. `enrich_v1.1` + `parse_emails_tower_v1.1*` do `Trash/`. | diff --git a/claude-memory/MEMORY.md b/claude-memory/MEMORY.md index ac65006..698d8d8 100644 --- a/claude-memory/MEMORY.md +++ b/claude-memory/MEMORY.md @@ -1,10 +1,14 @@ # Memory Index - [Pracovat v maintree](feedback_worktree.md) — vždy pracuj v `U:/janssen/`, ne ve worktree větvích +- [Proaktivně navrhuj lepší API](feedback_proactive_suggestions.md) — když vidím optimalizovatelný kus pipeline a znám lepší standard (delta, CDC, webhooks…), nabídnu to, ne čekám +- [Statistiky emailů přes MCP, ne SSH](feedback_use_mcp_emaily.md) — pro dotazy nad `emaily` db první volba MCP `emaily`, ne paramiko gymnastika - [Projekt Covance UCO3001](project_covance.md) — report vzorků studie 77242113UCO3001, skript `create_report.py`, zdroj + logika OK statusů - [EDC import do MongoDB](project_edc_mongo.md) — skript `medidata/edc_import.py`, import Data Listing + QueryDetails CSV do MongoDB (192.168.1.76), kolekce `queries` + `queries_snapshots` pro tracking vývoje queries v čase - [IWRS notifikace v Mongo](project_iwrs_mongo.md) — parser `IWRS/Patients/parse_notifications_to_mongo.py` čte texty notifikací z MySQL a ukládá strukturovaná data do `studie.iwrs` (lot, expirace, clinical response, audit trail) - [Dropbox file transfer](project_dropbox_file_transfer.md) — přenos souborů z JNJ PC do Dropboxu přes msgreceiver kontejner na Unraidu - [Graph email import](project_graph_email_import.md) — import JNJ emailů do schránky vladimir.buzalka@buzalka.cz přes Graph API - [Memory sync přes Giteu](setup_memory_sync.md) — paměť je v `claude-memory/` v janssen repu, junction + git push synchronizuje mezi PC +- [MCP soubory](project_mcp_soubory.md) — MCP server nad PG fulltextem + Mongo metadaty pro soubory studií (search/read/duplicates/by_author/...) +- [MCP emaily](project_mcp_emaily.md) — MCP server nad PG fulltextem + Mongo emailů z Graph importu (9 schránek, ~268k mailů; search/read_email/by_sender/conversation_thread/find_attachment/...) - [Claude Code learning path](project_claude_learning.md) — Level 2 Intermediate, mezery: Skills/Subagenty/Hooks/Print mode, tutoriál v `claude-howto/`