z230

2026-06-02 17:20:20 +02:00
parent ec187e673a
commit b433ef0446
58 changed files with 9247 additions and 0 deletions
@@ -0,0 +1,560 @@
+"""
+parse_emails_graph_v1.0.py
+Nazev:  parse_emails_graph_v1.0.py
+Verze:  1.0
+Datum:  2026-06-02
+Autor:  vladimir.buzalka
+
+Popis:
+    Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
+    Microsoft Graph API a importuje je jako dokumenty do MongoDB.
+    Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
+
+        - predmet, odesilatel, prijemci (To/CC/BCC s typy)
+        - cas doruceni, odeslani, vytvoreni, modifikace (UTC)
+        - telo HTML (max 2 MB) + textovy preview
+        - prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
+        - internet headers (SPF, DKIM, Received, X-*, ...)
+        - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
+          kategorie, In-Reply-To, References, ...
+        - navic: isRead, isDraft, folder_path, inferenceClassification
+
+    Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
+    archivni slozky, ...).
+
+    DB:       emaily
+    Kolekce:  ordinace@buzalkova.cz
+    _id:      Internet Message-ID (nebo "graphid:<id>" jako fallback)
+
+    Bezpecne prerusit a opakovat:
+        - upsert podle _id — duplicity se automaticky prepisi
+        - --skip-existing nacte seznam hotovych _id z MongoDB a preskoci je
+
+    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
+
+Spousteni:
+    python parse_emails_graph_v1.0.py                    # kompletni import
+    python parse_emails_graph_v1.0.py --limit 50         # test na prvnich 50
+    python parse_emails_graph_v1.0.py --skip-existing    # pokracovani po preruseni
+    python parse_emails_graph_v1.0.py --folder Inbox     # jen jedna slozka
+    python parse_emails_graph_v1.0.py --no-indexes       # bez indexu na konci
+
+Zavislosti:
+    msal, requests, pymongo, python-dateutil
+    Python 3.10+
+
+Struktura dokumentu v MongoDB:
+    _id                     Internet Message-ID (nebo graphid: fallback)
+    graph_id                Graph API message ID (pro pripadne dalsi operace)
+    subject                 predmet zpravy
+    normalized_subject      predmet bez RE:/FW:/AW: prefixu
+    importance              0=nizka 1=normalni 2=vysoka
+    flag_status             0=bez priznaku 1=oznaceno 2=dokonceno
+    is_read                 bool — aktualni stav precteni ve schrance
+    is_draft                bool
+    has_attachments         bool
+    attachment_count        int
+    inference_classification focused / other (Outlook AI trideni)
+    categories              [str]
+    conversation_id         Graph conversationId
+    conversation_index      base64 conversationIndex
+    conversation_topic      tema vlakna (z internet headers Thread-Topic)
+    in_reply_to             Message-ID predchozi zpravy
+    internet_references     [Message-ID] — cela historia vlakna
+    received_at             datetime UTC
+    sent_at                 datetime UTC
+    created_at              datetime UTC — cas vytvoreni zaznamu v M365
+    modified_at             datetime UTC — cas posledni modifikace
+    folder_id               Graph parentFolderId
+    folder_path             cela cesta slozky (napr. Inbox/Subfolder)
+    sender.email            emailova adresa odesilatele
+    sender.name             zobrazovane jmeno odesilatele
+    to                      retezec To (joined)
+    cc                      retezec CC
+    bcc                     retezec BCC
+    recipients              [{type, email, name}] — to/cc/bcc s typy
+    body_html               HTML telo (max 2 MB)
+    body_preview            textovy nahled (max 255 znaku z Graph)
+    attachments             [{filename, size_bytes, mime_type,
+                              content_id, is_inline}]
+    headers                 dict internet headers (lowercase_s_podtrzitky)
+    parsed_at               datetime UTC — cas parsovani
+
+Indexy:
+    received_at, sent_at, sender.email, graph_id (unique),
+    conversation_id, folder_path, has_attachments, categories,
+    importance, flag_status, is_read,
+    text_search (subject + body_preview + to + cc)
+
+Historie verzi:
+    1.0  2026-06-02  Inicialni verze — Graph API jako zdroj
+"""
+
+import sys
+import re
+import logging
+import argparse
+import base64
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+
+import msal
+import requests
+from dateutil import parser as dtparser
+from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+
+# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
+GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
+GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
+GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
+GRAPH_MAILBOX       = "ordinace@buzalkova.cz"
+GRAPH_URL           = "https://graph.microsoft.com/v1.0"
+
+MONGO_URI      = "mongodb://192.168.1.76:27017"
+MONGO_DB       = "emaily"
+MONGO_COL      = "ordinace@buzalkova.cz"
+BATCH_SIZE     = 100
+PAGE_SIZE      = 50
+LOG_FILE       = Path(__file__).parent / "parse_emails_errors.log"
+SCRIPT_VERSION = "1.0"
+# ──────────────────────────────────────────────────────────────────────────────
+
+logging.basicConfig(
+    filename=str(LOG_FILE),
+    level=logging.ERROR,
+    format="%(asctime)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    encoding="utf-8",
+)
+
+IMPORTANCE_MAP  = {"low": 0, "normal": 1, "high": 2}
+FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
+RE_SUBJECT      = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
+
+MSG_SELECT = (
+    "id,internetMessageId,subject,bodyPreview,body,"
+    "importance,isRead,isDraft,hasAttachments,"
+    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
+    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
+    "conversationId,conversationIndex,parentFolderId,"
+    "categories,flag,inferenceClassification,internetMessageHeaders"
+)
+
+
+# ─── Graph API helpers ────────────────────────────────────────────────────────
+
+_graph_token: Optional[str] = None
+
+
+def get_token() -> str:
+    global _graph_token
+    app = msal.ConfidentialClientApplication(
+        GRAPH_CLIENT_ID,
+        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
+        client_credential=GRAPH_CLIENT_SECRET,
+    )
+    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+    if "access_token" not in result:
+        raise RuntimeError(f"Graph auth failed: {result}")
+    _graph_token = result["access_token"]
+    return _graph_token
+
+
+def graph_get(url: str, params: dict = None) -> dict:
+    global _graph_token
+    if not _graph_token:
+        get_token()
+    for attempt in range(2):
+        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
+        if r.status_code == 401:
+            get_token()
+            continue
+        r.raise_for_status()
+        return r.json()
+    raise RuntimeError(f"Graph GET failed after retry: {url}")
+
+
+def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
+    """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
+    if parent_id is None:
+        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
+    else:
+        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
+
+    folders = []
+    params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
+    while url:
+        data = graph_get(url, params)
+        for f in data.get("value", []):
+            path = f"{parent_path}/{f['displayName']}".lstrip("/")
+            folders.append({"id": f["id"], "path": path})
+            if f.get("childFolderCount", 0) > 0:
+                folders.extend(get_all_folders(f["id"], path))
+        url = data.get("@odata.nextLink")
+        params = None
+    return folders
+
+
+def iter_folder_messages(folder_id: str):
+    """Generator: vraci zpravy ze slozky po strankach."""
+    url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
+    params = {"$top": PAGE_SIZE, "$select": MSG_SELECT, "$expand": "attachments"}
+    while url:
+        data = graph_get(url, params)
+        for msg in data.get("value", []):
+            yield msg
+        url = data.get("@odata.nextLink")
+        params = None
+
+
+# ─── Pomocné funkce ───────────────────────────────────────────────────────────
+
+def parse_date(raw) -> Optional[datetime]:
+    if raw is None:
+        return None
+    if isinstance(raw, datetime):
+        if raw.tzinfo:
+            return raw.astimezone(timezone.utc).replace(tzinfo=None)
+        return raw
+    try:
+        dt = dtparser.parse(str(raw))
+        if dt.tzinfo:
+            return dt.astimezone(timezone.utc).replace(tzinfo=None)
+        return dt
+    except Exception:
+        return None
+
+
+def normalize_subject(subject: str) -> str:
+    s = subject.strip()
+    while True:
+        m = RE_SUBJECT.match(s)
+        if not m:
+            break
+        s = s[m.end():].strip()
+    return s
+
+
+def parse_headers(raw_headers: list) -> dict:
+    result = {}
+    for h in raw_headers:
+        k = h["name"].lower().replace("-", "_")
+        v = h["value"]
+        if k in result:
+            existing = result[k]
+            if isinstance(existing, list):
+                existing.append(v)
+            else:
+                result[k] = [existing, v]
+        else:
+            result[k] = v
+    return result
+
+
+def format_recipients(lst: list) -> str:
+    return "; ".join(
+        f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
+        for r in lst
+    )
+
+
+# ─── Hlavní extrakce ─────────────────────────────────────────────────────────
+
+def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
+    try:
+        # _id
+        mid = (msg.get("internetMessageId") or "").strip()
+        if not mid:
+            mid = f"graphid:{msg['id']}"
+
+        subject = msg.get("subject") or ""
+        norm_subject = normalize_subject(subject)
+
+        # tělo
+        body_html = None
+        body_preview = msg.get("bodyPreview") or ""
+        body = msg.get("body", {})
+        if body.get("contentType") == "html":
+            content = body.get("content") or ""
+            body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
+        elif body.get("contentType") == "text":
+            body_preview = (body.get("content") or "")[:2000]
+
+        # odesílatel
+        sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
+        sender_email = sender_ea.get("address", "")
+        sender_name  = sender_ea.get("name", "")
+
+        # příjemci
+        to_list  = msg.get("toRecipients", [])
+        cc_list  = msg.get("ccRecipients", [])
+        bcc_list = msg.get("bccRecipients", [])
+
+        recipients = (
+            [{"type": "to",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
+            [{"type": "cc",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
+            [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
+        )
+
+        # příznaky
+        importance  = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
+        flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
+
+        # internet headers
+        raw_headers = msg.get("internetMessageHeaders") or []
+        headers = parse_headers(raw_headers)
+
+        in_reply_to = headers.get("in_reply_to", "")
+        if isinstance(in_reply_to, list):
+            in_reply_to = in_reply_to[0]
+
+        refs_raw = headers.get("references", "")
+        if isinstance(refs_raw, list):
+            refs_raw = " ".join(refs_raw)
+        internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
+
+        conv_topic = headers.get("thread_topic", "")
+        if isinstance(conv_topic, list):
+            conv_topic = conv_topic[0]
+
+        # conversation index
+        conv_index = ""
+        ci_raw = msg.get("conversationIndex")
+        if ci_raw:
+            try:
+                conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
+            except Exception:
+                conv_index = ci_raw
+
+        # přílohy (jen metadata, bez obsahu)
+        attachments = []
+        for att in msg.get("attachments") or []:
+            fname = att.get("name") or ""
+            if not fname:
+                continue
+            attachments.append({
+                "filename":   fname,
+                "size_bytes": att.get("size", 0),
+                "mime_type":  att.get("contentType", "application/octet-stream"),
+                "content_id": att.get("contentId"),
+                "is_inline":  att.get("isInline", False),
+            })
+
+        return {
+            "_id":     mid,
+            "graph_id": msg["id"],
+
+            "subject":            subject,
+            "normalized_subject": norm_subject,
+            "importance":         importance,
+            "flag_status":        flag_status,
+            "is_read":            msg.get("isRead", False),
+            "is_draft":           msg.get("isDraft", False),
+            "has_attachments":    msg.get("hasAttachments", False),
+            "attachment_count":   len(attachments),
+            "inference_classification": msg.get("inferenceClassification", ""),
+            "categories":         msg.get("categories") or [],
+
+            "conversation_id":    msg.get("conversationId", ""),
+            "conversation_index": conv_index,
+            "conversation_topic": conv_topic,
+            "in_reply_to":        in_reply_to,
+            "internet_references": internet_refs,
+
+            "received_at": parse_date(msg.get("receivedDateTime")),
+            "sent_at":     parse_date(msg.get("sentDateTime")),
+            "created_at":  parse_date(msg.get("createdDateTime")),
+            "modified_at": parse_date(msg.get("lastModifiedDateTime")),
+
+            "folder_id":   msg.get("parentFolderId", ""),
+            "folder_path": folder_path,
+
+            "sender": {
+                "email": sender_email,
+                "name":  sender_name,
+            },
+            "to":         format_recipients(to_list),
+            "cc":         format_recipients(cc_list),
+            "bcc":        format_recipients(bcc_list),
+            "recipients": recipients,
+
+            "body_html":    body_html,
+            "body_preview": body_preview,
+
+            "attachments": attachments,
+            "headers":     headers,
+
+            "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
+        }
+
+    except Exception as e:
+        logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
+        return None
+
+
+# ─── MongoDB indexy ───────────────────────────────────────────────────────────
+
+def create_indexes(col):
+    print("  Vytvarim indexy...")
+    col.create_index([("received_at",    ASCENDING)])
+    col.create_index([("sent_at",        ASCENDING)])
+    col.create_index([("sender.email",   ASCENDING)])
+    col.create_index([("graph_id",       ASCENDING)], unique=True, sparse=True)
+    col.create_index([("conversation_id", ASCENDING)])
+    col.create_index([("folder_path",    ASCENDING)])
+    col.create_index([("has_attachments", ASCENDING)])
+    col.create_index([("categories",     ASCENDING)])
+    col.create_index([("importance",     ASCENDING)])
+    col.create_index([("flag_status",    ASCENDING)])
+    col.create_index([("is_read",        ASCENDING)])
+    col.create_index([
+        ("subject",       TEXT),
+        ("body_preview",  TEXT),
+        ("to",            TEXT),
+        ("cc",            TEXT),
+    ], name="text_search", default_language="none")
+    print("  Indexy hotovy.")
+
+
+# ─── MAIN ─────────────────────────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
+    ap.add_argument("--limit",         type=int, default=0,
+                    help="Zpracovat max N zprav (0 = vse)")
+    ap.add_argument("--skip-existing", action="store_true",
+                    help="Preskocit zpravy ktere jiz jsou v MongoDB")
+    ap.add_argument("--folder",        default="",
+                    help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
+    ap.add_argument("--no-indexes",    action="store_true",
+                    help="Nevytvorit indexy na konci")
+    args = ap.parse_args()
+
+    start = datetime.now()
+    print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
+    print(f"Start:    {start.strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Schránka: {GRAPH_MAILBOX}")
+    print(f"MongoDB:  {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
+
+    # Graph token
+    print("\nPřipojuji se k Graph API...")
+    try:
+        get_token()
+        print("  Graph API OK")
+    except Exception as e:
+        print(f"  CHYBA: {e}")
+        sys.exit(1)
+
+    # MongoDB
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    try:
+        client.admin.command("ping")
+        print("  MongoDB OK")
+    except Exception as e:
+        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
+        sys.exit(1)
+    col = client[MONGO_DB][MONGO_COL]
+
+    # Skip existing
+    existing: set = set()
+    if args.skip_existing:
+        print("  Nacitam existujici zaznamy z MongoDB...")
+        existing = set(col.distinct("_id"))
+        print(f"  {len(existing)} jiz importovano")
+
+    # Slozky
+    print("\nNacitam seznam slozek...")
+    all_folders = get_all_folders()
+    if args.folder:
+        all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
+    print(f"  Slozek ke zpracovani: {len(all_folders)}")
+    for f in all_folders:
+        print(f"    {f['path']}")
+
+    # Import
+    batch     = []
+    ok_count  = 0
+    err_count = 0
+    skip_count = 0
+    total_i   = 0
+
+    def flush():
+        if not batch:
+            return
+        try:
+            col.bulk_write(batch, ordered=False)
+        except Exception as e:
+            logging.error("bulk_write: %s", e)
+            print(f"  CHYBA bulk_write: {e}")
+        batch.clear()
+
+    print()
+    for folder in all_folders:
+        print(f"--- Složka: {folder['path']} ---")
+        folder_count = 0
+
+        for msg in iter_folder_messages(folder["id"]):
+            if args.limit and total_i >= args.limit:
+                break
+
+            mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
+
+            if mid in existing:
+                skip_count += 1
+                total_i += 1
+                continue
+
+            doc = extract_message(msg, folder["path"])
+            total_i += 1
+            folder_count += 1
+
+            if doc is None:
+                err_count += 1
+            else:
+                batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
+                ok_count += 1
+
+            if len(batch) >= BATCH_SIZE:
+                flush()
+
+            status      = "ERR " if doc is None else "OK  "
+            subject_str = (doc.get("subject") or "")[:60] if doc else "?"
+            sender_str  = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?"
+            print(f"  {total_i:>6}  {status}  {subject_str:<60}  {sender_str}")
+
+            if total_i % 500 == 0:
+                elapsed = (datetime.now() - start).total_seconds()
+                rate    = total_i / elapsed if elapsed > 0 else 0
+                print(f"  {'─'*80}")
+                print(f"  Průběh: ok={ok_count}  skip={skip_count}  err={err_count}  {rate:.1f} msg/s")
+                print(f"  {'─'*80}")
+
+        flush()
+        print(f"  → {folder_count} zprav ze slozky {folder['path']}")
+
+        if args.limit and total_i >= args.limit:
+            break
+
+    elapsed_total = (datetime.now() - start).total_seconds()
+    print(f"\n{'='*52}")
+    print(f"Vysledek:  ok={ok_count}  |  skip={skip_count}  |  err={err_count}")
+    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
+    print(f"Dokumentu v kolekci: {col.count_documents({})}")
+
+    if not args.no_indexes:
+        print()
+        create_indexes(col)
+
+    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    if err_count:
+        print(f"Chyby logovany do: {LOG_FILE}")
+
+    client.close()
+
+
+if __name__ == "__main__":
+    main()