diff --git a/EmailsImport/DOCKER_NAVOD.md b/EmailsImport/DOCKER_NAVOD.md new file mode 100644 index 0000000..f6c350c --- /dev/null +++ b/EmailsImport/DOCKER_NAVOD.md @@ -0,0 +1,115 @@ +# Python Runner na Unraid — návod + +## Příprava + +Zkopíruj `Dockerfile` a `docker-compose.yml` na tower do složky, např.: +``` +/mnt/user/Scripts/python-runner/ +``` + +Přes Unraid terminál (Tools → Terminal): +```bash +mkdir -p /mnt/user/Scripts/python-runner +``` + +Pak přes Windows zkopíruj oba soubory do `\\tower\Scripts\python-runner\`. + +--- + +## Jednorázové sestavení image + +```bash +cd /mnt/user/Scripts/python-runner +docker compose build +``` + +Trvá 1–2 minuty, stáhne Python 3.12 a nainstaluje balíčky. + +--- + +## Spuštění containeru + +```bash +docker compose up -d +``` + +Container poběží na pozadí pořád (restartuje se i po rebootu tower). + +--- + +## Spuštění skriptu + +```bash +docker exec -it python-runner python /scripts/parse_emails_v1.0.py +``` + +S parametry (pokračování po přerušení): +```bash +docker exec -it python-runner python /scripts/parse_emails_v1.0.py --skip-existing +``` + +Test na 50 emailech: +```bash +docker exec -it python-runner python /scripts/parse_emails_v1.0.py --limit 50 --no-indexes +``` + +--- + +## Spuštění na pozadí (doporučeno pro 48h import) + +```bash +docker exec -d python-runner python /scripts/parse_emails_v1.0.py --skip-existing +``` + +Přepínač `-d` pustí skript na pozadí — terminál můžeš zavřít. + +### Sledování průběhu (log) + +```bash +# Spusť skript s logováním do souboru: +docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_v1.0.py --skip-existing > /scripts/parse_emails.log 2>&1" + +# Sleduj průběh živě: +docker exec -it python-runner tail -f /scripts/parse_emails.log +``` + +--- + +## Jakýkoliv jiný skript + +Stačí skript hodit do `\\tower\Scripts\` a spustit: +```bash +docker exec -it python-runner python /scripts/nazev_skriptu.py +``` + +Pokud skript potřebuje nový pip balíček: +```bash +docker exec -it python-runner pip install nazev-balicku +``` + +(Nainstaluje se do běžícího containeru. Po `docker compose down/up` zmizí — pro trvalé přidej do Dockerfile a znovu `docker compose build`.) + +--- + +## Zastavení / restart + +```bash +docker compose stop # zastaví container +docker compose start # znovu spustí +docker compose down # zastaví a odstraní container (image zůstane) +``` + +--- + +## Cesty uvnitř containeru + +| Na tower | Uvnitř containeru | +|---|---| +| `/mnt/user/Scripts` | `/scripts` | +| `/mnt/user/JNJEMAILS` | `/mnt/JNJEMAILS` | + +Skript má v kódu `MSGS_DIR = Path(r"\\tower\JNJEMAILS")` — **před spuštěním v Dockeru změň** na: +```python +MSGS_DIR = Path("/mnt/JNJEMAILS") +``` diff --git a/EmailsImport/Dockerfile b/EmailsImport/Dockerfile new file mode 100644 index 0000000..25f154d --- /dev/null +++ b/EmailsImport/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.12-slim + +# Závislosti OS (pro extract-msg) +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Python závislosti +RUN pip install --no-cache-dir \ + extract-msg==0.55.0 \ + pymongo \ + python-dateutil + +# Pracovní adresář +WORKDIR /scripts + +# Container poběží pořád — skripty se spouštějí přes docker exec +CMD ["tail", "-f", "/dev/null"] diff --git a/EmailsImport/docker-compose.yml b/EmailsImport/docker-compose.yml new file mode 100644 index 0000000..e503ebe --- /dev/null +++ b/EmailsImport/docker-compose.yml @@ -0,0 +1,10 @@ +services: + python-runner: + build: . + container_name: python-runner + restart: unless-stopped + volumes: + - /mnt/user/Scripts:/scripts # tvoje skripty + - /mnt/user/JNJEMAILS:/mnt/JNJEMAILS # .msg soubory (read-only) + extra_hosts: + - "host.docker.internal:host-gateway" # přístup na MongoDB na 192.168.1.76 diff --git a/EmailsImport/parse_emails_tower_v1.1.md b/EmailsImport/parse_emails_tower_v1.1.md new file mode 100644 index 0000000..8222abd --- /dev/null +++ b/EmailsImport/parse_emails_tower_v1.1.md @@ -0,0 +1,248 @@ +# parse_emails_tower_v1.1 + +## Spuštění + +**První spuštění:** +```bash +docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1" +``` + +**Pokračování po přerušení (přeskočí už importované):** +```bash +docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1" +``` + +--- + +## Stav importu + +**Sledování průběhu (live log):** +```bash +docker exec -it python-runner tail -f /scripts/parse_emails.log +``` + +**Počet emailů v MongoDB:** +```bash +docker exec -it python-runner python -c \ + "from pymongo import MongoClient; c=MongoClient('mongodb://192.168.1.76:27017'); print(c['emaily']['vbuzalka@its.jnj.com'].count_documents({}))" +``` + +--- + +**Název:** parse_emails_tower_v1.1.py +**Verze:** 1.1 +**Datum:** 2026-06-02 +**Autor:** vladimir.buzalka + +--- + +## Účel + +Import všech `.msg` souborů do MongoDB. Z každého souboru extrahuje **všechny dostupné vlastnosti** — podobně jako EXIF u fotek. + +- **DB:** `emaily` +- **Kolekce:** `vbuzalka@its.jnj.com` +- `_id` = Internet Message-ID (nebo `filename:` jako fallback) +- Bezpečné přerušit a opakovat — upsert podle `_id` + +--- + +## Prostředí + +Běží v Docker containeru **python-runner** na **Unraid Tower**. + +| Komponenta | Umístění | +|---|---| +| Container | `python-runner` (Docker na Unraid Tower) | +| .msg soubory | `/mnt/user/JNJEMAILS` → `/mnt/JNJEMAILS` uvnitř containeru | +| Skripty | `/mnt/user/Scripts` → `/scripts` uvnitř containeru | +| MongoDB | `192.168.1.76:27017` (externí, mimo container) | + +--- + +## Spouštění (z Unraid terminálu) + +**Test na 50 emailech:** +```bash +docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes +``` + +**Kompletní import na pozadí (log do souboru):** +```bash +docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1" +``` + +**Pokračování po přerušení:** +```bash +docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1" +``` + +**Sledování průběhu (Ctrl+C ukončí sledování, import běží dál):** +```bash +docker exec -it python-runner tail -f /scripts/parse_emails.log +``` + +### Všechny parametry + +| Parametr | Popis | +|---|---| +| `--skip-existing` | Načte seznam hotových souborů z MongoDB a přeskočí je. Použij pro pokračování po přerušení. | +| `--limit N` | Zpracuje jen prvních N souborů. Vhodné pro test. | +| `--no-indexes` | Nevytváří indexy na konci. Použij pokud přerušíš uprostřed — indexy vytvoř ručně až je vše hotové. | +| `--msgs-dir PATH` | Přepíše výchozí cestu k .msg souborům (výchozí: `/mnt/JNJEMAILS`). | + +--- + +## Průběh na konzoli + +Každý email na jednom řádku: +``` + 1/69371 OK RE: Protocol deviation CZ10022 jan.novak@its.jnj.com + 2/69371 OK UCO3001: Draft FUL pro DD5-CZ10022 monitor@4gclinical.com + 3/69371 ERR ? ? +``` + +Každých 500 emailů oddělovač s průběhem: +``` + ──────────────────────────────────────────────────────────────────────────────── + Průběh: ok=498 err=2 0.4 msg/s ETA 47h12m + ──────────────────────────────────────────────────────────────────────────────── +``` + +Na konci souhrn: +``` +==================================================== +Vysledek: ok=69300 | skip=0 | err=71 +Celkovy cas: 47h 23m 10s +Dokumentu v kolekci: 69300 +``` + +--- + +## Zdroje dat z každého .msg + +| Pole | Popis | +|---|---| +| Předmět, normalized subject | | +| Odesílatel | email, jméno, SMTP adresa | +| Příjemci To/CC/BCC | strukturovaně `[{type, email, name}]` | +| Čas doručení a odeslání | UTC | +| Tělo | plaintext + HTML (max 2 MB) | +| Přílohy | metadata: jméno, velikost, MIME typ, inline flag | +| Internet headers | X-Originating-IP, Received, DKIM, X-Mailer, ... | +| MAPI | důležitost, citlivost, příznak, konverzační vlákno, kategorie | +| In-Reply-To, References | pro rekonstrukci vlákna | +| Raw MAPI properties | `{0xXXXX: value}` | + +--- + +## Hodnotové kódy + +| Pole | Hodnota | Význam | +|---|---|---| +| `importance` | 0 | Nízká | +| | 1 | Normální | +| | 2 | Vysoká | +| `sensitivity` | 0 | Normální | +| | 1 | Osobní | +| | 2 | Soukromé | +| | 3 | Důvěrné | +| `flag_status` | 0 | Bez příznaku | +| | 1 | Označeno (follow up) | +| | 2 | Dokončeno | + +--- + +## MongoDB indexy + +Automaticky vytvořeny na konci importu (`--no-indexes` přeskočí): + +| Index | Pole | +|---|---| +| Chronologický | `received_at`, `sent_at` | +| Odesílatel | `sender.email` | +| Soubor | `filename` (unique) | +| Konverzace | `conversation_topic` | +| Filtry | `has_attachments`, `categories`, `importance`, `flag_status` | +| Full-text | `subject` + `body_text` + `to` + `cc` (text index `text_search`) | + +--- + +## Ukázkové dotazy (MongoDB shell / MCP) + +**Emaily o UCO3001 s přílohou:** +```javascript +db["vbuzalka@its.jnj.com"].find({ + $text: { $search: "UCO3001" }, + has_attachments: true +}).sort({ received_at: -1 }) +``` + +**Emaily od konkrétního odesílatele:** +```javascript +db["vbuzalka@its.jnj.com"].find({ + "sender.email": /covance/i +}).sort({ received_at: -1 }) +``` + +**Celé konverzační vlákno:** +```javascript +db["vbuzalka@its.jnj.com"].find({ + conversation_topic: "Protocol deviation CZ10022" +}).sort({ received_at: 1 }) +``` + +**Statistiky podle odesílatele (top 20):** +```javascript +db["vbuzalka@its.jnj.com"].aggregate([ + { $group: { _id: "$sender.email", count: { $sum: 1 } } }, + { $sort: { count: -1 } }, + { $limit: 20 } +]) +``` + +--- + +## Chybový log + +Soubory které selhaly jsou zalogrovány do `parse_emails_errors.log` vedle skriptu (tj. `/scripts/parse_emails_errors.log` → `\\tower\Scripts\parse_emails_errors.log`): +``` +2026-06-02 20:14:33 | open failed [7A3F...0000.msg]: +``` + +--- + +## Výkon + +| Parametr | Hodnota | +|---|---| +| Počet souborů | ~69 000 | +| Rychlost | ~0.4 msg/s (htmlBody dekódování) | +| Odhadovaný čas | 48 hodin | +| Batch size | 200 dokumentů / bulk_write | +| Odhadovaná velikost DB | 2–5 GB | + +--- + +## Závislosti (v Docker image python-runner) + +``` +extract-msg==0.55.0 +pymongo +python-dateutil +``` + +Image sestaven z `Dockerfile` v `/mnt/user/Scripts/python-runner/`. + +--- + +## Historie verzí + +| Verze | Datum | Změna | +|---|---|---| +| 1.0 | 2026-06-01 | Iniciální verze | +| 1.1 | 2026-06-02 | Nasazení na Unraid Tower v Docker containeru python-runner; MSGS_DIR změněno z SMB share (`\\tower\JNJEMAILS`) na lokální mount (`/mnt/JNJEMAILS`); aktualizován popis spouštění pro `docker exec` | diff --git a/EmailsImport/parse_emails_tower_v1.1.py b/EmailsImport/parse_emails_tower_v1.1.py new file mode 100644 index 0000000..97118a7 --- /dev/null +++ b/EmailsImport/parse_emails_tower_v1.1.py @@ -0,0 +1,660 @@ +""" +parse_emails_tower_v1.1.py +Nazev: parse_emails_tower_v1.1.py +Verze: 1.1 +Datum: 2026-06-02 +Autor: vladimir.buzalka + +Popis: + Parsuje vsechny .msg soubory z MSGS_DIR a importuje je jako dokumenty + do MongoDB. Z kazdeho souboru extrahuje VSECHNY dostupne vlastnosti — + podobne jako EXIF u fotek: + + - predmet, odesilatel, prijemci (To/CC/BCC s typy) + - cas doruceni a odeslani (UTC) + - telo plaintext + HTML (max 2 MB) + - prilohy (metadata: jmeno, velikost, MIME typ, inline flag) + - internet headers (X-Originating-IP, Received, DKIM, ...) + - MAPI vlastnosti: dulezitost, citlivost, priznak, konverzacni vlakno, + kategorie, In-Reply-To, References, ... + - vsechny raw MAPI properties jako {0xXXXX: value} + + DB: emaily + Kolekce: vbuzalka@its.jnj.com + _id: Internet Message-ID (nebo "filename:" jako fallback) + + Bezpecne prerusit a opakovat: + - upsert podle _id — duplicity se automaticky prepisi + - --skip-existing nacte seznam hotovych souboru z MongoDB a + preskoci je => pokracovani po preruseni bez ztraty prace + +Prostredi: + Bezi v Docker containeru "python-runner" na Unraid Tower. + .msg soubory jsou dostupne jako lokalni disk (volume mount): + /mnt/user/JNJEMAILS -> /mnt/JNJEMAILS (uvnitr containeru) + MongoDB na 192.168.1.76:27017 (externi, bezi mimo container). + +Spousteni (z Unraid terminalu): + # Test na 50 emailech: + docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes + + # Kompletni import na pozadi (log do souboru): + docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1" + + # Pokracovani po preruseni: + docker exec -d python-runner bash -c \ + "python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1" + + # Sledovani prubehu: + docker exec -it python-runner tail -f /scripts/parse_emails.log + +Vystup na konzoli: + Kazdy email na jednom radku: + / OK/ERR + Kazych 500 emailu: oddelovac s prubehem, rychlosti a ETA. + Na konci: souhrn ok/skip/err, celkovy cas, pocet dokumentu v kolekci. + +Zavislosti (nainstalovane v Docker image python-runner): + extract-msg==0.55.0, pymongo, python-dateutil + Python 3.12, Linux (Docker container na Unraid Tower) + +Struktura dokumentu v MongoDB: + _id Internet Message-ID (nebo filename: fallback) + filename jmeno .msg souboru (20znakovy hex + .msg) + subject predmet zpravy + normalized_subject predmet bez RE:/FW: prefixu + importance 0=nizka 1=normalni 2=vysoka + sensitivity 0=normalni 1=osobni 2=soukrome 3=duverne + flag_status 0=bez priznaku 1=oznaceno 2=dokonceno + read_receipt_requested bool + delivery_receipt_requested bool + has_attachments bool + attachment_count int + message_size_bytes velikost .msg souboru na disku + conversation_topic tema vlakna (PR_CONVERSATION_TOPIC) + conversation_index base64 PR_CONVERSATION_INDEX + in_reply_to Message-ID predchozi zpravy + internet_references [Message-ID] — cela historia vlakna + categories [str] — MAPI kategorie / stitky + read_receipt_requested bool + delivery_receipt_requested bool + received_at datetime UTC — cas doruceni + sent_at datetime UTC — cas odeslani + sender.email emailova adresa odesilatele + sender.name zobrazovane jmeno odesilatele + sender.smtp SMTP adresa (pro interni EX adresy) + to retezec To (tak jak v Outlooku) + cc retezec CC + bcc retezec BCC + display_to PR_DISPLAY_TO (zkraceny seznam) + display_cc PR_DISPLAY_CC + recipients [{type, email, name}] — to/cc/bcc s typy + body_text plain text telo + body_html HTML telo (max 2 MB, None pokud neni) + attachments [{filename, size_bytes, mime_type, + content_id, is_inline}] + headers dict internet headers (lowercase_s_podtrzitky) + mapi dict vsech raw MAPI properties {0xXXXX: value} + parsed_at datetime UTC — cas parsovani + +Indexy (vytvoreny automaticky na konci): + received_at, sent_at, sender.email, filename (unique), + conversation_topic, has_attachments, categories, importance, + flag_status, text_search (subject + body_text + to + cc) + +Chyby: + Soubory ktere selhaly jsou zalogiovany do parse_emails_errors.log + v adresari skriptu. Radek: timestamp | open/extract failed | duvod. + +Historie verzi: + 1.0 2026-06-01 Inicialni verze + 1.1 2026-06-02 Nasazeni na Unraid Tower v Docker containeru python-runner; + MSGS_DIR zmeneno z SMB share na lokalni mount /mnt/JNJEMAILS; + aktualizovany popis spousteni pro docker exec +""" + +import sys +import re +import logging +import argparse +import base64 +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional + +import extract_msg +from dateutil import parser as dtparser +from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + +# ─── KONFIGURACE ────────────────────────────────────────────────────────────── +MSGS_DIR = Path("/mnt/JNJEMAILS") +MONGO_URI = "mongodb://192.168.1.76:27017" +MONGO_DB = "emaily" +MONGO_COL = "vbuzalka@its.jnj.com" +BATCH_SIZE = 200 +LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" +SCRIPT_VERSION = "1.1" +# ────────────────────────────────────────────────────────────────────────────── + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.ERROR, + format="%(asctime)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + encoding="utf-8", +) + + +# ─── Pomocné funkce ─────────────────────────────────────────────────────────── + +def safe(obj, *attrs, default=None): + """Bezpecne cteni atributu — vrati prvni non-None hodnotu.""" + for attr in attrs: + try: + val = getattr(obj, attr, None) + if val is None: + continue + if isinstance(val, str) and not val.strip(): + continue + return val + except Exception: + continue + return default + + +def parse_date(raw) -> Optional[datetime]: + """Libovolny datum -> UTC datetime bez tzinfo (pro MongoDB).""" + if raw is None: + return None + if isinstance(raw, datetime): + if raw.tzinfo: + return raw.astimezone(timezone.utc).replace(tzinfo=None) + return raw + try: + dt = dtparser.parse(str(raw)) + if dt.tzinfo: + return dt.astimezone(timezone.utc).replace(tzinfo=None) + return dt + except Exception: + return None + + +def to_bson(val): + """Konvertuje hodnotu na BSON-serializovatelny typ.""" + if isinstance(val, bytes): + return val.hex() if len(val) <= 128 else f"" + if isinstance(val, datetime): + return parse_date(val) + if isinstance(val, (str, int, float, bool, type(None))): + return val + if isinstance(val, list): + return [to_bson(v) for v in val] + try: + return int(val) + except Exception: + pass + return str(val) + + +# ─── Extrakce částí zprávy ──────────────────────────────────────────────────── + +def extract_headers(msg) -> dict: + headers = {} + try: + hdr = msg.header + if not hdr: + return {} + from email.header import decode_header as _dh + + def _decode(v: str) -> str: + try: + parts = _dh(v) + out = "" + for part, enc in parts: + out += part.decode(enc or "utf-8", errors="replace") if isinstance(part, bytes) else part + return out + except Exception: + return v + + for key in set(hdr.keys()): + k = key.lower().replace("-", "_") + vals = [_decode(v) for v in hdr.get_all(key, [])] + headers[k] = vals if len(vals) > 1 else (vals[0] if vals else "") + except Exception as e: + logging.error("extract_headers: %s", e) + return headers + + +def extract_recipients(msg) -> list: + result = [] + type_map = {1: "to", 2: "cc", 3: "bcc"} + try: + for r in msg.recipients: + rtype = getattr(r, "type", 1) + try: + rtype = int(rtype) + except Exception: + try: + rtype = int(rtype.value) + except Exception: + rtype = 1 + rec = { + "type": type_map.get(rtype, "to"), + "email": safe(r, "email", default=""), + "name": safe(r, "name", default=""), + } + result.append(rec) + except Exception as e: + logging.error("extract_recipients: %s", e) + return result + + +def extract_attachments(msg) -> list: + result = [] + try: + for att in msg.attachments: + fname = safe(att, "longFilename", "shortFilename", default="") + if not fname: + continue + size = 0 + try: + d = att.data + size = len(d) if d else 0 + except Exception: + pass + result.append({ + "filename": fname, + "size_bytes": size, + "mime_type": safe(att, "mimetype", "mimeType", default="application/octet-stream"), + "content_id": safe(att, "cid", default=None), + "is_inline": bool(safe(att, "isInline", default=False)), + }) + except Exception as e: + logging.error("extract_attachments: %s", e) + return result + + +def extract_mapi_props(msg) -> dict: + """Vsechny raw MAPI properties jako {0xXXXX: value}.""" + result = {} + try: + props = msg.props + if not hasattr(props, "items"): + return {} + for key, prop in props.items(): + try: + val = to_bson(prop.value) + prop_id = f"0x{key[:4].upper()}" if len(key) >= 4 else f"0x{key.upper()}" + result[prop_id] = val + except Exception: + pass + except Exception as e: + logging.error("extract_mapi_props: %s", e) + return result + + +# ─── Hlavní extrakce ───────────────────────────────────────────────────────── + +def extract_message(msg_path: Path) -> Optional[dict]: + """Parsuje jeden .msg soubor -> MongoDB dokument.""" + try: + msg = extract_msg.Message(str(msg_path)) + except Exception as e: + logging.error("open failed [%s]: %s", msg_path.name, e) + return None + + try: + # ── Message-ID ──────────────────────────────────────────────── + mid = None + for attr in ("messageId", "message_id", "internetMessageId"): + mid = safe(msg, attr) + if mid: + break + if not mid: + mid = f"filename:{msg_path.stem}" + mid = str(mid).strip() + + # ── Předmět ─────────────────────────────────────────────────── + try: + subject = msg.subject or "" + except Exception: + subject = "" + + normalized_subject = safe(msg, "normalizedSubject", "normalized_subject", default="") + + # ── Tělo ────────────────────────────────────────────────────── + try: + body_text = msg.body or "" + except Exception: + body_text = "" + + body_html = None + try: + bh = msg.htmlBody + if isinstance(bh, bytes): + bh = bh.decode("utf-8", errors="replace") + if bh: + body_html = bh if len(bh) <= 2 * 1024 * 1024 else bh[:2 * 1024 * 1024] + except Exception: + pass + + # ── Odesílatel ──────────────────────────────────────────────── + try: + sender_email = msg.sender or "" + except Exception: + sender_email = "" + + sender_name = safe(msg, "senderName", "sender_name", default="") + sender_smtp = safe(msg, "senderSmtpAddress", "sent_representing_smtp_address", default="") + + # ── Příjemci ────────────────────────────────────────────────── + recipients = extract_recipients(msg) + + try: + to_raw = msg.to or "" + except Exception: + to_raw = "" + try: + cc_raw = msg.cc or "" + except Exception: + cc_raw = "" + try: + bcc_raw = getattr(msg, "bcc", None) or "" + except Exception: + bcc_raw = "" + + display_to = safe(msg, "displayTo", "display_to", default="") + display_cc = safe(msg, "displayCc", "display_cc", default="") + + # ── Časy ────────────────────────────────────────────────────── + try: + received_at = parse_date(msg.date) + except Exception: + received_at = None + + sent_at = None + for attr in ("clientSubmitTime", "client_submit_time", "sentOn"): + v = safe(msg, attr) + if v: + sent_at = parse_date(v) + break + + # ── MAPI vlastnosti ─────────────────────────────────────────── + importance = 1 + try: + v = msg.importance + if v is not None: + importance = int(v) + except Exception: + pass + + sensitivity = 0 + try: + v = getattr(msg, "sensitivity", None) + if v is not None: + sensitivity = int(v) + except Exception: + pass + + flag_status = 0 + try: + v = safe(msg, "flagStatus", "flag_status") + if v is not None: + flag_status = int(v) + except Exception: + pass + + conversation_topic = safe(msg, "conversationTopic", "conversation_topic", default="") + + conversation_index = "" + try: + ci = safe(msg, "conversationIndex", "conversation_index") + if isinstance(ci, bytes): + conversation_index = base64.b64encode(ci).decode() + elif ci: + conversation_index = str(ci) + except Exception: + pass + + in_reply_to = safe(msg, "inReplyTo", "in_reply_to", default="") + + internet_refs = [] + try: + refs = safe(msg, "internetReferences", "internet_references") + if isinstance(refs, list): + internet_refs = refs + elif isinstance(refs, str) and refs: + internet_refs = [r.strip() for r in refs.split() if r.strip()] + except Exception: + pass + + categories = [] + try: + cats = safe(msg, "categories") + if isinstance(cats, list): + categories = [str(c) for c in cats if c] + elif isinstance(cats, str) and cats: + categories = [c.strip() for c in re.split(r"[;,]", cats) if c.strip()] + except Exception: + pass + + read_receipt = bool(safe(msg, "readReceiptRequested", "read_receipt_requested", default=False)) + delivery_receipt = bool(safe(msg, "deliveryReceiptRequested", "delivery_receipt_requested", default=False)) + + # ── Internet headers ────────────────────────────────────────── + headers = extract_headers(msg) + + if not in_reply_to: + in_reply_to = headers.get("in_reply_to", "") + if not internet_refs: + refs_str = headers.get("references", "") + if isinstance(refs_str, str) and refs_str: + internet_refs = [r.strip() for r in refs_str.split() if r.strip()] + + # ── Přílohy ─────────────────────────────────────────────────── + attachments = extract_attachments(msg) + + # ── Raw MAPI ────────────────────────────────────────────────── + mapi_raw = extract_mapi_props(msg) + + msg.close() + + # ── Dokument ────────────────────────────────────────────────── + return { + "_id": mid, + "filename": msg_path.name, + + "subject": subject, + "normalized_subject": normalized_subject, + "importance": importance, + "sensitivity": sensitivity, + "flag_status": flag_status, + "read_receipt_requested": read_receipt, + "delivery_receipt_requested": delivery_receipt, + "has_attachments": len(attachments) > 0, + "attachment_count": len(attachments), + "message_size_bytes": msg_path.stat().st_size, + + "conversation_topic": conversation_topic, + "conversation_index": conversation_index, + "in_reply_to": in_reply_to, + "internet_references": internet_refs, + "categories": categories, + + "received_at": received_at, + "sent_at": sent_at, + + "sender": { + "email": sender_email, + "name": sender_name, + "smtp": sender_smtp, + }, + "to": to_raw, + "cc": cc_raw, + "bcc": bcc_raw, + "display_to": display_to, + "display_cc": display_cc, + "recipients": recipients, + + "body_text": body_text, + "body_html": body_html, + + "attachments": attachments, + "headers": headers, + "mapi": mapi_raw, + + "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None), + } + + except Exception as e: + logging.error("extract_message failed [%s]: %s", msg_path.name, e) + return None + + +# ─── MongoDB indexy ─────────────────────────────────────────────────────────── + +def create_indexes(col): + print(" Vytvarim indexy...") + col.create_index([("received_at", ASCENDING)]) + col.create_index([("sent_at", ASCENDING)]) + col.create_index([("sender.email", ASCENDING)]) + col.create_index([("filename", ASCENDING)], unique=True, sparse=True) + col.create_index([("conversation_topic", ASCENDING)]) + col.create_index([("has_attachments", ASCENDING)]) + col.create_index([("categories", ASCENDING)]) + col.create_index([("importance", ASCENDING)]) + col.create_index([("flag_status", ASCENDING)]) + col.create_index([ + ("subject", TEXT), + ("body_text", TEXT), + ("to", TEXT), + ("cc", TEXT), + ], name="text_search", default_language="none") + print(" Indexy hotovy.") + + +# ─── MAIN ───────────────────────────────────────────────────────────────────── + +def main(): + ap = argparse.ArgumentParser(description=f"parse_emails v{SCRIPT_VERSION}") + ap.add_argument("--msgs-dir", default=str(MSGS_DIR), + help="Cesta k .msg souborum") + ap.add_argument("--limit", type=int, default=0, + help="Zpracovat max N souboru (0 = vse)") + ap.add_argument("--skip-existing", action="store_true", + help="Preskocit soubory ktere jiz jsou v MongoDB (pokracovani)") + ap.add_argument("--no-indexes", action="store_true", + help="Nevytvorit indexy na konci") + args = ap.parse_args() + + msgs_dir = Path(args.msgs_dir) + start = datetime.now() + + print(f"=== parse_emails v{SCRIPT_VERSION} ===") + print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Zdroj: {msgs_dir}") + print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}") + + # MongoDB + client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) + try: + client.admin.command("ping") + print(" MongoDB OK") + except Exception as e: + print(f" CHYBA: MongoDB neni dostupna -- {e}") + sys.exit(1) + + col = client[MONGO_DB][MONGO_COL] + + # Skip existing — nacti seznam uz importovanych souboru + existing: set = set() + if args.skip_existing: + print(" Nacitam existujici zaznamy z MongoDB...") + existing = set(col.distinct("filename")) + print(f" {len(existing)} jiz importovano") + + # Scan + print(f"\nSkenuji {msgs_dir} ...") + all_files = sorted(msgs_dir.glob("*.msg")) + if args.limit: + all_files = all_files[:args.limit] + + to_process = [f for f in all_files if f.name not in existing] + skipped = len(all_files) - len(to_process) + total = len(to_process) + + print(f" Celkem .msg: {len(all_files)}") + print(f" Preskoceno: {skipped}") + print(f" Ke zpracovani: {total}\n") + + if total == 0: + print("Neni co importovat.") + client.close() + return + + batch = [] + ok_count = 0 + err_count = 0 + + def flush(): + if not batch: + return + try: + col.bulk_write(batch, ordered=False) + except Exception as e: + logging.error("bulk_write: %s", e) + print(f" CHYBA bulk_write: {e}") + batch.clear() + + for i, msg_path in enumerate(to_process, 1): + doc = extract_message(msg_path) + + if doc is None: + err_count += 1 + else: + batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True)) + ok_count += 1 + + if len(batch) >= BATCH_SIZE: + flush() + + # Výpis každého emailu + status = "ERR " if doc is None else "OK " + subject_str = (doc.get("subject") or "")[:60] if doc else "?" + sender_str = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?" + print(f" {i:>6}/{total} {status} {subject_str:<60} {sender_str}") + + if i % 500 == 0: + elapsed = (datetime.now() - start).total_seconds() + rate = i / elapsed if elapsed > 0 else 0 + eta_s = int((total - i) / rate) if rate > 0 else 0 + print(f" {'─'*80}") + print(f" Průběh: ok={ok_count} err={err_count} " + f"{rate:.1f} msg/s ETA {eta_s//3600}h{(eta_s%3600)//60}m") + print(f" {'─'*80}") + + flush() + + elapsed_total = (datetime.now() - start).total_seconds() + print(f"\n{'='*52}") + print(f"Vysledek: ok={ok_count} | skip={skipped} | err={err_count}") + print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") + print(f"Dokumentu v kolekci: {col.count_documents({})}") + + if not args.no_indexes: + print() + create_indexes(col) + + print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + if err_count: + print(f"Chyby logovany do: {LOG_FILE}") + + client.close() + + +if __name__ == "__main__": + main() diff --git a/EmailsImport/parse_emails_v1.0.py b/EmailsImport/parse_emails_v1.0.py index 6a119da..842dae8 100644 --- a/EmailsImport/parse_emails_v1.0.py +++ b/EmailsImport/parse_emails_v1.0.py @@ -115,7 +115,7 @@ if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") # ─── KONFIGURACE ────────────────────────────────────────────────────────────── -MSGS_DIR = Path(r"\\tower\JNJEMAILS") +MSGS_DIR = Path("/mnt/JNJEMAILS") MONGO_URI = "mongodb://192.168.1.76:27017" MONGO_DB = "emaily" MONGO_COL = "vbuzalka@its.jnj.com"