notebook
This commit is contained in:
@@ -0,0 +1,427 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
================================================================================
|
||||
Nazev: mailstore_ingest_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-11
|
||||
Autor: Vladimir Buzalka (asistovano Claude)
|
||||
Popis: Backfill stare historie z MailStore archivu do MongoDB `emaily`.
|
||||
Dobere do existujici kolekce schranky JEN zpravy, ktere tam jeste
|
||||
nejsou - dedup podle internet Message-ID (= _id v Mongu).
|
||||
|
||||
Cilove schema dokumentu = stejne jako Graph import, takze navazujici
|
||||
enrich_fulltext_emails + MCP `emaily` search funguji bez uprav.
|
||||
|
||||
Strategie:
|
||||
1. Nacti SET vsech Message-ID (_id) co uz v Mongu pro schranku jsou.
|
||||
2. Projdi slozky schranky (API GetChildFolders).
|
||||
3. Per slozka davkove stahni hlavicky (UID, DATE, MESSAGE-ID) - rychle.
|
||||
4. Kandidat = Message-ID neni v setu AND rok(DATE) >= --since.
|
||||
5. Pro kandidaty stahni cele telo (RFC822), naparsuj, upsert do Mongo.
|
||||
|
||||
Filtr data je client-side z DATE headeru (IMAP SEARCH je u MailStme 78s/k nicemu).
|
||||
|
||||
Spusteni:
|
||||
# KOLIK by se dobralo (nic nezapise) - delej VZDY prvni:
|
||||
python mailstore_ingest_v1.0.py "vladimir.buzalka@buzalka.cz" --since 2020 --dry-run
|
||||
# ostry beh:
|
||||
python mailstore_ingest_v1.0.py "vladimir.buzalka@buzalka.cz" --since 2020
|
||||
# test na jedne slozce / s limitem:
|
||||
python mailstore_ingest_v1.0.py "vladimir.buzalka@buzalka.cz" --since 2020 \
|
||||
--folder "vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Sent Items" --limit 50
|
||||
================================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import email
|
||||
import imaplib
|
||||
import json
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from base64 import b64encode
|
||||
from datetime import datetime, timezone
|
||||
from email.header import decode_header
|
||||
from email.utils import getaddresses, parsedate_to_datetime
|
||||
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MS_HOST = "192.168.1.53"
|
||||
IMAP_PORT = 143
|
||||
API_PORT = 8463
|
||||
MS_USER = "admin"
|
||||
MS_PASS = "*$N(B)vMUym!%"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
|
||||
HEADER_BATCH = 2000 # kolik hlavicek FETCHovat naraz
|
||||
UPSERT_BATCH = 100 # kolik dokumentu zapsat naraz do Mongo
|
||||
|
||||
# --- API (jen GetChildFolders na seznam slozek) -----------------------------
|
||||
_API_BASE = f"https://{MS_HOST}:{API_PORT}/api"
|
||||
_API_AUTH = "Basic " + b64encode(f"{MS_USER}:{MS_PASS}".encode()).decode()
|
||||
_CTX = ssl.create_default_context()
|
||||
_CTX.check_hostname = False
|
||||
_CTX.verify_mode = ssl.CERT_NONE
|
||||
|
||||
|
||||
def api_result(method: str, params: dict | None = None):
|
||||
data = urllib.parse.urlencode(params or {}).encode()
|
||||
req = urllib.request.Request(f"{_API_BASE}/invoke/{method}", data=data, method="POST",
|
||||
headers={"Authorization": _API_AUTH,
|
||||
"Content-Type": "application/x-www-form-urlencoded"})
|
||||
with urllib.request.urlopen(req, context=_CTX, timeout=30) as resp:
|
||||
r = json.loads(resp.read().decode("utf-8-sig"))
|
||||
if r.get("statusCode") != "succeeded":
|
||||
raise RuntimeError(f"{method}: {(r.get('error') or {}).get('message')}")
|
||||
return r.get("result")
|
||||
|
||||
|
||||
def collect_folders(mailbox: str) -> list[str]:
|
||||
"""Vrati seznam plnych cest vsech slozek schranky (rekurzivne)."""
|
||||
tree = api_result("GetChildFolders", {"folder": mailbox, "maxLevels": 20})
|
||||
out: list[str] = []
|
||||
|
||||
def walk(node):
|
||||
for ch in node.get("childFolders") or []:
|
||||
out.append(ch["fullName"])
|
||||
walk(ch)
|
||||
|
||||
walk(tree)
|
||||
return out
|
||||
|
||||
|
||||
# --- IMAP --------------------------------------------------------------------
|
||||
|
||||
def imap_connect() -> imaplib.IMAP4:
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
M = imaplib.IMAP4(MS_HOST, IMAP_PORT)
|
||||
M.starttls(ssl_context=ctx)
|
||||
M.login(MS_USER, MS_PASS)
|
||||
return M
|
||||
|
||||
|
||||
_SEQ_RX = re.compile(rb"^(\d+)\s")
|
||||
_UID_RX = re.compile(rb"UID (\d+)")
|
||||
|
||||
|
||||
def dec(s) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
out = []
|
||||
for txt, enc in decode_header(s):
|
||||
out.append(txt.decode(enc or "utf-8", errors="replace") if isinstance(txt, bytes) else txt)
|
||||
return "".join(out).replace("\r", " ").replace("\n", " ").strip()
|
||||
|
||||
|
||||
def parse_date(raw) -> datetime | None:
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
dt = parsedate_to_datetime(raw)
|
||||
if dt.tzinfo:
|
||||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def encode_mutf7(s: str) -> str:
|
||||
"""Nazev IMAP slozky -> modified UTF-7 (RFC 3501). MailStore neumi
|
||||
UTF8=ACCEPT, takze slozky s diakritikou (Dorucena posta) musi byt mUTF-7.
|
||||
Vysledek je cisty ASCII -> bezpecne projde imaplib (ascii encoding)."""
|
||||
res = []
|
||||
i, n = 0, len(s)
|
||||
while i < n:
|
||||
ch = s[i]
|
||||
o = ord(ch)
|
||||
if 0x20 <= o <= 0x7e:
|
||||
res.append("&-" if ch == "&" else ch)
|
||||
i += 1
|
||||
else:
|
||||
j = i
|
||||
while j < n and not (0x20 <= ord(s[j]) <= 0x7e):
|
||||
j += 1
|
||||
import base64 as _b64
|
||||
b = s[i:j].encode("utf-16-be")
|
||||
enc = _b64.b64encode(b).decode("ascii").rstrip("=").replace("/", ",")
|
||||
res.append("&" + enc + "-")
|
||||
i = j
|
||||
return "".join(res)
|
||||
|
||||
|
||||
def imap_select(M: imaplib.IMAP4, folder: str):
|
||||
"""SELECT slozky s mUTF-7 enkodovanim nazvu (kvuli diakritice)."""
|
||||
return M.select(f'"{encode_mutf7(folder)}"', readonly=True)
|
||||
|
||||
|
||||
def scan_folder_headers(M: imaplib.IMAP4, folder: str):
|
||||
"""Davkove stahne (seq, uid, msgid, date) vsech zprav slozky."""
|
||||
typ, data = imap_select(M, folder)
|
||||
if typ != "OK":
|
||||
return None, []
|
||||
total = int(data[0]) if data and data[0] else 0
|
||||
if total == 0:
|
||||
return 0, []
|
||||
items = []
|
||||
lo = 1
|
||||
while lo <= total:
|
||||
hi = min(lo + HEADER_BATCH - 1, total)
|
||||
typ, msgs = M.fetch(f"{lo}:{hi}",
|
||||
"(UID BODY.PEEK[HEADER.FIELDS (MESSAGE-ID DATE)])")
|
||||
for it in msgs:
|
||||
if not isinstance(it, tuple):
|
||||
continue
|
||||
meta, hdr = it[0], it[1]
|
||||
mseq = _SEQ_RX.match(meta or b"")
|
||||
muid = _UID_RX.search(meta or b"")
|
||||
h = email.message_from_bytes(hdr or b"")
|
||||
mid = (h.get("Message-ID") or "").strip()
|
||||
items.append((int(mseq.group(1)) if mseq else 0,
|
||||
int(muid.group(1)) if muid else 0,
|
||||
mid, parse_date(h.get("Date"))))
|
||||
lo = hi + 1
|
||||
return total, items
|
||||
|
||||
|
||||
def fetch_full(M: imaplib.IMAP4, seq: int) -> bytes | None:
|
||||
typ, data = M.fetch(str(seq), "(RFC822)")
|
||||
if typ != "OK" or not data or not isinstance(data[0], tuple):
|
||||
return None
|
||||
return data[0][1]
|
||||
|
||||
|
||||
# --- mapovani EML -> Mongo dokument -----------------------------------------
|
||||
|
||||
def relativize(folder: str, mailbox: str) -> str:
|
||||
"""schranka/Exchange X/Sent Items -> Sent Items (jako Graph folder_path)."""
|
||||
parts = folder.split("/")
|
||||
# odstran prefix schranky a 'Exchange ...' uroven
|
||||
if len(parts) >= 2 and parts[0] == mailbox:
|
||||
rest = parts[2:] if len(parts) > 2 else parts[1:]
|
||||
return "/".join(rest) if rest else parts[-1]
|
||||
return parts[-1]
|
||||
|
||||
|
||||
def parse_addr_one(raw) -> dict:
|
||||
if not raw:
|
||||
return {"email": None, "name": None}
|
||||
pairs = getaddresses([raw])
|
||||
if not pairs:
|
||||
return {"email": None, "name": None}
|
||||
name, addr = pairs[0]
|
||||
return {"email": (addr or "").lower() or None, "name": dec(name) or (addr or None)}
|
||||
|
||||
|
||||
def parse_recipients(msg) -> list[dict]:
|
||||
out = []
|
||||
for kind, hdr in (("to", "To"), ("cc", "Cc"), ("bcc", "Bcc")):
|
||||
val = msg.get(hdr)
|
||||
if not val:
|
||||
continue
|
||||
for name, addr in getaddresses([val]):
|
||||
if addr:
|
||||
out.append({"type": kind, "email": addr.lower(),
|
||||
"name": dec(name) or addr})
|
||||
return out
|
||||
|
||||
|
||||
def extract_bodies(msg):
|
||||
body_text = body_html = ""
|
||||
atts = []
|
||||
for part in msg.walk():
|
||||
if part.is_multipart():
|
||||
continue
|
||||
ct = part.get_content_type()
|
||||
disp = str(part.get("Content-Disposition") or "")
|
||||
payload = part.get_payload(decode=True)
|
||||
is_att = "attachment" in disp or (part.get_filename() and ct not in ("text/plain", "text/html"))
|
||||
if is_att:
|
||||
atts.append({
|
||||
"filename": dec(part.get_filename()) or "(bez nazvu)",
|
||||
"size_bytes": len(payload or b""),
|
||||
"mime_type": ct,
|
||||
"is_inline": "inline" in disp,
|
||||
})
|
||||
elif ct == "text/plain" and not body_text:
|
||||
body_text = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
elif ct == "text/html" and not body_html:
|
||||
body_html = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
return body_text, body_html, atts
|
||||
|
||||
|
||||
def build_doc(raw: bytes, uid: int, folder: str, mailbox: str) -> dict | None:
|
||||
msg = email.message_from_bytes(raw)
|
||||
mid = (msg.get("Message-ID") or "").strip()
|
||||
if not mid:
|
||||
return None
|
||||
dt = parse_date(msg.get("Date"))
|
||||
body_text, body_html, atts = extract_bodies(msg)
|
||||
now = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||
preview = (body_text or "")[:255]
|
||||
return {
|
||||
"_id": mid,
|
||||
"source": "mailstore",
|
||||
"mailstore_uid": uid,
|
||||
"mailstore_folder": folder,
|
||||
# graph_id zamerne VYNECHANO: kolekce ma unique+sparse index na graph_id,
|
||||
# explicitni None by kolidoval (sparse ignoruje jen CHYBEJICI pole).
|
||||
"conversation_id": None,
|
||||
"folder_path": relativize(folder, mailbox),
|
||||
"subject": dec(msg.get("Subject")),
|
||||
"sender": parse_addr_one(msg.get("From")),
|
||||
"recipients": parse_recipients(msg),
|
||||
"to": dec(msg.get("To")),
|
||||
"cc": dec(msg.get("Cc")),
|
||||
"bcc": dec(msg.get("Bcc")),
|
||||
"sent_at": dt,
|
||||
"received_at": dt,
|
||||
"modified_at": now,
|
||||
"created_at": now,
|
||||
"parsed_at": now,
|
||||
"is_read": True,
|
||||
"is_draft": "draft" in folder.lower() or "koncept" in folder.lower(),
|
||||
"has_attachments": bool(atts),
|
||||
"attachment_count": len(atts),
|
||||
"attachments": atts,
|
||||
"body_html": body_html or None,
|
||||
"body_text": body_text or None,
|
||||
"body_preview": preview,
|
||||
}
|
||||
|
||||
|
||||
# --- hlavni ------------------------------------------------------------------
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="MailStore -> Mongo backfill (dedup dle Message-ID)")
|
||||
ap.add_argument("mailbox", help="Schranka (top-level slozka MailStore = Mongo kolekce)")
|
||||
ap.add_argument("--since", type=int, default=None,
|
||||
help="Ber jen zpravy s rokem >= SINCE (napr. 2020)")
|
||||
ap.add_argument("--until", type=int, default=None,
|
||||
help="Ber jen zpravy s rokem <= UNTIL")
|
||||
ap.add_argument("--folder", default=None, help="Jen jedna konkretni slozka (plna cesta)")
|
||||
ap.add_argument("--limit", type=int, default=None, help="Max zprav k ingestu (test)")
|
||||
ap.add_argument("--max-folders", type=int, default=None, help="Max slozek (diagnostika)")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="Jen spocitej kolik by se dobralo, NIC nezapisuj")
|
||||
args = ap.parse_args()
|
||||
|
||||
t0 = time.time()
|
||||
print(f"=== MailStore ingest v1.0 | schranka: {args.mailbox} ===")
|
||||
print(f"Filtr: rok >= {args.since or '-'}{' a <= ' + str(args.until) if args.until else ''}"
|
||||
f"{' [DRY-RUN]' if args.dry_run else ''}")
|
||||
|
||||
# Mongo + set znamych Message-ID
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
mongo.admin.command("ping")
|
||||
coll = mongo[MONGO_DB][args.mailbox]
|
||||
print("Nacitam existujici Message-ID z Mongo...", flush=True)
|
||||
known = set(coll.distinct("_id"))
|
||||
print(f" v Mongu uz mam: {len(known):,} zprav")
|
||||
|
||||
# slozky
|
||||
if args.folder:
|
||||
folders = [args.folder]
|
||||
else:
|
||||
folders = collect_folders(args.mailbox)
|
||||
print(f"Slozek ke kontrole: {len(folders)}")
|
||||
|
||||
M = imap_connect()
|
||||
|
||||
grand_seen = grand_cand = grand_ingested = 0
|
||||
queue: list[UpdateOne] = []
|
||||
|
||||
def flush():
|
||||
nonlocal queue
|
||||
if queue and not args.dry_run:
|
||||
coll.bulk_write(queue, ordered=False)
|
||||
queue = []
|
||||
|
||||
nonlocal_M = {"M": M}
|
||||
for fidx, folder in enumerate(folders):
|
||||
if args.max_folders and fidx >= args.max_folders:
|
||||
print(f" (--max-folders {args.max_folders} dosazeno)")
|
||||
break
|
||||
try:
|
||||
total, items = scan_folder_headers(nonlocal_M["M"], folder)
|
||||
except Exception as ex:
|
||||
# jedna chybna slozka nesmi shodit cely beh - zaloguj a pokracuj.
|
||||
# Pri chybe IMAP spojeni (abort) se prepoj.
|
||||
print(f" [{relativize(folder, args.mailbox)[:45]:45}] CHYBA: {type(ex).__name__}: {str(ex)[:80]}", flush=True)
|
||||
try:
|
||||
nonlocal_M["M"].logout()
|
||||
except Exception:
|
||||
pass
|
||||
nonlocal_M["M"] = imap_connect()
|
||||
continue
|
||||
M = nonlocal_M["M"]
|
||||
if not total:
|
||||
continue
|
||||
# kandidati: rok ok, neni v known, ma msgid
|
||||
cands = []
|
||||
for seq, uid, mid, dt in items:
|
||||
if not mid or mid in known:
|
||||
continue
|
||||
yr = dt.year if dt else None
|
||||
if args.since and (yr is None or yr < args.since):
|
||||
continue
|
||||
if args.until and (yr is None or yr > args.until):
|
||||
continue
|
||||
cands.append((seq, uid, mid))
|
||||
grand_seen += total
|
||||
grand_cand += len(cands)
|
||||
rel = relativize(folder, args.mailbox)
|
||||
print(f" [{rel[:45]:45}] zprav={total:>6} k dobrani={len(cands):>6}", flush=True)
|
||||
|
||||
if args.dry_run:
|
||||
continue
|
||||
|
||||
for seq, uid, mid in cands:
|
||||
if args.limit and grand_ingested >= args.limit:
|
||||
break
|
||||
raw = fetch_full(M, seq)
|
||||
if not raw:
|
||||
continue
|
||||
doc = build_doc(raw, uid, folder, args.mailbox)
|
||||
if not doc:
|
||||
continue
|
||||
queue.append(UpdateOne({"_id": doc["_id"]}, {"$setOnInsert": doc}, upsert=True))
|
||||
known.add(doc["_id"])
|
||||
grand_ingested += 1
|
||||
if len(queue) >= UPSERT_BATCH:
|
||||
flush()
|
||||
flush()
|
||||
if args.limit and grand_ingested >= args.limit:
|
||||
print(f" (dosazen limit {args.limit})")
|
||||
break
|
||||
|
||||
M.logout()
|
||||
flush()
|
||||
|
||||
print("-" * 64)
|
||||
print(f"Zprav proskenovano: {grand_seen:,}")
|
||||
print(f"K dobrani (chybi, v okne): {grand_cand:,}")
|
||||
if args.dry_run:
|
||||
print(">>> DRY-RUN: nic nezapsano. Pro ostry beh spust bez --dry-run.")
|
||||
else:
|
||||
print(f"Zapsano do Mongo: {grand_ingested:,}")
|
||||
print(f"Trvalo: {time.time()-t0:.1f}s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
sys.exit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user