z230
This commit is contained in:
@@ -0,0 +1,587 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: enrich_fulltext_emails_v1.4.py
|
||||
Verze: 1.4
|
||||
Datum: 2026-06-10
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Zmeny v1.4 (2026-06-10):
|
||||
- Bugfix: NON_MAILBOX_COLLECTIONS rozsireno o "jnj_messages" a
|
||||
"jnj_sync_state" (pomocne kolekce JNJ folder trackingu). Predtim je
|
||||
discover_mailboxes bral jako schranky (jiny schema dokumentu) ->
|
||||
errors=1 -> cely krok 5 FAIL(1) pri kazdem behu pipeline.
|
||||
|
||||
Popis:
|
||||
Vytahne plny text z emailu ulozenych v MongoDB (db: emaily) a ulozi ho do
|
||||
PostgreSQL (db: MongoEmaily, tabulka: emails) s GIN tsvector indexem.
|
||||
|
||||
Emaily se NESTAHUJI znovu - tela uz jsou v Mongo z parse_emails_graph_v1.4
|
||||
(a refetch_text_bodies_v1.0 pro stare plain-text emaily).
|
||||
Tento skript jen vybere prvni dostupne telo a posle text do PG na fulltext.
|
||||
|
||||
Zmeny v1.3.1 (2026-06-09):
|
||||
- Bugfix: _clean_for_pg nahrazuje osamocene surrogate (\\ud800-\\udfff) za U+FFFD.
|
||||
Drive jeden mail se surrogaty (napr. JNJ .msg) shodil celou davku a krok 5
|
||||
skoncil FAIL. EXTRACTOR_VERSION zustava 1.2 (neni zmena fallback logiky).
|
||||
|
||||
Zmeny v1.3 vs v1.2:
|
||||
- Bugfix: NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state"}
|
||||
(sync_state pribyla v delta syncu, predtim ji v1.2 brala jako mailbox).
|
||||
- --index-reset: pred zpracovanim schranky vymaze vsechny jeji emaily z PG
|
||||
(force re-extract; pouzij kdyz povysis EXTRACTOR_VERSION nebo chces ciste).
|
||||
- Vylepseny header per-mailbox: ukaze pocet v Mongu, v PG a k zpracovani.
|
||||
|
||||
Zmeny v1.2 vs v1.1:
|
||||
- S/MIME emaily: pokud unwrap_smime_v1.0 ulozil smime_body_text/smime_body_html,
|
||||
pouzije se PREFEROVANE pred bezvyznamnym wrapper telem.
|
||||
- body_source: nova hodnota "smime".
|
||||
- EXTRACTOR_VERSION=1.2 -> vsechny existujici emaily v PG se preparsuji.
|
||||
|
||||
Zmeny v1.1 vs v1.0:
|
||||
- Fallback poradi rozsireno o body_text.
|
||||
- body_source umi novou hodnotu "text" (plne plain-text telo, max 2 MB).
|
||||
|
||||
Zdroj:
|
||||
MongoDB 192.168.1.76 db=emaily kolekce=<mailbox>
|
||||
(krome NON_MAILBOX_COLLECTIONS)
|
||||
|
||||
Cil:
|
||||
PostgreSQL 192.168.1.76 db=MongoEmaily tabulka=emails
|
||||
tsvector config 'soubory' (sdileny - simple + unaccent)
|
||||
|
||||
Inkrementalita:
|
||||
Pokud (mailbox, message_id) jiz existuje a extractor_version je aktualni
|
||||
a modified_at v Mongo neni novejsi -> skip. Pri zmene verze extractoru
|
||||
se vse preparsuje. --index-reset to obejde a smaze PG pred behom.
|
||||
|
||||
Spusteni:
|
||||
python enrich_fulltext_emails_v1.4.py # vsechny schranky
|
||||
python enrich_fulltext_emails_v1.4.py --mailbox ordinace@buzalkova.cz
|
||||
python enrich_fulltext_emails_v1.4.py --limit 500 # test
|
||||
python enrich_fulltext_emails_v1.4.py --mailbox X --index-reset # smaze PG schranky a re-extrahuje vsechno
|
||||
python enrich_fulltext_emails_v1.4.py --index-reset # smaze CELY index a postavi znovu (POMALE!)
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import psycopg
|
||||
from bs4 import BeautifulSoup
|
||||
from pymongo import MongoClient
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoEmaily "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
EXTRACTOR_VERSION = "1.2" # NEMENIT pokud nemenis fallback logiku!
|
||||
|
||||
MAX_TEXT_BYTES = 5 * 1024 * 1024 # plain text max 5 MB
|
||||
|
||||
# Kolekce v `emaily` ktere NEJSOU mailboxy (nezpracovavame)
|
||||
# (jnj_messages + jnj_sync_state = pomocne kolekce JNJ folder trackingu)
|
||||
NON_MAILBOX_COLLECTIONS = {"attachments_index", "sync_state",
|
||||
"jnj_messages", "jnj_sync_state"}
|
||||
|
||||
BATCH_SIZE = 100
|
||||
|
||||
|
||||
# --- SCHEMA -----------------------------------------------------------------
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE EXTENSION IF NOT EXISTS unaccent;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'soubory') THEN
|
||||
CREATE TEXT SEARCH CONFIGURATION soubory ( COPY = simple );
|
||||
ALTER TEXT SEARCH CONFIGURATION soubory
|
||||
ALTER MAPPING FOR hword, hword_part, word
|
||||
WITH unaccent, simple;
|
||||
END IF;
|
||||
END$$;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS emails (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
mailbox TEXT NOT NULL,
|
||||
message_id TEXT NOT NULL,
|
||||
graph_id TEXT,
|
||||
conversation_id TEXT,
|
||||
folder_path TEXT,
|
||||
subject TEXT,
|
||||
sender_email TEXT,
|
||||
sender_name TEXT,
|
||||
to_addrs TEXT,
|
||||
cc_addrs TEXT,
|
||||
bcc_addrs TEXT,
|
||||
sent_at TIMESTAMPTZ,
|
||||
received_at TIMESTAMPTZ,
|
||||
modified_at TIMESTAMPTZ,
|
||||
is_read BOOLEAN,
|
||||
is_draft BOOLEAN,
|
||||
has_attachments BOOLEAN,
|
||||
attachment_count INT,
|
||||
attachments_summary TEXT,
|
||||
body TEXT,
|
||||
body_length INT,
|
||||
body_source TEXT, -- 'html' | 'preview' | 'empty'
|
||||
tsv tsvector GENERATED ALWAYS AS (
|
||||
to_tsvector('soubory'::regconfig,
|
||||
left(
|
||||
coalesce(subject, '') || ' ' ||
|
||||
coalesce(sender_email, '') || ' ' ||
|
||||
coalesce(sender_name, '') || ' ' ||
|
||||
coalesce(to_addrs, '') || ' ' ||
|
||||
coalesce(cc_addrs, '') || ' ' ||
|
||||
coalesce(attachments_summary, '') || ' ' ||
|
||||
coalesce(body, ''),
|
||||
800000)
|
||||
)
|
||||
) STORED,
|
||||
extracted_at TIMESTAMPTZ DEFAULT now(),
|
||||
extractor_version TEXT,
|
||||
ok BOOLEAN,
|
||||
error TEXT,
|
||||
UNIQUE (mailbox, message_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS emails_tsv_gin ON emails USING gin(tsv);
|
||||
CREATE INDEX IF NOT EXISTS emails_subject_trgm ON emails USING gin(subject gin_trgm_ops);
|
||||
CREATE INDEX IF NOT EXISTS emails_sender_email_idx ON emails(sender_email);
|
||||
CREATE INDEX IF NOT EXISTS emails_mailbox_idx ON emails(mailbox);
|
||||
CREATE INDEX IF NOT EXISTS emails_received_idx ON emails(received_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS emails_conv_idx ON emails(conversation_id);
|
||||
"""
|
||||
|
||||
|
||||
# --- HELPERY ----------------------------------------------------------------
|
||||
|
||||
_CTRL_RX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
|
||||
_WS_RX = re.compile(r"[ \t]+")
|
||||
_NL_RX = re.compile(r"\n{3,}")
|
||||
# Osamocene surrogate (\ud800-\udfff) jsou neplatne v UTF-8 -> psycopg pri zapisu
|
||||
# vyhodi UnicodeEncodeError ("surrogates not allowed") a shodi celou davku.
|
||||
# Vznikaji ze spatne dekodovanych tel (napr. nektere JNJ .msg). Nahradime je U+FFFD.
|
||||
_SURROGATE_RX = re.compile(r"[\ud800-\udfff]")
|
||||
|
||||
|
||||
def _clean_for_pg(s: str) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
s = _CTRL_RX.sub("", s)
|
||||
if _SURROGATE_RX.search(s):
|
||||
s = _SURROGATE_RX.sub("�", s)
|
||||
return s
|
||||
|
||||
|
||||
def _truncate(s: str) -> str:
|
||||
s = _clean_for_pg(s or "")
|
||||
if not s:
|
||||
return ""
|
||||
b = s.encode("utf-8", errors="replace")
|
||||
if len(b) <= MAX_TEXT_BYTES:
|
||||
return s
|
||||
return b[:MAX_TEXT_BYTES].decode("utf-8", errors="ignore")
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
if not html:
|
||||
return ""
|
||||
try:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
except Exception:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for tag in soup(["script", "style", "head"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator="\n")
|
||||
lines = [_WS_RX.sub(" ", ln).strip() for ln in text.split("\n")]
|
||||
text = "\n".join(ln for ln in lines if ln)
|
||||
text = _NL_RX.sub("\n\n", text)
|
||||
return text
|
||||
|
||||
|
||||
def fmt_recipients(recipients: list, kind: str) -> str:
|
||||
if not recipients:
|
||||
return ""
|
||||
out = []
|
||||
for r in recipients:
|
||||
if not isinstance(r, dict):
|
||||
continue
|
||||
if r.get("type") != kind:
|
||||
continue
|
||||
name = (r.get("name") or "").strip()
|
||||
email = (r.get("email") or "").strip()
|
||||
if name and email:
|
||||
out.append(f"{name} <{email}>")
|
||||
elif email:
|
||||
out.append(email)
|
||||
elif name:
|
||||
out.append(name)
|
||||
return "; ".join(out)
|
||||
|
||||
|
||||
def fmt_attachments(attachments: list) -> str:
|
||||
if not attachments:
|
||||
return ""
|
||||
out = []
|
||||
for a in attachments[:20]:
|
||||
if not isinstance(a, dict):
|
||||
continue
|
||||
name = a.get("name") or a.get("filename") or ""
|
||||
if name:
|
||||
out.append(name)
|
||||
return " | ".join(out)
|
||||
|
||||
|
||||
def _short(s, n=60):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return datetime.now(tz=timezone.utc)
|
||||
|
||||
|
||||
def _aware_utc(dt: Optional[datetime]) -> Optional[datetime]:
|
||||
"""Sjednoceni: PG TIMESTAMPTZ -> tz-aware UTC; Mongo datetime -> naive (UTC).
|
||||
Vrati tz-aware UTC datetime nebo None."""
|
||||
if dt is None:
|
||||
return None
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc)
|
||||
|
||||
|
||||
# --- HLAVNI SMYCKA ----------------------------------------------------------
|
||||
|
||||
def process_mailbox(pg: psycopg.Connection, mongo_coll, mailbox: str,
|
||||
limit: Optional[int] = None,
|
||||
index_reset: bool = False) -> dict:
|
||||
# --index-reset: smaz vse pro tuto schranku v PG
|
||||
if index_reset:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("DELETE FROM emails WHERE mailbox = %s", (mailbox,))
|
||||
deleted = cur.rowcount
|
||||
pg.commit()
|
||||
print(f"[{mailbox}] --index-reset: smazano {deleted} radku v PG")
|
||||
|
||||
# existujici zaznamy v PG (rychly inkrementalni lookup)
|
||||
# tuple = (extractor_version, ok, body_source)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT message_id, extractor_version, ok, body_source "
|
||||
"FROM emails WHERE mailbox = %s",
|
||||
(mailbox,),
|
||||
)
|
||||
existing = {row[0]: (row[1], row[2], row[3]) for row in cur.fetchall()}
|
||||
|
||||
mongo_total = mongo_coll.estimated_document_count()
|
||||
pg_total = len(existing)
|
||||
pg_uptodate = sum(1 for v in existing.values()
|
||||
if v[0] == EXTRACTOR_VERSION and v[1])
|
||||
to_process_estimate = mongo_total - pg_uptodate
|
||||
print(f"\n========== {mailbox} ==========")
|
||||
print(f" v Mongu: {mongo_total}")
|
||||
print(f" v PG: {pg_total} (z toho ext_v={EXTRACTOR_VERSION} & ok=true: {pg_uptodate})")
|
||||
print(f" k zpracovani: ~{to_process_estimate}{' (limit=' + str(limit) + ')' if limit else ''}")
|
||||
|
||||
if to_process_estimate <= 0 and not index_reset and not limit:
|
||||
print(" Nic noveho ke zpracovani.")
|
||||
return {"mailbox": mailbox, "processed": 0, "ok": 0, "errors": 0,
|
||||
"skipped": pg_uptodate, "empty_body": 0}
|
||||
|
||||
proj = {
|
||||
"_id": 1, "graph_id": 1, "conversation_id": 1, "folder_path": 1,
|
||||
"subject": 1, "sender": 1, "recipients": 1,
|
||||
"sent_at": 1, "received_at": 1, "modified_at": 1,
|
||||
"is_read": 1, "is_draft": 1,
|
||||
"has_attachments": 1, "attachment_count": 1, "attachments": 1,
|
||||
"body_html": 1, "body_text": 1, "body_preview": 1,
|
||||
"smime_unwrapped": 1, "smime_body_text": 1, "smime_body_html": 1,
|
||||
"smime_subject": 1, "smime_inner_attachments": 1,
|
||||
}
|
||||
cursor = mongo_coll.find({}, proj, no_cursor_timeout=True)
|
||||
if limit:
|
||||
cursor = cursor.limit(limit)
|
||||
|
||||
processed = ok = errors = skipped = empty_body = 0
|
||||
queue: list[dict] = []
|
||||
n = 0
|
||||
|
||||
try:
|
||||
for doc in cursor:
|
||||
n += 1
|
||||
msg_id = doc.get("_id") or ""
|
||||
prev = existing.get(msg_id) # (extractor_version, ok, body_source)
|
||||
mongo_mtime = doc.get("modified_at")
|
||||
|
||||
# Skip kdyz PG ma stejnou EV a ok=true.
|
||||
# Vyjimka: smime_unwrapped v Mongu, ale PG body_source != 'smime'
|
||||
# -> unwrap_smime pridal rozbaleny text az po enrichu -> re-enrich.
|
||||
if prev and prev[0] == EXTRACTOR_VERSION and prev[1]:
|
||||
needs_smime_reindex = (
|
||||
bool(doc.get("smime_unwrapped"))
|
||||
and prev[2] != "smime"
|
||||
)
|
||||
if not needs_smime_reindex:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
sender = doc.get("sender") or {}
|
||||
recipients = doc.get("recipients") or []
|
||||
attachments = doc.get("attachments") or []
|
||||
inner = doc.get("smime_inner_attachments") or []
|
||||
if inner:
|
||||
attachments = list(attachments) + [
|
||||
{"filename": (a.get("filename") or "") + " [smime]"}
|
||||
for a in inner if a.get("filename")
|
||||
]
|
||||
|
||||
row = {
|
||||
"mailbox": mailbox,
|
||||
"message_id": msg_id,
|
||||
"graph_id": doc.get("graph_id"),
|
||||
"conversation_id": doc.get("conversation_id"),
|
||||
"folder_path": doc.get("folder_path"),
|
||||
"subject": doc.get("subject") or "",
|
||||
"sender_email": sender.get("email"),
|
||||
"sender_name": sender.get("name"),
|
||||
"to_addrs": fmt_recipients(recipients, "to"),
|
||||
"cc_addrs": fmt_recipients(recipients, "cc"),
|
||||
"bcc_addrs": fmt_recipients(recipients, "bcc"),
|
||||
# Vsechny timestampy z Monga jsou naive ale interpretovany jako UTC.
|
||||
# Tagneme je tz-aware aby PG TIMESTAMPTZ ulozil spravnou UTC hodnotu
|
||||
# a nepocital posun podle session timezone.
|
||||
"sent_at": _aware_utc(doc.get("sent_at")),
|
||||
"received_at": _aware_utc(doc.get("received_at")),
|
||||
"modified_at": _aware_utc(mongo_mtime),
|
||||
"is_read": doc.get("is_read"),
|
||||
"is_draft": doc.get("is_draft"),
|
||||
"has_attachments": doc.get("has_attachments"),
|
||||
"attachment_count": doc.get("attachment_count"),
|
||||
"attachments_summary": fmt_attachments(attachments),
|
||||
"body": None,
|
||||
"body_length": 0,
|
||||
"body_source": "empty",
|
||||
"extracted_at": _now(),
|
||||
"extractor_version": EXTRACTOR_VERSION,
|
||||
"ok": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
status = "OK "; detail = ""
|
||||
try:
|
||||
text = ""
|
||||
if doc.get("smime_unwrapped"):
|
||||
s_text = doc.get("smime_body_text") or ""
|
||||
s_html = doc.get("smime_body_html") or ""
|
||||
s_html_text = html_to_text(s_html) if s_html else ""
|
||||
combined = "\n\n".join(p for p in (s_text, s_html_text) if p)
|
||||
s_subject = doc.get("smime_subject") or ""
|
||||
if s_subject:
|
||||
combined = f"Subject: {s_subject}\n\n{combined}"
|
||||
if combined:
|
||||
text = combined
|
||||
row["body_source"] = "smime"
|
||||
if not text:
|
||||
html = doc.get("body_html") or ""
|
||||
h_text = html_to_text(html) if html else ""
|
||||
if h_text:
|
||||
text = h_text
|
||||
row["body_source"] = "html"
|
||||
if not text:
|
||||
plain = doc.get("body_text") or ""
|
||||
if plain:
|
||||
text = plain
|
||||
row["body_source"] = "text"
|
||||
if not text:
|
||||
preview = doc.get("body_preview") or ""
|
||||
if preview:
|
||||
text = preview
|
||||
row["body_source"] = "preview"
|
||||
if not text:
|
||||
row["body_source"] = "empty"
|
||||
empty_body += 1
|
||||
body = _truncate(text)
|
||||
row["body"] = body if body else None
|
||||
row["body_length"] = len(body)
|
||||
row["ok"] = True
|
||||
ok += 1
|
||||
detail = f"{len(body)} znaku {_short(body, 60)!r}"
|
||||
except Exception as e:
|
||||
row["error"] = f"{type(e).__name__}: {e}"[:500]
|
||||
status = "ERR"; detail = row["error"][:80]; errors += 1
|
||||
|
||||
queue.append(row)
|
||||
processed += 1
|
||||
|
||||
if processed % 200 == 0 or processed == 1:
|
||||
subj = _short(row["subject"], 50)
|
||||
print(f" [{n:>6}|p={processed:>5}] {status} {row['body_source']:<7} "
|
||||
f"{row['body_length']:>7}ch | {subj}", flush=True)
|
||||
|
||||
if len(queue) >= BATCH_SIZE:
|
||||
_flush(pg, queue); queue.clear()
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
if queue:
|
||||
_flush(pg, queue)
|
||||
|
||||
return {"mailbox": mailbox, "processed": processed, "ok": ok,
|
||||
"errors": errors, "skipped": skipped, "empty_body": empty_body}
|
||||
|
||||
|
||||
UPSERT_SQL = """
|
||||
INSERT INTO emails
|
||||
(mailbox, message_id, graph_id, conversation_id, folder_path,
|
||||
subject, sender_email, sender_name, to_addrs, cc_addrs, bcc_addrs,
|
||||
sent_at, received_at, modified_at, is_read, is_draft,
|
||||
has_attachments, attachment_count, attachments_summary,
|
||||
body, body_length, body_source,
|
||||
extracted_at, extractor_version, ok, error)
|
||||
VALUES
|
||||
(%(mailbox)s, %(message_id)s, %(graph_id)s, %(conversation_id)s, %(folder_path)s,
|
||||
%(subject)s, %(sender_email)s, %(sender_name)s, %(to_addrs)s, %(cc_addrs)s, %(bcc_addrs)s,
|
||||
%(sent_at)s, %(received_at)s, %(modified_at)s, %(is_read)s, %(is_draft)s,
|
||||
%(has_attachments)s, %(attachment_count)s, %(attachments_summary)s,
|
||||
%(body)s, %(body_length)s, %(body_source)s,
|
||||
%(extracted_at)s, %(extractor_version)s, %(ok)s, %(error)s)
|
||||
ON CONFLICT (mailbox, message_id) DO UPDATE SET
|
||||
graph_id = EXCLUDED.graph_id,
|
||||
conversation_id = EXCLUDED.conversation_id,
|
||||
folder_path = EXCLUDED.folder_path,
|
||||
subject = EXCLUDED.subject,
|
||||
sender_email = EXCLUDED.sender_email,
|
||||
sender_name = EXCLUDED.sender_name,
|
||||
to_addrs = EXCLUDED.to_addrs,
|
||||
cc_addrs = EXCLUDED.cc_addrs,
|
||||
bcc_addrs = EXCLUDED.bcc_addrs,
|
||||
sent_at = EXCLUDED.sent_at,
|
||||
received_at = EXCLUDED.received_at,
|
||||
modified_at = EXCLUDED.modified_at,
|
||||
is_read = EXCLUDED.is_read,
|
||||
is_draft = EXCLUDED.is_draft,
|
||||
has_attachments = EXCLUDED.has_attachments,
|
||||
attachment_count = EXCLUDED.attachment_count,
|
||||
attachments_summary = EXCLUDED.attachments_summary,
|
||||
body = EXCLUDED.body,
|
||||
body_length = EXCLUDED.body_length,
|
||||
body_source = EXCLUDED.body_source,
|
||||
extracted_at = EXCLUDED.extracted_at,
|
||||
extractor_version = EXCLUDED.extractor_version,
|
||||
ok = EXCLUDED.ok,
|
||||
error = EXCLUDED.error
|
||||
"""
|
||||
|
||||
|
||||
def _flush(pg: psycopg.Connection, rows: list[dict]) -> None:
|
||||
for r in rows:
|
||||
for k in ("subject", "sender_email", "sender_name", "to_addrs", "cc_addrs",
|
||||
"bcc_addrs", "attachments_summary", "body", "error", "folder_path"):
|
||||
if r.get(k):
|
||||
r[k] = _clean_for_pg(r[k])
|
||||
with pg.cursor() as cur:
|
||||
cur.executemany(UPSERT_SQL, rows)
|
||||
pg.commit()
|
||||
|
||||
|
||||
def discover_mailboxes(db) -> list[str]:
|
||||
out = []
|
||||
for name in sorted(db.list_collection_names()):
|
||||
if name in NON_MAILBOX_COLLECTIONS:
|
||||
continue
|
||||
out.append(name)
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="enrich_fulltext_emails v1.4")
|
||||
ap.add_argument("--mailbox", default="",
|
||||
help="Jedna konkretni schranka. Bez argumentu projede vsechny.")
|
||||
ap.add_argument("--limit", type=int,
|
||||
help="Limit emailu na schranku (test)")
|
||||
ap.add_argument("--index-reset", action="store_true",
|
||||
help="Pred zpracovanim schranky vymaze vsechny jeji emaily z PG "
|
||||
"(force re-extract). Bez --mailbox SMAZE CELY index.")
|
||||
args = ap.parse_args()
|
||||
|
||||
t0 = time.time()
|
||||
print(f"=== enrich_fulltext_emails v1.4 ===")
|
||||
print(f"Start: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print("\nPripojuji se k PostgreSQL...")
|
||||
pg = psycopg.connect(PG_DSN, connect_timeout=10)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(SCHEMA_SQL)
|
||||
pg.commit()
|
||||
print(" Schema OK.")
|
||||
|
||||
print("Pripojuji se k MongoDB...")
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
mongo.admin.command("ping")
|
||||
db = mongo[MONGO_DB]
|
||||
print(" MongoDB OK.")
|
||||
|
||||
if args.mailbox:
|
||||
mailboxes = [args.mailbox]
|
||||
else:
|
||||
mailboxes = discover_mailboxes(db)
|
||||
print(f"\nSchranky ke zpracovani ({len(mailboxes)}):")
|
||||
for mb in mailboxes:
|
||||
print(f" - {mb}")
|
||||
|
||||
if args.index_reset and not args.mailbox:
|
||||
print(f"\n!!! --index-reset bez --mailbox => SMAZE CELY INDEX ({len(mailboxes)} schranek) !!!")
|
||||
|
||||
results = []
|
||||
for mb in mailboxes:
|
||||
try:
|
||||
results.append(process_mailbox(pg, db[mb], mb,
|
||||
limit=args.limit,
|
||||
index_reset=args.index_reset))
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
print(f" FATAL pri zpracovani {mb}: {e}")
|
||||
results.append({"mailbox": mb, "processed": 0, "ok": 0,
|
||||
"errors": 1, "skipped": 0, "empty_body": 0})
|
||||
|
||||
pg.close()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("=== SHRNUTI ===")
|
||||
grand = {"processed": 0, "ok": 0, "errors": 0, "skipped": 0, "empty_body": 0}
|
||||
for r in results:
|
||||
print(f" {r['mailbox']:40} processed={r['processed']:>5} ok={r['ok']:>5} "
|
||||
f"errors={r['errors']:>3} skipped={r['skipped']:>6} empty={r['empty_body']:>4}")
|
||||
for k in grand:
|
||||
grand[k] += r.get(k, 0)
|
||||
print(f" {'TOTAL':40} processed={grand['processed']:>5} ok={grand['ok']:>5} "
|
||||
f"errors={grand['errors']:>3} skipped={grand['skipped']:>6} empty={grand['empty_body']:>4}")
|
||||
print(f"\nCelkem trvalo: {time.time() - t0:.1f} s")
|
||||
print(f"Konec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
# exit code: 0 jen kdyz vsechny schranky probehly bez chyby
|
||||
return 1 if grand["errors"] > 0 else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user