This commit is contained in:
2026-06-04 07:15:17 +02:00
parent 6c57ab3ae6
commit 9b12745e1d
6 changed files with 147 additions and 1119 deletions
-483
View File
@@ -1,483 +0,0 @@
"""
download_attachments_v1.3.py
Nazev: download_attachments_v1.3.py
Verze: 1.3
Datum: 2026-06-02
Autor: vladimir.buzalka
Popis:
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
pres Microsoft Graph API a uklada je do adresare
/mnt/Emails/<schránka>/Attachments/.
Schránka se predava jako povinny parametr --mailbox.
Deduplikace podle SHA256 hashe obsahu:
- stejny hash = soubor uz existuje -> preskoci
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
Po ulozeni aktualizuje MongoDB:
- v email dokumentu: kazda priloha dostane file_hash + local_path
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
mime_type, mailbox, first_seen_at, ref_count
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
se preskoci. --force-recheck znovu overi i uz stazene.
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
Spousteni:
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
Docker:
docker exec -it python-runner python /scripts/download_attachments_v1.3.py \\
--mailbox ordinace@buzalkova.cz
Zavislosti:
msal, requests, pymongo
Python 3.10+
Historie verzi:
1.0 2026-06-02 Inicialni verze
1.1 2026-06-02 Schránka jako parametr --mailbox
1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu;
preskoceni S/MIME; inline z Graphu -> SKIP ne ERR
1.3 2026-06-02 Primarni stazeni pres graph_att_id (prime ID bez name-matchingu);
oprava $select na attachment listu (odstranen contentId ktery
zpusoboval BadRequest a vracel prazdny seznam); name-matching
zustava jako fallback pro stare emaily bez graph_att_id
"""
import sys
import re
import hashlib
import logging
import argparse
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import msal
import requests
from pymongo import MongoClient, UpdateOne
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
MONGO_COL_INDEX = "attachments_index"
EMAILS_BASE_DIR = Path("/mnt/Emails")
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.3"
BATCH_SIZE = 50
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
# ──────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
_graph_token: Optional[str] = None
# ─── Graph API ────────────────────────────────────────────────────────────────
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
def graph_get_bytes(url: str) -> bytes:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.content
raise RuntimeError(f"Graph GET bytes failed: {url}")
def graph_get_json(url: str, params: dict = None) -> dict:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Graph GET json failed: {url}")
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
"""Nacte metadata vsech priloh zpravy (bez contentBytes)."""
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
try:
# Pozor: contentId NENI v base attachment type — nesmi byt v $select
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
return data.get("value", [])
except Exception as e:
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
return []
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
try:
return graph_get_bytes(url)
except Exception as e:
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s",
graph_message_id, attachment_id, e)
return None
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
def normalize_name(name: str) -> str:
"""Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-"""
nfkd = unicodedata.normalize("NFKD", name.lower().strip())
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
return re.sub(r"[^\w.\-]", "_", ascii_str)
def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
"""Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id)."""
# 1. Presna shoda
for ga in graph_atts:
if ga["name"] == att_name:
return ga
norm_want = normalize_name(att_name)
# 2. Normalizovana shoda
for ga in graph_atts:
if normalize_name(ga["name"]) == norm_want:
return ga
# 3. Normalizovana shoda + velikost (±10 %)
for ga in graph_atts:
if normalize_name(ga["name"]) == norm_want:
ga_size = ga.get("size", 0)
if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
return ga
# 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména)
for ga in graph_atts:
if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
return ga
return None
def sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def safe_filename(name: str) -> str:
safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
return safe or "attachment"
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
existing = col_index.find_one({"filename": desired_name})
if existing:
if existing["_id"] == hash_val:
return desired_name
stem = Path(desired_name).stem
suffix = Path(desired_name).suffix
n = 2
while True:
candidate = f"{stem}_{n}{suffix}"
ex2 = col_index.find_one({"filename": candidate})
if not ex2 or ex2["_id"] == hash_val:
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
return candidate
n += 1
return desired_name
def save_attachment(
content: bytes,
original_name: str,
mime_type: str,
mailbox: str,
att_dir: Path,
col_index,
) -> tuple[str, str, bool]:
hash_val = sha256(content)
existing = col_index.find_one({"_id": hash_val})
if existing:
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
return hash_val, existing["local_path"], False
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
file_path = att_dir / filename
file_path.write_bytes(content)
col_index.insert_one({
"_id": hash_val,
"filename": filename,
"local_path": filename,
"size_bytes": len(content),
"mime_type": mime_type,
"mailbox": mailbox,
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
"ref_count": 1,
})
return hash_val, filename, True
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def main():
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", required=True,
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
ap.add_argument("--limit", type=int, default=0,
help="Zpracovat max N emailu (0 = vse)")
ap.add_argument("--force-recheck", action="store_true",
help="Znovu overi i emaily kde prilohy uz maji file_hash")
ap.add_argument("--no-indexes", action="store_true",
help="Nevytvorit indexy na attachments_index kolekci")
args = ap.parse_args()
mailbox = args.mailbox
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
mongo_col = mailbox
start = datetime.now()
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Schránka: {mailbox}")
print(f"Cilovy adresar: {att_dir}")
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
att_dir.mkdir(parents=True, exist_ok=True)
print(" Adresar OK")
print("\nPřipojuji se k Graph API...")
try:
get_token()
print(" Graph API OK")
except Exception as e:
print(f" CHYBA: {e}")
sys.exit(1)
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
try:
client.admin.command("ping")
print(" MongoDB OK")
except Exception as e:
print(f" CHYBA: MongoDB neni dostupna -- {e}")
sys.exit(1)
col_emails = client[MONGO_DB][mongo_col]
col_index = client[MONGO_DB][MONGO_COL_INDEX]
if not args.no_indexes:
col_index.create_index("filename")
col_index.create_index("mime_type")
col_index.create_index("mailbox")
if args.force_recheck:
query = {"has_attachments": True}
else:
query = {
"has_attachments": True,
"attachments": {
"$elemMatch": {
"is_inline": False,
"file_hash": {"$exists": False},
}
}
}
total = col_emails.count_documents(query)
print(f"\nEmailu ke zpracovani: {total}")
if total == 0:
print("Neni co stahnout.")
client.close()
return
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
if args.limit:
cursor = cursor.limit(args.limit)
ok_count = 0
new_count = 0
dup_count = 0
skip_count = 0
err_count = 0
email_i = 0
batch = []
def flush():
if not batch:
return
try:
col_emails.bulk_write(batch, ordered=False)
except Exception as e:
logging.error("bulk_write: %s", e)
print(f" CHYBA bulk_write: {e}")
batch.clear()
for email_doc in cursor:
email_i += 1
email_id = email_doc["_id"]
graph_id = email_doc.get("graph_id", "")
subject = (email_doc.get("subject") or "")[:60]
att_list = email_doc.get("attachments") or []
real_atts = [a for a in att_list if not a.get("is_inline", False)]
if not real_atts:
continue
print(f"\n {email_i:>5}/{total} {subject}")
# Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id
need_listing = any(
not a.get("is_inline", False)
and not (not args.force_recheck and a.get("file_hash"))
and not a.get("graph_att_id")
for a in att_list
)
graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
updated_atts = list(att_list)
email_ok = True
for i, att in enumerate(updated_atts):
if att.get("is_inline", False):
continue
if not args.force_recheck and att.get("file_hash"):
continue
att_name = att.get("filename", "")
att_size = att.get("size_bytes", 0)
graph_att_id = att.get("graph_att_id")
# Preskoc S/MIME podpisy
if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
skip_count += 1
print(f" SKIP {att_name} (S/MIME)")
continue
# Primy pristup pres graph_att_id (emaily parsovane v1.2+)
if graph_att_id:
content = fetch_attachment_content(mailbox, graph_id, graph_att_id)
if content is None:
err_count += 1
email_ok = False
print(f" ERR {att_name} (stazeni selhalo)")
continue
# Zkontroluj zda jde skutecne o inline (pro edge case)
mime_type = att.get("mime_type", "")
else:
# Fallback: name matching pro stare emaily (parsovane pred v1.2)
graph_att = find_graph_att(att_name, att_size, graph_atts)
if not graph_att:
logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
print(f" ERR {att_name} (nenalezeno)")
err_count += 1
email_ok = False
continue
# Pokud Graph rika ze je inline — preskoc
if graph_att.get("isInline", False):
updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
skip_count += 1
print(f" SKIP {att_name} (inline obrazek)")
continue
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
if content is None:
err_count += 1
email_ok = False
print(f" ERR {att_name} (stazeni selhalo)")
continue
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
hash_val, local_path, was_new = save_attachment(
content, att_name, mime_type, mailbox, att_dir, col_index
)
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
if was_new:
new_count += 1
print(f" NEW {local_path} ({len(content):,} B)")
else:
dup_count += 1
print(f" DUP {att_name} -> {local_path}")
if email_ok:
ok_count += 1
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
if len(batch) >= BATCH_SIZE:
flush()
if email_i % 100 == 0:
elapsed = (datetime.now() - start).total_seconds()
print(f" {''*60}")
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} skip={skip_count} err={err_count}")
print(f" {''*60}")
flush()
elapsed_total = (datetime.now() - start).total_seconds()
files_total = col_index.count_documents({})
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
print(f"\n{'='*52}")
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | skip={skip_count} | err={err_count}")
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if err_count:
print(f"Chyby logovany do: {LOG_FILE}")
client.close()
if __name__ == "__main__":
main()
-611
View File
@@ -1,611 +0,0 @@
"""
parse_emails_graph_v1.3.py
Nazev: parse_emails_graph_v1.3.py
Verze: 1.3
Datum: 2026-06-02
Autor: vladimir.buzalka
Popis:
Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API
a importuje je jako dokumenty do MongoDB.
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
- telo HTML (max 2 MB) + textovy preview
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id)
- internet headers (SPF, DKIM, Received, X-*, ...)
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
kategorie, In-Reply-To, References, ...
- navic: isRead, isDraft, folder_path, inferenceClassification
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
archivni slozky, ...).
DB: emaily
Kolekce: <mailbox> (napr. ordinace@buzalkova.cz)
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
Spousteni:
# Prvni import (vsechno):
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
# Test na prvnich 50:
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
# Jen jedna slozka:
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox
# Pokracovani po preruseni (pouze nove):
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only
# Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync
# Jina schranka:
python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz
Rezimy (--mode):
full Plny upsert vsech poli pro kazdou zpravu (vychozi)
new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
sync Existujici: aktualizuje jen is_read/flag_status/categories/
modified_at/folder_path. Nove zpravy importuje cely.
Idealni pro pravidelne spousteni.
Zavislosti:
msal, requests, pymongo, python-dateutil
Python 3.10+
Struktura dokumentu v MongoDB:
_id Internet Message-ID (nebo graphid: fallback)
graph_id Graph API message ID
subject predmet zpravy
normalized_subject predmet bez RE:/FW:/AW: prefixu
importance 0=nizka 1=normalni 2=vysoka
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
is_read bool — aktualni stav precteni ve schrance
is_draft bool
has_attachments bool
attachment_count int
inference_classification focused / other
categories [str]
conversation_id Graph conversationId
conversation_index base64 conversationIndex
conversation_topic tema vlakna (z internet headers Thread-Topic)
in_reply_to Message-ID predchozi zpravy
internet_references [Message-ID]
received_at datetime UTC
sent_at datetime UTC
created_at datetime UTC
modified_at datetime UTC
folder_id Graph parentFolderId
folder_path cela cesta slozky (napr. Inbox/Subfolder)
sender.email emailova adresa odesilatele
sender.name zobrazovane jmeno
to retezec To (joined)
cc retezec CC
bcc retezec BCC
recipients [{type, email, name}]
body_html HTML telo (max 2 MB)
body_preview textovy nahled (max 255 znaku)
attachments [{filename, size_bytes, mime_type, is_inline, graph_att_id}]
headers dict internet headers
parsed_at datetime UTC
Indexy:
received_at, sent_at, sender.email, graph_id (unique),
conversation_id, folder_path, has_attachments, categories,
importance, flag_status, is_read,
text_search (subject + body_preview + to + cc)
Historie verzi:
1.0 2026-06-02 Inicialni verze
1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync;
odstranen --skip-existing (nahrazen --mode new-only)
1.2 2026-06-02 $expand attachments s $select (bez contentBytes — rychlejsi);
prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu
1.3 2026-06-02 --mailbox jako povinny parametr — univerzalni pouziti pro
libovolnou schranku; kolekce v MongoDB = nazev schranky
"""
import sys
import re
import logging
import argparse
import base64
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import msal
import requests
from dateutil import parser as dtparser
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
BATCH_SIZE = 100
PAGE_SIZE = 50
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.3"
# Schránka se nastavuje za behu z --mailbox parametru
GRAPH_MAILBOX: str = ""
# ──────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
# $expand prilohy bez contentBytes — jen metadata co potrebujeme
ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
MSG_SELECT = (
"id,internetMessageId,subject,bodyPreview,body,"
"importance,isRead,isDraft,hasAttachments,"
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
"conversationId,conversationIndex,parentFolderId,"
"categories,flag,inferenceClassification,internetMessageHeaders"
)
MSG_SELECT_SYNC = (
"id,internetMessageId,isRead,isDraft,flag,categories,"
"lastModifiedDateTime,parentFolderId,importance"
)
# ─── Graph API helpers ────────────────────────────────────────────────────────
_graph_token: Optional[str] = None
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
def graph_get(url: str, params: dict = None) -> dict:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Graph GET failed after retry: {url}")
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
if parent_id is None:
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
else:
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
folders = []
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
while url:
data = graph_get(url, params)
for f in data.get("value", []):
path = f"{parent_path}/{f['displayName']}".lstrip("/")
folders.append({"id": f["id"], "path": path})
if f.get("childFolderCount", 0) > 0:
folders.extend(get_all_folders(f["id"], path))
url = data.get("@odata.nextLink")
params = None
return folders
def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
"""Generator: vraci zpravy ze slozky po strankach."""
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
params = {"$top": PAGE_SIZE, "$select": select}
if expand_attachments:
params["$expand"] = ATT_EXPAND
while url:
data = graph_get(url, params)
for msg in data.get("value", []):
yield msg
url = data.get("@odata.nextLink")
params = None
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
def parse_date(raw) -> Optional[datetime]:
if raw is None:
return None
if isinstance(raw, datetime):
if raw.tzinfo:
return raw.astimezone(timezone.utc).replace(tzinfo=None)
return raw
try:
dt = dtparser.parse(str(raw))
if dt.tzinfo:
return dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
except Exception:
return None
def normalize_subject(subject: str) -> str:
s = subject.strip()
while True:
m = RE_SUBJECT.match(s)
if not m:
break
s = s[m.end():].strip()
return s
def parse_headers(raw_headers: list) -> dict:
result = {}
for h in raw_headers:
k = h["name"].lower().replace("-", "_")
v = h["value"]
if k in result:
existing = result[k]
result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
else:
result[k] = v
return result
def format_recipients(lst: list) -> str:
return "; ".join(
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
for r in lst
)
# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
"""Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
try:
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
subject = msg.get("subject") or ""
body_html = None
body_preview = msg.get("bodyPreview") or ""
body = msg.get("body", {})
if body.get("contentType") == "html":
content = body.get("content") or ""
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
elif body.get("contentType") == "text":
body_preview = (body.get("content") or "")[:2000]
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
to_list = msg.get("toRecipients", [])
cc_list = msg.get("ccRecipients", [])
bcc_list = msg.get("bccRecipients", [])
recipients = (
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
)
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
raw_headers = msg.get("internetMessageHeaders") or []
headers = parse_headers(raw_headers)
in_reply_to = headers.get("in_reply_to", "")
if isinstance(in_reply_to, list):
in_reply_to = in_reply_to[0]
refs_raw = headers.get("references", "")
if isinstance(refs_raw, list):
refs_raw = " ".join(refs_raw)
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
conv_topic = headers.get("thread_topic", "")
if isinstance(conv_topic, list):
conv_topic = conv_topic[0]
conv_index = ""
ci_raw = msg.get("conversationIndex")
if ci_raw:
try:
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
except Exception:
conv_index = ci_raw
attachments = []
for att in msg.get("attachments") or []:
fname = att.get("name") or ""
if not fname:
continue
attachments.append({
"filename": fname,
"size_bytes": att.get("size", 0),
"mime_type": att.get("contentType", "application/octet-stream"),
"is_inline": att.get("isInline", False),
"graph_att_id": att.get("id"),
})
return {
"_id": mid,
"graph_id": msg["id"],
"subject": subject,
"normalized_subject": normalize_subject(subject),
"importance": importance,
"flag_status": flag_status,
"is_read": msg.get("isRead", False),
"is_draft": msg.get("isDraft", False),
"has_attachments": msg.get("hasAttachments", False),
"attachment_count": len(attachments),
"inference_classification": msg.get("inferenceClassification", ""),
"categories": msg.get("categories") or [],
"conversation_id": msg.get("conversationId", ""),
"conversation_index": conv_index,
"conversation_topic": conv_topic,
"in_reply_to": in_reply_to,
"internet_references": internet_refs,
"received_at": parse_date(msg.get("receivedDateTime")),
"sent_at": parse_date(msg.get("sentDateTime")),
"created_at": parse_date(msg.get("createdDateTime")),
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
"folder_id": msg.get("parentFolderId", ""),
"folder_path": folder_path,
"sender": {
"email": sender_ea.get("address", ""),
"name": sender_ea.get("name", ""),
},
"to": format_recipients(to_list),
"cc": format_recipients(cc_list),
"bcc": format_recipients(bcc_list),
"recipients": recipients,
"body_html": body_html,
"body_preview": body_preview,
"attachments": attachments,
"headers": headers,
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}
except Exception as e:
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
return None
def extract_sync_fields(msg: dict, folder_path: str) -> dict:
"""Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
return {
"is_read": msg.get("isRead", False),
"is_draft": msg.get("isDraft", False),
"flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
"importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
"categories": msg.get("categories") or [],
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
"folder_id": msg.get("parentFolderId", ""),
"folder_path": folder_path,
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
}
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
def create_indexes(col):
print(" Vytvarim indexy...")
col.create_index([("received_at", ASCENDING)])
col.create_index([("sent_at", ASCENDING)])
col.create_index([("sender.email", ASCENDING)])
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
col.create_index([("conversation_id", ASCENDING)])
col.create_index([("folder_path", ASCENDING)])
col.create_index([("has_attachments", ASCENDING)])
col.create_index([("categories", ASCENDING)])
col.create_index([("importance", ASCENDING)])
col.create_index([("flag_status", ASCENDING)])
col.create_index([("is_read", ASCENDING)])
col.create_index([
("subject", TEXT),
("body_preview", TEXT),
("to", TEXT),
("cc", TEXT),
], name="text_search", default_language="none")
print(" Indexy hotovy.")
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def main():
global GRAPH_MAILBOX
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", required=True,
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
"sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
ap.add_argument("--limit", type=int, default=0,
help="Zpracovat max N zprav (0 = vse)")
ap.add_argument("--folder", default="",
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
ap.add_argument("--no-indexes", action="store_true",
help="Nevytvorit indexy na konci")
args = ap.parse_args()
GRAPH_MAILBOX = args.mailbox
mongo_col = args.mailbox
start = datetime.now()
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Schránka: {GRAPH_MAILBOX}")
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
print(f"Režim: {args.mode}")
print("\nPřipojuji se k Graph API...")
try:
get_token()
print(" Graph API OK")
except Exception as e:
print(f" CHYBA: {e}")
sys.exit(1)
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
try:
client.admin.command("ping")
print(" MongoDB OK")
except Exception as e:
print(f" CHYBA: MongoDB neni dostupna -- {e}")
sys.exit(1)
col = client[MONGO_DB][mongo_col]
existing: set = set()
if args.mode in ("new-only", "sync"):
print(" Nacitam existujici zaznamy z MongoDB...")
existing = set(col.distinct("_id"))
print(f" {len(existing)} jiz importovano")
print("\nNacitam seznam slozek...")
all_folders = get_all_folders()
if args.folder:
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
print(f" Slozek ke zpracovani: {len(all_folders)}")
for f in all_folders:
print(f" {f['path']}")
is_sync = args.mode == "sync"
msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
expand_att = not is_sync
batch = []
ok_count = 0
sync_count = 0
err_count = 0
skip_count = 0
total_i = 0
def flush():
if not batch:
return
try:
col.bulk_write(batch, ordered=False)
except Exception as e:
logging.error("bulk_write: %s", e)
print(f" CHYBA bulk_write: {e}")
batch.clear()
print()
for folder in all_folders:
print(f"--- Složka: {folder['path']} ---")
folder_count = 0
for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
if args.limit and total_i >= args.limit:
break
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
total_i += 1
folder_count += 1
if args.mode == "new-only" and mid in existing:
skip_count += 1
continue
if is_sync and mid in existing:
fields = extract_sync_fields(msg, folder["path"])
batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
sync_count += 1
print(f" {total_i:>6} SYN {mid[:80]}")
else:
if is_sync:
full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND}
try:
msg = graph_get(full_url, full_params)
except Exception as e:
logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
err_count += 1
continue
doc = extract_message(msg, folder["path"])
if doc is None:
err_count += 1
print(f" {total_i:>6} ERR {mid[:80]}")
else:
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
ok_count += 1
subject_str = (doc.get("subject") or "")[:60]
sender_str = (doc.get("sender", {}).get("email") or "")[:40]
print(f" {total_i:>6} OK {subject_str:<60} {sender_str}")
if len(batch) >= BATCH_SIZE:
flush()
if total_i % 500 == 0:
elapsed = (datetime.now() - start).total_seconds()
rate = total_i / elapsed if elapsed > 0 else 0
print(f" {''*80}")
print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
print(f" {''*80}")
flush()
print(f"{folder_count} zprav ze slozky {folder['path']}")
if args.limit and total_i >= args.limit:
break
elapsed_total = (datetime.now() - start).total_seconds()
print(f"\n{'='*52}")
print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"Dokumentu v kolekci: {col.count_documents({})}")
if not args.no_indexes:
print()
create_indexes(col)
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if err_count:
print(f"Chyby logovany do: {LOG_FILE}")
client.close()
if __name__ == "__main__":
main()
+133 -25
View File
@@ -48,20 +48,32 @@ c.close()
---
## Aktuální skripty v /scripts
## Pipeline — 5 skriptů v pořadí spouštění
| Soubor | Popis |
|---------------------------------|--------------------------------------------------------------|
| `parse_emails_graph_v1.3.py` | Import emailů ze schránky přes Graph API → MongoDB |
| `download_attachments_v1.3.py` | Stažení skutečných příloh emailů (Graph API) → `/mnt/Emails` |
| `python_runner.md` | Tato dokumentace |
| `parse_emails_errors.log` | Log chyb (soubory/zprávy které selhaly) |
Prefix `1_``5_` indikuje pořadí v pipeline. Bezpečné opakovat každý krok (idempotentní upserty).
> **POZOR:** oba skripty pouze **čtou** ze schránky — žádný zápis do schránky.
| # | Skript | Účel | Zdroj → Cíl |
|---|---|---|---|
| 1 | `1_parse_emails_graph_v1.4.py` | Import emailů z Graph API | Graph → Mongo `emaily.<mailbox>` |
| 2 | `2_refetch_text_bodies_v1.0.py` | ONETIME oprava starých plain-text emailů (v1.3 ukládal jen 2000 znaků do `body_preview`) | Graph → Mongo `body_text` |
| 3 | `3_download_attachments_v1.3.py` | Stažení binárek příloh + SHA256 dedup | Graph → `/mnt/Emails/<mailbox>/Attachments/` + Mongo `attachments_index` |
| 4 | `4_unwrap_smime_v1.0.py` | Rozbalení S/MIME wrapper (`smime.p7m`) na vnitřní MIME tělo | Graph → Mongo `smime_body_text/html`, `smime_inner_attachments` |
| 5 | `5_enrich_fulltext_emails_v1.2.py`| Plný text emailů do PG fulltext indexu | Mongo → PG `MongoEmaily.emails` |
Doplňkové soubory v `/scripts/`:
| Soubor | Popis |
|---|---|
| `python_runner.md` | Tato dokumentace |
| `*.log` | Výstupy běhů (`parse_emails.log`, `download_attachments.log`, `unwrap_smime.log`, `refetch.log`) |
| `*_errors.log` | Chyby konkrétních zpráv/příloh |
| `Trash/` | Staré verze skriptů |
> **POZOR:** všechny skripty pouze **čtou** ze schránky — žádný zápis do schránky.
---
## Microsoft Graph API — konfigurace (v obou skriptech)
## Microsoft Graph API — konfigurace (sdílená všemi skripty)
| Parametr | Hodnota |
|-----------------|----------------------------------------|
@@ -77,29 +89,35 @@ c.close()
| Kolekce emailů | `<mailbox>` (např. `ordinace@buzalkova.cz`) |
| Index příloh | `attachments_index` |
| PostgreSQL | Hodnota |
|-----------------|----------------------------------------|
| Host | `192.168.1.76` |
| DB | `MongoEmaily` |
| Tabulka | `emails` (GIN tsvector, config `soubory`) |
---
## 1) parse_emails_graph_v1.3.py — import emailů → MongoDB
## 1) `1_parse_emails_graph_v1.4.py` — import emailů → MongoDB
Čte **všechny složky** schránky rekurzivně (Inbox, Sent, Deleted, archivy …) přes
Graph API a importuje každou zprávu jako dokument do MongoDB. `_id` = Internet
Message-ID (fallback `graphid:<id>`). Upsert → bezpečné přerušit a opakovat.
Z každé zprávy extrahuje: předmět, odesílatel, příjemci To/CC/BCC, časy (UTC),
HTML tělo (max 2 MB) + text preview, přílohy (metadata + `graph_att_id`),
internet headers (SPF/DKIM/Received/X-*), MAPI-ekvivalenty (důležitost, příznak,
konverzační vlákno, kategorie, In-Reply-To, References), `isRead`, `isDraft`,
`folder_path`, `inferenceClassification`.
HTML tělo (max 2 MB) + text preview, **plné plain-text tělo (`body_text`, max 2 MB)**,
přílohy (metadata + `graph_att_id`), internet headers (SPF/DKIM/Received/X-*),
MAPI-ekvivalenty (důležitost, příznak, konverzační vlákno, kategorie,
In-Reply-To, References), `isRead`, `isDraft`, `folder_path`, `inferenceClassification`.
```bash
# První import (vše):
docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz
# Test na 50 zprávách bez indexů:
docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
docker exec -it python-runner python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
# Pravidelný sync na pozadí (log do souboru):
docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
docker exec -d python-runner bash -c "python /scripts/1_parse_emails_graph_v1.4.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
```
> **`-d` = detached:** příkaz se hned vrátí a skript běží dál v kontejneru i po
@@ -119,7 +137,23 @@ docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py
---
## 2) download_attachments_v1.3.py — stažení příloh → /mnt/Emails
## 2) `2_refetch_text_bodies_v1.0.py`dohnání plain-text těl
**ONETIME oprava.** Starý `parse_emails_graph_v1.3` ukládal plain-text emaily
jen jako prvních 2000 znaků do `body_preview` — plné tělo se zahazovalo.
Tenhle skript v Mongo najde emaily kde `body_html` chybí a re-fetchne plné
tělo z Graphu do nového pole `body_text` (max 2 MB).
```bash
docker exec -d python-runner bash -c "python /scripts/2_refetch_text_bodies_v1.0.py --mailbox ordinace@buzalkova.cz > /scripts/refetch.log 2>&1"
```
> Po importu schránky přes v1.4 už tenhle skript prakticky nemá co dělat
> (kandidátů 0). Drží se kvůli archivním schránkám, které byly importovány v1.3.
---
## 3) `3_download_attachments_v1.3.py` — stažení příloh → /mnt/Emails
Stahuje skutečné přílohy (`is_inline=False`) všech emailů z MongoDB přes Graph API
do `/mnt/Emails/<schránka>/Attachments/`. Primárně přes `graph_att_id` (přímé ID),
@@ -135,15 +169,13 @@ mime_type, mailbox, first_seen_at, ref_count). Emaily kde mají všechny přílo
`file_hash` se přeskočí → bezpečné opakovat.
```bash
# Interaktivně (vidíš výstup, skončí zavřením terminálu):
docker exec -it python-runner python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
# Interaktivně:
docker exec -it python-runner python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
# Na pozadí (běží dál i po zavření terminálu, log do souboru):
docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
# Na pozadí:
docker exec -d python-runner bash -c "python /scripts/3_download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
```
> `-d` = detached — stejné chování jako u skriptu výše (viz poznámka v sekci 1).
### Parametry
| Parametr | Popis |
@@ -155,10 +187,82 @@ docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3.
---
## 4) `4_unwrap_smime_v1.0.py` — rozbalení S/MIME zpráv
Některé emaily (Datová schránka, mBank, ComGate, PayU, PostSignum …) přicházejí
jako S/MIME signed-data wrapper: viditelné tělo je jen *"This is an S/MIME
signed message"*, skutečný obsah je zabalený uvnitř přílohy `smime.p7m`.
Skript najde tyto emaily, stáhne binárku `smime.p7m` z Graphu, rozbalí PKCS7
SignedData (`asn1crypto.cms`), extrahuje vnitřní MIME zprávu a doplní do Mongo:
| Pole | Obsah |
|---|---|
| `smime_unwrapped: True` | flag — už rozbaleno |
| `smime_subject` | Subject z vnitřní MIME hlavičky |
| `smime_body_text` | plain text vnitřního těla |
| `smime_body_html` | HTML vnitřního těla (pokud je) |
| `smime_inner_attachments[]` | `{filename, content_type, size_bytes}` vnitřních příloh |
Pole pak používá `5_enrich_fulltext_emails_v1.2` — preferuje `smime_body_*` před
prázdným wrapper tělem a názvy vnitřních příloh přidá do `attachments_summary`
(takže je najde MCP `emaily.find_attachment`).
```bash
docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py # vsechny schránky
docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --mailbox ordinace@buzalkova.cz
docker exec -it python-runner python /scripts/4_unwrap_smime_v1.0.py --limit 10 # test
```
### POZOR: `smime.p7m` vs `smime.p7s` — dva různé typy
| Příloha | Co to je | Skript dělá |
|---|---|---|
| `smime.p7m` | **Enveloped/signed-data wrapper** — vnější obal kolem celé MIME zprávy. Bez rozbalení je viditelné jen *"This is an S/MIME signed message"*. | **Rozbalí** → extrahuje vnitřní tělo + přílohy do Mongo. |
| `smime.p7s` | **Detached signature** — jen digitální podpis vedle čistého emailu. Vlastní `body_html` / `body_text` je normálně dostupné. | **Ignoruje** — není co rozbalovat. Mail je už čitelný. |
Filtr ve skriptu (`SMIME_FILTER`) je proto explicitně `^smime\.p7m$`. Pokud při
auditu vidíš email s přílohou `smime.p7s` a `smime_unwrapped != True`, je to
**správně** — žádná akce není potřeba.
### Závislosti
```bash
pip install asn1crypto
```
---
## 5) `5_enrich_fulltext_emails_v1.2.py` — fulltext do PostgreSQL
Vytáhne plný text z emailů v MongoDB a uloží do PostgreSQL
(`MongoEmaily.emails`) s GIN `tsvector` indexem (config `soubory` — simple + unaccent).
Emaily se **nestahují znovu** — tělo už je v Mongo z kroků 1/2/4.
**Priorita zdroje těla** (`body_source`):
1. `smime``smime_body_text` / `smime_body_html` (pokud unwrap proběhl)
2. `html``body_html`
3. `text``body_text` (z parse v1.4 nebo refetch v1.0)
4. `preview``body_preview` (fallback)
Inkrementalita: pokud `(mailbox, message_id)` existuje a `extractor_version`
je aktuální a `modified_at` v Mongo není novější → skip. Bump `EXTRACTOR_VERSION`
přeparsuje vše.
```bash
docker exec -d python-runner bash -c "python /scripts/5_enrich_fulltext_emails_v1.2.py > /scripts/enrich.log 2>&1"
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --mailbox ordinace@buzalkova.cz
docker exec -it python-runner python /scripts/5_enrich_fulltext_emails_v1.2.py --limit 500 # test
```
---
## Sledování průběhu
```bash
docker exec -it python-runner tail -f /scripts/parse_emails.log
docker exec -it python-runner tail -f /scripts/download_attachments.log
docker exec -it python-runner tail -f /scripts/unwrap_smime.log
```
---
@@ -172,6 +276,7 @@ pymongo 4.17.0
python-dateutil 2.9.0.post0
extract-msg 0.55.0
cryptography 48.0.0
asn1crypto (S/MIME unwrap)
beautifulsoup4 4.13.5
oletools 0.60.2
msoffcrypto-tool 6.0.0
@@ -183,6 +288,7 @@ pcodedmp 1.2.6
tzlocal 5.3.1
six 1.17.0
pip 25.0.1
psycopg (PG klient pro krok 5)
```
---
@@ -201,4 +307,6 @@ docker exec python-runner pip install <balicek>
| Datum | Změna |
|---|---|
| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. Skript `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. |
| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. |
| 2026-06-03 | `parse_emails_graph_v1.4` (ukládá i plné plain-text tělo do `body_text`). Přidán `refetch_text_bodies_v1.0` (dohnání starých plain-text). Přidán `unwrap_smime_v1.0` (rozbalení `smime.p7m`). `enrich_fulltext_emails_v1.2` (preferuje `smime_body_*`, body_source `smime`/`text`). |
| 2026-06-04 | Skripty přejmenovány s prefixem `1_…5_` podle pořadí v pipeline. `enrich_v1.1` + `parse_emails_tower_v1.1*` do `Trash/`. |