z230
This commit is contained in:
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
download_attachments_v1.0.py
|
||||
Nazev: download_attachments_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB kolekce
|
||||
ordinace@buzalkova.cz primo pres Microsoft Graph API a uklada je do
|
||||
adresare /mnt/Emails/ordinace@buzalkova.cz/Attachments/.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, first_seen_at, ref_count (pocet emailu ktery ji obsahuje)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- zpravy kde jsou vsechny prilohy uz stazene (maji file_hash) se preskoci
|
||||
- --force-recheck znovu overi i uz stazene (pro pripad zmen na disku)
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.0.py # stahni vse co chybi
|
||||
python download_attachments_v1.0.py --limit 50 # test na prvnich 50 emailech
|
||||
python download_attachments_v1.0.py --force-recheck # overi i uz stazene
|
||||
|
||||
Docker (po pridani mountu /mnt/user/Emails -> /mnt/Emails):
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.0.py
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura na disku:
|
||||
/mnt/Emails/
|
||||
└── ordinace@buzalkova.cz/
|
||||
└── Attachments/
|
||||
├── faktura_2026.pdf
|
||||
├── vysledky_lab.pdf
|
||||
├── vysledky_lab_2.pdf <- kolize nazvu, jiny obsah
|
||||
└── ...
|
||||
|
||||
Kolekce emaily.attachments_index:
|
||||
_id SHA256 hash (hex)
|
||||
filename nazev souboru na disku (prvni vyskytu)
|
||||
local_path relativni cesta od Attachments/ (zatim = filename)
|
||||
size_bytes velikost souboru
|
||||
mime_type MIME typ
|
||||
first_seen_at datetime UTC
|
||||
ref_count v kolika emailech se tato priloha vyskytuje
|
||||
|
||||
Aktualizace v email dokumentu (kolekce ordinace@buzalkova.cz):
|
||||
attachments[i].file_hash SHA256 hash
|
||||
attachments[i].local_path cesta relativni od Attachments/
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
"""
|
||||
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_EMAILS = "ordinace@buzalkova.cz"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
ATTACHMENTS_DIR = Path("/mnt/Emails/ordinace@buzalkova.cz/Attachments")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.0"
|
||||
BATCH_SIZE = 50
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
"""Stahne binarni obsah prilohy."""
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_attachment_content(graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
"""Stahne obsah prilohy pres Graph API."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_message_attachments(graph_message_id: str) -> list[dict]:
|
||||
"""Nacte seznam priloh zpravy z Graph API (metadata vcetne attachment ID)."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, index_col) -> str:
|
||||
"""
|
||||
Vrati nazev souboru ktery pouzit pro ulozeni.
|
||||
Pokud desired_name jiz existuje s jinym hashem, prida suffix _2, _3 ...
|
||||
"""
|
||||
# Zkontroluj jestli existujici soubor se stejnym nazvem ma stejny hash
|
||||
existing = index_col.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name # Stejny hash, stejne jmeno — dedup hit
|
||||
# Jiny hash — hledej volny suffix
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
if not (att_dir / candidate).exists():
|
||||
# Overi ze ani v indexu neni tento kandidat s jinym hashem
|
||||
ex2 = index_col.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(content: bytes, original_name: str, att_dir: Path, index_col) -> tuple[str, str, bool]:
|
||||
"""
|
||||
Ulozi prilohu s deduplikaci.
|
||||
Vraci (hash, local_path, was_new):
|
||||
was_new=True -> soubor byl ulozen
|
||||
was_new=False -> hash uz existoval, soubor preskocen
|
||||
"""
|
||||
hash_val = sha256(content)
|
||||
|
||||
# Zkontroluj index — pokud hash uz existuje, vrat existujici zaznam
|
||||
existing = index_col.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
# Zvys pocitadlo referenci
|
||||
index_col.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
# Novy soubor — urcit nazev
|
||||
safe_name = "".join(c if c.isalnum() or c in "._- " else "_" for c in original_name).strip()
|
||||
if not safe_name:
|
||||
safe_name = f"attachment_{hash_val[:8]}"
|
||||
|
||||
filename = resolve_filename(safe_name, att_dir, hash_val, index_col)
|
||||
file_path = att_dir / filename
|
||||
|
||||
# Uloz soubor
|
||||
file_path.write_bytes(content)
|
||||
|
||||
# Zaznamenej do indexu
|
||||
index_col.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": "",
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"Cilovy adresar: {ATTACHMENTS_DIR}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}")
|
||||
|
||||
# Adresar
|
||||
ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
print(f" Adresar OK")
|
||||
|
||||
# Graph
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][MONGO_COL_EMAILS]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
# Indexy na attachment index kolekci
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
|
||||
# Dotaz — emaily s prilohou ktere jeste nebyly zpracovany
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
skip_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
# Jen skutecne prilohy
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
# Nacti attachment IDs z Graph API
|
||||
graph_atts = fetch_message_attachments(graph_id)
|
||||
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
skip_count += 1
|
||||
print(f" SKIP {att['filename']}")
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
graph_att = graph_att_map.get(att_name)
|
||||
|
||||
if not graph_att:
|
||||
# Zkus najit podle casti nazvu
|
||||
for gname, ga in graph_att_map.items():
|
||||
if att_name.lower() in gname.lower():
|
||||
graph_att = ga
|
||||
break
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno v Graph)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
# Stahni obsah
|
||||
content = fetch_attachment_content(graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
# Uloz s dedupem
|
||||
hash_val, local_path, was_new = save_attachment(content, att_name, ATTACHMENTS_DIR, col_index)
|
||||
|
||||
# Aktualizuj MIME typ v indexu
|
||||
col_index.update_one(
|
||||
{"_id": hash_val},
|
||||
{"$set": {"mime_type": att.get("mime_type", graph_att.get("contentType", ""))}},
|
||||
)
|
||||
|
||||
# Zaznamenej do emailu
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
skip_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
# Uloz aktualizovane prilohy zpet do emailu
|
||||
batch.append(UpdateOne(
|
||||
{"_id": email_id},
|
||||
{"$set": {"attachments": updated_atts}}
|
||||
))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={skip_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove soubory={new_count} | duplikaty={skip_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total/1024/1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,428 @@
|
||||
"""
|
||||
download_attachments_v1.1.py
|
||||
Nazev: download_attachments_v1.1.py
|
||||
Verze: 1.1
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
|
||||
pres Microsoft Graph API a uklada je do adresare
|
||||
/mnt/Emails/<schránka>/Attachments/.
|
||||
|
||||
Schránka se predava jako povinny parametr --mailbox.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, mailbox, first_seen_at, ref_count
|
||||
|
||||
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
|
||||
se preskoci. --force-recheck znovu overi i uz stazene.
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz
|
||||
python download_attachments_v1.1.py --mailbox vladimir.buzalka@buzalka.cz --limit 50
|
||||
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz --force-recheck
|
||||
|
||||
Docker:
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.1.py \\
|
||||
--mailbox ordinace@buzalkova.cz
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo
|
||||
Python 3.10+
|
||||
|
||||
Struktura na disku:
|
||||
/mnt/Emails/
|
||||
└── <mailbox>/
|
||||
└── Attachments/
|
||||
├── faktura_2026.pdf
|
||||
├── vysledky_lab.pdf
|
||||
├── vysledky_lab_2.pdf
|
||||
└── ...
|
||||
|
||||
Kolekce emaily.attachments_index:
|
||||
_id SHA256 hash (hex)
|
||||
filename nazev souboru na disku
|
||||
local_path relativni cesta od Attachments/
|
||||
size_bytes velikost souboru
|
||||
mime_type MIME typ
|
||||
mailbox schránka ze ktere pochazi prvni vyskytu
|
||||
first_seen_at datetime UTC
|
||||
ref_count v kolika emailech se tato priloha vyskytuje
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Schránka jako parametr --mailbox (univerzalni pouziti)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
EMAILS_BASE_DIR = Path("/mnt/Emails")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.1"
|
||||
BATCH_SIZE = 50
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def safe_filename(name: str) -> str:
|
||||
safe = "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip()
|
||||
return safe or "attachment"
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
|
||||
"""Vrati nazev souboru pro ulozeni — resi kolize (stejny nazev, jiny hash)."""
|
||||
existing = col_index.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name # Dedup hit — stejny hash
|
||||
# Kolize — hledej volny suffix
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
ex2 = col_index.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(
|
||||
content: bytes,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
mailbox: str,
|
||||
att_dir: Path,
|
||||
col_index,
|
||||
) -> tuple[str, str, bool]:
|
||||
"""
|
||||
Ulozi prilohu s deduplikaci.
|
||||
Vraci (hash, local_path, was_new).
|
||||
"""
|
||||
hash_val = sha256(content)
|
||||
|
||||
existing = col_index.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
|
||||
file_path = att_dir / filename
|
||||
file_path.write_bytes(content)
|
||||
|
||||
col_index.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": mime_type,
|
||||
"mailbox": mailbox,
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mailbox", required=True,
|
||||
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na attachments_index kolekci")
|
||||
args = ap.parse_args()
|
||||
|
||||
mailbox = args.mailbox
|
||||
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
|
||||
mongo_col = mailbox
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {mailbox}")
|
||||
print(f"Cilovy adresar: {att_dir}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
|
||||
|
||||
att_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(" Adresar OK")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][mongo_col]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
col_index.create_index("mailbox")
|
||||
|
||||
# Dotaz
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
dup_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
graph_atts = fetch_message_attachments(mailbox, graph_id)
|
||||
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
print(f" SKIP {att['filename']}")
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
graph_att = graph_att_map.get(att_name)
|
||||
if not graph_att:
|
||||
for gname, ga in graph_att_map.items():
|
||||
if att_name.lower() in gname.lower():
|
||||
graph_att = ga
|
||||
break
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno v Graph)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
|
||||
hash_val, local_path, was_new = save_attachment(
|
||||
content, att_name, mime_type, mailbox, att_dir, col_index
|
||||
)
|
||||
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
dup_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,466 @@
|
||||
"""
|
||||
download_attachments_v1.2.py
|
||||
Nazev: download_attachments_v1.2.py
|
||||
Verze: 1.2
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
|
||||
pres Microsoft Graph API a uklada je do adresare
|
||||
/mnt/Emails/<schránka>/Attachments/.
|
||||
|
||||
Schránka se predava jako povinny parametr --mailbox.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, mailbox, first_seen_at, ref_count
|
||||
|
||||
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
|
||||
se preskoci. --force-recheck znovu overi i uz stazene.
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.2.py --mailbox ordinace@buzalkova.cz
|
||||
python download_attachments_v1.2.py --mailbox ordinace@buzalkova.cz --limit 50
|
||||
python download_attachments_v1.2.py --mailbox ordinace@buzalkova.cz --force-recheck
|
||||
|
||||
Docker:
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.2.py \\
|
||||
--mailbox ordinace@buzalkova.cz
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo
|
||||
Python 3.10+
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Schránka jako parametr --mailbox
|
||||
1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline (fix ERR pri
|
||||
inline obrazcich ulozených jako is_inline=False v MongoDB);
|
||||
normalizace nazvu pro robustni porovnani; preskoceni S/MIME
|
||||
(.p7m/.p7s); pokud Graph oznaci jako inline -> SKIP ne ERR
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
EMAILS_BASE_DIR = Path("/mnt/Emails")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.2"
|
||||
BATCH_SIZE = 50
|
||||
|
||||
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
|
||||
SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
|
||||
"""Nacte VSECHNY prilohy zpravy (vcetne inline) — filtrovani az pozdeji."""
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s",
|
||||
graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-"""
|
||||
nfkd = unicodedata.normalize("NFKD", name.lower().strip())
|
||||
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
|
||||
return re.sub(r"[^\w.\-]", "_", ascii_str)
|
||||
|
||||
|
||||
def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
|
||||
"""
|
||||
Hleda prilohu v Graph listu.
|
||||
1. Presna shoda jmena
|
||||
2. Normalizovana shoda jmena
|
||||
3. Shoda jmena + velikosti (toleruje drobne rozdily v nazvu)
|
||||
"""
|
||||
# 1. Presna shoda
|
||||
for ga in graph_atts:
|
||||
if ga["name"] == att_name:
|
||||
return ga
|
||||
|
||||
norm_want = normalize_name(att_name)
|
||||
|
||||
# 2. Normalizovana shoda
|
||||
for ga in graph_atts:
|
||||
if normalize_name(ga["name"]) == norm_want:
|
||||
return ga
|
||||
|
||||
# 3. Normalizovana shoda + velikost (±10 %)
|
||||
for ga in graph_atts:
|
||||
if normalize_name(ga["name"]) == norm_want:
|
||||
ga_size = ga.get("size", 0)
|
||||
if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
|
||||
return ga
|
||||
|
||||
# 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména)
|
||||
for ga in graph_atts:
|
||||
if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
|
||||
return ga
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def safe_filename(name: str) -> str:
|
||||
safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
|
||||
return safe or "attachment"
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
|
||||
existing = col_index.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
ex2 = col_index.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(
|
||||
content: bytes,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
mailbox: str,
|
||||
att_dir: Path,
|
||||
col_index,
|
||||
) -> tuple[str, str, bool]:
|
||||
hash_val = sha256(content)
|
||||
|
||||
existing = col_index.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
|
||||
file_path = att_dir / filename
|
||||
file_path.write_bytes(content)
|
||||
|
||||
col_index.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": mime_type,
|
||||
"mailbox": mailbox,
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mailbox", required=True,
|
||||
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na attachments_index kolekci")
|
||||
args = ap.parse_args()
|
||||
|
||||
mailbox = args.mailbox
|
||||
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
|
||||
mongo_col = mailbox
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {mailbox}")
|
||||
print(f"Cilovy adresar: {att_dir}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
|
||||
|
||||
att_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(" Adresar OK")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][mongo_col]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
col_index.create_index("mailbox")
|
||||
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
dup_count = 0
|
||||
skip_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
# Nacti VSECHNY prilohy z Graph (vcetne inline — potrebujeme je pro matching)
|
||||
graph_atts = fetch_message_attachments(mailbox, graph_id)
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
att_size = att.get("size_bytes", 0)
|
||||
|
||||
# Preskoc S/MIME podpisy
|
||||
if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
|
||||
updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
|
||||
skip_count += 1
|
||||
print(f" SKIP {att_name} (S/MIME)")
|
||||
continue
|
||||
|
||||
# Najdi prilohu v Graph
|
||||
graph_att = find_graph_att(att_name, att_size, graph_atts)
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
# Pokud Graph rika ze je inline — preskoc, nestahujem
|
||||
if graph_att.get("isInline", False):
|
||||
updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
|
||||
skip_count += 1
|
||||
print(f" SKIP {att_name} (inline obrazek)")
|
||||
continue
|
||||
|
||||
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
|
||||
hash_val, local_path, was_new = save_attachment(
|
||||
content, att_name, mime_type, mailbox, att_dir, col_index
|
||||
)
|
||||
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
dup_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} skip={skip_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,560 @@
|
||||
"""
|
||||
parse_emails_graph_v1.0.py
|
||||
Nazev: parse_emails_graph_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
|
||||
Microsoft Graph API a importuje je jako dokumenty do MongoDB.
|
||||
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
|
||||
- telo HTML (max 2 MB) + textovy preview
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
|
||||
- internet headers (SPF, DKIM, Received, X-*, ...)
|
||||
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- navic: isRead, isDraft, folder_path, inferenceClassification
|
||||
|
||||
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
|
||||
archivni slozky, ...).
|
||||
|
||||
DB: emaily
|
||||
Kolekce: ordinace@buzalkova.cz
|
||||
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- upsert podle _id — duplicity se automaticky prepisi
|
||||
- --skip-existing nacte seznam hotovych _id z MongoDB a preskoci je
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python parse_emails_graph_v1.0.py # kompletni import
|
||||
python parse_emails_graph_v1.0.py --limit 50 # test na prvnich 50
|
||||
python parse_emails_graph_v1.0.py --skip-existing # pokracovani po preruseni
|
||||
python parse_emails_graph_v1.0.py --folder Inbox # jen jedna slozka
|
||||
python parse_emails_graph_v1.0.py --no-indexes # bez indexu na konci
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo graphid: fallback)
|
||||
graph_id Graph API message ID (pro pripadne dalsi operace)
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW:/AW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
is_read bool — aktualni stav precteni ve schrance
|
||||
is_draft bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
inference_classification focused / other (Outlook AI trideni)
|
||||
categories [str]
|
||||
conversation_id Graph conversationId
|
||||
conversation_index base64 conversationIndex
|
||||
conversation_topic tema vlakna (z internet headers Thread-Topic)
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID] — cela historia vlakna
|
||||
received_at datetime UTC
|
||||
sent_at datetime UTC
|
||||
created_at datetime UTC — cas vytvoreni zaznamu v M365
|
||||
modified_at datetime UTC — cas posledni modifikace
|
||||
folder_id Graph parentFolderId
|
||||
folder_path cela cesta slozky (napr. Inbox/Subfolder)
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno odesilatele
|
||||
to retezec To (joined)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
recipients [{type, email, name}] — to/cc/bcc s typy
|
||||
body_html HTML telo (max 2 MB)
|
||||
body_preview textovy nahled (max 255 znaku z Graph)
|
||||
attachments [{filename, size_bytes, mime_type,
|
||||
content_id, is_inline}]
|
||||
headers dict internet headers (lowercase_s_podtrzitky)
|
||||
parsed_at datetime UTC — cas parsovani
|
||||
|
||||
Indexy:
|
||||
received_at, sent_at, sender.email, graph_id (unique),
|
||||
conversation_id, folder_path, has_attachments, categories,
|
||||
importance, flag_status, is_read,
|
||||
text_search (subject + body_preview + to + cc)
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze — Graph API jako zdroj
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL = "ordinace@buzalkova.cz"
|
||||
BATCH_SIZE = 100
|
||||
PAGE_SIZE = 50
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.0"
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
|
||||
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
|
||||
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
|
||||
|
||||
MSG_SELECT = (
|
||||
"id,internetMessageId,subject,bodyPreview,body,"
|
||||
"importance,isRead,isDraft,hasAttachments,"
|
||||
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
|
||||
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
|
||||
"conversationId,conversationIndex,parentFolderId,"
|
||||
"categories,flag,inferenceClassification,internetMessageHeaders"
|
||||
)
|
||||
|
||||
|
||||
# ─── Graph API helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET failed after retry: {url}")
|
||||
|
||||
|
||||
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
|
||||
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
|
||||
if parent_id is None:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
|
||||
else:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
|
||||
|
||||
folders = []
|
||||
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for f in data.get("value", []):
|
||||
path = f"{parent_path}/{f['displayName']}".lstrip("/")
|
||||
folders.append({"id": f["id"], "path": path})
|
||||
if f.get("childFolderCount", 0) > 0:
|
||||
folders.extend(get_all_folders(f["id"], path))
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
return folders
|
||||
|
||||
|
||||
def iter_folder_messages(folder_id: str):
|
||||
"""Generator: vraci zpravy ze slozky po strankach."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
|
||||
params = {"$top": PAGE_SIZE, "$select": MSG_SELECT, "$expand": "attachments"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for msg in data.get("value", []):
|
||||
yield msg
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
s = subject.strip()
|
||||
while True:
|
||||
m = RE_SUBJECT.match(s)
|
||||
if not m:
|
||||
break
|
||||
s = s[m.end():].strip()
|
||||
return s
|
||||
|
||||
|
||||
def parse_headers(raw_headers: list) -> dict:
|
||||
result = {}
|
||||
for h in raw_headers:
|
||||
k = h["name"].lower().replace("-", "_")
|
||||
v = h["value"]
|
||||
if k in result:
|
||||
existing = result[k]
|
||||
if isinstance(existing, list):
|
||||
existing.append(v)
|
||||
else:
|
||||
result[k] = [existing, v]
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
def format_recipients(lst: list) -> str:
|
||||
return "; ".join(
|
||||
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
|
||||
for r in lst
|
||||
)
|
||||
|
||||
|
||||
# ─── Hlavní extrakce ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
|
||||
try:
|
||||
# _id
|
||||
mid = (msg.get("internetMessageId") or "").strip()
|
||||
if not mid:
|
||||
mid = f"graphid:{msg['id']}"
|
||||
|
||||
subject = msg.get("subject") or ""
|
||||
norm_subject = normalize_subject(subject)
|
||||
|
||||
# tělo
|
||||
body_html = None
|
||||
body_preview = msg.get("bodyPreview") or ""
|
||||
body = msg.get("body", {})
|
||||
if body.get("contentType") == "html":
|
||||
content = body.get("content") or ""
|
||||
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
|
||||
elif body.get("contentType") == "text":
|
||||
body_preview = (body.get("content") or "")[:2000]
|
||||
|
||||
# odesílatel
|
||||
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
|
||||
sender_email = sender_ea.get("address", "")
|
||||
sender_name = sender_ea.get("name", "")
|
||||
|
||||
# příjemci
|
||||
to_list = msg.get("toRecipients", [])
|
||||
cc_list = msg.get("ccRecipients", [])
|
||||
bcc_list = msg.get("bccRecipients", [])
|
||||
|
||||
recipients = (
|
||||
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
|
||||
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
|
||||
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
|
||||
)
|
||||
|
||||
# příznaky
|
||||
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
|
||||
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
|
||||
|
||||
# internet headers
|
||||
raw_headers = msg.get("internetMessageHeaders") or []
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if isinstance(in_reply_to, list):
|
||||
in_reply_to = in_reply_to[0]
|
||||
|
||||
refs_raw = headers.get("references", "")
|
||||
if isinstance(refs_raw, list):
|
||||
refs_raw = " ".join(refs_raw)
|
||||
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
|
||||
|
||||
conv_topic = headers.get("thread_topic", "")
|
||||
if isinstance(conv_topic, list):
|
||||
conv_topic = conv_topic[0]
|
||||
|
||||
# conversation index
|
||||
conv_index = ""
|
||||
ci_raw = msg.get("conversationIndex")
|
||||
if ci_raw:
|
||||
try:
|
||||
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
|
||||
except Exception:
|
||||
conv_index = ci_raw
|
||||
|
||||
# přílohy (jen metadata, bez obsahu)
|
||||
attachments = []
|
||||
for att in msg.get("attachments") or []:
|
||||
fname = att.get("name") or ""
|
||||
if not fname:
|
||||
continue
|
||||
attachments.append({
|
||||
"filename": fname,
|
||||
"size_bytes": att.get("size", 0),
|
||||
"mime_type": att.get("contentType", "application/octet-stream"),
|
||||
"content_id": att.get("contentId"),
|
||||
"is_inline": att.get("isInline", False),
|
||||
})
|
||||
|
||||
return {
|
||||
"_id": mid,
|
||||
"graph_id": msg["id"],
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": norm_subject,
|
||||
"importance": importance,
|
||||
"flag_status": flag_status,
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"has_attachments": msg.get("hasAttachments", False),
|
||||
"attachment_count": len(attachments),
|
||||
"inference_classification": msg.get("inferenceClassification", ""),
|
||||
"categories": msg.get("categories") or [],
|
||||
|
||||
"conversation_id": msg.get("conversationId", ""),
|
||||
"conversation_index": conv_index,
|
||||
"conversation_topic": conv_topic,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
|
||||
"received_at": parse_date(msg.get("receivedDateTime")),
|
||||
"sent_at": parse_date(msg.get("sentDateTime")),
|
||||
"created_at": parse_date(msg.get("createdDateTime")),
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
|
||||
"sender": {
|
||||
"email": sender_email,
|
||||
"name": sender_name,
|
||||
},
|
||||
"to": format_recipients(to_list),
|
||||
"cc": format_recipients(cc_list),
|
||||
"bcc": format_recipients(bcc_list),
|
||||
"recipients": recipients,
|
||||
|
||||
"body_html": body_html,
|
||||
"body_preview": body_preview,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_id", ASCENDING)])
|
||||
col.create_index([("folder_path", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([("is_read", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_preview", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N zprav (0 = vse)")
|
||||
ap.add_argument("--skip-existing", action="store_true",
|
||||
help="Preskocit zpravy ktere jiz jsou v MongoDB")
|
||||
ap.add_argument("--folder", default="",
|
||||
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
|
||||
|
||||
# Graph token
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
col = client[MONGO_DB][MONGO_COL]
|
||||
|
||||
# Skip existing
|
||||
existing: set = set()
|
||||
if args.skip_existing:
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("_id"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
# Slozky
|
||||
print("\nNacitam seznam slozek...")
|
||||
all_folders = get_all_folders()
|
||||
if args.folder:
|
||||
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
|
||||
print(f" Slozek ke zpracovani: {len(all_folders)}")
|
||||
for f in all_folders:
|
||||
print(f" {f['path']}")
|
||||
|
||||
# Import
|
||||
batch = []
|
||||
ok_count = 0
|
||||
err_count = 0
|
||||
skip_count = 0
|
||||
total_i = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
print()
|
||||
for folder in all_folders:
|
||||
print(f"--- Složka: {folder['path']} ---")
|
||||
folder_count = 0
|
||||
|
||||
for msg in iter_folder_messages(folder["id"]):
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
|
||||
if mid in existing:
|
||||
skip_count += 1
|
||||
total_i += 1
|
||||
continue
|
||||
|
||||
doc = extract_message(msg, folder["path"])
|
||||
total_i += 1
|
||||
folder_count += 1
|
||||
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
status = "ERR " if doc is None else "OK "
|
||||
subject_str = (doc.get("subject") or "")[:60] if doc else "?"
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?"
|
||||
print(f" {total_i:>6} {status} {subject_str:<60} {sender_str}")
|
||||
|
||||
if total_i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = total_i / elapsed if elapsed > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
print(f" → {folder_count} zprav ze slozky {folder['path']}")
|
||||
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,605 @@
|
||||
"""
|
||||
parse_emails_graph_v1.1.py
|
||||
Nazev: parse_emails_graph_v1.1.py
|
||||
Verze: 1.1
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
|
||||
Microsoft Graph API a importuje je jako dokumenty do MongoDB.
|
||||
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
|
||||
- telo HTML (max 2 MB) + textovy preview
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
|
||||
- internet headers (SPF, DKIM, Received, X-*, ...)
|
||||
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- navic: isRead, isDraft, folder_path, inferenceClassification
|
||||
|
||||
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
|
||||
archivni slozky, ...).
|
||||
|
||||
DB: emaily
|
||||
Kolekce: ordinace@buzalkova.cz
|
||||
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
# Prvni import (vsechno):
|
||||
python parse_emails_graph_v1.1.py
|
||||
|
||||
# Test na prvnich 50:
|
||||
python parse_emails_graph_v1.1.py --limit 50 --no-indexes
|
||||
|
||||
# Jen jedna slozka:
|
||||
python parse_emails_graph_v1.1.py --folder Inbox
|
||||
|
||||
# Pokracovani po preruseni (pouze nove):
|
||||
python parse_emails_graph_v1.1.py --mode new-only
|
||||
|
||||
# Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
|
||||
python parse_emails_graph_v1.1.py --mode sync
|
||||
|
||||
# Plny reimport vsech dat:
|
||||
python parse_emails_graph_v1.1.py --mode full
|
||||
|
||||
Rezimy (--mode):
|
||||
full Plny upsert vsech poli pro kazdou zpravu (vychozi)
|
||||
new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
|
||||
sync Existujici: aktualizuje jen is_read/flag_status/categories/
|
||||
modified_at/folder_path. Nove zpravy importuje cely.
|
||||
Idealni pro pravidelne spousteni.
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo graphid: fallback)
|
||||
graph_id Graph API message ID
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW:/AW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
is_read bool — aktualni stav precteni ve schrance
|
||||
is_draft bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
inference_classification focused / other
|
||||
categories [str]
|
||||
conversation_id Graph conversationId
|
||||
conversation_index base64 conversationIndex
|
||||
conversation_topic tema vlakna (z internet headers Thread-Topic)
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID]
|
||||
received_at datetime UTC
|
||||
sent_at datetime UTC
|
||||
created_at datetime UTC
|
||||
modified_at datetime UTC
|
||||
folder_id Graph parentFolderId
|
||||
folder_path cela cesta slozky (napr. Inbox/Subfolder)
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno
|
||||
to retezec To (joined)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
recipients [{type, email, name}]
|
||||
body_html HTML telo (max 2 MB)
|
||||
body_preview textovy nahled (max 255 znaku)
|
||||
attachments [{filename, size_bytes, mime_type, content_id, is_inline}]
|
||||
headers dict internet headers
|
||||
parsed_at datetime UTC
|
||||
|
||||
Indexy:
|
||||
received_at, sent_at, sender.email, graph_id (unique),
|
||||
conversation_id, folder_path, has_attachments, categories,
|
||||
importance, flag_status, is_read,
|
||||
text_search (subject + body_preview + to + cc)
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync;
|
||||
odstranen --skip-existing (nahrazen --mode new-only)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL = "ordinace@buzalkova.cz"
|
||||
BATCH_SIZE = 100
|
||||
PAGE_SIZE = 50
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.1"
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
|
||||
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
|
||||
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
|
||||
|
||||
MSG_SELECT = (
|
||||
"id,internetMessageId,subject,bodyPreview,body,"
|
||||
"importance,isRead,isDraft,hasAttachments,"
|
||||
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
|
||||
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
|
||||
"conversationId,conversationIndex,parentFolderId,"
|
||||
"categories,flag,inferenceClassification,internetMessageHeaders"
|
||||
)
|
||||
|
||||
# Pro sync mode staci jen menitelna pole — rychlejsi fetch
|
||||
MSG_SELECT_SYNC = (
|
||||
"id,internetMessageId,isRead,isDraft,flag,categories,"
|
||||
"lastModifiedDateTime,parentFolderId,importance"
|
||||
)
|
||||
|
||||
|
||||
# ─── Graph API helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET failed after retry: {url}")
|
||||
|
||||
|
||||
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
|
||||
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
|
||||
if parent_id is None:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
|
||||
else:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
|
||||
|
||||
folders = []
|
||||
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for f in data.get("value", []):
|
||||
path = f"{parent_path}/{f['displayName']}".lstrip("/")
|
||||
folders.append({"id": f["id"], "path": path})
|
||||
if f.get("childFolderCount", 0) > 0:
|
||||
folders.extend(get_all_folders(f["id"], path))
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
return folders
|
||||
|
||||
|
||||
def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
|
||||
"""Generator: vraci zpravy ze slozky po strankach."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
|
||||
params = {"$top": PAGE_SIZE, "$select": select}
|
||||
if expand_attachments:
|
||||
params["$expand"] = "attachments"
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for msg in data.get("value", []):
|
||||
yield msg
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
s = subject.strip()
|
||||
while True:
|
||||
m = RE_SUBJECT.match(s)
|
||||
if not m:
|
||||
break
|
||||
s = s[m.end():].strip()
|
||||
return s
|
||||
|
||||
|
||||
def parse_headers(raw_headers: list) -> dict:
|
||||
result = {}
|
||||
for h in raw_headers:
|
||||
k = h["name"].lower().replace("-", "_")
|
||||
v = h["value"]
|
||||
if k in result:
|
||||
existing = result[k]
|
||||
result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
def format_recipients(lst: list) -> str:
|
||||
return "; ".join(
|
||||
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
|
||||
for r in lst
|
||||
)
|
||||
|
||||
|
||||
# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
|
||||
"""Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
|
||||
try:
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
subject = msg.get("subject") or ""
|
||||
|
||||
body_html = None
|
||||
body_preview = msg.get("bodyPreview") or ""
|
||||
body = msg.get("body", {})
|
||||
if body.get("contentType") == "html":
|
||||
content = body.get("content") or ""
|
||||
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
|
||||
elif body.get("contentType") == "text":
|
||||
body_preview = (body.get("content") or "")[:2000]
|
||||
|
||||
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
|
||||
to_list = msg.get("toRecipients", [])
|
||||
cc_list = msg.get("ccRecipients", [])
|
||||
bcc_list = msg.get("bccRecipients", [])
|
||||
|
||||
recipients = (
|
||||
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
|
||||
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
|
||||
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
|
||||
)
|
||||
|
||||
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
|
||||
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
|
||||
|
||||
raw_headers = msg.get("internetMessageHeaders") or []
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if isinstance(in_reply_to, list):
|
||||
in_reply_to = in_reply_to[0]
|
||||
|
||||
refs_raw = headers.get("references", "")
|
||||
if isinstance(refs_raw, list):
|
||||
refs_raw = " ".join(refs_raw)
|
||||
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
|
||||
|
||||
conv_topic = headers.get("thread_topic", "")
|
||||
if isinstance(conv_topic, list):
|
||||
conv_topic = conv_topic[0]
|
||||
|
||||
conv_index = ""
|
||||
ci_raw = msg.get("conversationIndex")
|
||||
if ci_raw:
|
||||
try:
|
||||
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
|
||||
except Exception:
|
||||
conv_index = ci_raw
|
||||
|
||||
attachments = []
|
||||
for att in msg.get("attachments") or []:
|
||||
fname = att.get("name") or ""
|
||||
if not fname:
|
||||
continue
|
||||
attachments.append({
|
||||
"filename": fname,
|
||||
"size_bytes": att.get("size", 0),
|
||||
"mime_type": att.get("contentType", "application/octet-stream"),
|
||||
"content_id": att.get("contentId"),
|
||||
"is_inline": att.get("isInline", False),
|
||||
})
|
||||
|
||||
return {
|
||||
"_id": mid,
|
||||
"graph_id": msg["id"],
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": normalize_subject(subject),
|
||||
"importance": importance,
|
||||
"flag_status": flag_status,
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"has_attachments": msg.get("hasAttachments", False),
|
||||
"attachment_count": len(attachments),
|
||||
"inference_classification": msg.get("inferenceClassification", ""),
|
||||
"categories": msg.get("categories") or [],
|
||||
|
||||
"conversation_id": msg.get("conversationId", ""),
|
||||
"conversation_index": conv_index,
|
||||
"conversation_topic": conv_topic,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
|
||||
"received_at": parse_date(msg.get("receivedDateTime")),
|
||||
"sent_at": parse_date(msg.get("sentDateTime")),
|
||||
"created_at": parse_date(msg.get("createdDateTime")),
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
|
||||
"sender": {
|
||||
"email": sender_ea.get("address", ""),
|
||||
"name": sender_ea.get("name", ""),
|
||||
},
|
||||
"to": format_recipients(to_list),
|
||||
"cc": format_recipients(cc_list),
|
||||
"bcc": format_recipients(bcc_list),
|
||||
"recipients": recipients,
|
||||
|
||||
"body_html": body_html,
|
||||
"body_preview": body_preview,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
|
||||
return None
|
||||
|
||||
|
||||
def extract_sync_fields(msg: dict, folder_path: str) -> dict:
|
||||
"""Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
|
||||
return {
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
|
||||
"importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
|
||||
"categories": msg.get("categories") or [],
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_id", ASCENDING)])
|
||||
col.create_index([("folder_path", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([("is_read", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_preview", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
|
||||
help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
|
||||
"sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N zprav (0 = vse)")
|
||||
ap.add_argument("--folder", default="",
|
||||
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
|
||||
print(f"Režim: {args.mode}")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
col = client[MONGO_DB][MONGO_COL]
|
||||
|
||||
# Existující _id (potřeba pro new-only a sync)
|
||||
existing: set = set()
|
||||
if args.mode in ("new-only", "sync"):
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("_id"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
print("\nNacitam seznam slozek...")
|
||||
all_folders = get_all_folders()
|
||||
if args.folder:
|
||||
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
|
||||
print(f" Slozek ke zpracovani: {len(all_folders)}")
|
||||
for f in all_folders:
|
||||
print(f" {f['path']}")
|
||||
|
||||
# V sync mode fetchujeme jen menitelna pole
|
||||
is_sync = args.mode == "sync"
|
||||
msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
|
||||
expand_att = not is_sync
|
||||
|
||||
batch = []
|
||||
ok_count = 0
|
||||
sync_count = 0
|
||||
err_count = 0
|
||||
skip_count = 0
|
||||
total_i = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
print()
|
||||
for folder in all_folders:
|
||||
print(f"--- Složka: {folder['path']} ---")
|
||||
folder_count = 0
|
||||
|
||||
for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
total_i += 1
|
||||
folder_count += 1
|
||||
|
||||
if args.mode == "new-only" and mid in existing:
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
if is_sync and mid in existing:
|
||||
# Sync existujici — jen menitelna pole
|
||||
fields = extract_sync_fields(msg, folder["path"])
|
||||
batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
|
||||
sync_count += 1
|
||||
status = "SYN "
|
||||
print(f" {total_i:>6} {status} {mid[:80]}")
|
||||
else:
|
||||
# Full extract (new-only nove, sync nove, full vse)
|
||||
# Pro sync nove zpravy potrebujeme plny fetch
|
||||
if is_sync:
|
||||
full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
|
||||
full_params = {"$select": MSG_SELECT, "$expand": "attachments"}
|
||||
try:
|
||||
msg = graph_get(full_url, full_params)
|
||||
except Exception as e:
|
||||
logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
|
||||
err_count += 1
|
||||
continue
|
||||
|
||||
doc = extract_message(msg, folder["path"])
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
status = "ERR "
|
||||
print(f" {total_i:>6} {status} {mid[:80]}")
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
status = "OK "
|
||||
subject_str = (doc.get("subject") or "")[:60]
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40]
|
||||
print(f" {total_i:>6} {status} {subject_str:<60} {sender_str}")
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if total_i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = total_i / elapsed if elapsed > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
print(f" → {folder_count} zprav ze slozky {folder['path']}")
|
||||
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,610 @@
|
||||
"""
|
||||
parse_emails_graph_v1.2.py
|
||||
Nazev: parse_emails_graph_v1.2.py
|
||||
Verze: 1.2
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
|
||||
Microsoft Graph API a importuje je jako dokumenty do MongoDB.
|
||||
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
|
||||
- telo HTML (max 2 MB) + textovy preview
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id)
|
||||
- internet headers (SPF, DKIM, Received, X-*, ...)
|
||||
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- navic: isRead, isDraft, folder_path, inferenceClassification
|
||||
|
||||
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
|
||||
archivni slozky, ...).
|
||||
|
||||
DB: emaily
|
||||
Kolekce: ordinace@buzalkova.cz
|
||||
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
# Prvni import (vsechno):
|
||||
python parse_emails_graph_v1.2.py
|
||||
|
||||
# Test na prvnich 50:
|
||||
python parse_emails_graph_v1.2.py --limit 50 --no-indexes
|
||||
|
||||
# Jen jedna slozka:
|
||||
python parse_emails_graph_v1.2.py --folder Inbox
|
||||
|
||||
# Pokracovani po preruseni (pouze nove):
|
||||
python parse_emails_graph_v1.2.py --mode new-only
|
||||
|
||||
# Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
|
||||
python parse_emails_graph_v1.2.py --mode sync
|
||||
|
||||
# Plny reimport vsech dat:
|
||||
python parse_emails_graph_v1.2.py --mode full
|
||||
|
||||
Rezimy (--mode):
|
||||
full Plny upsert vsech poli pro kazdou zpravu (vychozi)
|
||||
new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
|
||||
sync Existujici: aktualizuje jen is_read/flag_status/categories/
|
||||
modified_at/folder_path. Nove zpravy importuje cely.
|
||||
Idealni pro pravidelne spousteni.
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo graphid: fallback)
|
||||
graph_id Graph API message ID
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW:/AW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
is_read bool — aktualni stav precteni ve schrance
|
||||
is_draft bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
inference_classification focused / other
|
||||
categories [str]
|
||||
conversation_id Graph conversationId
|
||||
conversation_index base64 conversationIndex
|
||||
conversation_topic tema vlakna (z internet headers Thread-Topic)
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID]
|
||||
received_at datetime UTC
|
||||
sent_at datetime UTC
|
||||
created_at datetime UTC
|
||||
modified_at datetime UTC
|
||||
folder_id Graph parentFolderId
|
||||
folder_path cela cesta slozky (napr. Inbox/Subfolder)
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno
|
||||
to retezec To (joined)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
recipients [{type, email, name}]
|
||||
body_html HTML telo (max 2 MB)
|
||||
body_preview textovy nahled (max 255 znaku)
|
||||
attachments [{filename, size_bytes, mime_type, is_inline, graph_att_id}]
|
||||
headers dict internet headers
|
||||
parsed_at datetime UTC
|
||||
|
||||
Indexy:
|
||||
received_at, sent_at, sender.email, graph_id (unique),
|
||||
conversation_id, folder_path, has_attachments, categories,
|
||||
importance, flag_status, is_read,
|
||||
text_search (subject + body_preview + to + cc)
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync;
|
||||
odstranen --skip-existing (nahrazen --mode new-only)
|
||||
1.2 2026-06-02 $expand attachments s $select (bez contentBytes — rychlejsi);
|
||||
prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL = "ordinace@buzalkova.cz"
|
||||
BATCH_SIZE = 100
|
||||
PAGE_SIZE = 50
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.2"
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
|
||||
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
|
||||
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
|
||||
|
||||
# $expand prilohy bez contentBytes — jen metadata co potrebujeme
|
||||
ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
|
||||
|
||||
MSG_SELECT = (
|
||||
"id,internetMessageId,subject,bodyPreview,body,"
|
||||
"importance,isRead,isDraft,hasAttachments,"
|
||||
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
|
||||
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
|
||||
"conversationId,conversationIndex,parentFolderId,"
|
||||
"categories,flag,inferenceClassification,internetMessageHeaders"
|
||||
)
|
||||
|
||||
# Pro sync mode staci jen menitelna pole — rychlejsi fetch
|
||||
MSG_SELECT_SYNC = (
|
||||
"id,internetMessageId,isRead,isDraft,flag,categories,"
|
||||
"lastModifiedDateTime,parentFolderId,importance"
|
||||
)
|
||||
|
||||
|
||||
# ─── Graph API helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET failed after retry: {url}")
|
||||
|
||||
|
||||
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
|
||||
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
|
||||
if parent_id is None:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
|
||||
else:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
|
||||
|
||||
folders = []
|
||||
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for f in data.get("value", []):
|
||||
path = f"{parent_path}/{f['displayName']}".lstrip("/")
|
||||
folders.append({"id": f["id"], "path": path})
|
||||
if f.get("childFolderCount", 0) > 0:
|
||||
folders.extend(get_all_folders(f["id"], path))
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
return folders
|
||||
|
||||
|
||||
def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
|
||||
"""Generator: vraci zpravy ze slozky po strankach."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
|
||||
params = {"$top": PAGE_SIZE, "$select": select}
|
||||
if expand_attachments:
|
||||
params["$expand"] = ATT_EXPAND
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for msg in data.get("value", []):
|
||||
yield msg
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
s = subject.strip()
|
||||
while True:
|
||||
m = RE_SUBJECT.match(s)
|
||||
if not m:
|
||||
break
|
||||
s = s[m.end():].strip()
|
||||
return s
|
||||
|
||||
|
||||
def parse_headers(raw_headers: list) -> dict:
|
||||
result = {}
|
||||
for h in raw_headers:
|
||||
k = h["name"].lower().replace("-", "_")
|
||||
v = h["value"]
|
||||
if k in result:
|
||||
existing = result[k]
|
||||
result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
def format_recipients(lst: list) -> str:
|
||||
return "; ".join(
|
||||
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
|
||||
for r in lst
|
||||
)
|
||||
|
||||
|
||||
# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
|
||||
"""Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
|
||||
try:
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
subject = msg.get("subject") or ""
|
||||
|
||||
body_html = None
|
||||
body_preview = msg.get("bodyPreview") or ""
|
||||
body = msg.get("body", {})
|
||||
if body.get("contentType") == "html":
|
||||
content = body.get("content") or ""
|
||||
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
|
||||
elif body.get("contentType") == "text":
|
||||
body_preview = (body.get("content") or "")[:2000]
|
||||
|
||||
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
|
||||
to_list = msg.get("toRecipients", [])
|
||||
cc_list = msg.get("ccRecipients", [])
|
||||
bcc_list = msg.get("bccRecipients", [])
|
||||
|
||||
recipients = (
|
||||
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
|
||||
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
|
||||
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
|
||||
)
|
||||
|
||||
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
|
||||
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
|
||||
|
||||
raw_headers = msg.get("internetMessageHeaders") or []
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if isinstance(in_reply_to, list):
|
||||
in_reply_to = in_reply_to[0]
|
||||
|
||||
refs_raw = headers.get("references", "")
|
||||
if isinstance(refs_raw, list):
|
||||
refs_raw = " ".join(refs_raw)
|
||||
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
|
||||
|
||||
conv_topic = headers.get("thread_topic", "")
|
||||
if isinstance(conv_topic, list):
|
||||
conv_topic = conv_topic[0]
|
||||
|
||||
conv_index = ""
|
||||
ci_raw = msg.get("conversationIndex")
|
||||
if ci_raw:
|
||||
try:
|
||||
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
|
||||
except Exception:
|
||||
conv_index = ci_raw
|
||||
|
||||
attachments = []
|
||||
for att in msg.get("attachments") or []:
|
||||
fname = att.get("name") or ""
|
||||
if not fname:
|
||||
continue
|
||||
attachments.append({
|
||||
"filename": fname,
|
||||
"size_bytes": att.get("size", 0),
|
||||
"mime_type": att.get("contentType", "application/octet-stream"),
|
||||
"is_inline": att.get("isInline", False),
|
||||
"graph_att_id": att.get("id"),
|
||||
})
|
||||
|
||||
return {
|
||||
"_id": mid,
|
||||
"graph_id": msg["id"],
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": normalize_subject(subject),
|
||||
"importance": importance,
|
||||
"flag_status": flag_status,
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"has_attachments": msg.get("hasAttachments", False),
|
||||
"attachment_count": len(attachments),
|
||||
"inference_classification": msg.get("inferenceClassification", ""),
|
||||
"categories": msg.get("categories") or [],
|
||||
|
||||
"conversation_id": msg.get("conversationId", ""),
|
||||
"conversation_index": conv_index,
|
||||
"conversation_topic": conv_topic,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
|
||||
"received_at": parse_date(msg.get("receivedDateTime")),
|
||||
"sent_at": parse_date(msg.get("sentDateTime")),
|
||||
"created_at": parse_date(msg.get("createdDateTime")),
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
|
||||
"sender": {
|
||||
"email": sender_ea.get("address", ""),
|
||||
"name": sender_ea.get("name", ""),
|
||||
},
|
||||
"to": format_recipients(to_list),
|
||||
"cc": format_recipients(cc_list),
|
||||
"bcc": format_recipients(bcc_list),
|
||||
"recipients": recipients,
|
||||
|
||||
"body_html": body_html,
|
||||
"body_preview": body_preview,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
|
||||
return None
|
||||
|
||||
|
||||
def extract_sync_fields(msg: dict, folder_path: str) -> dict:
|
||||
"""Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
|
||||
return {
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
|
||||
"importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
|
||||
"categories": msg.get("categories") or [],
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_id", ASCENDING)])
|
||||
col.create_index([("folder_path", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([("is_read", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_preview", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
|
||||
help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
|
||||
"sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N zprav (0 = vse)")
|
||||
ap.add_argument("--folder", default="",
|
||||
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
|
||||
print(f"Režim: {args.mode}")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
col = client[MONGO_DB][MONGO_COL]
|
||||
|
||||
# Existující _id (potřeba pro new-only a sync)
|
||||
existing: set = set()
|
||||
if args.mode in ("new-only", "sync"):
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("_id"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
print("\nNacitam seznam slozek...")
|
||||
all_folders = get_all_folders()
|
||||
if args.folder:
|
||||
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
|
||||
print(f" Slozek ke zpracovani: {len(all_folders)}")
|
||||
for f in all_folders:
|
||||
print(f" {f['path']}")
|
||||
|
||||
# V sync mode fetchujeme jen menitelna pole
|
||||
is_sync = args.mode == "sync"
|
||||
msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
|
||||
expand_att = not is_sync
|
||||
|
||||
batch = []
|
||||
ok_count = 0
|
||||
sync_count = 0
|
||||
err_count = 0
|
||||
skip_count = 0
|
||||
total_i = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
print()
|
||||
for folder in all_folders:
|
||||
print(f"--- Složka: {folder['path']} ---")
|
||||
folder_count = 0
|
||||
|
||||
for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
total_i += 1
|
||||
folder_count += 1
|
||||
|
||||
if args.mode == "new-only" and mid in existing:
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
if is_sync and mid in existing:
|
||||
# Sync existujici — jen menitelna pole
|
||||
fields = extract_sync_fields(msg, folder["path"])
|
||||
batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
|
||||
sync_count += 1
|
||||
status = "SYN "
|
||||
print(f" {total_i:>6} {status} {mid[:80]}")
|
||||
else:
|
||||
# Full extract (new-only nove, sync nove, full vse)
|
||||
# Pro sync nove zpravy potrebujeme plny fetch
|
||||
if is_sync:
|
||||
full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
|
||||
full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND}
|
||||
try:
|
||||
msg = graph_get(full_url, full_params)
|
||||
except Exception as e:
|
||||
logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
|
||||
err_count += 1
|
||||
continue
|
||||
|
||||
doc = extract_message(msg, folder["path"])
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
status = "ERR "
|
||||
print(f" {total_i:>6} {status} {mid[:80]}")
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
status = "OK "
|
||||
subject_str = (doc.get("subject") or "")[:60]
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40]
|
||||
print(f" {total_i:>6} {status} {subject_str:<60} {sender_str}")
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if total_i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = total_i / elapsed if elapsed > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
print(f" → {folder_count} zprav ze slozky {folder['path']}")
|
||||
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
download_attachments_v1.0.py
|
||||
Nazev: download_attachments_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB kolekce
|
||||
ordinace@buzalkova.cz primo pres Microsoft Graph API a uklada je do
|
||||
adresare /mnt/Emails/ordinace@buzalkova.cz/Attachments/.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, first_seen_at, ref_count (pocet emailu ktery ji obsahuje)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- zpravy kde jsou vsechny prilohy uz stazene (maji file_hash) se preskoci
|
||||
- --force-recheck znovu overi i uz stazene (pro pripad zmen na disku)
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.0.py # stahni vse co chybi
|
||||
python download_attachments_v1.0.py --limit 50 # test na prvnich 50 emailech
|
||||
python download_attachments_v1.0.py --force-recheck # overi i uz stazene
|
||||
|
||||
Docker (po pridani mountu /mnt/user/Emails -> /mnt/Emails):
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.0.py
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura na disku:
|
||||
/mnt/Emails/
|
||||
└── ordinace@buzalkova.cz/
|
||||
└── Attachments/
|
||||
├── faktura_2026.pdf
|
||||
├── vysledky_lab.pdf
|
||||
├── vysledky_lab_2.pdf <- kolize nazvu, jiny obsah
|
||||
└── ...
|
||||
|
||||
Kolekce emaily.attachments_index:
|
||||
_id SHA256 hash (hex)
|
||||
filename nazev souboru na disku (prvni vyskytu)
|
||||
local_path relativni cesta od Attachments/ (zatim = filename)
|
||||
size_bytes velikost souboru
|
||||
mime_type MIME typ
|
||||
first_seen_at datetime UTC
|
||||
ref_count v kolika emailech se tato priloha vyskytuje
|
||||
|
||||
Aktualizace v email dokumentu (kolekce ordinace@buzalkova.cz):
|
||||
attachments[i].file_hash SHA256 hash
|
||||
attachments[i].local_path cesta relativni od Attachments/
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
"""
|
||||
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_EMAILS = "ordinace@buzalkova.cz"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
ATTACHMENTS_DIR = Path("/mnt/Emails/ordinace@buzalkova.cz/Attachments")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.0"
|
||||
BATCH_SIZE = 50
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
"""Stahne binarni obsah prilohy."""
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_attachment_content(graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
"""Stahne obsah prilohy pres Graph API."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_message_attachments(graph_message_id: str) -> list[dict]:
|
||||
"""Nacte seznam priloh zpravy z Graph API (metadata vcetne attachment ID)."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, index_col) -> str:
|
||||
"""
|
||||
Vrati nazev souboru ktery pouzit pro ulozeni.
|
||||
Pokud desired_name jiz existuje s jinym hashem, prida suffix _2, _3 ...
|
||||
"""
|
||||
# Zkontroluj jestli existujici soubor se stejnym nazvem ma stejny hash
|
||||
existing = index_col.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name # Stejny hash, stejne jmeno — dedup hit
|
||||
# Jiny hash — hledej volny suffix
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
if not (att_dir / candidate).exists():
|
||||
# Overi ze ani v indexu neni tento kandidat s jinym hashem
|
||||
ex2 = index_col.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(content: bytes, original_name: str, att_dir: Path, index_col) -> tuple[str, str, bool]:
|
||||
"""
|
||||
Ulozi prilohu s deduplikaci.
|
||||
Vraci (hash, local_path, was_new):
|
||||
was_new=True -> soubor byl ulozen
|
||||
was_new=False -> hash uz existoval, soubor preskocen
|
||||
"""
|
||||
hash_val = sha256(content)
|
||||
|
||||
# Zkontroluj index — pokud hash uz existuje, vrat existujici zaznam
|
||||
existing = index_col.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
# Zvys pocitadlo referenci
|
||||
index_col.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
# Novy soubor — urcit nazev
|
||||
safe_name = "".join(c if c.isalnum() or c in "._- " else "_" for c in original_name).strip()
|
||||
if not safe_name:
|
||||
safe_name = f"attachment_{hash_val[:8]}"
|
||||
|
||||
filename = resolve_filename(safe_name, att_dir, hash_val, index_col)
|
||||
file_path = att_dir / filename
|
||||
|
||||
# Uloz soubor
|
||||
file_path.write_bytes(content)
|
||||
|
||||
# Zaznamenej do indexu
|
||||
index_col.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": "",
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"Cilovy adresar: {ATTACHMENTS_DIR}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}")
|
||||
|
||||
# Adresar
|
||||
ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
print(f" Adresar OK")
|
||||
|
||||
# Graph
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][MONGO_COL_EMAILS]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
# Indexy na attachment index kolekci
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
|
||||
# Dotaz — emaily s prilohou ktere jeste nebyly zpracovany
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
skip_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
# Jen skutecne prilohy
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
# Nacti attachment IDs z Graph API
|
||||
graph_atts = fetch_message_attachments(graph_id)
|
||||
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
skip_count += 1
|
||||
print(f" SKIP {att['filename']}")
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
graph_att = graph_att_map.get(att_name)
|
||||
|
||||
if not graph_att:
|
||||
# Zkus najit podle casti nazvu
|
||||
for gname, ga in graph_att_map.items():
|
||||
if att_name.lower() in gname.lower():
|
||||
graph_att = ga
|
||||
break
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno v Graph)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
# Stahni obsah
|
||||
content = fetch_attachment_content(graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
# Uloz s dedupem
|
||||
hash_val, local_path, was_new = save_attachment(content, att_name, ATTACHMENTS_DIR, col_index)
|
||||
|
||||
# Aktualizuj MIME typ v indexu
|
||||
col_index.update_one(
|
||||
{"_id": hash_val},
|
||||
{"$set": {"mime_type": att.get("mime_type", graph_att.get("contentType", ""))}},
|
||||
)
|
||||
|
||||
# Zaznamenej do emailu
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
skip_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
# Uloz aktualizovane prilohy zpet do emailu
|
||||
batch.append(UpdateOne(
|
||||
{"_id": email_id},
|
||||
{"$set": {"attachments": updated_atts}}
|
||||
))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={skip_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove soubory={new_count} | duplikaty={skip_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total/1024/1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,428 @@
|
||||
"""
|
||||
download_attachments_v1.1.py
|
||||
Nazev: download_attachments_v1.1.py
|
||||
Verze: 1.1
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
|
||||
pres Microsoft Graph API a uklada je do adresare
|
||||
/mnt/Emails/<schránka>/Attachments/.
|
||||
|
||||
Schránka se predava jako povinny parametr --mailbox.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, mailbox, first_seen_at, ref_count
|
||||
|
||||
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
|
||||
se preskoci. --force-recheck znovu overi i uz stazene.
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz
|
||||
python download_attachments_v1.1.py --mailbox vladimir.buzalka@buzalka.cz --limit 50
|
||||
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz --force-recheck
|
||||
|
||||
Docker:
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.1.py \\
|
||||
--mailbox ordinace@buzalkova.cz
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo
|
||||
Python 3.10+
|
||||
|
||||
Struktura na disku:
|
||||
/mnt/Emails/
|
||||
└── <mailbox>/
|
||||
└── Attachments/
|
||||
├── faktura_2026.pdf
|
||||
├── vysledky_lab.pdf
|
||||
├── vysledky_lab_2.pdf
|
||||
└── ...
|
||||
|
||||
Kolekce emaily.attachments_index:
|
||||
_id SHA256 hash (hex)
|
||||
filename nazev souboru na disku
|
||||
local_path relativni cesta od Attachments/
|
||||
size_bytes velikost souboru
|
||||
mime_type MIME typ
|
||||
mailbox schránka ze ktere pochazi prvni vyskytu
|
||||
first_seen_at datetime UTC
|
||||
ref_count v kolika emailech se tato priloha vyskytuje
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Schránka jako parametr --mailbox (univerzalni pouziti)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
EMAILS_BASE_DIR = Path("/mnt/Emails")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.1"
|
||||
BATCH_SIZE = 50
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def safe_filename(name: str) -> str:
|
||||
safe = "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip()
|
||||
return safe or "attachment"
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
|
||||
"""Vrati nazev souboru pro ulozeni — resi kolize (stejny nazev, jiny hash)."""
|
||||
existing = col_index.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name # Dedup hit — stejny hash
|
||||
# Kolize — hledej volny suffix
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
ex2 = col_index.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(
|
||||
content: bytes,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
mailbox: str,
|
||||
att_dir: Path,
|
||||
col_index,
|
||||
) -> tuple[str, str, bool]:
|
||||
"""
|
||||
Ulozi prilohu s deduplikaci.
|
||||
Vraci (hash, local_path, was_new).
|
||||
"""
|
||||
hash_val = sha256(content)
|
||||
|
||||
existing = col_index.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
|
||||
file_path = att_dir / filename
|
||||
file_path.write_bytes(content)
|
||||
|
||||
col_index.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": mime_type,
|
||||
"mailbox": mailbox,
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mailbox", required=True,
|
||||
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na attachments_index kolekci")
|
||||
args = ap.parse_args()
|
||||
|
||||
mailbox = args.mailbox
|
||||
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
|
||||
mongo_col = mailbox
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {mailbox}")
|
||||
print(f"Cilovy adresar: {att_dir}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
|
||||
|
||||
att_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(" Adresar OK")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][mongo_col]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
col_index.create_index("mailbox")
|
||||
|
||||
# Dotaz
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
dup_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
graph_atts = fetch_message_attachments(mailbox, graph_id)
|
||||
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
print(f" SKIP {att['filename']}")
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
graph_att = graph_att_map.get(att_name)
|
||||
if not graph_att:
|
||||
for gname, ga in graph_att_map.items():
|
||||
if att_name.lower() in gname.lower():
|
||||
graph_att = ga
|
||||
break
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno v Graph)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
|
||||
hash_val, local_path, was_new = save_attachment(
|
||||
content, att_name, mime_type, mailbox, att_dir, col_index
|
||||
)
|
||||
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
dup_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,483 @@
|
||||
"""
|
||||
download_attachments_v1.3.py
|
||||
Nazev: download_attachments_v1.3.py
|
||||
Verze: 1.3
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
|
||||
pres Microsoft Graph API a uklada je do adresare
|
||||
/mnt/Emails/<schránka>/Attachments/.
|
||||
|
||||
Schránka se predava jako povinny parametr --mailbox.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, mailbox, first_seen_at, ref_count
|
||||
|
||||
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
|
||||
se preskoci. --force-recheck znovu overi i uz stazene.
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
|
||||
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50
|
||||
python download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz --force-recheck
|
||||
|
||||
Docker:
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.3.py \\
|
||||
--mailbox ordinace@buzalkova.cz
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo
|
||||
Python 3.10+
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Schránka jako parametr --mailbox
|
||||
1.2 2026-06-02 Oprava: Graph attachment mapa vcetne inline; normalizace nazvu;
|
||||
preskoceni S/MIME; inline z Graphu -> SKIP ne ERR
|
||||
1.3 2026-06-02 Primarni stazeni pres graph_att_id (prime ID bez name-matchingu);
|
||||
oprava $select na attachment listu (odstranen contentId ktery
|
||||
zpusoboval BadRequest a vracel prazdny seznam); name-matching
|
||||
zustava jako fallback pro stare emaily bez graph_att_id
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
EMAILS_BASE_DIR = Path("/mnt/Emails")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.3"
|
||||
BATCH_SIZE = 50
|
||||
|
||||
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
|
||||
SKIP_EXTENSIONS = {".p7m", ".p7s", ".p7c", ".p7b"}
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
|
||||
"""Nacte metadata vsech priloh zpravy (bez contentBytes)."""
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
# Pozor: contentId NENI v base attachment type — nesmi byt v $select
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s",
|
||||
graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Normalizuje název pro porovnání — lowercase, bez diakritiky, jen alnum+._-"""
|
||||
nfkd = unicodedata.normalize("NFKD", name.lower().strip())
|
||||
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
|
||||
return re.sub(r"[^\w.\-]", "_", ascii_str)
|
||||
|
||||
|
||||
def find_graph_att(att_name: str, att_size: int, graph_atts: list[dict]) -> Optional[dict]:
|
||||
"""Fallback: hleda prilohu v Graph listu podle jmena (pro emaily bez graph_att_id)."""
|
||||
# 1. Presna shoda
|
||||
for ga in graph_atts:
|
||||
if ga["name"] == att_name:
|
||||
return ga
|
||||
|
||||
norm_want = normalize_name(att_name)
|
||||
|
||||
# 2. Normalizovana shoda
|
||||
for ga in graph_atts:
|
||||
if normalize_name(ga["name"]) == norm_want:
|
||||
return ga
|
||||
|
||||
# 3. Normalizovana shoda + velikost (±10 %)
|
||||
for ga in graph_atts:
|
||||
if normalize_name(ga["name"]) == norm_want:
|
||||
ga_size = ga.get("size", 0)
|
||||
if att_size == 0 or ga_size == 0 or abs(ga_size - att_size) / max(ga_size, att_size) < 0.1:
|
||||
return ga
|
||||
|
||||
# 4. Castecna shoda sufixu (posledních 20 znaků normalizovaného jména)
|
||||
for ga in graph_atts:
|
||||
if norm_want[-20:] and normalize_name(ga["name"]).endswith(norm_want[-20:]):
|
||||
return ga
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def safe_filename(name: str) -> str:
|
||||
safe = "".join(c if c.isalnum() or c in "._- ()" else "_" for c in name).strip()
|
||||
return safe or "attachment"
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
|
||||
existing = col_index.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
ex2 = col_index.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(
|
||||
content: bytes,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
mailbox: str,
|
||||
att_dir: Path,
|
||||
col_index,
|
||||
) -> tuple[str, str, bool]:
|
||||
hash_val = sha256(content)
|
||||
|
||||
existing = col_index.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
|
||||
file_path = att_dir / filename
|
||||
file_path.write_bytes(content)
|
||||
|
||||
col_index.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": mime_type,
|
||||
"mailbox": mailbox,
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mailbox", required=True,
|
||||
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na attachments_index kolekci")
|
||||
args = ap.parse_args()
|
||||
|
||||
mailbox = args.mailbox
|
||||
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
|
||||
mongo_col = mailbox
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {mailbox}")
|
||||
print(f"Cilovy adresar: {att_dir}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
|
||||
|
||||
att_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(" Adresar OK")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][mongo_col]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
col_index.create_index("mailbox")
|
||||
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
dup_count = 0
|
||||
skip_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
# Nacti attachment list z Graphu jen pokud nektere prilohy nemaji graph_att_id
|
||||
need_listing = any(
|
||||
not a.get("is_inline", False)
|
||||
and not (not args.force_recheck and a.get("file_hash"))
|
||||
and not a.get("graph_att_id")
|
||||
for a in att_list
|
||||
)
|
||||
graph_atts = fetch_message_attachments(mailbox, graph_id) if need_listing else []
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
att_size = att.get("size_bytes", 0)
|
||||
graph_att_id = att.get("graph_att_id")
|
||||
|
||||
# Preskoc S/MIME podpisy
|
||||
if Path(att_name).suffix.lower() in SKIP_EXTENSIONS:
|
||||
updated_atts[i] = {**att, "file_hash": "skip", "local_path": ""}
|
||||
skip_count += 1
|
||||
print(f" SKIP {att_name} (S/MIME)")
|
||||
continue
|
||||
|
||||
# Primy pristup pres graph_att_id (emaily parsovane v1.2+)
|
||||
if graph_att_id:
|
||||
content = fetch_attachment_content(mailbox, graph_id, graph_att_id)
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
# Zkontroluj zda jde skutecne o inline (pro edge case)
|
||||
mime_type = att.get("mime_type", "")
|
||||
else:
|
||||
# Fallback: name matching pro stare emaily (parsovane pred v1.2)
|
||||
graph_att = find_graph_att(att_name, att_size, graph_atts)
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
# Pokud Graph rika ze je inline — preskoc
|
||||
if graph_att.get("isInline", False):
|
||||
updated_atts[i] = {**att, "is_inline": True, "file_hash": "skip", "local_path": ""}
|
||||
skip_count += 1
|
||||
print(f" SKIP {att_name} (inline obrazek)")
|
||||
continue
|
||||
|
||||
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
|
||||
|
||||
hash_val, local_path, was_new = save_attachment(
|
||||
content, att_name, mime_type, mailbox, att_dir, col_index
|
||||
)
|
||||
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
dup_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} skip={skip_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,560 @@
|
||||
"""
|
||||
parse_emails_graph_v1.0.py
|
||||
Nazev: parse_emails_graph_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
|
||||
Microsoft Graph API a importuje je jako dokumenty do MongoDB.
|
||||
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
|
||||
- telo HTML (max 2 MB) + textovy preview
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
|
||||
- internet headers (SPF, DKIM, Received, X-*, ...)
|
||||
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- navic: isRead, isDraft, folder_path, inferenceClassification
|
||||
|
||||
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
|
||||
archivni slozky, ...).
|
||||
|
||||
DB: emaily
|
||||
Kolekce: ordinace@buzalkova.cz
|
||||
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- upsert podle _id — duplicity se automaticky prepisi
|
||||
- --skip-existing nacte seznam hotovych _id z MongoDB a preskoci je
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python parse_emails_graph_v1.0.py # kompletni import
|
||||
python parse_emails_graph_v1.0.py --limit 50 # test na prvnich 50
|
||||
python parse_emails_graph_v1.0.py --skip-existing # pokracovani po preruseni
|
||||
python parse_emails_graph_v1.0.py --folder Inbox # jen jedna slozka
|
||||
python parse_emails_graph_v1.0.py --no-indexes # bez indexu na konci
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo graphid: fallback)
|
||||
graph_id Graph API message ID (pro pripadne dalsi operace)
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW:/AW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
is_read bool — aktualni stav precteni ve schrance
|
||||
is_draft bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
inference_classification focused / other (Outlook AI trideni)
|
||||
categories [str]
|
||||
conversation_id Graph conversationId
|
||||
conversation_index base64 conversationIndex
|
||||
conversation_topic tema vlakna (z internet headers Thread-Topic)
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID] — cela historia vlakna
|
||||
received_at datetime UTC
|
||||
sent_at datetime UTC
|
||||
created_at datetime UTC — cas vytvoreni zaznamu v M365
|
||||
modified_at datetime UTC — cas posledni modifikace
|
||||
folder_id Graph parentFolderId
|
||||
folder_path cela cesta slozky (napr. Inbox/Subfolder)
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno odesilatele
|
||||
to retezec To (joined)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
recipients [{type, email, name}] — to/cc/bcc s typy
|
||||
body_html HTML telo (max 2 MB)
|
||||
body_preview textovy nahled (max 255 znaku z Graph)
|
||||
attachments [{filename, size_bytes, mime_type,
|
||||
content_id, is_inline}]
|
||||
headers dict internet headers (lowercase_s_podtrzitky)
|
||||
parsed_at datetime UTC — cas parsovani
|
||||
|
||||
Indexy:
|
||||
received_at, sent_at, sender.email, graph_id (unique),
|
||||
conversation_id, folder_path, has_attachments, categories,
|
||||
importance, flag_status, is_read,
|
||||
text_search (subject + body_preview + to + cc)
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze — Graph API jako zdroj
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL = "ordinace@buzalkova.cz"
|
||||
BATCH_SIZE = 100
|
||||
PAGE_SIZE = 50
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.0"
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
|
||||
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
|
||||
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
|
||||
|
||||
MSG_SELECT = (
|
||||
"id,internetMessageId,subject,bodyPreview,body,"
|
||||
"importance,isRead,isDraft,hasAttachments,"
|
||||
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
|
||||
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
|
||||
"conversationId,conversationIndex,parentFolderId,"
|
||||
"categories,flag,inferenceClassification,internetMessageHeaders"
|
||||
)
|
||||
|
||||
|
||||
# ─── Graph API helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET failed after retry: {url}")
|
||||
|
||||
|
||||
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
|
||||
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
|
||||
if parent_id is None:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
|
||||
else:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
|
||||
|
||||
folders = []
|
||||
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for f in data.get("value", []):
|
||||
path = f"{parent_path}/{f['displayName']}".lstrip("/")
|
||||
folders.append({"id": f["id"], "path": path})
|
||||
if f.get("childFolderCount", 0) > 0:
|
||||
folders.extend(get_all_folders(f["id"], path))
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
return folders
|
||||
|
||||
|
||||
def iter_folder_messages(folder_id: str):
|
||||
"""Generator: vraci zpravy ze slozky po strankach."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
|
||||
params = {"$top": PAGE_SIZE, "$select": MSG_SELECT, "$expand": "attachments"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for msg in data.get("value", []):
|
||||
yield msg
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
s = subject.strip()
|
||||
while True:
|
||||
m = RE_SUBJECT.match(s)
|
||||
if not m:
|
||||
break
|
||||
s = s[m.end():].strip()
|
||||
return s
|
||||
|
||||
|
||||
def parse_headers(raw_headers: list) -> dict:
|
||||
result = {}
|
||||
for h in raw_headers:
|
||||
k = h["name"].lower().replace("-", "_")
|
||||
v = h["value"]
|
||||
if k in result:
|
||||
existing = result[k]
|
||||
if isinstance(existing, list):
|
||||
existing.append(v)
|
||||
else:
|
||||
result[k] = [existing, v]
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
def format_recipients(lst: list) -> str:
|
||||
return "; ".join(
|
||||
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
|
||||
for r in lst
|
||||
)
|
||||
|
||||
|
||||
# ─── Hlavní extrakce ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
|
||||
try:
|
||||
# _id
|
||||
mid = (msg.get("internetMessageId") or "").strip()
|
||||
if not mid:
|
||||
mid = f"graphid:{msg['id']}"
|
||||
|
||||
subject = msg.get("subject") or ""
|
||||
norm_subject = normalize_subject(subject)
|
||||
|
||||
# tělo
|
||||
body_html = None
|
||||
body_preview = msg.get("bodyPreview") or ""
|
||||
body = msg.get("body", {})
|
||||
if body.get("contentType") == "html":
|
||||
content = body.get("content") or ""
|
||||
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
|
||||
elif body.get("contentType") == "text":
|
||||
body_preview = (body.get("content") or "")[:2000]
|
||||
|
||||
# odesílatel
|
||||
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
|
||||
sender_email = sender_ea.get("address", "")
|
||||
sender_name = sender_ea.get("name", "")
|
||||
|
||||
# příjemci
|
||||
to_list = msg.get("toRecipients", [])
|
||||
cc_list = msg.get("ccRecipients", [])
|
||||
bcc_list = msg.get("bccRecipients", [])
|
||||
|
||||
recipients = (
|
||||
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
|
||||
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
|
||||
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
|
||||
)
|
||||
|
||||
# příznaky
|
||||
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
|
||||
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
|
||||
|
||||
# internet headers
|
||||
raw_headers = msg.get("internetMessageHeaders") or []
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if isinstance(in_reply_to, list):
|
||||
in_reply_to = in_reply_to[0]
|
||||
|
||||
refs_raw = headers.get("references", "")
|
||||
if isinstance(refs_raw, list):
|
||||
refs_raw = " ".join(refs_raw)
|
||||
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
|
||||
|
||||
conv_topic = headers.get("thread_topic", "")
|
||||
if isinstance(conv_topic, list):
|
||||
conv_topic = conv_topic[0]
|
||||
|
||||
# conversation index
|
||||
conv_index = ""
|
||||
ci_raw = msg.get("conversationIndex")
|
||||
if ci_raw:
|
||||
try:
|
||||
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
|
||||
except Exception:
|
||||
conv_index = ci_raw
|
||||
|
||||
# přílohy (jen metadata, bez obsahu)
|
||||
attachments = []
|
||||
for att in msg.get("attachments") or []:
|
||||
fname = att.get("name") or ""
|
||||
if not fname:
|
||||
continue
|
||||
attachments.append({
|
||||
"filename": fname,
|
||||
"size_bytes": att.get("size", 0),
|
||||
"mime_type": att.get("contentType", "application/octet-stream"),
|
||||
"content_id": att.get("contentId"),
|
||||
"is_inline": att.get("isInline", False),
|
||||
})
|
||||
|
||||
return {
|
||||
"_id": mid,
|
||||
"graph_id": msg["id"],
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": norm_subject,
|
||||
"importance": importance,
|
||||
"flag_status": flag_status,
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"has_attachments": msg.get("hasAttachments", False),
|
||||
"attachment_count": len(attachments),
|
||||
"inference_classification": msg.get("inferenceClassification", ""),
|
||||
"categories": msg.get("categories") or [],
|
||||
|
||||
"conversation_id": msg.get("conversationId", ""),
|
||||
"conversation_index": conv_index,
|
||||
"conversation_topic": conv_topic,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
|
||||
"received_at": parse_date(msg.get("receivedDateTime")),
|
||||
"sent_at": parse_date(msg.get("sentDateTime")),
|
||||
"created_at": parse_date(msg.get("createdDateTime")),
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
|
||||
"sender": {
|
||||
"email": sender_email,
|
||||
"name": sender_name,
|
||||
},
|
||||
"to": format_recipients(to_list),
|
||||
"cc": format_recipients(cc_list),
|
||||
"bcc": format_recipients(bcc_list),
|
||||
"recipients": recipients,
|
||||
|
||||
"body_html": body_html,
|
||||
"body_preview": body_preview,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_id", ASCENDING)])
|
||||
col.create_index([("folder_path", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([("is_read", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_preview", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N zprav (0 = vse)")
|
||||
ap.add_argument("--skip-existing", action="store_true",
|
||||
help="Preskocit zpravy ktere jiz jsou v MongoDB")
|
||||
ap.add_argument("--folder", default="",
|
||||
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
|
||||
|
||||
# Graph token
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
col = client[MONGO_DB][MONGO_COL]
|
||||
|
||||
# Skip existing
|
||||
existing: set = set()
|
||||
if args.skip_existing:
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("_id"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
# Slozky
|
||||
print("\nNacitam seznam slozek...")
|
||||
all_folders = get_all_folders()
|
||||
if args.folder:
|
||||
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
|
||||
print(f" Slozek ke zpracovani: {len(all_folders)}")
|
||||
for f in all_folders:
|
||||
print(f" {f['path']}")
|
||||
|
||||
# Import
|
||||
batch = []
|
||||
ok_count = 0
|
||||
err_count = 0
|
||||
skip_count = 0
|
||||
total_i = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
print()
|
||||
for folder in all_folders:
|
||||
print(f"--- Složka: {folder['path']} ---")
|
||||
folder_count = 0
|
||||
|
||||
for msg in iter_folder_messages(folder["id"]):
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
|
||||
if mid in existing:
|
||||
skip_count += 1
|
||||
total_i += 1
|
||||
continue
|
||||
|
||||
doc = extract_message(msg, folder["path"])
|
||||
total_i += 1
|
||||
folder_count += 1
|
||||
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
status = "ERR " if doc is None else "OK "
|
||||
subject_str = (doc.get("subject") or "")[:60] if doc else "?"
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?"
|
||||
print(f" {total_i:>6} {status} {subject_str:<60} {sender_str}")
|
||||
|
||||
if total_i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = total_i / elapsed if elapsed > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
print(f" → {folder_count} zprav ze slozky {folder['path']}")
|
||||
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,611 @@
|
||||
"""
|
||||
parse_emails_graph_v1.3.py
|
||||
Nazev: parse_emails_graph_v1.3.py
|
||||
Verze: 1.3
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Cte vsechny emaily z libovolne schranky primo pres Microsoft Graph API
|
||||
a importuje je jako dokumenty do MongoDB.
|
||||
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
|
||||
- telo HTML (max 2 MB) + textovy preview
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag, graph_att_id)
|
||||
- internet headers (SPF, DKIM, Received, X-*, ...)
|
||||
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- navic: isRead, isDraft, folder_path, inferenceClassification
|
||||
|
||||
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
|
||||
archivni slozky, ...).
|
||||
|
||||
DB: emaily
|
||||
Kolekce: <mailbox> (napr. ordinace@buzalkova.cz)
|
||||
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
# Prvni import (vsechno):
|
||||
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
|
||||
|
||||
# Test na prvnich 50:
|
||||
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
|
||||
|
||||
# Jen jedna slozka:
|
||||
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --folder Inbox
|
||||
|
||||
# Pokracovani po preruseni (pouze nove):
|
||||
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode new-only
|
||||
|
||||
# Pravidelny sync (aktualizuje is_read, flag, slozku; importuje nove):
|
||||
python parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync
|
||||
|
||||
# Jina schranka:
|
||||
python parse_emails_graph_v1.3.py --mailbox vladimir.buzalka@buzalka.cz
|
||||
|
||||
Rezimy (--mode):
|
||||
full Plny upsert vsech poli pro kazdou zpravu (vychozi)
|
||||
new-only Preskoci zpravy ktere uz jsou v MongoDB, importuje jen nove
|
||||
sync Existujici: aktualizuje jen is_read/flag_status/categories/
|
||||
modified_at/folder_path. Nove zpravy importuje cely.
|
||||
Idealni pro pravidelne spousteni.
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo graphid: fallback)
|
||||
graph_id Graph API message ID
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW:/AW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
is_read bool — aktualni stav precteni ve schrance
|
||||
is_draft bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
inference_classification focused / other
|
||||
categories [str]
|
||||
conversation_id Graph conversationId
|
||||
conversation_index base64 conversationIndex
|
||||
conversation_topic tema vlakna (z internet headers Thread-Topic)
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID]
|
||||
received_at datetime UTC
|
||||
sent_at datetime UTC
|
||||
created_at datetime UTC
|
||||
modified_at datetime UTC
|
||||
folder_id Graph parentFolderId
|
||||
folder_path cela cesta slozky (napr. Inbox/Subfolder)
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno
|
||||
to retezec To (joined)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
recipients [{type, email, name}]
|
||||
body_html HTML telo (max 2 MB)
|
||||
body_preview textovy nahled (max 255 znaku)
|
||||
attachments [{filename, size_bytes, mime_type, is_inline, graph_att_id}]
|
||||
headers dict internet headers
|
||||
parsed_at datetime UTC
|
||||
|
||||
Indexy:
|
||||
received_at, sent_at, sender.email, graph_id (unique),
|
||||
conversation_id, folder_path, has_attachments, categories,
|
||||
importance, flag_status, is_read,
|
||||
text_search (subject + body_preview + to + cc)
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Pridany rezimy --mode full/new-only/sync;
|
||||
odstranen --skip-existing (nahrazen --mode new-only)
|
||||
1.2 2026-06-02 $expand attachments s $select (bez contentBytes — rychlejsi);
|
||||
prilohy ukladaji graph_att_id pro prime stazeni bez name-matchingu
|
||||
1.3 2026-06-02 --mailbox jako povinny parametr — univerzalni pouziti pro
|
||||
libovolnou schranku; kolekce v MongoDB = nazev schranky
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
BATCH_SIZE = 100
|
||||
PAGE_SIZE = 50
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.3"
|
||||
|
||||
# Schránka se nastavuje za behu z --mailbox parametru
|
||||
GRAPH_MAILBOX: str = ""
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
|
||||
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
|
||||
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
|
||||
|
||||
# $expand prilohy bez contentBytes — jen metadata co potrebujeme
|
||||
ATT_EXPAND = "attachments($select=id,name,contentType,size,isInline)"
|
||||
|
||||
MSG_SELECT = (
|
||||
"id,internetMessageId,subject,bodyPreview,body,"
|
||||
"importance,isRead,isDraft,hasAttachments,"
|
||||
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
|
||||
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
|
||||
"conversationId,conversationIndex,parentFolderId,"
|
||||
"categories,flag,inferenceClassification,internetMessageHeaders"
|
||||
)
|
||||
|
||||
MSG_SELECT_SYNC = (
|
||||
"id,internetMessageId,isRead,isDraft,flag,categories,"
|
||||
"lastModifiedDateTime,parentFolderId,importance"
|
||||
)
|
||||
|
||||
|
||||
# ─── Graph API helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET failed after retry: {url}")
|
||||
|
||||
|
||||
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
|
||||
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
|
||||
if parent_id is None:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
|
||||
else:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
|
||||
|
||||
folders = []
|
||||
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for f in data.get("value", []):
|
||||
path = f"{parent_path}/{f['displayName']}".lstrip("/")
|
||||
folders.append({"id": f["id"], "path": path})
|
||||
if f.get("childFolderCount", 0) > 0:
|
||||
folders.extend(get_all_folders(f["id"], path))
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
return folders
|
||||
|
||||
|
||||
def iter_folder_messages(folder_id: str, select: str = MSG_SELECT, expand_attachments: bool = True):
|
||||
"""Generator: vraci zpravy ze slozky po strankach."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
|
||||
params = {"$top": PAGE_SIZE, "$select": select}
|
||||
if expand_attachments:
|
||||
params["$expand"] = ATT_EXPAND
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for msg in data.get("value", []):
|
||||
yield msg
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
s = subject.strip()
|
||||
while True:
|
||||
m = RE_SUBJECT.match(s)
|
||||
if not m:
|
||||
break
|
||||
s = s[m.end():].strip()
|
||||
return s
|
||||
|
||||
|
||||
def parse_headers(raw_headers: list) -> dict:
|
||||
result = {}
|
||||
for h in raw_headers:
|
||||
k = h["name"].lower().replace("-", "_")
|
||||
v = h["value"]
|
||||
if k in result:
|
||||
existing = result[k]
|
||||
result[k] = existing + [v] if isinstance(existing, list) else [existing, v]
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
def format_recipients(lst: list) -> str:
|
||||
return "; ".join(
|
||||
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
|
||||
for r in lst
|
||||
)
|
||||
|
||||
|
||||
# ─── Extrakce zprávy ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
|
||||
"""Plna extrakce — pouziva se pro mode full a nove zpravy v sync/new-only."""
|
||||
try:
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
subject = msg.get("subject") or ""
|
||||
|
||||
body_html = None
|
||||
body_preview = msg.get("bodyPreview") or ""
|
||||
body = msg.get("body", {})
|
||||
if body.get("contentType") == "html":
|
||||
content = body.get("content") or ""
|
||||
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
|
||||
elif body.get("contentType") == "text":
|
||||
body_preview = (body.get("content") or "")[:2000]
|
||||
|
||||
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
|
||||
to_list = msg.get("toRecipients", [])
|
||||
cc_list = msg.get("ccRecipients", [])
|
||||
bcc_list = msg.get("bccRecipients", [])
|
||||
|
||||
recipients = (
|
||||
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
|
||||
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
|
||||
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
|
||||
)
|
||||
|
||||
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
|
||||
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
|
||||
|
||||
raw_headers = msg.get("internetMessageHeaders") or []
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if isinstance(in_reply_to, list):
|
||||
in_reply_to = in_reply_to[0]
|
||||
|
||||
refs_raw = headers.get("references", "")
|
||||
if isinstance(refs_raw, list):
|
||||
refs_raw = " ".join(refs_raw)
|
||||
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
|
||||
|
||||
conv_topic = headers.get("thread_topic", "")
|
||||
if isinstance(conv_topic, list):
|
||||
conv_topic = conv_topic[0]
|
||||
|
||||
conv_index = ""
|
||||
ci_raw = msg.get("conversationIndex")
|
||||
if ci_raw:
|
||||
try:
|
||||
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
|
||||
except Exception:
|
||||
conv_index = ci_raw
|
||||
|
||||
attachments = []
|
||||
for att in msg.get("attachments") or []:
|
||||
fname = att.get("name") or ""
|
||||
if not fname:
|
||||
continue
|
||||
attachments.append({
|
||||
"filename": fname,
|
||||
"size_bytes": att.get("size", 0),
|
||||
"mime_type": att.get("contentType", "application/octet-stream"),
|
||||
"is_inline": att.get("isInline", False),
|
||||
"graph_att_id": att.get("id"),
|
||||
})
|
||||
|
||||
return {
|
||||
"_id": mid,
|
||||
"graph_id": msg["id"],
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": normalize_subject(subject),
|
||||
"importance": importance,
|
||||
"flag_status": flag_status,
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"has_attachments": msg.get("hasAttachments", False),
|
||||
"attachment_count": len(attachments),
|
||||
"inference_classification": msg.get("inferenceClassification", ""),
|
||||
"categories": msg.get("categories") or [],
|
||||
|
||||
"conversation_id": msg.get("conversationId", ""),
|
||||
"conversation_index": conv_index,
|
||||
"conversation_topic": conv_topic,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
|
||||
"received_at": parse_date(msg.get("receivedDateTime")),
|
||||
"sent_at": parse_date(msg.get("sentDateTime")),
|
||||
"created_at": parse_date(msg.get("createdDateTime")),
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
|
||||
"sender": {
|
||||
"email": sender_ea.get("address", ""),
|
||||
"name": sender_ea.get("name", ""),
|
||||
},
|
||||
"to": format_recipients(to_list),
|
||||
"cc": format_recipients(cc_list),
|
||||
"bcc": format_recipients(bcc_list),
|
||||
"recipients": recipients,
|
||||
|
||||
"body_html": body_html,
|
||||
"body_preview": body_preview,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
|
||||
return None
|
||||
|
||||
|
||||
def extract_sync_fields(msg: dict, folder_path: str) -> dict:
|
||||
"""Jen menitelna pole — pouziva se v sync mode pro existujici zpravy."""
|
||||
return {
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"flag_status": FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0),
|
||||
"importance": IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1),
|
||||
"categories": msg.get("categories") or [],
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_id", ASCENDING)])
|
||||
col.create_index([("folder_path", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([("is_read", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_preview", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
global GRAPH_MAILBOX
|
||||
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mailbox", required=True,
|
||||
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
|
||||
ap.add_argument("--mode", default="full", choices=["full", "new-only", "sync"],
|
||||
help="full=plny upsert (vychozi) | new-only=jen nove zpravy | "
|
||||
"sync=existujici aktualizuje jen menitelna pole, nove importuje cely")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N zprav (0 = vse)")
|
||||
ap.add_argument("--folder", default="",
|
||||
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
GRAPH_MAILBOX = args.mailbox
|
||||
mongo_col = args.mailbox
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
|
||||
print(f"Režim: {args.mode}")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
col = client[MONGO_DB][mongo_col]
|
||||
|
||||
existing: set = set()
|
||||
if args.mode in ("new-only", "sync"):
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("_id"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
print("\nNacitam seznam slozek...")
|
||||
all_folders = get_all_folders()
|
||||
if args.folder:
|
||||
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
|
||||
print(f" Slozek ke zpracovani: {len(all_folders)}")
|
||||
for f in all_folders:
|
||||
print(f" {f['path']}")
|
||||
|
||||
is_sync = args.mode == "sync"
|
||||
msg_select = MSG_SELECT_SYNC if is_sync else MSG_SELECT
|
||||
expand_att = not is_sync
|
||||
|
||||
batch = []
|
||||
ok_count = 0
|
||||
sync_count = 0
|
||||
err_count = 0
|
||||
skip_count = 0
|
||||
total_i = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
print()
|
||||
for folder in all_folders:
|
||||
print(f"--- Složka: {folder['path']} ---")
|
||||
folder_count = 0
|
||||
|
||||
for msg in iter_folder_messages(folder["id"], select=msg_select, expand_attachments=expand_att):
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
total_i += 1
|
||||
folder_count += 1
|
||||
|
||||
if args.mode == "new-only" and mid in existing:
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
if is_sync and mid in existing:
|
||||
fields = extract_sync_fields(msg, folder["path"])
|
||||
batch.append(UpdateOne({"_id": mid}, {"$set": fields}))
|
||||
sync_count += 1
|
||||
print(f" {total_i:>6} SYN {mid[:80]}")
|
||||
else:
|
||||
if is_sync:
|
||||
full_url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{msg['id']}"
|
||||
full_params = {"$select": MSG_SELECT, "$expand": ATT_EXPAND}
|
||||
try:
|
||||
msg = graph_get(full_url, full_params)
|
||||
except Exception as e:
|
||||
logging.error("full fetch failed [%s]: %s", msg.get("id","?"), e)
|
||||
err_count += 1
|
||||
continue
|
||||
|
||||
doc = extract_message(msg, folder["path"])
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
print(f" {total_i:>6} ERR {mid[:80]}")
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
subject_str = (doc.get("subject") or "")[:60]
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40]
|
||||
print(f" {total_i:>6} OK {subject_str:<60} {sender_str}")
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if total_i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = total_i / elapsed if elapsed > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} sync={sync_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
print(f" → {folder_count} zprav ze slozky {folder['path']}")
|
||||
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | sync={sync_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,248 @@
|
||||
# parse_emails_tower_v1.1
|
||||
|
||||
## Spuštění
|
||||
|
||||
**První spuštění:**
|
||||
```bash
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1"
|
||||
```
|
||||
|
||||
**Pokračování po přerušení (přeskočí už importované):**
|
||||
```bash
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Stav importu
|
||||
|
||||
**Sledování průběhu (live log):**
|
||||
```bash
|
||||
docker exec -it python-runner tail -f /scripts/parse_emails.log
|
||||
```
|
||||
|
||||
**Počet emailů v MongoDB:**
|
||||
```bash
|
||||
docker exec -it python-runner python -c \
|
||||
"from pymongo import MongoClient; c=MongoClient('mongodb://192.168.1.76:27017'); print(c['emaily']['vbuzalka@its.jnj.com'].count_documents({}))"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**Název:** parse_emails_tower_v1.1.py
|
||||
**Verze:** 1.1
|
||||
**Datum:** 2026-06-02
|
||||
**Autor:** vladimir.buzalka
|
||||
|
||||
---
|
||||
|
||||
## Účel
|
||||
|
||||
Import všech `.msg` souborů do MongoDB. Z každého souboru extrahuje **všechny dostupné vlastnosti** — podobně jako EXIF u fotek.
|
||||
|
||||
- **DB:** `emaily`
|
||||
- **Kolekce:** `vbuzalka@its.jnj.com`
|
||||
- `_id` = Internet Message-ID (nebo `filename:<stem>` jako fallback)
|
||||
- Bezpečné přerušit a opakovat — upsert podle `_id`
|
||||
|
||||
---
|
||||
|
||||
## Prostředí
|
||||
|
||||
Běží v Docker containeru **python-runner** na **Unraid Tower**.
|
||||
|
||||
| Komponenta | Umístění |
|
||||
|---|---|
|
||||
| Container | `python-runner` (Docker na Unraid Tower) |
|
||||
| .msg soubory | `/mnt/user/JNJEMAILS` → `/mnt/JNJEMAILS` uvnitř containeru |
|
||||
| Skripty | `/mnt/user/Scripts` → `/scripts` uvnitř containeru |
|
||||
| MongoDB | `192.168.1.76:27017` (externí, mimo container) |
|
||||
|
||||
---
|
||||
|
||||
## Spouštění (z Unraid terminálu)
|
||||
|
||||
**Test na 50 emailech:**
|
||||
```bash
|
||||
docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes
|
||||
```
|
||||
|
||||
**Kompletní import na pozadí (log do souboru):**
|
||||
```bash
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1"
|
||||
```
|
||||
|
||||
**Pokračování po přerušení:**
|
||||
```bash
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1"
|
||||
```
|
||||
|
||||
**Sledování průběhu (Ctrl+C ukončí sledování, import běží dál):**
|
||||
```bash
|
||||
docker exec -it python-runner tail -f /scripts/parse_emails.log
|
||||
```
|
||||
|
||||
### Všechny parametry
|
||||
|
||||
| Parametr | Popis |
|
||||
|---|---|
|
||||
| `--skip-existing` | Načte seznam hotových souborů z MongoDB a přeskočí je. Použij pro pokračování po přerušení. |
|
||||
| `--limit N` | Zpracuje jen prvních N souborů. Vhodné pro test. |
|
||||
| `--no-indexes` | Nevytváří indexy na konci. Použij pokud přerušíš uprostřed — indexy vytvoř ručně až je vše hotové. |
|
||||
| `--msgs-dir PATH` | Přepíše výchozí cestu k .msg souborům (výchozí: `/mnt/JNJEMAILS`). |
|
||||
|
||||
---
|
||||
|
||||
## Průběh na konzoli
|
||||
|
||||
Každý email na jednom řádku:
|
||||
```
|
||||
1/69371 OK RE: Protocol deviation CZ10022 jan.novak@its.jnj.com
|
||||
2/69371 OK UCO3001: Draft FUL pro DD5-CZ10022 monitor@4gclinical.com
|
||||
3/69371 ERR ? ?
|
||||
```
|
||||
|
||||
Každých 500 emailů oddělovač s průběhem:
|
||||
```
|
||||
────────────────────────────────────────────────────────────────────────────────
|
||||
Průběh: ok=498 err=2 0.4 msg/s ETA 47h12m
|
||||
────────────────────────────────────────────────────────────────────────────────
|
||||
```
|
||||
|
||||
Na konci souhrn:
|
||||
```
|
||||
====================================================
|
||||
Vysledek: ok=69300 | skip=0 | err=71
|
||||
Celkovy cas: 47h 23m 10s
|
||||
Dokumentu v kolekci: 69300
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Zdroje dat z každého .msg
|
||||
|
||||
| Pole | Popis |
|
||||
|---|---|
|
||||
| Předmět, normalized subject | |
|
||||
| Odesílatel | email, jméno, SMTP adresa |
|
||||
| Příjemci To/CC/BCC | strukturovaně `[{type, email, name}]` |
|
||||
| Čas doručení a odeslání | UTC |
|
||||
| Tělo | plaintext + HTML (max 2 MB) |
|
||||
| Přílohy | metadata: jméno, velikost, MIME typ, inline flag |
|
||||
| Internet headers | X-Originating-IP, Received, DKIM, X-Mailer, ... |
|
||||
| MAPI | důležitost, citlivost, příznak, konverzační vlákno, kategorie |
|
||||
| In-Reply-To, References | pro rekonstrukci vlákna |
|
||||
| Raw MAPI properties | `{0xXXXX: value}` |
|
||||
|
||||
---
|
||||
|
||||
## Hodnotové kódy
|
||||
|
||||
| Pole | Hodnota | Význam |
|
||||
|---|---|---|
|
||||
| `importance` | 0 | Nízká |
|
||||
| | 1 | Normální |
|
||||
| | 2 | Vysoká |
|
||||
| `sensitivity` | 0 | Normální |
|
||||
| | 1 | Osobní |
|
||||
| | 2 | Soukromé |
|
||||
| | 3 | Důvěrné |
|
||||
| `flag_status` | 0 | Bez příznaku |
|
||||
| | 1 | Označeno (follow up) |
|
||||
| | 2 | Dokončeno |
|
||||
|
||||
---
|
||||
|
||||
## MongoDB indexy
|
||||
|
||||
Automaticky vytvořeny na konci importu (`--no-indexes` přeskočí):
|
||||
|
||||
| Index | Pole |
|
||||
|---|---|
|
||||
| Chronologický | `received_at`, `sent_at` |
|
||||
| Odesílatel | `sender.email` |
|
||||
| Soubor | `filename` (unique) |
|
||||
| Konverzace | `conversation_topic` |
|
||||
| Filtry | `has_attachments`, `categories`, `importance`, `flag_status` |
|
||||
| Full-text | `subject` + `body_text` + `to` + `cc` (text index `text_search`) |
|
||||
|
||||
---
|
||||
|
||||
## Ukázkové dotazy (MongoDB shell / MCP)
|
||||
|
||||
**Emaily o UCO3001 s přílohou:**
|
||||
```javascript
|
||||
db["vbuzalka@its.jnj.com"].find({
|
||||
$text: { $search: "UCO3001" },
|
||||
has_attachments: true
|
||||
}).sort({ received_at: -1 })
|
||||
```
|
||||
|
||||
**Emaily od konkrétního odesílatele:**
|
||||
```javascript
|
||||
db["vbuzalka@its.jnj.com"].find({
|
||||
"sender.email": /covance/i
|
||||
}).sort({ received_at: -1 })
|
||||
```
|
||||
|
||||
**Celé konverzační vlákno:**
|
||||
```javascript
|
||||
db["vbuzalka@its.jnj.com"].find({
|
||||
conversation_topic: "Protocol deviation CZ10022"
|
||||
}).sort({ received_at: 1 })
|
||||
```
|
||||
|
||||
**Statistiky podle odesílatele (top 20):**
|
||||
```javascript
|
||||
db["vbuzalka@its.jnj.com"].aggregate([
|
||||
{ $group: { _id: "$sender.email", count: { $sum: 1 } } },
|
||||
{ $sort: { count: -1 } },
|
||||
{ $limit: 20 }
|
||||
])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Chybový log
|
||||
|
||||
Soubory které selhaly jsou zalogrovány do `parse_emails_errors.log` vedle skriptu (tj. `/scripts/parse_emails_errors.log` → `\\tower\Scripts\parse_emails_errors.log`):
|
||||
```
|
||||
2026-06-02 20:14:33 | open failed [7A3F...0000.msg]: <důvod>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Výkon
|
||||
|
||||
| Parametr | Hodnota |
|
||||
|---|---|
|
||||
| Počet souborů | ~69 000 |
|
||||
| Rychlost | ~0.4 msg/s (htmlBody dekódování) |
|
||||
| Odhadovaný čas | 48 hodin |
|
||||
| Batch size | 200 dokumentů / bulk_write |
|
||||
| Odhadovaná velikost DB | 2–5 GB |
|
||||
|
||||
---
|
||||
|
||||
## Závislosti (v Docker image python-runner)
|
||||
|
||||
```
|
||||
extract-msg==0.55.0
|
||||
pymongo
|
||||
python-dateutil
|
||||
```
|
||||
|
||||
Image sestaven z `Dockerfile` v `/mnt/user/Scripts/python-runner/`.
|
||||
|
||||
---
|
||||
|
||||
## Historie verzí
|
||||
|
||||
| Verze | Datum | Změna |
|
||||
|---|---|---|
|
||||
| 1.0 | 2026-06-01 | Iniciální verze |
|
||||
| 1.1 | 2026-06-02 | Nasazení na Unraid Tower v Docker containeru python-runner; MSGS_DIR změněno z SMB share (`\\tower\JNJEMAILS`) na lokální mount (`/mnt/JNJEMAILS`); aktualizován popis spouštění pro `docker exec` |
|
||||
@@ -0,0 +1,660 @@
|
||||
"""
|
||||
parse_emails_tower_v1.1.py
|
||||
Nazev: parse_emails_tower_v1.1.py
|
||||
Verze: 1.1
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Parsuje vsechny .msg soubory z MSGS_DIR a importuje je jako dokumenty
|
||||
do MongoDB. Z kazdeho souboru extrahuje VSECHNY dostupne vlastnosti —
|
||||
podobne jako EXIF u fotek:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni a odeslani (UTC)
|
||||
- telo plaintext + HTML (max 2 MB)
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
|
||||
- internet headers (X-Originating-IP, Received, DKIM, ...)
|
||||
- MAPI vlastnosti: dulezitost, citlivost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- vsechny raw MAPI properties jako {0xXXXX: value}
|
||||
|
||||
DB: emaily
|
||||
Kolekce: vbuzalka@its.jnj.com
|
||||
_id: Internet Message-ID (nebo "filename:<stem>" jako fallback)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- upsert podle _id — duplicity se automaticky prepisi
|
||||
- --skip-existing nacte seznam hotovych souboru z MongoDB a
|
||||
preskoci je => pokracovani po preruseni bez ztraty prace
|
||||
|
||||
Prostredi:
|
||||
Bezi v Docker containeru "python-runner" na Unraid Tower.
|
||||
.msg soubory jsou dostupne jako lokalni disk (volume mount):
|
||||
/mnt/user/JNJEMAILS -> /mnt/JNJEMAILS (uvnitr containeru)
|
||||
MongoDB na 192.168.1.76:27017 (externi, bezi mimo container).
|
||||
|
||||
Spousteni (z Unraid terminalu):
|
||||
# Test na 50 emailech:
|
||||
docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes
|
||||
|
||||
# Kompletni import na pozadi (log do souboru):
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1"
|
||||
|
||||
# Pokracovani po preruseni:
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1"
|
||||
|
||||
# Sledovani prubehu:
|
||||
docker exec -it python-runner tail -f /scripts/parse_emails.log
|
||||
|
||||
Vystup na konzoli:
|
||||
Kazdy email na jednom radku:
|
||||
<poradi>/<celkem> OK/ERR <predmet 60 znaku> <odesilatel>
|
||||
Kazych 500 emailu: oddelovac s prubehem, rychlosti a ETA.
|
||||
Na konci: souhrn ok/skip/err, celkovy cas, pocet dokumentu v kolekci.
|
||||
|
||||
Zavislosti (nainstalovane v Docker image python-runner):
|
||||
extract-msg==0.55.0, pymongo, python-dateutil
|
||||
Python 3.12, Linux (Docker container na Unraid Tower)
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo filename: fallback)
|
||||
filename jmeno .msg souboru (20znakovy hex + .msg)
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
sensitivity 0=normalni 1=osobni 2=soukrome 3=duverne
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
read_receipt_requested bool
|
||||
delivery_receipt_requested bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
message_size_bytes velikost .msg souboru na disku
|
||||
conversation_topic tema vlakna (PR_CONVERSATION_TOPIC)
|
||||
conversation_index base64 PR_CONVERSATION_INDEX
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID] — cela historia vlakna
|
||||
categories [str] — MAPI kategorie / stitky
|
||||
read_receipt_requested bool
|
||||
delivery_receipt_requested bool
|
||||
received_at datetime UTC — cas doruceni
|
||||
sent_at datetime UTC — cas odeslani
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno odesilatele
|
||||
sender.smtp SMTP adresa (pro interni EX adresy)
|
||||
to retezec To (tak jak v Outlooku)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
display_to PR_DISPLAY_TO (zkraceny seznam)
|
||||
display_cc PR_DISPLAY_CC
|
||||
recipients [{type, email, name}] — to/cc/bcc s typy
|
||||
body_text plain text telo
|
||||
body_html HTML telo (max 2 MB, None pokud neni)
|
||||
attachments [{filename, size_bytes, mime_type,
|
||||
content_id, is_inline}]
|
||||
headers dict internet headers (lowercase_s_podtrzitky)
|
||||
mapi dict vsech raw MAPI properties {0xXXXX: value}
|
||||
parsed_at datetime UTC — cas parsovani
|
||||
|
||||
Indexy (vytvoreny automaticky na konci):
|
||||
received_at, sent_at, sender.email, filename (unique),
|
||||
conversation_topic, has_attachments, categories, importance,
|
||||
flag_status, text_search (subject + body_text + to + cc)
|
||||
|
||||
Chyby:
|
||||
Soubory ktere selhaly jsou zalogiovany do parse_emails_errors.log
|
||||
v adresari skriptu. Radek: timestamp | open/extract failed | duvod.
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-01 Inicialni verze
|
||||
1.1 2026-06-02 Nasazeni na Unraid Tower v Docker containeru python-runner;
|
||||
MSGS_DIR zmeneno z SMB share na lokalni mount /mnt/JNJEMAILS;
|
||||
aktualizovany popis spousteni pro docker exec
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import extract_msg
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
MSGS_DIR = Path("/mnt/JNJEMAILS")
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL = "vbuzalka@its.jnj.com"
|
||||
BATCH_SIZE = 200
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.1"
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def safe(obj, *attrs, default=None):
|
||||
"""Bezpecne cteni atributu — vrati prvni non-None hodnotu."""
|
||||
for attr in attrs:
|
||||
try:
|
||||
val = getattr(obj, attr, None)
|
||||
if val is None:
|
||||
continue
|
||||
if isinstance(val, str) and not val.strip():
|
||||
continue
|
||||
return val
|
||||
except Exception:
|
||||
continue
|
||||
return default
|
||||
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
"""Libovolny datum -> UTC datetime bez tzinfo (pro MongoDB)."""
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def to_bson(val):
|
||||
"""Konvertuje hodnotu na BSON-serializovatelny typ."""
|
||||
if isinstance(val, bytes):
|
||||
return val.hex() if len(val) <= 128 else f"<bytes:{len(val)}>"
|
||||
if isinstance(val, datetime):
|
||||
return parse_date(val)
|
||||
if isinstance(val, (str, int, float, bool, type(None))):
|
||||
return val
|
||||
if isinstance(val, list):
|
||||
return [to_bson(v) for v in val]
|
||||
try:
|
||||
return int(val)
|
||||
except Exception:
|
||||
pass
|
||||
return str(val)
|
||||
|
||||
|
||||
# ─── Extrakce částí zprávy ────────────────────────────────────────────────────
|
||||
|
||||
def extract_headers(msg) -> dict:
|
||||
headers = {}
|
||||
try:
|
||||
hdr = msg.header
|
||||
if not hdr:
|
||||
return {}
|
||||
from email.header import decode_header as _dh
|
||||
|
||||
def _decode(v: str) -> str:
|
||||
try:
|
||||
parts = _dh(v)
|
||||
out = ""
|
||||
for part, enc in parts:
|
||||
out += part.decode(enc or "utf-8", errors="replace") if isinstance(part, bytes) else part
|
||||
return out
|
||||
except Exception:
|
||||
return v
|
||||
|
||||
for key in set(hdr.keys()):
|
||||
k = key.lower().replace("-", "_")
|
||||
vals = [_decode(v) for v in hdr.get_all(key, [])]
|
||||
headers[k] = vals if len(vals) > 1 else (vals[0] if vals else "")
|
||||
except Exception as e:
|
||||
logging.error("extract_headers: %s", e)
|
||||
return headers
|
||||
|
||||
|
||||
def extract_recipients(msg) -> list:
|
||||
result = []
|
||||
type_map = {1: "to", 2: "cc", 3: "bcc"}
|
||||
try:
|
||||
for r in msg.recipients:
|
||||
rtype = getattr(r, "type", 1)
|
||||
try:
|
||||
rtype = int(rtype)
|
||||
except Exception:
|
||||
try:
|
||||
rtype = int(rtype.value)
|
||||
except Exception:
|
||||
rtype = 1
|
||||
rec = {
|
||||
"type": type_map.get(rtype, "to"),
|
||||
"email": safe(r, "email", default=""),
|
||||
"name": safe(r, "name", default=""),
|
||||
}
|
||||
result.append(rec)
|
||||
except Exception as e:
|
||||
logging.error("extract_recipients: %s", e)
|
||||
return result
|
||||
|
||||
|
||||
def extract_attachments(msg) -> list:
|
||||
result = []
|
||||
try:
|
||||
for att in msg.attachments:
|
||||
fname = safe(att, "longFilename", "shortFilename", default="")
|
||||
if not fname:
|
||||
continue
|
||||
size = 0
|
||||
try:
|
||||
d = att.data
|
||||
size = len(d) if d else 0
|
||||
except Exception:
|
||||
pass
|
||||
result.append({
|
||||
"filename": fname,
|
||||
"size_bytes": size,
|
||||
"mime_type": safe(att, "mimetype", "mimeType", default="application/octet-stream"),
|
||||
"content_id": safe(att, "cid", default=None),
|
||||
"is_inline": bool(safe(att, "isInline", default=False)),
|
||||
})
|
||||
except Exception as e:
|
||||
logging.error("extract_attachments: %s", e)
|
||||
return result
|
||||
|
||||
|
||||
def extract_mapi_props(msg) -> dict:
|
||||
"""Vsechny raw MAPI properties jako {0xXXXX: value}."""
|
||||
result = {}
|
||||
try:
|
||||
props = msg.props
|
||||
if not hasattr(props, "items"):
|
||||
return {}
|
||||
for key, prop in props.items():
|
||||
try:
|
||||
val = to_bson(prop.value)
|
||||
prop_id = f"0x{key[:4].upper()}" if len(key) >= 4 else f"0x{key.upper()}"
|
||||
result[prop_id] = val
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logging.error("extract_mapi_props: %s", e)
|
||||
return result
|
||||
|
||||
|
||||
# ─── Hlavní extrakce ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg_path: Path) -> Optional[dict]:
|
||||
"""Parsuje jeden .msg soubor -> MongoDB dokument."""
|
||||
try:
|
||||
msg = extract_msg.Message(str(msg_path))
|
||||
except Exception as e:
|
||||
logging.error("open failed [%s]: %s", msg_path.name, e)
|
||||
return None
|
||||
|
||||
try:
|
||||
# ── Message-ID ────────────────────────────────────────────────
|
||||
mid = None
|
||||
for attr in ("messageId", "message_id", "internetMessageId"):
|
||||
mid = safe(msg, attr)
|
||||
if mid:
|
||||
break
|
||||
if not mid:
|
||||
mid = f"filename:{msg_path.stem}"
|
||||
mid = str(mid).strip()
|
||||
|
||||
# ── Předmět ───────────────────────────────────────────────────
|
||||
try:
|
||||
subject = msg.subject or ""
|
||||
except Exception:
|
||||
subject = ""
|
||||
|
||||
normalized_subject = safe(msg, "normalizedSubject", "normalized_subject", default="")
|
||||
|
||||
# ── Tělo ──────────────────────────────────────────────────────
|
||||
try:
|
||||
body_text = msg.body or ""
|
||||
except Exception:
|
||||
body_text = ""
|
||||
|
||||
body_html = None
|
||||
try:
|
||||
bh = msg.htmlBody
|
||||
if isinstance(bh, bytes):
|
||||
bh = bh.decode("utf-8", errors="replace")
|
||||
if bh:
|
||||
body_html = bh if len(bh) <= 2 * 1024 * 1024 else bh[:2 * 1024 * 1024]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Odesílatel ────────────────────────────────────────────────
|
||||
try:
|
||||
sender_email = msg.sender or ""
|
||||
except Exception:
|
||||
sender_email = ""
|
||||
|
||||
sender_name = safe(msg, "senderName", "sender_name", default="")
|
||||
sender_smtp = safe(msg, "senderSmtpAddress", "sent_representing_smtp_address", default="")
|
||||
|
||||
# ── Příjemci ──────────────────────────────────────────────────
|
||||
recipients = extract_recipients(msg)
|
||||
|
||||
try:
|
||||
to_raw = msg.to or ""
|
||||
except Exception:
|
||||
to_raw = ""
|
||||
try:
|
||||
cc_raw = msg.cc or ""
|
||||
except Exception:
|
||||
cc_raw = ""
|
||||
try:
|
||||
bcc_raw = getattr(msg, "bcc", None) or ""
|
||||
except Exception:
|
||||
bcc_raw = ""
|
||||
|
||||
display_to = safe(msg, "displayTo", "display_to", default="")
|
||||
display_cc = safe(msg, "displayCc", "display_cc", default="")
|
||||
|
||||
# ── Časy ──────────────────────────────────────────────────────
|
||||
try:
|
||||
received_at = parse_date(msg.date)
|
||||
except Exception:
|
||||
received_at = None
|
||||
|
||||
sent_at = None
|
||||
for attr in ("clientSubmitTime", "client_submit_time", "sentOn"):
|
||||
v = safe(msg, attr)
|
||||
if v:
|
||||
sent_at = parse_date(v)
|
||||
break
|
||||
|
||||
# ── MAPI vlastnosti ───────────────────────────────────────────
|
||||
importance = 1
|
||||
try:
|
||||
v = msg.importance
|
||||
if v is not None:
|
||||
importance = int(v)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
sensitivity = 0
|
||||
try:
|
||||
v = getattr(msg, "sensitivity", None)
|
||||
if v is not None:
|
||||
sensitivity = int(v)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
flag_status = 0
|
||||
try:
|
||||
v = safe(msg, "flagStatus", "flag_status")
|
||||
if v is not None:
|
||||
flag_status = int(v)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
conversation_topic = safe(msg, "conversationTopic", "conversation_topic", default="")
|
||||
|
||||
conversation_index = ""
|
||||
try:
|
||||
ci = safe(msg, "conversationIndex", "conversation_index")
|
||||
if isinstance(ci, bytes):
|
||||
conversation_index = base64.b64encode(ci).decode()
|
||||
elif ci:
|
||||
conversation_index = str(ci)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
in_reply_to = safe(msg, "inReplyTo", "in_reply_to", default="")
|
||||
|
||||
internet_refs = []
|
||||
try:
|
||||
refs = safe(msg, "internetReferences", "internet_references")
|
||||
if isinstance(refs, list):
|
||||
internet_refs = refs
|
||||
elif isinstance(refs, str) and refs:
|
||||
internet_refs = [r.strip() for r in refs.split() if r.strip()]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
categories = []
|
||||
try:
|
||||
cats = safe(msg, "categories")
|
||||
if isinstance(cats, list):
|
||||
categories = [str(c) for c in cats if c]
|
||||
elif isinstance(cats, str) and cats:
|
||||
categories = [c.strip() for c in re.split(r"[;,]", cats) if c.strip()]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
read_receipt = bool(safe(msg, "readReceiptRequested", "read_receipt_requested", default=False))
|
||||
delivery_receipt = bool(safe(msg, "deliveryReceiptRequested", "delivery_receipt_requested", default=False))
|
||||
|
||||
# ── Internet headers ──────────────────────────────────────────
|
||||
headers = extract_headers(msg)
|
||||
|
||||
if not in_reply_to:
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if not internet_refs:
|
||||
refs_str = headers.get("references", "")
|
||||
if isinstance(refs_str, str) and refs_str:
|
||||
internet_refs = [r.strip() for r in refs_str.split() if r.strip()]
|
||||
|
||||
# ── Přílohy ───────────────────────────────────────────────────
|
||||
attachments = extract_attachments(msg)
|
||||
|
||||
# ── Raw MAPI ──────────────────────────────────────────────────
|
||||
mapi_raw = extract_mapi_props(msg)
|
||||
|
||||
msg.close()
|
||||
|
||||
# ── Dokument ──────────────────────────────────────────────────
|
||||
return {
|
||||
"_id": mid,
|
||||
"filename": msg_path.name,
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": normalized_subject,
|
||||
"importance": importance,
|
||||
"sensitivity": sensitivity,
|
||||
"flag_status": flag_status,
|
||||
"read_receipt_requested": read_receipt,
|
||||
"delivery_receipt_requested": delivery_receipt,
|
||||
"has_attachments": len(attachments) > 0,
|
||||
"attachment_count": len(attachments),
|
||||
"message_size_bytes": msg_path.stat().st_size,
|
||||
|
||||
"conversation_topic": conversation_topic,
|
||||
"conversation_index": conversation_index,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
"categories": categories,
|
||||
|
||||
"received_at": received_at,
|
||||
"sent_at": sent_at,
|
||||
|
||||
"sender": {
|
||||
"email": sender_email,
|
||||
"name": sender_name,
|
||||
"smtp": sender_smtp,
|
||||
},
|
||||
"to": to_raw,
|
||||
"cc": cc_raw,
|
||||
"bcc": bcc_raw,
|
||||
"display_to": display_to,
|
||||
"display_cc": display_cc,
|
||||
"recipients": recipients,
|
||||
|
||||
"body_text": body_text,
|
||||
"body_html": body_html,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
"mapi": mapi_raw,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg_path.name, e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("filename", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_topic", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_text", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--msgs-dir", default=str(MSGS_DIR),
|
||||
help="Cesta k .msg souborum")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N souboru (0 = vse)")
|
||||
ap.add_argument("--skip-existing", action="store_true",
|
||||
help="Preskocit soubory ktere jiz jsou v MongoDB (pokracovani)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
msgs_dir = Path(args.msgs_dir)
|
||||
start = datetime.now()
|
||||
|
||||
print(f"=== parse_emails v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Zdroj: {msgs_dir}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col = client[MONGO_DB][MONGO_COL]
|
||||
|
||||
# Skip existing — nacti seznam uz importovanych souboru
|
||||
existing: set = set()
|
||||
if args.skip_existing:
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("filename"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
# Scan
|
||||
print(f"\nSkenuji {msgs_dir} ...")
|
||||
all_files = sorted(msgs_dir.glob("*.msg"))
|
||||
if args.limit:
|
||||
all_files = all_files[:args.limit]
|
||||
|
||||
to_process = [f for f in all_files if f.name not in existing]
|
||||
skipped = len(all_files) - len(to_process)
|
||||
total = len(to_process)
|
||||
|
||||
print(f" Celkem .msg: {len(all_files)}")
|
||||
print(f" Preskoceno: {skipped}")
|
||||
print(f" Ke zpracovani: {total}\n")
|
||||
|
||||
if total == 0:
|
||||
print("Neni co importovat.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
batch = []
|
||||
ok_count = 0
|
||||
err_count = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for i, msg_path in enumerate(to_process, 1):
|
||||
doc = extract_message(msg_path)
|
||||
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
# Výpis každého emailu
|
||||
status = "ERR " if doc is None else "OK "
|
||||
subject_str = (doc.get("subject") or "")[:60] if doc else "?"
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?"
|
||||
print(f" {i:>6}/{total} {status} {subject_str:<60} {sender_str}")
|
||||
|
||||
if i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = i / elapsed if elapsed > 0 else 0
|
||||
eta_s = int((total - i) / rate) if rate > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} err={err_count} "
|
||||
f"{rate:.1f} msg/s ETA {eta_s//3600}h{(eta_s%3600)//60}m")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | skip={skipped} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,122 @@
|
||||
# python-runner — Docker kontejner na Tower
|
||||
|
||||
## Základní info
|
||||
|
||||
| Parametr | Hodnota |
|
||||
|----------------|----------------------------------------------|
|
||||
| Název | python-runner |
|
||||
| Image | python-runner (vlastní) |
|
||||
| Status | running (unless-stopped) |
|
||||
| Python | 3.12.13 |
|
||||
| Spouštěcí cmd | `tail -f /dev/null` — container jen běží, skripty se spouštějí ručně |
|
||||
| Working dir | `/scripts` |
|
||||
| Vytvořen | 2026-06-02 |
|
||||
|
||||
---
|
||||
|
||||
## Tower — SSH přístup
|
||||
|
||||
| Parametr | Hodnota |
|
||||
|----------|------------------|
|
||||
| Host | tower / 192.168.1.76 |
|
||||
| Port | 22 |
|
||||
| User | root |
|
||||
| Heslo | 7309208104 |
|
||||
|
||||
**Připojení přes Python (paramiko)** — Docker CLI není lokálně dostupný:
|
||||
|
||||
```python
|
||||
import paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect('192.168.1.76', username='root', password='7309208104')
|
||||
_, out, _ = c.exec_command('...')
|
||||
print(out.read().decode())
|
||||
c.close()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Volume mounty
|
||||
|
||||
| Host (Unraid) | Kontejner | Popis |
|
||||
|-----------------------|-------------------|------------------------------|
|
||||
| `/mnt/user/Scripts` | `/scripts` | Skripty, logy — working dir |
|
||||
| `/mnt/user/JNJEMAILS` | `/mnt/JNJEMAILS` | .msg soubory emailů (JNJ) |
|
||||
|
||||
---
|
||||
|
||||
## Spouštění skriptů
|
||||
|
||||
```bash
|
||||
# Interaktivně (vidíš výstup):
|
||||
docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes
|
||||
|
||||
# Na pozadí (log do souboru):
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1"
|
||||
|
||||
# Pokračování po přerušení (skip hotových):
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1"
|
||||
|
||||
# Sledování průběhu:
|
||||
docker exec -it python-runner tail -f /scripts/parse_emails.log
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Aktuální skripty v /scripts
|
||||
|
||||
| Soubor | Popis |
|
||||
|-------------------------------|------------------------------------------------|
|
||||
| `parse_emails_tower_v1.1.py` | Import .msg → MongoDB (db: emaily, kolekce: vbuzalka@its.jnj.com) |
|
||||
| `parse_emails_tower_v1.1.md` | Dokumentace ke skriptu |
|
||||
| `parse_emails.log` | Log průběhu importu |
|
||||
| `parse_emails_errors.log` | Log chyb (soubory které selhaly) |
|
||||
|
||||
Lokální protějšek: `EmailsImport/parse_emails_v1.0.py` — identický kód, liší se jen cestou
|
||||
(`\\tower\JNJEMAILS` SMB vs. `/mnt/JNJEMAILS` lokální mount) a verzí hlavičky.
|
||||
|
||||
---
|
||||
|
||||
## Nainstalované Python balíčky
|
||||
|
||||
```
|
||||
extract-msg 0.55.0
|
||||
pymongo 4.17.0
|
||||
python-dateutil 2.9.0.post0
|
||||
cryptography 48.0.0
|
||||
beautifulsoup4 4.13.5
|
||||
oletools 0.60.2
|
||||
msoffcrypto-tool 6.0.0
|
||||
olefile 0.47
|
||||
RTFDE 0.1.2.2
|
||||
compressed-rtf 1.0.7
|
||||
lark 1.3.1
|
||||
pcodedmp 1.2.6
|
||||
tzlocal 5.3.1
|
||||
six 1.17.0
|
||||
pip 25.0.1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Přidání nového balíčku
|
||||
|
||||
```bash
|
||||
docker exec python-runner pip install <balicek>
|
||||
```
|
||||
|
||||
> Pozor: instalace se ztratí při recreate kontejneru — je třeba přidat do Dockerfile nebo do setup skriptu.
|
||||
|
||||
---
|
||||
|
||||
## Logika parse_emails (oba skripty)
|
||||
|
||||
- Čte všechny `.msg` soubory z MSGS_DIR
|
||||
- Extrahuje: předmět, odesílatel, příjemci (To/CC/BCC), tělo (text+HTML), přílohy, internet headers, všechny raw MAPI properties
|
||||
- Ukládá do MongoDB: `emaily` → `vbuzalka@its.jnj.com`
|
||||
- `_id` = Internet Message-ID (nebo `filename:<stem>` jako fallback)
|
||||
- Upsert → bezpečné opakování, `--skip-existing` pro pokračování
|
||||
- Indexy: received_at, sent_at, sender.email, filename (unique), full-text (subject+body+to+cc)
|
||||
Reference in New Issue
Block a user