Files
janssen/Python-runner/download_attachments_v1.1.py
T
2026-06-02 17:20:20 +02:00

429 lines
15 KiB
Python

"""
download_attachments_v1.1.py
Nazev: download_attachments_v1.1.py
Verze: 1.1
Datum: 2026-06-02
Autor: vladimir.buzalka
Popis:
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
pres Microsoft Graph API a uklada je do adresare
/mnt/Emails/<schránka>/Attachments/.
Schránka se predava jako povinny parametr --mailbox.
Deduplikace podle SHA256 hashe obsahu:
- stejny hash = soubor uz existuje -> preskoci
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
Po ulozeni aktualizuje MongoDB:
- v email dokumentu: kazda priloha dostane file_hash + local_path
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
mime_type, mailbox, first_seen_at, ref_count
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
se preskoci. --force-recheck znovu overi i uz stazene.
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
Spousteni:
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz
python download_attachments_v1.1.py --mailbox vladimir.buzalka@buzalka.cz --limit 50
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz --force-recheck
Docker:
docker exec -it python-runner python /scripts/download_attachments_v1.1.py \\
--mailbox ordinace@buzalkova.cz
Zavislosti:
msal, requests, pymongo
Python 3.10+
Struktura na disku:
/mnt/Emails/
└── <mailbox>/
└── Attachments/
├── faktura_2026.pdf
├── vysledky_lab.pdf
├── vysledky_lab_2.pdf
└── ...
Kolekce emaily.attachments_index:
_id SHA256 hash (hex)
filename nazev souboru na disku
local_path relativni cesta od Attachments/
size_bytes velikost souboru
mime_type MIME typ
mailbox schránka ze ktere pochazi prvni vyskytu
first_seen_at datetime UTC
ref_count v kolika emailech se tato priloha vyskytuje
Historie verzi:
1.0 2026-06-02 Inicialni verze
1.1 2026-06-02 Schránka jako parametr --mailbox (univerzalni pouziti)
"""
import sys
import hashlib
import logging
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import msal
import requests
from pymongo import MongoClient, UpdateOne
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
GRAPH_URL = "https://graph.microsoft.com/v1.0"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
MONGO_COL_INDEX = "attachments_index"
EMAILS_BASE_DIR = Path("/mnt/Emails")
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.1"
BATCH_SIZE = 50
# ──────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
filename=str(LOG_FILE),
level=logging.ERROR,
format="%(asctime)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
encoding="utf-8",
)
_graph_token: Optional[str] = None
# ─── Graph API ────────────────────────────────────────────────────────────────
def get_token() -> str:
global _graph_token
app = msal.ConfidentialClientApplication(
GRAPH_CLIENT_ID,
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
client_credential=GRAPH_CLIENT_SECRET,
)
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise RuntimeError(f"Graph auth failed: {result}")
_graph_token = result["access_token"]
return _graph_token
def graph_get_bytes(url: str) -> bytes:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.content
raise RuntimeError(f"Graph GET bytes failed: {url}")
def graph_get_json(url: str, params: dict = None) -> dict:
global _graph_token
if not _graph_token:
get_token()
for attempt in range(2):
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
if r.status_code == 401:
get_token()
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Graph GET json failed: {url}")
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
try:
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
return data.get("value", [])
except Exception as e:
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
return []
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
try:
return graph_get_bytes(url)
except Exception as e:
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
return None
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
def sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def safe_filename(name: str) -> str:
safe = "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip()
return safe or "attachment"
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
"""Vrati nazev souboru pro ulozeni — resi kolize (stejny nazev, jiny hash)."""
existing = col_index.find_one({"filename": desired_name})
if existing:
if existing["_id"] == hash_val:
return desired_name # Dedup hit — stejny hash
# Kolize — hledej volny suffix
stem = Path(desired_name).stem
suffix = Path(desired_name).suffix
n = 2
while True:
candidate = f"{stem}_{n}{suffix}"
ex2 = col_index.find_one({"filename": candidate})
if not ex2 or ex2["_id"] == hash_val:
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
return candidate
n += 1
return desired_name
def save_attachment(
content: bytes,
original_name: str,
mime_type: str,
mailbox: str,
att_dir: Path,
col_index,
) -> tuple[str, str, bool]:
"""
Ulozi prilohu s deduplikaci.
Vraci (hash, local_path, was_new).
"""
hash_val = sha256(content)
existing = col_index.find_one({"_id": hash_val})
if existing:
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
return hash_val, existing["local_path"], False
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
file_path = att_dir / filename
file_path.write_bytes(content)
col_index.insert_one({
"_id": hash_val,
"filename": filename,
"local_path": filename,
"size_bytes": len(content),
"mime_type": mime_type,
"mailbox": mailbox,
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
"ref_count": 1,
})
return hash_val, filename, True
# ─── MAIN ─────────────────────────────────────────────────────────────────────
def main():
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
ap.add_argument("--mailbox", required=True,
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
ap.add_argument("--limit", type=int, default=0,
help="Zpracovat max N emailu (0 = vse)")
ap.add_argument("--force-recheck", action="store_true",
help="Znovu overi i emaily kde prilohy uz maji file_hash")
ap.add_argument("--no-indexes", action="store_true",
help="Nevytvorit indexy na attachments_index kolekci")
args = ap.parse_args()
mailbox = args.mailbox
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
mongo_col = mailbox
start = datetime.now()
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Schránka: {mailbox}")
print(f"Cilovy adresar: {att_dir}")
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
att_dir.mkdir(parents=True, exist_ok=True)
print(" Adresar OK")
print("\nPřipojuji se k Graph API...")
try:
get_token()
print(" Graph API OK")
except Exception as e:
print(f" CHYBA: {e}")
sys.exit(1)
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
try:
client.admin.command("ping")
print(" MongoDB OK")
except Exception as e:
print(f" CHYBA: MongoDB neni dostupna -- {e}")
sys.exit(1)
col_emails = client[MONGO_DB][mongo_col]
col_index = client[MONGO_DB][MONGO_COL_INDEX]
if not args.no_indexes:
col_index.create_index("filename")
col_index.create_index("mime_type")
col_index.create_index("mailbox")
# Dotaz
if args.force_recheck:
query = {"has_attachments": True}
else:
query = {
"has_attachments": True,
"attachments": {
"$elemMatch": {
"is_inline": False,
"file_hash": {"$exists": False},
}
}
}
total = col_emails.count_documents(query)
print(f"\nEmailu ke zpracovani: {total}")
if total == 0:
print("Neni co stahnout.")
client.close()
return
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
if args.limit:
cursor = cursor.limit(args.limit)
ok_count = 0
new_count = 0
dup_count = 0
err_count = 0
email_i = 0
batch = []
def flush():
if not batch:
return
try:
col_emails.bulk_write(batch, ordered=False)
except Exception as e:
logging.error("bulk_write: %s", e)
print(f" CHYBA bulk_write: {e}")
batch.clear()
for email_doc in cursor:
email_i += 1
email_id = email_doc["_id"]
graph_id = email_doc.get("graph_id", "")
subject = (email_doc.get("subject") or "")[:60]
att_list = email_doc.get("attachments") or []
real_atts = [a for a in att_list if not a.get("is_inline", False)]
if not real_atts:
continue
print(f"\n {email_i:>5}/{total} {subject}")
graph_atts = fetch_message_attachments(mailbox, graph_id)
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
updated_atts = list(att_list)
email_ok = True
for i, att in enumerate(updated_atts):
if att.get("is_inline", False):
continue
if not args.force_recheck and att.get("file_hash"):
print(f" SKIP {att['filename']}")
continue
att_name = att.get("filename", "")
graph_att = graph_att_map.get(att_name)
if not graph_att:
for gname, ga in graph_att_map.items():
if att_name.lower() in gname.lower():
graph_att = ga
break
if not graph_att:
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
print(f" ERR {att_name} (nenalezeno v Graph)")
err_count += 1
email_ok = False
continue
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
if content is None:
err_count += 1
email_ok = False
print(f" ERR {att_name} (stazeni selhalo)")
continue
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
hash_val, local_path, was_new = save_attachment(
content, att_name, mime_type, mailbox, att_dir, col_index
)
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
if was_new:
new_count += 1
print(f" NEW {local_path} ({len(content):,} B)")
else:
dup_count += 1
print(f" DUP {att_name} -> {local_path}")
if email_ok:
ok_count += 1
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
if len(batch) >= BATCH_SIZE:
flush()
if email_i % 100 == 0:
elapsed = (datetime.now() - start).total_seconds()
print(f" {''*60}")
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} err={err_count}")
print(f" {''*60}")
flush()
elapsed_total = (datetime.now() - start).total_seconds()
files_total = col_index.count_documents({})
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
print(f"\n{'='*52}")
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | err={err_count}")
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if err_count:
print(f"Chyby logovany do: {LOG_FILE}")
client.close()
if __name__ == "__main__":
main()