This commit is contained in:
2026-06-14 08:25:15 +02:00
parent f94573ea6e
commit ed6455787a
7 changed files with 876 additions and 20 deletions
+48 -11
View File
@@ -69,6 +69,13 @@ Historie verzi:
1.5 2026-06-13 Nova priloha se zaroven nahrava do SeaweedFS (Tower1) pres
sdileny seaweed_store.py; index dostane seaweed_path/url/synced_at.
Vypadek SeaweedFS pipeline neshodi (fallback = backfill skript).
1.5.1 2026-06-14 Pojistka proti emailum s prazdnym graph_id (legacy/mailstore
importy jeste neoznacene jako source=mailstore). Drive se z nich
sestavila URL .../messages//attachments -> 400 a kazda priloha
se zapocitala jako chyba -> cely pipeline report padal na FAIL
(priklad: 2026-06-13, 22 666 chyb). Nyni se takove prilohy
trvale oznaci attachment_no_graph_id (jako missing/reference),
preskoci se a NEpocitaji jako chyba.
"""
import sys
@@ -105,7 +112,7 @@ MONGO_COL_INDEX = "attachments_index"
EMAILS_BASE_DIR = Path("/mnt/Emails")
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
SCRIPT_VERSION = "1.5"
SCRIPT_VERSION = "1.5.1"
BATCH_SIZE = 50
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
@@ -448,6 +455,7 @@ def process_mailbox(client, mailbox: str, args) -> dict:
"file_hash": {"$exists": False},
"attachment_missing": {"$ne": True},
"attachment_reference": {"$ne": True},
"attachment_no_graph_id": {"$ne": True},
}
}
}
@@ -457,7 +465,7 @@ def process_mailbox(client, mailbox: str, args) -> dict:
if total == 0:
print(" Neni co stahnout.")
return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0,
"miss": 0, "ref": 0, "err": 0, "elapsed": 0.0}
"miss": 0, "ref": 0, "nogid": 0, "err": 0, "elapsed": 0.0}
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
if args.limit:
@@ -469,6 +477,7 @@ def process_mailbox(client, mailbox: str, args) -> dict:
skip_count = 0
miss_count = 0
ref_count = 0
nogid_count = 0
err_count = 0
email_i = 0
batch = []
@@ -492,10 +501,36 @@ def process_mailbox(client, mailbox: str, args) -> dict:
real_atts = [a for a in att_list if not a.get("is_inline", False)
and not a.get("attachment_missing")
and not a.get("attachment_reference")]
and not a.get("attachment_reference")
and not a.get("attachment_no_graph_id")]
if not real_atts:
continue
# Email bez graph_id nelze stahnout z Graphu (legacy/mailstore import,
# jeste neoznaceny jako source=mailstore). Bez teto pojistky se sestavi
# URL .../messages//attachments -> 400 Bad Request a KAZDA priloha se
# zapocita jako chyba -> cely pipeline report spadne na FAIL.
# Oznacime prilohy attachment_no_graph_id (permanentni, jako
# missing/reference), aby se v dalsich bezich preskocily a NEpocitaly
# jako chyba.
if not graph_id:
now_utc = datetime.now(timezone.utc).replace(tzinfo=None)
marked = list(att_list)
for i, a in enumerate(marked):
if (a.get("is_inline", False) or a.get("file_hash")
or a.get("attachment_missing") or a.get("attachment_reference")
or a.get("attachment_no_graph_id")):
continue
marked[i] = {**a, "attachment_no_graph_id": True,
"attachment_no_graph_id_at": now_utc}
nogid_count += len(real_atts)
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": marked}}))
if len(batch) >= BATCH_SIZE:
flush()
print(f"\n {email_i:>5}/{total} NOGID {subject} "
f"({len(real_atts)} priloh bez graph_id — oznaceno)")
continue
print(f"\n {email_i:>5}/{total} {subject}")
need_listing = any(
@@ -517,6 +552,8 @@ def process_mailbox(client, mailbox: str, args) -> dict:
continue
if att.get("attachment_missing") or att.get("attachment_reference"):
continue
if att.get("attachment_no_graph_id"):
continue
if not args.force_recheck and att.get("file_hash"):
continue
@@ -628,17 +665,17 @@ def process_mailbox(client, mailbox: str, args) -> dict:
elapsed = (datetime.now() - start).total_seconds()
print(f" {''*60}")
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} "
f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count}")
f"skip={skip_count} miss={miss_count} ref={ref_count} nogid={nogid_count} err={err_count}")
print(f" {''*60}")
flush()
elapsed = (datetime.now() - start).total_seconds()
print(f" -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count} ({elapsed:.1f} s)")
f"skip={skip_count} miss={miss_count} ref={ref_count} nogid={nogid_count} err={err_count} ({elapsed:.1f} s)")
return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
"skip": skip_count, "miss": miss_count, "ref": ref_count, "err": err_count,
"elapsed": elapsed}
"skip": skip_count, "miss": miss_count, "ref": ref_count, "nogid": nogid_count,
"err": err_count, "elapsed": elapsed}
def discover_mailboxes(db) -> list[str]:
@@ -711,24 +748,24 @@ def main():
logging.error("process_mailbox %s: %s", mb, e)
print(f" FATAL pri zpracovani {mb}: {e}")
results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
"skip": 0, "miss": 0, "ref": 0, "err": 1, "elapsed": 0.0})
"skip": 0, "miss": 0, "ref": 0, "nogid": 0, "err": 1, "elapsed": 0.0})
elapsed_total = (datetime.now() - start_all).total_seconds()
files_total = col_index.count_documents({})
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
grand = {k: sum(r.get(k, 0) for r in results)
for k in ("ok", "new", "dup", "skip", "miss", "ref", "err")}
for k in ("ok", "new", "dup", "skip", "miss", "ref", "nogid", "err")}
print(f"\n{'='*60}")
print("=== SHRNUTI ===")
for r in results:
print(f" {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
f"dup={r['dup']:>4} skip={r['skip']:>3} miss={r.get('miss',0):>3} "
f"ref={r.get('ref',0):>3} err={r['err']:>3}")
f"ref={r.get('ref',0):>3} nogid={r.get('nogid',0):>4} err={r['err']:>3}")
print(f" {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
f"dup={grand['dup']:>4} skip={grand['skip']:>3} miss={grand['miss']:>3} "
f"ref={grand['ref']:>3} err={grand['err']:>3}")
f"ref={grand['ref']:>3} nogid={grand['nogid']:>4} err={grand['err']:>3}")
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")