notebook
This commit is contained in:
@@ -69,6 +69,13 @@ Historie verzi:
|
||||
1.5 2026-06-13 Nova priloha se zaroven nahrava do SeaweedFS (Tower1) pres
|
||||
sdileny seaweed_store.py; index dostane seaweed_path/url/synced_at.
|
||||
Vypadek SeaweedFS pipeline neshodi (fallback = backfill skript).
|
||||
1.5.1 2026-06-14 Pojistka proti emailum s prazdnym graph_id (legacy/mailstore
|
||||
importy jeste neoznacene jako source=mailstore). Drive se z nich
|
||||
sestavila URL .../messages//attachments -> 400 a kazda priloha
|
||||
se zapocitala jako chyba -> cely pipeline report padal na FAIL
|
||||
(priklad: 2026-06-13, 22 666 chyb). Nyni se takove prilohy
|
||||
trvale oznaci attachment_no_graph_id (jako missing/reference),
|
||||
preskoci se a NEpocitaji jako chyba.
|
||||
"""
|
||||
|
||||
import sys
|
||||
@@ -105,7 +112,7 @@ MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
EMAILS_BASE_DIR = Path("/mnt/Emails")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.5"
|
||||
SCRIPT_VERSION = "1.5.1"
|
||||
BATCH_SIZE = 50
|
||||
|
||||
# Typy příloh které přeskočíme (S/MIME podpisy, certifikáty)
|
||||
@@ -448,6 +455,7 @@ def process_mailbox(client, mailbox: str, args) -> dict:
|
||||
"file_hash": {"$exists": False},
|
||||
"attachment_missing": {"$ne": True},
|
||||
"attachment_reference": {"$ne": True},
|
||||
"attachment_no_graph_id": {"$ne": True},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -457,7 +465,7 @@ def process_mailbox(client, mailbox: str, args) -> dict:
|
||||
if total == 0:
|
||||
print(" Neni co stahnout.")
|
||||
return {"mailbox": mailbox, "ok": 0, "new": 0, "dup": 0, "skip": 0,
|
||||
"miss": 0, "ref": 0, "err": 0, "elapsed": 0.0}
|
||||
"miss": 0, "ref": 0, "nogid": 0, "err": 0, "elapsed": 0.0}
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
@@ -469,6 +477,7 @@ def process_mailbox(client, mailbox: str, args) -> dict:
|
||||
skip_count = 0
|
||||
miss_count = 0
|
||||
ref_count = 0
|
||||
nogid_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
@@ -492,10 +501,36 @@ def process_mailbox(client, mailbox: str, args) -> dict:
|
||||
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)
|
||||
and not a.get("attachment_missing")
|
||||
and not a.get("attachment_reference")]
|
||||
and not a.get("attachment_reference")
|
||||
and not a.get("attachment_no_graph_id")]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
# Email bez graph_id nelze stahnout z Graphu (legacy/mailstore import,
|
||||
# jeste neoznaceny jako source=mailstore). Bez teto pojistky se sestavi
|
||||
# URL .../messages//attachments -> 400 Bad Request a KAZDA priloha se
|
||||
# zapocita jako chyba -> cely pipeline report spadne na FAIL.
|
||||
# Oznacime prilohy attachment_no_graph_id (permanentni, jako
|
||||
# missing/reference), aby se v dalsich bezich preskocily a NEpocitaly
|
||||
# jako chyba.
|
||||
if not graph_id:
|
||||
now_utc = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||
marked = list(att_list)
|
||||
for i, a in enumerate(marked):
|
||||
if (a.get("is_inline", False) or a.get("file_hash")
|
||||
or a.get("attachment_missing") or a.get("attachment_reference")
|
||||
or a.get("attachment_no_graph_id")):
|
||||
continue
|
||||
marked[i] = {**a, "attachment_no_graph_id": True,
|
||||
"attachment_no_graph_id_at": now_utc}
|
||||
nogid_count += len(real_atts)
|
||||
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": marked}}))
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
print(f"\n {email_i:>5}/{total} NOGID {subject} "
|
||||
f"({len(real_atts)} priloh bez graph_id — oznaceno)")
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
need_listing = any(
|
||||
@@ -517,6 +552,8 @@ def process_mailbox(client, mailbox: str, args) -> dict:
|
||||
continue
|
||||
if att.get("attachment_missing") or att.get("attachment_reference"):
|
||||
continue
|
||||
if att.get("attachment_no_graph_id"):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
continue
|
||||
|
||||
@@ -628,17 +665,17 @@ def process_mailbox(client, mailbox: str, args) -> dict:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} "
|
||||
f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count}")
|
||||
f"skip={skip_count} miss={miss_count} ref={ref_count} nogid={nogid_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" -> mailbox total: emaily={ok_count} nove={new_count} dup={dup_count} "
|
||||
f"skip={skip_count} miss={miss_count} ref={ref_count} err={err_count} ({elapsed:.1f} s)")
|
||||
f"skip={skip_count} miss={miss_count} ref={ref_count} nogid={nogid_count} err={err_count} ({elapsed:.1f} s)")
|
||||
return {"mailbox": mailbox, "ok": ok_count, "new": new_count, "dup": dup_count,
|
||||
"skip": skip_count, "miss": miss_count, "ref": ref_count, "err": err_count,
|
||||
"elapsed": elapsed}
|
||||
"skip": skip_count, "miss": miss_count, "ref": ref_count, "nogid": nogid_count,
|
||||
"err": err_count, "elapsed": elapsed}
|
||||
|
||||
|
||||
def discover_mailboxes(db) -> list[str]:
|
||||
@@ -711,24 +748,24 @@ def main():
|
||||
logging.error("process_mailbox %s: %s", mb, e)
|
||||
print(f" FATAL pri zpracovani {mb}: {e}")
|
||||
results.append({"mailbox": mb, "ok": 0, "new": 0, "dup": 0,
|
||||
"skip": 0, "miss": 0, "ref": 0, "err": 1, "elapsed": 0.0})
|
||||
"skip": 0, "miss": 0, "ref": 0, "nogid": 0, "err": 1, "elapsed": 0.0})
|
||||
|
||||
elapsed_total = (datetime.now() - start_all).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
grand = {k: sum(r.get(k, 0) for r in results)
|
||||
for k in ("ok", "new", "dup", "skip", "miss", "ref", "err")}
|
||||
for k in ("ok", "new", "dup", "skip", "miss", "ref", "nogid", "err")}
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("=== SHRNUTI ===")
|
||||
for r in results:
|
||||
print(f" {r['mailbox']:40} ok={r['ok']:>5} nove={r['new']:>4} "
|
||||
f"dup={r['dup']:>4} skip={r['skip']:>3} miss={r.get('miss',0):>3} "
|
||||
f"ref={r.get('ref',0):>3} err={r['err']:>3}")
|
||||
f"ref={r.get('ref',0):>3} nogid={r.get('nogid',0):>4} err={r['err']:>3}")
|
||||
print(f" {'TOTAL':40} ok={grand['ok']:>5} nove={grand['new']:>4} "
|
||||
f"dup={grand['dup']:>4} skip={grand['skip']:>3} miss={grand['miss']:>3} "
|
||||
f"ref={grand['ref']:>3} err={grand['err']:>3}")
|
||||
f"ref={grand['ref']:>3} nogid={grand['nogid']:>4} err={grand['err']:>3}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
Reference in New Issue
Block a user