This commit is contained in:
2026-06-13 21:45:28 +02:00
parent 6bcb721eb4
commit f94573ea6e
473 changed files with 7805 additions and 31 deletions
+10
View File
@@ -0,0 +1,10 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Kolik slozek je v checkpointu vs kolik jich ma schranka, a ktere chybi."""
import sys, paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
i, o, e = c.exec_command("wc -l < /mnt/user/Scripts/MailStore/ingest_done.txt 2>/dev/null")
print("slozek v checkpointu:", o.read().decode().strip())
c.close()
+14
View File
@@ -0,0 +1,14 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Nahraj ingest skript na server pres SFTP."""
import paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
sftp = c.open_sftp()
local = r"U:\janssen\mailstore\mailstore_ingest_v1.0.py"
remote = "/mnt/user/Scripts/MailStore/mailstore_ingest_v1.0.py"
sftp.put(local, remote)
print("nahrano:", sftp.stat(remote).st_size, "bytu")
sftp.close()
c.close()
+22
View File
@@ -0,0 +1,22 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Nahraj PoC skript na server + over zavislosti/konektivitu v kontejneru."""
import paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
sftp = c.open_sftp()
sftp.put(r"U:\janssen\mailstore\mailstore_attachments_poc.py",
"/mnt/user/Scripts/MailStore/mailstore_attachments_poc.py")
print("nahrano OK")
sftp.close()
def sh(cmd):
i, o, e = c.exec_command(cmd)
return (o.read() + e.read()).decode("utf-8", "replace").strip()
# requests v kontejneru?
print("requests:", sh("docker exec python-runner python -c \"import requests; print(requests.__version__)\""))
# dosah na SeaweedFS Filer z kontejneru?
print("filer:", sh("docker exec python-runner python -c \"import urllib.request as u; print(u.urlopen('http://192.168.1.50:8888/', timeout=8).status)\""))
c.close()
+28
View File
@@ -0,0 +1,28 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Kolik 2020+ zprav z problemovych slozek uz je v Mongo (dle mailstore_folder)."""
import sys, paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
py = r'''
from pymongo import MongoClient
from datetime import datetime, timezone
m=MongoClient("mongodb://192.168.1.76:27017")
col=m["emaily"]["vladimir.buzalka@buzalka.cz"]
since=datetime(2020,1,1,tzinfo=timezone.utc)
# vsechny distinct mailstore_folder hodnoty obsahujici tyto nazvy
for needle in ["Odstranen", "/Doru", "OPTUM_PATROc", "27646"]:
folders=[f for f in col.distinct("mailstore_folder") if f and needle in f]
print("== needle:", needle, "->", len(folders), "slozek")
for label, q in [
("Odstranena posta (top)", {"mailstore_folder":{"$regex":"/Odstran[^/]*$"}}),
("Dorucena posta (top)", {"mailstore_folder":{"$regex":"/Doru[^/]*posta$"}}),
]:
tot=col.count_documents(dict(q, source="mailstore"))
new=col.count_documents(dict(q, source="mailstore", sent_at={"$gte":since}))
print(f"{label}: mailstore={tot} 2020+={new}")
'''
i, o, e = c.exec_command('docker exec python-runner python -c %r' % py)
sys.stdout.buffer.write(o.read()); sys.stdout.buffer.write(e.read())
c.close()
+16
View File
@@ -0,0 +1,16 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Rychly test: odpovida MailStore IMAP?"""
import imaplib, ssl, time
MS_HOST="192.168.1.53"; IMAP_PORT=143
MS_USER="admin"; MS_PASS='*$N(B)vMUym!%'
for i in range(1,4):
try:
ctx=ssl.create_default_context(); ctx.check_hostname=False; ctx.verify_mode=ssl.CERT_NONE
t0=time.time()
M=imaplib.IMAP4(MS_HOST,IMAP_PORT); M.starttls(ssl_context=ctx); M.login(MS_USER,MS_PASS)
print(f"pokus {i}: OK za {time.time()-t0:.1f}s, CAP={M.capabilities}")
M.logout(); break
except Exception as ex:
print(f"pokus {i}: SELHAL {type(ex).__name__}: {ex}")
time.sleep(3)
+11
View File
@@ -0,0 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
# posledni zpracovane slozky s nenulovym k dobrani z posledniho behu
cmd = "grep 'k dobrani=' /mnt/user/Scripts/MailStore/ingest_full.log | tail -15"
i, o, e = c.exec_command(cmd)
sys.stdout.buffer.write(o.read())
c.close()
+22
View File
@@ -0,0 +1,22 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Kolik zprav z mailstore uz je v Mongo kolekci schranky."""
import sys
import paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
py = (
"from pymongo import MongoClient; "
"m=MongoClient('mongodb://192.168.1.76:27017'); "
"col=m['emaily']['vladimir.buzalka@buzalka.cz']; "
"print('celkem v kolekci:', col.count_documents({})); "
"print('z mailstore:', col.count_documents({'source':'mailstore'}))"
)
cmd = "docker exec python-runner python -c \"%s\"" % py
i, o, e = c.exec_command(cmd)
sys.stdout.buffer.write(o.read())
sys.stdout.buffer.write(e.read())
c.close()
+37
View File
@@ -0,0 +1,37 @@
# Orchestrator: pousti ingest dokola; po wedgi MailStore IMAP restartne sluzbu
# a zkusi znovu. Diky checkpointu kazdy pokus navaze tam, kde wedge utnul.
# Konci kdyz ingest skonci ciste
$env:PYTHONIOENCODING = "utf-8"
$py = "U:/janssen/.venv/Scripts/python.exe"
$maxAttempts = 10
$pass = ConvertTo-SecureString "Vlado7309208104++" -AsPlainText -Force
$cred = New-Object System.Management.Automation.PSCredential("administrator", $pass)
function Restart-MailStore {
Write-Output ">>> restart MailStoreServer sluzby na .53"
Invoke-Command -ComputerName 192.168.1.53 -Credential $cred -ScriptBlock {
Restart-Service -Name MailStoreServer -Force
}
Start-Sleep -Seconds 25
# pockej az IMAP zacne odpovidat (delsi warmup - obri slozky chteji zahraty server)
for ($i = 1; $i -le 12; $i++) {
$ok = & $py "U:/janssen/mailstore/_imap_test.py" 2>&1 | Select-String "OK za"
if ($ok) { Start-Sleep -Seconds 8; Write-Output ">>> IMAP zpet"; return }
Start-Sleep -Seconds 6
}
Write-Output ">>> IMAP po restartu stale neodpovida (jedu dal stejne)"
}
for ($attempt = 1; $attempt -le $maxAttempts; $attempt++) {
Write-Output "================ POKUS $attempt / $maxAttempts ================"
& $py "U:/janssen/mailstore/_run_ingest.py"
$rc = $LASTEXITCODE
Write-Output ">>> ingest skoncil rc=$rc"
if ($rc -eq 0) {
Write-Output "================ HOTOVO (cisty beh) ================"
break
}
# wedge / chyba -> restart MailStore a zkus znovu
Restart-MailStore
}
+40
View File
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Spusti blokujici ingest na serveru (docker exec drzeny pres paramiko).
Loguje progress kazdych 30s. Idempotentni - hotove slozky preskoci."""
import sys
import time
import paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
cmd = (
"docker exec -e PYTHONIOENCODING=utf-8 python-runner "
"python -u /scripts/MailStore/mailstore_ingest_v1.0.py "
"vladimir.buzalka@buzalka.cz --since 2020 "
"--log-file /scripts/MailStore/ingest_full.log "
"--checkpoint /scripts/MailStore/ingest_done.txt"
)
tr = c.get_transport()
ch = tr.open_session()
ch.exec_command(cmd)
t0 = time.time()
buf = b""
while not ch.exit_status_ready():
while ch.recv_ready():
buf += ch.recv(65536)
time.sleep(30)
mins = (time.time() - t0) / 60
print("[%5.1f min] bezi..." % mins, flush=True)
while ch.recv_ready():
buf += ch.recv(65536)
rc = ch.recv_exit_status()
print("=== EXIT %d po %.1f min ===" % (rc, (time.time() - t0) / 60), flush=True)
sys.stdout.buffer.write(buf[-2000:])
c.close()
sys.exit(rc)
+29
View File
@@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Ingest rok 2019 z MailStore -> Mongo (fresh checkpoint, blokujici docker exec).
Stavajici checkpoint 2020+ by 2019 minul (preskoci slozky pred scanem), proto
samostatny ingest_done_2019.txt. Dedup Message-ID pohlida duplicity."""
import sys, time, paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
cmd = (
"docker exec -e PYTHONIOENCODING=utf-8 python-runner "
"python -u /scripts/MailStore/mailstore_ingest_v1.0.py "
"vladimir.buzalka@buzalka.cz --since 2019 --until 2019 "
"--log-file /scripts/MailStore/ingest_2019.log "
"--checkpoint /scripts/MailStore/ingest_done_2019.txt"
)
tr = c.get_transport()
ch = tr.open_session()
ch.exec_command(cmd)
t0 = time.time()
while not ch.exit_status_ready():
time.sleep(30)
print("[%5.1f min] bezi..." % ((time.time() - t0) / 60), flush=True)
rc = ch.recv_exit_status()
print("=== EXIT %d po %.1f min ===" % (rc, (time.time() - t0) / 60), flush=True)
c.close()
sys.exit(rc)
+16
View File
@@ -0,0 +1,16 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Spusti PoC prilohy->SeaweedFS v python-runner kontejneru (blokujici)."""
import sys, paramiko
limit = sys.argv[1] if len(sys.argv) > 1 else "50"
extra = " ".join(sys.argv[2:]) if len(sys.argv) > 2 else "--write-back"
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
cmd = ("docker exec -e PYTHONIOENCODING=utf-8 python-runner "
"python -u /scripts/MailStore/mailstore_attachments_poc.py "
f"vladimir.buzalka@buzalka.cz --limit {limit} {extra}")
i, o, e = c.exec_command(cmd)
sys.stdout.buffer.write(o.read())
sys.stdout.buffer.write(e.read())
c.close()
+29
View File
@@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Prida obri slozky (Odstranena/Dorucena posta) do checkpointu - prijimame je
jako hotove na ~99%, protoze re-scan 170k/47k hlavicek deterministicky wedgne
MailStore IMAP. Data uz v Mongu jsou z drivejsich behu."""
import paramiko
GIANTS = [
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Odstraněná pošta",
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Doručená pošta",
]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
sftp = c.open_sftp()
path = "/mnt/user/Scripts/MailStore/ingest_done.txt"
# nacti stavajici
with sftp.open(path, "r") as f:
existing = {ln.strip() for ln in f.read().decode("utf-8").splitlines() if ln.strip()}
added = 0
with sftp.open(path, "a") as f:
for g in GIANTS:
if g not in existing:
f.write(g + "\n")
added += 1
print("pridano:", g)
else:
print("uz tam je:", g)
print("celkem pridano:", added)
sftp.close(); c.close()
+22
View File
@@ -0,0 +1,22 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Jednorazovy status check ingestu na serveru."""
import sys
import paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
cmd = (
"date; "
"echo ---PROC---; "
"docker exec python-runner ps aux 2>/dev/null | grep mailstore_ingest | grep -v grep; "
"echo ---FOLDERS_DONE---; "
"grep -c 'k dobrani=' /mnt/user/Scripts/MailStore/ingest_full.log; "
"echo ---TAIL---; "
"tail -3 /mnt/user/Scripts/MailStore/ingest_full.log"
)
i, o, e = c.exec_command(cmd)
sys.stdout.buffer.write(o.read())
c.close()
+24
View File
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Stav archivnich storu MailStore pres Management API (GetStores)."""
import json, ssl, urllib.request, urllib.parse
from base64 import b64encode
MS_HOST="192.168.1.53"; API_PORT=8463
MS_USER="admin"; MS_PASS='*$N(B)vMUym!%'
ctx=ssl.create_default_context(); ctx.check_hostname=False; ctx.verify_mode=ssl.CERT_NONE
auth=b64encode(f"{MS_USER}:{MS_PASS}".encode()).decode()
def call(fn, **params):
url=f"https://{MS_HOST}:{API_PORT}/api/invoke/{fn}"
data=urllib.parse.urlencode(params).encode() if params else b""
req=urllib.request.Request(url, data=data, headers={"Authorization":f"Basic {auth}"})
with urllib.request.urlopen(req, context=ctx, timeout=30) as r:
return json.loads(r.read().decode("utf-8-sig"))
for fn in ("GetStores", "GetServerInfo"):
try:
print(f"===== {fn} =====")
print(json.dumps(call(fn), indent=2, ensure_ascii=False)[:2500])
except Exception as ex:
print(f"{fn}: CHYBA {type(ex).__name__}: {ex}")
+9
View File
@@ -0,0 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, paramiko
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
i, o, e = c.exec_command("tail -40 /mnt/user/Scripts/MailStore/ingest_full.log")
sys.stdout.buffer.write(o.read())
c.close()
+22
View File
@@ -0,0 +1,22 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Vyjme obri slozky z checkpointu, aby se znovu proскenovaly (uz bez wedge,
kdyz je C: opravene). Male slozky v checkpointu nejsou -> zpracuji se samy."""
import paramiko
REMOVE = {
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Odstraněná pošta",
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Doručená pošta",
}
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
sftp = c.open_sftp()
path = "/mnt/user/Scripts/MailStore/ingest_done.txt"
with sftp.open(path, "r") as f:
lines = [ln.rstrip("\n") for ln in f.read().decode("utf-8").splitlines()]
kept = [l for l in lines if l.strip() and l.strip() not in REMOVE]
removed = len(lines) - len(kept)
with sftp.open(path, "w") as f:
f.write("\n".join(kept) + "\n")
print(f"puvodne radku: {len(lines)} odebrano: {removed} zbylo: {len(kept)}")
sftp.close(); c.close()
+129 -29
View File
@@ -62,7 +62,7 @@ MS_PASS = "*$N(B)vMUym!%"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
HEADER_BATCH = 2000 # kolik hlavicek FETCHovat naraz
HEADER_BATCH = 500 # kolik hlavicek FETCHovat naraz (mensi davka = setrnejsi k MailStore IMAP u obrich slozek)
UPSERT_BATCH = 100 # kolik dokumentu zapsat naraz do Mongo
# --- API (jen GetChildFolders na seznam slozek) -----------------------------
@@ -101,26 +101,51 @@ def collect_folders(mailbox: str) -> list[str]:
# --- IMAP --------------------------------------------------------------------
def imap_connect() -> imaplib.IMAP4:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
M = imaplib.IMAP4(MS_HOST, IMAP_PORT)
M.starttls(ssl_context=ctx)
M.login(MS_USER, MS_PASS)
return M
def imap_connect(retries: int = 6, delay: float = 5.0) -> imaplib.IMAP4:
"""Pripoj se k IMAP; MailStore obcas utne spojeni i behem handshake
(CAPABILITY => EOF) -> retry s kratkym spankem, aby transientni vypadek
neshodil cely beh."""
last = None
for attempt in range(1, retries + 1):
try:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
M = imaplib.IMAP4(MS_HOST, IMAP_PORT)
M.starttls(ssl_context=ctx)
M.login(MS_USER, MS_PASS)
return M
except (imaplib.IMAP4.abort, imaplib.IMAP4.error, OSError) as ex:
last = ex
print(" ! imap_connect pokus %d/%d selhal: %s -> cekam %.0fs"
% (attempt, retries, ex, delay), flush=True)
time.sleep(delay)
raise last
_SEQ_RX = re.compile(rb"^(\d+)\s")
_UID_RX = re.compile(rb"UID (\d+)")
def _safe_decode(b: bytes, enc) -> str:
"""Dekoduj bytes; nestandardni/nezname charsety (napr. 'unknown-8bit')
nesmi shodit beh -> fallback na utf-8, pak latin-1."""
for e in (enc, "utf-8", "latin-1"):
if not e:
continue
try:
return b.decode(e, errors="replace")
except (LookupError, TypeError):
continue
return b.decode("utf-8", errors="replace")
def dec(s) -> str:
if not s:
return ""
out = []
for txt, enc in decode_header(s):
out.append(txt.decode(enc or "utf-8", errors="replace") if isinstance(txt, bytes) else txt)
out.append(_safe_decode(txt, enc) if isinstance(txt, bytes) else txt)
return "".join(out).replace("\r", " ").replace("\n", " ").strip()
@@ -254,9 +279,9 @@ def extract_bodies(msg):
"is_inline": "inline" in disp,
})
elif ct == "text/plain" and not body_text:
body_text = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
body_text = _safe_decode(payload or b"", part.get_content_charset())
elif ct == "text/html" and not body_html:
body_html = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
body_html = _safe_decode(payload or b"", part.get_content_charset())
return body_text, body_html, atts
@@ -314,8 +339,22 @@ def main() -> int:
ap.add_argument("--max-folders", type=int, default=None, help="Max slozek (diagnostika)")
ap.add_argument("--dry-run", action="store_true",
help="Jen spocitej kolik by se dobralo, NIC nezapisuj")
ap.add_argument("--log-file", default=None,
help="Presmeruj vystup do souboru (line-buffered). Pro detached beh "
"v kontejneru bez shell redirectu (ten by docker exec cleanup zabil).")
ap.add_argument("--checkpoint", default=None,
help="Soubor s hotovymi slozkami (jedna cesta na radek). Hotove slozky "
"se pri dalsim behu preskoci BEZ FETCH -> rychle navazani po wedgi "
"MailStore IMAP. Idempotentni.")
args = ap.parse_args()
# Vlastni log do souboru - aby detached `docker exec -d python ...` mohl bezet
# bez shell wrapperu (sh -c '... &' docker exec cleanup zabije).
if args.log_file:
_f = open(args.log_file, "a", buffering=1, encoding="utf-8")
sys.stdout = _f
sys.stderr = _f
t0 = time.time()
print(f"=== MailStore ingest v1.0 | schranka: {args.mailbox} ===")
print(f"Filtr: rok >= {args.since or '-'}{' a <= ' + str(args.until) if args.until else ''}"
@@ -326,7 +365,8 @@ def main() -> int:
mongo.admin.command("ping")
coll = mongo[MONGO_DB][args.mailbox]
print("Nacitam existujici Message-ID z Mongo...", flush=True)
known = set(coll.distinct("_id"))
# distinct('_id') prekroci 16MB cap u velkych kolekci -> kurzor po davkach
known = {d["_id"] for d in coll.find({}, {"_id": 1}).batch_size(5000)}
print(f" v Mongu uz mam: {len(known):,} zprav")
# slozky
@@ -336,9 +376,21 @@ def main() -> int:
folders = collect_folders(args.mailbox)
print(f"Slozek ke kontrole: {len(folders)}")
# checkpoint hotovych slozek (preskoci se bez FETCH)
done_folders: set[str] = set()
cp_fh = None
if args.checkpoint and not args.dry_run:
try:
with open(args.checkpoint, "r", encoding="utf-8") as _cf:
done_folders = {ln.strip() for ln in _cf if ln.strip()}
except FileNotFoundError:
pass
cp_fh = open(args.checkpoint, "a", buffering=1, encoding="utf-8")
print(f" checkpoint: {len(done_folders)} slozek uz hotovo (preskocim)")
M = imap_connect()
grand_seen = grand_cand = grand_ingested = 0
grand_seen = grand_cand = grand_ingested = grand_errors = 0
queue: list[UpdateOne] = []
def flush():
@@ -348,16 +400,30 @@ def main() -> int:
queue = []
nonlocal_M = {"M": M}
consec_aborts = 0 # po sobe jdouci aborty = MailStore zwedgoval -> exit(1) pro orchestrator
for fidx, folder in enumerate(folders):
if args.max_folders and fidx >= args.max_folders:
print(f" (--max-folders {args.max_folders} dosazeno)")
break
if folder in done_folders:
continue
try:
total, items = scan_folder_headers(nonlocal_M["M"], folder)
except Exception as ex:
# jedna chybna slozka nesmi shodit cely beh - zaloguj a pokracuj.
# Pri chybe IMAP spojeni (abort) se prepoj.
print(f" [{relativize(folder, args.mailbox)[:45]:45}] CHYBA: {type(ex).__name__}: {str(ex)[:80]}", flush=True)
consec_aborts += 1
if consec_aborts >= 4:
# MailStore IMAP je zwedgovany (login projde, ale FETCH hned EOF) ->
# nedet nastavanou kaskadu falesnych preskoku, skonci NEnulovym kodem,
# at orchestrator restartne sluzbu MailStore a navaze z checkpointu.
flush()
print("!!! %d po sobe jdoucich abortu -> MailStore wedge, koncim rc=2 pro restart"
% consec_aborts, flush=True)
if cp_fh:
cp_fh.close()
return 2
try:
nonlocal_M["M"].logout()
except Exception:
@@ -386,21 +452,53 @@ def main() -> int:
if args.dry_run:
continue
for seq, uid, mid in cands:
if args.limit and grand_ingested >= args.limit:
break
raw = fetch_full(M, seq)
if not raw:
continue
doc = build_doc(raw, uid, folder, args.mailbox)
if not doc:
continue
queue.append(UpdateOne({"_id": doc["_id"]}, {"$setOnInsert": doc}, upsert=True))
known.add(doc["_id"])
grand_ingested += 1
if len(queue) >= UPSERT_BATCH:
flush()
flush()
try:
for seq, uid, mid in cands:
if args.limit and grand_ingested >= args.limit:
break
try:
raw = fetch_full(M, seq)
if not raw:
continue
doc = build_doc(raw, uid, folder, args.mailbox)
if not doc:
continue
except imaplib.IMAP4.abort:
# spojeni umrelo (MailStore wedge) -> ven, prepoj, slozku zopakuje
# az dalsi run (NEoznacit hotovou)
raise
except Exception as ex:
# jedna vadna zprava nesmi shodit beh - preskoc a pokracuj
grand_errors += 1
print(f" ! zprava seq={seq} CHYBA: {type(ex).__name__}: {str(ex)[:60]}", flush=True)
continue
queue.append(UpdateOne({"_id": doc["_id"]}, {"$setOnInsert": doc}, upsert=True))
known.add(doc["_id"])
grand_ingested += 1
if len(queue) >= UPSERT_BATCH:
flush()
flush()
except imaplib.IMAP4.abort as ex:
flush()
print(f" [{rel[:45]:45}] IMAP abort behem fetch: {str(ex)[:50]} -> reconnect", flush=True)
consec_aborts += 1
if consec_aborts >= 4:
print("!!! %d po sobe jdoucich abortu -> MailStore wedge, koncim rc=2 pro restart"
% consec_aborts, flush=True)
if cp_fh:
cp_fh.close()
return 2
try:
nonlocal_M["M"].logout()
except Exception:
pass
nonlocal_M["M"] = imap_connect()
continue # slozka NENI hotova -> zopakuje ji dalsi run
# slozka uspesne dokoncena -> zapis do checkpointu
consec_aborts = 0
if cp_fh:
cp_fh.write(folder + "\n")
done_folders.add(folder)
if args.limit and grand_ingested >= args.limit:
print(f" (dosazen limit {args.limit})")
break
@@ -415,6 +513,8 @@ def main() -> int:
print(">>> DRY-RUN: nic nezapsano. Pro ostry beh spust bez --dry-run.")
else:
print(f"Zapsano do Mongo: {grand_ingested:,}")
if grand_errors:
print(f"Preskoceno zprav s chybou: {grand_errors:,}")
print(f"Trvalo: {time.time()-t0:.1f}s")
return 0