notebook
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Kolik slozek je v checkpointu vs kolik jich ma schranka, a ktere chybi."""
|
||||
import sys, paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
i, o, e = c.exec_command("wc -l < /mnt/user/Scripts/MailStore/ingest_done.txt 2>/dev/null")
|
||||
print("slozek v checkpointu:", o.read().decode().strip())
|
||||
c.close()
|
||||
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Nahraj ingest skript na server pres SFTP."""
|
||||
import paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
sftp = c.open_sftp()
|
||||
local = r"U:\janssen\mailstore\mailstore_ingest_v1.0.py"
|
||||
remote = "/mnt/user/Scripts/MailStore/mailstore_ingest_v1.0.py"
|
||||
sftp.put(local, remote)
|
||||
print("nahrano:", sftp.stat(remote).st_size, "bytu")
|
||||
sftp.close()
|
||||
c.close()
|
||||
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Nahraj PoC skript na server + over zavislosti/konektivitu v kontejneru."""
|
||||
import paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
sftp = c.open_sftp()
|
||||
sftp.put(r"U:\janssen\mailstore\mailstore_attachments_poc.py",
|
||||
"/mnt/user/Scripts/MailStore/mailstore_attachments_poc.py")
|
||||
print("nahrano OK")
|
||||
sftp.close()
|
||||
|
||||
def sh(cmd):
|
||||
i, o, e = c.exec_command(cmd)
|
||||
return (o.read() + e.read()).decode("utf-8", "replace").strip()
|
||||
|
||||
# requests v kontejneru?
|
||||
print("requests:", sh("docker exec python-runner python -c \"import requests; print(requests.__version__)\""))
|
||||
# dosah na SeaweedFS Filer z kontejneru?
|
||||
print("filer:", sh("docker exec python-runner python -c \"import urllib.request as u; print(u.urlopen('http://192.168.1.50:8888/', timeout=8).status)\""))
|
||||
c.close()
|
||||
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Kolik 2020+ zprav z problemovych slozek uz je v Mongo (dle mailstore_folder)."""
|
||||
import sys, paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
py = r'''
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime, timezone
|
||||
m=MongoClient("mongodb://192.168.1.76:27017")
|
||||
col=m["emaily"]["vladimir.buzalka@buzalka.cz"]
|
||||
since=datetime(2020,1,1,tzinfo=timezone.utc)
|
||||
# vsechny distinct mailstore_folder hodnoty obsahujici tyto nazvy
|
||||
for needle in ["Odstranen", "/Doru", "OPTUM_PATROc", "27646"]:
|
||||
folders=[f for f in col.distinct("mailstore_folder") if f and needle in f]
|
||||
print("== needle:", needle, "->", len(folders), "slozek")
|
||||
for label, q in [
|
||||
("Odstranena posta (top)", {"mailstore_folder":{"$regex":"/Odstran[^/]*$"}}),
|
||||
("Dorucena posta (top)", {"mailstore_folder":{"$regex":"/Doru[^/]*posta$"}}),
|
||||
]:
|
||||
tot=col.count_documents(dict(q, source="mailstore"))
|
||||
new=col.count_documents(dict(q, source="mailstore", sent_at={"$gte":since}))
|
||||
print(f"{label}: mailstore={tot} 2020+={new}")
|
||||
'''
|
||||
i, o, e = c.exec_command('docker exec python-runner python -c %r' % py)
|
||||
sys.stdout.buffer.write(o.read()); sys.stdout.buffer.write(e.read())
|
||||
c.close()
|
||||
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Rychly test: odpovida MailStore IMAP?"""
|
||||
import imaplib, ssl, time
|
||||
MS_HOST="192.168.1.53"; IMAP_PORT=143
|
||||
MS_USER="admin"; MS_PASS='*$N(B)vMUym!%'
|
||||
for i in range(1,4):
|
||||
try:
|
||||
ctx=ssl.create_default_context(); ctx.check_hostname=False; ctx.verify_mode=ssl.CERT_NONE
|
||||
t0=time.time()
|
||||
M=imaplib.IMAP4(MS_HOST,IMAP_PORT); M.starttls(ssl_context=ctx); M.login(MS_USER,MS_PASS)
|
||||
print(f"pokus {i}: OK za {time.time()-t0:.1f}s, CAP={M.capabilities}")
|
||||
M.logout(); break
|
||||
except Exception as ex:
|
||||
print(f"pokus {i}: SELHAL {type(ex).__name__}: {ex}")
|
||||
time.sleep(3)
|
||||
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys, paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
# posledni zpracovane slozky s nenulovym k dobrani z posledniho behu
|
||||
cmd = "grep 'k dobrani=' /mnt/user/Scripts/MailStore/ingest_full.log | tail -15"
|
||||
i, o, e = c.exec_command(cmd)
|
||||
sys.stdout.buffer.write(o.read())
|
||||
c.close()
|
||||
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Kolik zprav z mailstore uz je v Mongo kolekci schranky."""
|
||||
import sys
|
||||
import paramiko
|
||||
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
|
||||
py = (
|
||||
"from pymongo import MongoClient; "
|
||||
"m=MongoClient('mongodb://192.168.1.76:27017'); "
|
||||
"col=m['emaily']['vladimir.buzalka@buzalka.cz']; "
|
||||
"print('celkem v kolekci:', col.count_documents({})); "
|
||||
"print('z mailstore:', col.count_documents({'source':'mailstore'}))"
|
||||
)
|
||||
cmd = "docker exec python-runner python -c \"%s\"" % py
|
||||
i, o, e = c.exec_command(cmd)
|
||||
sys.stdout.buffer.write(o.read())
|
||||
sys.stdout.buffer.write(e.read())
|
||||
c.close()
|
||||
@@ -0,0 +1,37 @@
|
||||
# Orchestrator: pousti ingest dokola; po wedgi MailStore IMAP restartne sluzbu
|
||||
# a zkusi znovu. Diky checkpointu kazdy pokus navaze tam, kde wedge utnul.
|
||||
# Konci kdyz ingest skonci ciste
|
||||
$env:PYTHONIOENCODING = "utf-8"
|
||||
$py = "U:/janssen/.venv/Scripts/python.exe"
|
||||
$maxAttempts = 10
|
||||
|
||||
$pass = ConvertTo-SecureString "Vlado7309208104++" -AsPlainText -Force
|
||||
$cred = New-Object System.Management.Automation.PSCredential("administrator", $pass)
|
||||
|
||||
function Restart-MailStore {
|
||||
Write-Output ">>> restart MailStoreServer sluzby na .53"
|
||||
Invoke-Command -ComputerName 192.168.1.53 -Credential $cred -ScriptBlock {
|
||||
Restart-Service -Name MailStoreServer -Force
|
||||
}
|
||||
Start-Sleep -Seconds 25
|
||||
# pockej az IMAP zacne odpovidat (delsi warmup - obri slozky chteji zahraty server)
|
||||
for ($i = 1; $i -le 12; $i++) {
|
||||
$ok = & $py "U:/janssen/mailstore/_imap_test.py" 2>&1 | Select-String "OK za"
|
||||
if ($ok) { Start-Sleep -Seconds 8; Write-Output ">>> IMAP zpet"; return }
|
||||
Start-Sleep -Seconds 6
|
||||
}
|
||||
Write-Output ">>> IMAP po restartu stale neodpovida (jedu dal stejne)"
|
||||
}
|
||||
|
||||
for ($attempt = 1; $attempt -le $maxAttempts; $attempt++) {
|
||||
Write-Output "================ POKUS $attempt / $maxAttempts ================"
|
||||
& $py "U:/janssen/mailstore/_run_ingest.py"
|
||||
$rc = $LASTEXITCODE
|
||||
Write-Output ">>> ingest skoncil rc=$rc"
|
||||
if ($rc -eq 0) {
|
||||
Write-Output "================ HOTOVO (cisty beh) ================"
|
||||
break
|
||||
}
|
||||
# wedge / chyba -> restart MailStore a zkus znovu
|
||||
Restart-MailStore
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Spusti blokujici ingest na serveru (docker exec drzeny pres paramiko).
|
||||
Loguje progress kazdych 30s. Idempotentni - hotove slozky preskoci."""
|
||||
import sys
|
||||
import time
|
||||
import paramiko
|
||||
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
|
||||
cmd = (
|
||||
"docker exec -e PYTHONIOENCODING=utf-8 python-runner "
|
||||
"python -u /scripts/MailStore/mailstore_ingest_v1.0.py "
|
||||
"vladimir.buzalka@buzalka.cz --since 2020 "
|
||||
"--log-file /scripts/MailStore/ingest_full.log "
|
||||
"--checkpoint /scripts/MailStore/ingest_done.txt"
|
||||
)
|
||||
|
||||
tr = c.get_transport()
|
||||
ch = tr.open_session()
|
||||
ch.exec_command(cmd)
|
||||
|
||||
t0 = time.time()
|
||||
buf = b""
|
||||
while not ch.exit_status_ready():
|
||||
while ch.recv_ready():
|
||||
buf += ch.recv(65536)
|
||||
time.sleep(30)
|
||||
mins = (time.time() - t0) / 60
|
||||
print("[%5.1f min] bezi..." % mins, flush=True)
|
||||
|
||||
while ch.recv_ready():
|
||||
buf += ch.recv(65536)
|
||||
rc = ch.recv_exit_status()
|
||||
print("=== EXIT %d po %.1f min ===" % (rc, (time.time() - t0) / 60), flush=True)
|
||||
sys.stdout.buffer.write(buf[-2000:])
|
||||
c.close()
|
||||
sys.exit(rc)
|
||||
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Ingest rok 2019 z MailStore -> Mongo (fresh checkpoint, blokujici docker exec).
|
||||
Stavajici checkpoint 2020+ by 2019 minul (preskoci slozky pred scanem), proto
|
||||
samostatny ingest_done_2019.txt. Dedup Message-ID pohlida duplicity."""
|
||||
import sys, time, paramiko
|
||||
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
|
||||
cmd = (
|
||||
"docker exec -e PYTHONIOENCODING=utf-8 python-runner "
|
||||
"python -u /scripts/MailStore/mailstore_ingest_v1.0.py "
|
||||
"vladimir.buzalka@buzalka.cz --since 2019 --until 2019 "
|
||||
"--log-file /scripts/MailStore/ingest_2019.log "
|
||||
"--checkpoint /scripts/MailStore/ingest_done_2019.txt"
|
||||
)
|
||||
tr = c.get_transport()
|
||||
ch = tr.open_session()
|
||||
ch.exec_command(cmd)
|
||||
t0 = time.time()
|
||||
while not ch.exit_status_ready():
|
||||
time.sleep(30)
|
||||
print("[%5.1f min] bezi..." % ((time.time() - t0) / 60), flush=True)
|
||||
rc = ch.recv_exit_status()
|
||||
print("=== EXIT %d po %.1f min ===" % (rc, (time.time() - t0) / 60), flush=True)
|
||||
c.close()
|
||||
sys.exit(rc)
|
||||
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Spusti PoC prilohy->SeaweedFS v python-runner kontejneru (blokujici)."""
|
||||
import sys, paramiko
|
||||
limit = sys.argv[1] if len(sys.argv) > 1 else "50"
|
||||
extra = " ".join(sys.argv[2:]) if len(sys.argv) > 2 else "--write-back"
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
cmd = ("docker exec -e PYTHONIOENCODING=utf-8 python-runner "
|
||||
"python -u /scripts/MailStore/mailstore_attachments_poc.py "
|
||||
f"vladimir.buzalka@buzalka.cz --limit {limit} {extra}")
|
||||
i, o, e = c.exec_command(cmd)
|
||||
sys.stdout.buffer.write(o.read())
|
||||
sys.stdout.buffer.write(e.read())
|
||||
c.close()
|
||||
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Prida obri slozky (Odstranena/Dorucena posta) do checkpointu - prijimame je
|
||||
jako hotove na ~99%, protoze re-scan 170k/47k hlavicek deterministicky wedgne
|
||||
MailStore IMAP. Data uz v Mongu jsou z drivejsich behu."""
|
||||
import paramiko
|
||||
GIANTS = [
|
||||
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Odstraněná pošta",
|
||||
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Doručená pošta",
|
||||
]
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
sftp = c.open_sftp()
|
||||
path = "/mnt/user/Scripts/MailStore/ingest_done.txt"
|
||||
# nacti stavajici
|
||||
with sftp.open(path, "r") as f:
|
||||
existing = {ln.strip() for ln in f.read().decode("utf-8").splitlines() if ln.strip()}
|
||||
added = 0
|
||||
with sftp.open(path, "a") as f:
|
||||
for g in GIANTS:
|
||||
if g not in existing:
|
||||
f.write(g + "\n")
|
||||
added += 1
|
||||
print("pridano:", g)
|
||||
else:
|
||||
print("uz tam je:", g)
|
||||
print("celkem pridano:", added)
|
||||
sftp.close(); c.close()
|
||||
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Jednorazovy status check ingestu na serveru."""
|
||||
import sys
|
||||
import paramiko
|
||||
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
|
||||
cmd = (
|
||||
"date; "
|
||||
"echo ---PROC---; "
|
||||
"docker exec python-runner ps aux 2>/dev/null | grep mailstore_ingest | grep -v grep; "
|
||||
"echo ---FOLDERS_DONE---; "
|
||||
"grep -c 'k dobrani=' /mnt/user/Scripts/MailStore/ingest_full.log; "
|
||||
"echo ---TAIL---; "
|
||||
"tail -3 /mnt/user/Scripts/MailStore/ingest_full.log"
|
||||
)
|
||||
i, o, e = c.exec_command(cmd)
|
||||
sys.stdout.buffer.write(o.read())
|
||||
c.close()
|
||||
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Stav archivnich storu MailStore pres Management API (GetStores)."""
|
||||
import json, ssl, urllib.request, urllib.parse
|
||||
from base64 import b64encode
|
||||
|
||||
MS_HOST="192.168.1.53"; API_PORT=8463
|
||||
MS_USER="admin"; MS_PASS='*$N(B)vMUym!%'
|
||||
ctx=ssl.create_default_context(); ctx.check_hostname=False; ctx.verify_mode=ssl.CERT_NONE
|
||||
auth=b64encode(f"{MS_USER}:{MS_PASS}".encode()).decode()
|
||||
|
||||
def call(fn, **params):
|
||||
url=f"https://{MS_HOST}:{API_PORT}/api/invoke/{fn}"
|
||||
data=urllib.parse.urlencode(params).encode() if params else b""
|
||||
req=urllib.request.Request(url, data=data, headers={"Authorization":f"Basic {auth}"})
|
||||
with urllib.request.urlopen(req, context=ctx, timeout=30) as r:
|
||||
return json.loads(r.read().decode("utf-8-sig"))
|
||||
|
||||
for fn in ("GetStores", "GetServerInfo"):
|
||||
try:
|
||||
print(f"===== {fn} =====")
|
||||
print(json.dumps(call(fn), indent=2, ensure_ascii=False)[:2500])
|
||||
except Exception as ex:
|
||||
print(f"{fn}: CHYBA {type(ex).__name__}: {ex}")
|
||||
@@ -0,0 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys, paramiko
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
i, o, e = c.exec_command("tail -40 /mnt/user/Scripts/MailStore/ingest_full.log")
|
||||
sys.stdout.buffer.write(o.read())
|
||||
c.close()
|
||||
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Vyjme obri slozky z checkpointu, aby se znovu proскenovaly (uz bez wedge,
|
||||
kdyz je C: opravene). Male slozky v checkpointu nejsou -> zpracuji se samy."""
|
||||
import paramiko
|
||||
REMOVE = {
|
||||
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Odstraněná pošta",
|
||||
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Doručená pošta",
|
||||
}
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("192.168.1.76", username="root", password="7309208104", timeout=10)
|
||||
sftp = c.open_sftp()
|
||||
path = "/mnt/user/Scripts/MailStore/ingest_done.txt"
|
||||
with sftp.open(path, "r") as f:
|
||||
lines = [ln.rstrip("\n") for ln in f.read().decode("utf-8").splitlines()]
|
||||
kept = [l for l in lines if l.strip() and l.strip() not in REMOVE]
|
||||
removed = len(lines) - len(kept)
|
||||
with sftp.open(path, "w") as f:
|
||||
f.write("\n".join(kept) + "\n")
|
||||
print(f"puvodne radku: {len(lines)} odebrano: {removed} zbylo: {len(kept)}")
|
||||
sftp.close(); c.close()
|
||||
@@ -62,7 +62,7 @@ MS_PASS = "*$N(B)vMUym!%"
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
|
||||
HEADER_BATCH = 2000 # kolik hlavicek FETCHovat naraz
|
||||
HEADER_BATCH = 500 # kolik hlavicek FETCHovat naraz (mensi davka = setrnejsi k MailStore IMAP u obrich slozek)
|
||||
UPSERT_BATCH = 100 # kolik dokumentu zapsat naraz do Mongo
|
||||
|
||||
# --- API (jen GetChildFolders na seznam slozek) -----------------------------
|
||||
@@ -101,26 +101,51 @@ def collect_folders(mailbox: str) -> list[str]:
|
||||
|
||||
# --- IMAP --------------------------------------------------------------------
|
||||
|
||||
def imap_connect() -> imaplib.IMAP4:
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
M = imaplib.IMAP4(MS_HOST, IMAP_PORT)
|
||||
M.starttls(ssl_context=ctx)
|
||||
M.login(MS_USER, MS_PASS)
|
||||
return M
|
||||
def imap_connect(retries: int = 6, delay: float = 5.0) -> imaplib.IMAP4:
|
||||
"""Pripoj se k IMAP; MailStore obcas utne spojeni i behem handshake
|
||||
(CAPABILITY => EOF) -> retry s kratkym spankem, aby transientni vypadek
|
||||
neshodil cely beh."""
|
||||
last = None
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
M = imaplib.IMAP4(MS_HOST, IMAP_PORT)
|
||||
M.starttls(ssl_context=ctx)
|
||||
M.login(MS_USER, MS_PASS)
|
||||
return M
|
||||
except (imaplib.IMAP4.abort, imaplib.IMAP4.error, OSError) as ex:
|
||||
last = ex
|
||||
print(" ! imap_connect pokus %d/%d selhal: %s -> cekam %.0fs"
|
||||
% (attempt, retries, ex, delay), flush=True)
|
||||
time.sleep(delay)
|
||||
raise last
|
||||
|
||||
|
||||
_SEQ_RX = re.compile(rb"^(\d+)\s")
|
||||
_UID_RX = re.compile(rb"UID (\d+)")
|
||||
|
||||
|
||||
def _safe_decode(b: bytes, enc) -> str:
|
||||
"""Dekoduj bytes; nestandardni/nezname charsety (napr. 'unknown-8bit')
|
||||
nesmi shodit beh -> fallback na utf-8, pak latin-1."""
|
||||
for e in (enc, "utf-8", "latin-1"):
|
||||
if not e:
|
||||
continue
|
||||
try:
|
||||
return b.decode(e, errors="replace")
|
||||
except (LookupError, TypeError):
|
||||
continue
|
||||
return b.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def dec(s) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
out = []
|
||||
for txt, enc in decode_header(s):
|
||||
out.append(txt.decode(enc or "utf-8", errors="replace") if isinstance(txt, bytes) else txt)
|
||||
out.append(_safe_decode(txt, enc) if isinstance(txt, bytes) else txt)
|
||||
return "".join(out).replace("\r", " ").replace("\n", " ").strip()
|
||||
|
||||
|
||||
@@ -254,9 +279,9 @@ def extract_bodies(msg):
|
||||
"is_inline": "inline" in disp,
|
||||
})
|
||||
elif ct == "text/plain" and not body_text:
|
||||
body_text = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
body_text = _safe_decode(payload or b"", part.get_content_charset())
|
||||
elif ct == "text/html" and not body_html:
|
||||
body_html = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
body_html = _safe_decode(payload or b"", part.get_content_charset())
|
||||
return body_text, body_html, atts
|
||||
|
||||
|
||||
@@ -314,8 +339,22 @@ def main() -> int:
|
||||
ap.add_argument("--max-folders", type=int, default=None, help="Max slozek (diagnostika)")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="Jen spocitej kolik by se dobralo, NIC nezapisuj")
|
||||
ap.add_argument("--log-file", default=None,
|
||||
help="Presmeruj vystup do souboru (line-buffered). Pro detached beh "
|
||||
"v kontejneru bez shell redirectu (ten by docker exec cleanup zabil).")
|
||||
ap.add_argument("--checkpoint", default=None,
|
||||
help="Soubor s hotovymi slozkami (jedna cesta na radek). Hotove slozky "
|
||||
"se pri dalsim behu preskoci BEZ FETCH -> rychle navazani po wedgi "
|
||||
"MailStore IMAP. Idempotentni.")
|
||||
args = ap.parse_args()
|
||||
|
||||
# Vlastni log do souboru - aby detached `docker exec -d python ...` mohl bezet
|
||||
# bez shell wrapperu (sh -c '... &' docker exec cleanup zabije).
|
||||
if args.log_file:
|
||||
_f = open(args.log_file, "a", buffering=1, encoding="utf-8")
|
||||
sys.stdout = _f
|
||||
sys.stderr = _f
|
||||
|
||||
t0 = time.time()
|
||||
print(f"=== MailStore ingest v1.0 | schranka: {args.mailbox} ===")
|
||||
print(f"Filtr: rok >= {args.since or '-'}{' a <= ' + str(args.until) if args.until else ''}"
|
||||
@@ -326,7 +365,8 @@ def main() -> int:
|
||||
mongo.admin.command("ping")
|
||||
coll = mongo[MONGO_DB][args.mailbox]
|
||||
print("Nacitam existujici Message-ID z Mongo...", flush=True)
|
||||
known = set(coll.distinct("_id"))
|
||||
# distinct('_id') prekroci 16MB cap u velkych kolekci -> kurzor po davkach
|
||||
known = {d["_id"] for d in coll.find({}, {"_id": 1}).batch_size(5000)}
|
||||
print(f" v Mongu uz mam: {len(known):,} zprav")
|
||||
|
||||
# slozky
|
||||
@@ -336,9 +376,21 @@ def main() -> int:
|
||||
folders = collect_folders(args.mailbox)
|
||||
print(f"Slozek ke kontrole: {len(folders)}")
|
||||
|
||||
# checkpoint hotovych slozek (preskoci se bez FETCH)
|
||||
done_folders: set[str] = set()
|
||||
cp_fh = None
|
||||
if args.checkpoint and not args.dry_run:
|
||||
try:
|
||||
with open(args.checkpoint, "r", encoding="utf-8") as _cf:
|
||||
done_folders = {ln.strip() for ln in _cf if ln.strip()}
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
cp_fh = open(args.checkpoint, "a", buffering=1, encoding="utf-8")
|
||||
print(f" checkpoint: {len(done_folders)} slozek uz hotovo (preskocim)")
|
||||
|
||||
M = imap_connect()
|
||||
|
||||
grand_seen = grand_cand = grand_ingested = 0
|
||||
grand_seen = grand_cand = grand_ingested = grand_errors = 0
|
||||
queue: list[UpdateOne] = []
|
||||
|
||||
def flush():
|
||||
@@ -348,16 +400,30 @@ def main() -> int:
|
||||
queue = []
|
||||
|
||||
nonlocal_M = {"M": M}
|
||||
consec_aborts = 0 # po sobe jdouci aborty = MailStore zwedgoval -> exit(1) pro orchestrator
|
||||
for fidx, folder in enumerate(folders):
|
||||
if args.max_folders and fidx >= args.max_folders:
|
||||
print(f" (--max-folders {args.max_folders} dosazeno)")
|
||||
break
|
||||
if folder in done_folders:
|
||||
continue
|
||||
try:
|
||||
total, items = scan_folder_headers(nonlocal_M["M"], folder)
|
||||
except Exception as ex:
|
||||
# jedna chybna slozka nesmi shodit cely beh - zaloguj a pokracuj.
|
||||
# Pri chybe IMAP spojeni (abort) se prepoj.
|
||||
print(f" [{relativize(folder, args.mailbox)[:45]:45}] CHYBA: {type(ex).__name__}: {str(ex)[:80]}", flush=True)
|
||||
consec_aborts += 1
|
||||
if consec_aborts >= 4:
|
||||
# MailStore IMAP je zwedgovany (login projde, ale FETCH hned EOF) ->
|
||||
# nedet nastavanou kaskadu falesnych preskoku, skonci NEnulovym kodem,
|
||||
# at orchestrator restartne sluzbu MailStore a navaze z checkpointu.
|
||||
flush()
|
||||
print("!!! %d po sobe jdoucich abortu -> MailStore wedge, koncim rc=2 pro restart"
|
||||
% consec_aborts, flush=True)
|
||||
if cp_fh:
|
||||
cp_fh.close()
|
||||
return 2
|
||||
try:
|
||||
nonlocal_M["M"].logout()
|
||||
except Exception:
|
||||
@@ -386,21 +452,53 @@ def main() -> int:
|
||||
if args.dry_run:
|
||||
continue
|
||||
|
||||
for seq, uid, mid in cands:
|
||||
if args.limit and grand_ingested >= args.limit:
|
||||
break
|
||||
raw = fetch_full(M, seq)
|
||||
if not raw:
|
||||
continue
|
||||
doc = build_doc(raw, uid, folder, args.mailbox)
|
||||
if not doc:
|
||||
continue
|
||||
queue.append(UpdateOne({"_id": doc["_id"]}, {"$setOnInsert": doc}, upsert=True))
|
||||
known.add(doc["_id"])
|
||||
grand_ingested += 1
|
||||
if len(queue) >= UPSERT_BATCH:
|
||||
flush()
|
||||
flush()
|
||||
try:
|
||||
for seq, uid, mid in cands:
|
||||
if args.limit and grand_ingested >= args.limit:
|
||||
break
|
||||
try:
|
||||
raw = fetch_full(M, seq)
|
||||
if not raw:
|
||||
continue
|
||||
doc = build_doc(raw, uid, folder, args.mailbox)
|
||||
if not doc:
|
||||
continue
|
||||
except imaplib.IMAP4.abort:
|
||||
# spojeni umrelo (MailStore wedge) -> ven, prepoj, slozku zopakuje
|
||||
# az dalsi run (NEoznacit hotovou)
|
||||
raise
|
||||
except Exception as ex:
|
||||
# jedna vadna zprava nesmi shodit beh - preskoc a pokracuj
|
||||
grand_errors += 1
|
||||
print(f" ! zprava seq={seq} CHYBA: {type(ex).__name__}: {str(ex)[:60]}", flush=True)
|
||||
continue
|
||||
queue.append(UpdateOne({"_id": doc["_id"]}, {"$setOnInsert": doc}, upsert=True))
|
||||
known.add(doc["_id"])
|
||||
grand_ingested += 1
|
||||
if len(queue) >= UPSERT_BATCH:
|
||||
flush()
|
||||
flush()
|
||||
except imaplib.IMAP4.abort as ex:
|
||||
flush()
|
||||
print(f" [{rel[:45]:45}] IMAP abort behem fetch: {str(ex)[:50]} -> reconnect", flush=True)
|
||||
consec_aborts += 1
|
||||
if consec_aborts >= 4:
|
||||
print("!!! %d po sobe jdoucich abortu -> MailStore wedge, koncim rc=2 pro restart"
|
||||
% consec_aborts, flush=True)
|
||||
if cp_fh:
|
||||
cp_fh.close()
|
||||
return 2
|
||||
try:
|
||||
nonlocal_M["M"].logout()
|
||||
except Exception:
|
||||
pass
|
||||
nonlocal_M["M"] = imap_connect()
|
||||
continue # slozka NENI hotova -> zopakuje ji dalsi run
|
||||
# slozka uspesne dokoncena -> zapis do checkpointu
|
||||
consec_aborts = 0
|
||||
if cp_fh:
|
||||
cp_fh.write(folder + "\n")
|
||||
done_folders.add(folder)
|
||||
if args.limit and grand_ingested >= args.limit:
|
||||
print(f" (dosazen limit {args.limit})")
|
||||
break
|
||||
@@ -415,6 +513,8 @@ def main() -> int:
|
||||
print(">>> DRY-RUN: nic nezapsano. Pro ostry beh spust bez --dry-run.")
|
||||
else:
|
||||
print(f"Zapsano do Mongo: {grand_ingested:,}")
|
||||
if grand_errors:
|
||||
print(f"Preskoceno zprav s chybou: {grand_errors:,}")
|
||||
print(f"Trvalo: {time.time()-t0:.1f}s")
|
||||
return 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user