This commit is contained in:
2026-06-11 21:49:04 +02:00
parent 8e760d3adf
commit 8ef7d1cfd1
15 changed files with 1621 additions and 0 deletions
+49
View File
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Hlida serverovy log (na Unraidu) dokud dany beh neskonci.
Poluje pres SSH, tiskne ridky progress, skonci na koncovem markeru.
Pouziti: _watch_server_log.py <vzdalena_cesta_logu> [marker]
"""
import sys
import time
import paramiko
HOST = "192.168.1.76"
USER = "root"
PASS = "7309208104"
logpath = sys.argv[1] if len(sys.argv) > 1 else "/mnt/user/Scripts/MailStore/dryrun_full.log"
marker = sys.argv[2] if len(sys.argv) > 2 else "Zprav proskenovano"
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect(HOST, username=USER, password=PASS, timeout=10)
def sh(cmd):
i, o, e = c.exec_command(cmd)
return o.read().decode("utf-8", "replace")
t0 = time.time()
last_count = -1
while True:
content = sh(f"cat {logpath!r} 2>/dev/null")
done = (marker in content) or ("Traceback" in content)
folders = content.count("k dobrani=")
if folders != last_count:
mins = (time.time() - t0) / 60
# posledni zpracovana slozka
lines = [l for l in content.splitlines() if "k dobrani=" in l]
last = lines[-1].strip() if lines else ""
print(f"[{mins:4.1f} min] slozek hotovo: {folders:4} | {last[:70]}", flush=True)
last_count = folders
if done:
print("=== HOTOVO ===", flush=True)
tail = "\n".join(content.splitlines()[-10:])
print(tail, flush=True)
break
time.sleep(30)
c.close()
+54
View File
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""Test IMAP SEARCH proti MailStore serveru — ověření rychlosti a funkčnosti."""
import imaplib
import ssl
import sys
import time
HOST = "192.168.1.53"
PORT = 143
USER = "admin"
PASS = "*$N(B)vMUym!%"
def connect():
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
M = imaplib.IMAP4(HOST, PORT)
M.starttls(ssl_context=ctx)
M.login(USER, PASS)
return M
def main():
t0 = time.time()
M = connect()
print(f"[{time.time()-t0:.1f}s] připojeno + login", flush=True)
# Přímý SELECT na konkrétní složku (LIST cizí archivy neukáže, SELECT ano)
target = "vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Sent Items"
typ, data = M.select(f'"{target}"', readonly=True)
count = int(data[0]) if typ == "OK" and data and data[0] else 0
print(f"[{time.time()-t0:.1f}s] SELECT '{target}' = {count} zpráv (typ={typ})", flush=True)
if count == 0:
M.logout()
return
# Test SEARCH různých kritérií
for crit, val in [("ALL", None), ("SUBJECT", "re"), ("FROM", "cz"), ("TEXT", "objednávka")]:
ts = time.time()
if val is None:
typ, data = M.search(None, crit)
else:
typ, data = M.search(None, crit, f'"{val}"')
nums = data[0].split() if data and data[0] else []
label = crit if val is None else f'{crit} "{val}"'
print(f"[{time.time()-t0:.1f}s] SEARCH {label}: {len(nums)} výsledků ({time.time()-ts:.2f}s)", flush=True)
M.logout()
print(f"[{time.time()-t0:.1f}s] hotovo", flush=True)
if __name__ == "__main__":
main()
+176
View File
@@ -0,0 +1,176 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
================================================================================
Nazev: mailstore_folder_v1.0.py
Verze: 1.0
Datum: 2026-06-11
Autor: Vladimir Buzalka (asistovano Claude)
Popis: Vypise obsah jedne MailStore slozky jako seznam zprav
(datum | od | predmet) pres davkovy IMAP FETCH hlavicek.
Predstupen ingestu - overuje davkove cteni hlavicek.
Argument = plna cesta slozky (fullName z mapy), napr.:
"vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Sent Items"
Zdroj: MailStore IMAP server, port 143, STARTTLS, auth Prosty text (LOGIN).
IMAP FETCH BODY.PEEK[HEADER.FIELDS (...)] = hlavicky bez oznaceni
jako precteno. Davkove jednim prikazem, ne po jedne zprave.
Spusteni:
python mailstore_folder_v1.0.py "...slozka..." # poslednich 50
python mailstore_folder_v1.0.py "...slozka..." --limit 200
python mailstore_folder_v1.0.py "...slozka..." --all # vse (pozor velke slozky)
python mailstore_folder_v1.0.py "...slozka..." --oldest # od nejstarsich
================================================================================
"""
from __future__ import annotations
import argparse
import email
import imaplib
import re
import ssl
import sys
from email.header import decode_header
from email.utils import parsedate_to_datetime
# --- konfigurace ------------------------------------------------------------
HOST = "192.168.1.53"
PORT = 143
USER = "admin"
PASS = "*$N(B)vMUym!%"
DEFAULT_LIMIT = 50
# --- helpery ----------------------------------------------------------------
def connect() -> imaplib.IMAP4:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
M = imaplib.IMAP4(HOST, PORT)
M.starttls(ssl_context=ctx)
M.login(USER, PASS)
return M
def encode_mutf7(s: str) -> str:
"""Nazev IMAP slozky -> modified UTF-7 (RFC 3501) kvuli diakritice
(MailStore neumi UTF8=ACCEPT). Vysledek je cisty ASCII."""
import base64 as _b64
res = []
i, n = 0, len(s)
while i < n:
ch = s[i]; o = ord(ch)
if 0x20 <= o <= 0x7e:
res.append("&-" if ch == "&" else ch); i += 1
else:
j = i
while j < n and not (0x20 <= ord(s[j]) <= 0x7e):
j += 1
enc = _b64.b64encode(s[i:j].encode("utf-16-be")).decode("ascii").rstrip("=").replace("/", ",")
res.append("&" + enc + "-"); i = j
return "".join(res)
def dec(s: str | None) -> str:
"""Dekoduje MIME-encoded hlavicku (=?utf-8?...?=) na citelny text."""
if not s:
return ""
out = []
for txt, enc in decode_header(s):
if isinstance(txt, bytes):
out.append(txt.decode(enc or "utf-8", errors="replace"))
else:
out.append(txt)
return "".join(out).replace("\r", " ").replace("\n", " ").strip()
def fmt_date(raw: str | None) -> str:
if not raw:
return "?"
try:
dt = parsedate_to_datetime(raw)
return dt.strftime("%Y-%m-%d %H:%M")
except Exception:
return (raw or "")[:16]
def short(s: str, n: int) -> str:
s = s or ""
return s if len(s) <= n else s[: n - 1] + ""
# IMAP FETCH header bloky prijdou jako tuple (b'N (BODY[...] {len}', b'<headers>')
_NUM_RX = re.compile(rb"^(\d+)\s")
def main() -> int:
ap = argparse.ArgumentParser(description="Vypis obsahu MailStore slozky")
ap.add_argument("folder", help="Plna cesta slozky (fullName z mapy)")
ap.add_argument("--limit", type=int, default=DEFAULT_LIMIT,
help=f"Pocet zprav (default {DEFAULT_LIMIT})")
ap.add_argument("--all", action="store_true", help="Vsechny zpravy (ignoruje --limit)")
ap.add_argument("--oldest", action="store_true",
help="Od nejstarsich (default: od nejnovejsich)")
args = ap.parse_args()
M = connect()
typ, data = M.select(f'"{encode_mutf7(args.folder)}"', readonly=True)
if typ != "OK":
print(f"Slozku nelze otevrit: {data}", file=sys.stderr)
return 1
total = int(data[0]) if data and data[0] else 0
print(f"Slozka: {args.folder}")
print(f"Zprav celkem: {total:,}")
if total == 0:
M.logout()
return 0
# urci rozsah porad. cisel (1 = nejstarsi, total = nejnovejsi)
if args.all:
lo, hi = 1, total
else:
n = min(args.limit, total)
lo, hi = (1, n) if args.oldest else (total - n + 1, total)
rng = f"{lo}:{hi}"
shown = hi - lo + 1
order = "nejstarsi" if args.oldest else "nejnovejsi"
print(f"Zobrazuji {shown} zprav ({order} prvni), rozsah #{rng}")
print("=" * 100)
# davkovy FETCH hlavicek
typ, msgs = M.fetch(rng, "(BODY.PEEK[HEADER.FIELDS (DATE FROM SUBJECT)])")
rows = []
for item in msgs:
if not isinstance(item, tuple):
continue
meta, hdr_bytes = item[0], item[1]
m = _NUM_RX.match(meta or b"")
seqno = int(m.group(1)) if m else 0
hdr = email.message_from_bytes(hdr_bytes)
rows.append((seqno, fmt_date(hdr.get("Date")),
dec(hdr.get("From")), dec(hdr.get("Subject"))))
rows.sort(key=lambda r: r[0], reverse=not args.oldest)
print(f"{'#':>6} {'Datum':<16} {'Od':<32} Predmet")
print("-" * 100)
for seqno, d, frm, subj in rows:
print(f"{seqno:>6} {d:<16} {short(frm, 32):<32} {short(subj, 40)}")
M.logout()
print("=" * 100)
print(f"Vypsano {len(rows)} zprav.")
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print("\nPreruseno", file=sys.stderr)
sys.exit(1)
+427
View File
@@ -0,0 +1,427 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
================================================================================
Nazev: mailstore_ingest_v1.0.py
Verze: 1.0
Datum: 2026-06-11
Autor: Vladimir Buzalka (asistovano Claude)
Popis: Backfill stare historie z MailStore archivu do MongoDB `emaily`.
Dobere do existujici kolekce schranky JEN zpravy, ktere tam jeste
nejsou - dedup podle internet Message-ID (= _id v Mongu).
Cilove schema dokumentu = stejne jako Graph import, takze navazujici
enrich_fulltext_emails + MCP `emaily` search funguji bez uprav.
Strategie:
1. Nacti SET vsech Message-ID (_id) co uz v Mongu pro schranku jsou.
2. Projdi slozky schranky (API GetChildFolders).
3. Per slozka davkove stahni hlavicky (UID, DATE, MESSAGE-ID) - rychle.
4. Kandidat = Message-ID neni v setu AND rok(DATE) >= --since.
5. Pro kandidaty stahni cele telo (RFC822), naparsuj, upsert do Mongo.
Filtr data je client-side z DATE headeru (IMAP SEARCH je u MailStme 78s/k nicemu).
Spusteni:
# KOLIK by se dobralo (nic nezapise) - delej VZDY prvni:
python mailstore_ingest_v1.0.py "vladimir.buzalka@buzalka.cz" --since 2020 --dry-run
# ostry beh:
python mailstore_ingest_v1.0.py "vladimir.buzalka@buzalka.cz" --since 2020
# test na jedne slozce / s limitem:
python mailstore_ingest_v1.0.py "vladimir.buzalka@buzalka.cz" --since 2020 \
--folder "vladimir.buzalka@buzalka.cz/Exchange vladimir.buzalka/Sent Items" --limit 50
================================================================================
"""
from __future__ import annotations
import argparse
import email
import imaplib
import json
import re
import ssl
import sys
import time
import urllib.parse
import urllib.request
from base64 import b64encode
from datetime import datetime, timezone
from email.header import decode_header
from email.utils import getaddresses, parsedate_to_datetime
from pymongo import MongoClient, UpdateOne
# --- konfigurace ------------------------------------------------------------
MS_HOST = "192.168.1.53"
IMAP_PORT = 143
API_PORT = 8463
MS_USER = "admin"
MS_PASS = "*$N(B)vMUym!%"
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "emaily"
HEADER_BATCH = 2000 # kolik hlavicek FETCHovat naraz
UPSERT_BATCH = 100 # kolik dokumentu zapsat naraz do Mongo
# --- API (jen GetChildFolders na seznam slozek) -----------------------------
_API_BASE = f"https://{MS_HOST}:{API_PORT}/api"
_API_AUTH = "Basic " + b64encode(f"{MS_USER}:{MS_PASS}".encode()).decode()
_CTX = ssl.create_default_context()
_CTX.check_hostname = False
_CTX.verify_mode = ssl.CERT_NONE
def api_result(method: str, params: dict | None = None):
data = urllib.parse.urlencode(params or {}).encode()
req = urllib.request.Request(f"{_API_BASE}/invoke/{method}", data=data, method="POST",
headers={"Authorization": _API_AUTH,
"Content-Type": "application/x-www-form-urlencoded"})
with urllib.request.urlopen(req, context=_CTX, timeout=30) as resp:
r = json.loads(resp.read().decode("utf-8-sig"))
if r.get("statusCode") != "succeeded":
raise RuntimeError(f"{method}: {(r.get('error') or {}).get('message')}")
return r.get("result")
def collect_folders(mailbox: str) -> list[str]:
"""Vrati seznam plnych cest vsech slozek schranky (rekurzivne)."""
tree = api_result("GetChildFolders", {"folder": mailbox, "maxLevels": 20})
out: list[str] = []
def walk(node):
for ch in node.get("childFolders") or []:
out.append(ch["fullName"])
walk(ch)
walk(tree)
return out
# --- IMAP --------------------------------------------------------------------
def imap_connect() -> imaplib.IMAP4:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
M = imaplib.IMAP4(MS_HOST, IMAP_PORT)
M.starttls(ssl_context=ctx)
M.login(MS_USER, MS_PASS)
return M
_SEQ_RX = re.compile(rb"^(\d+)\s")
_UID_RX = re.compile(rb"UID (\d+)")
def dec(s) -> str:
if not s:
return ""
out = []
for txt, enc in decode_header(s):
out.append(txt.decode(enc or "utf-8", errors="replace") if isinstance(txt, bytes) else txt)
return "".join(out).replace("\r", " ").replace("\n", " ").strip()
def parse_date(raw) -> datetime | None:
if not raw:
return None
try:
dt = parsedate_to_datetime(raw)
if dt.tzinfo:
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
except Exception:
return None
def encode_mutf7(s: str) -> str:
"""Nazev IMAP slozky -> modified UTF-7 (RFC 3501). MailStore neumi
UTF8=ACCEPT, takze slozky s diakritikou (Dorucena posta) musi byt mUTF-7.
Vysledek je cisty ASCII -> bezpecne projde imaplib (ascii encoding)."""
res = []
i, n = 0, len(s)
while i < n:
ch = s[i]
o = ord(ch)
if 0x20 <= o <= 0x7e:
res.append("&-" if ch == "&" else ch)
i += 1
else:
j = i
while j < n and not (0x20 <= ord(s[j]) <= 0x7e):
j += 1
import base64 as _b64
b = s[i:j].encode("utf-16-be")
enc = _b64.b64encode(b).decode("ascii").rstrip("=").replace("/", ",")
res.append("&" + enc + "-")
i = j
return "".join(res)
def imap_select(M: imaplib.IMAP4, folder: str):
"""SELECT slozky s mUTF-7 enkodovanim nazvu (kvuli diakritice)."""
return M.select(f'"{encode_mutf7(folder)}"', readonly=True)
def scan_folder_headers(M: imaplib.IMAP4, folder: str):
"""Davkove stahne (seq, uid, msgid, date) vsech zprav slozky."""
typ, data = imap_select(M, folder)
if typ != "OK":
return None, []
total = int(data[0]) if data and data[0] else 0
if total == 0:
return 0, []
items = []
lo = 1
while lo <= total:
hi = min(lo + HEADER_BATCH - 1, total)
typ, msgs = M.fetch(f"{lo}:{hi}",
"(UID BODY.PEEK[HEADER.FIELDS (MESSAGE-ID DATE)])")
for it in msgs:
if not isinstance(it, tuple):
continue
meta, hdr = it[0], it[1]
mseq = _SEQ_RX.match(meta or b"")
muid = _UID_RX.search(meta or b"")
h = email.message_from_bytes(hdr or b"")
mid = (h.get("Message-ID") or "").strip()
items.append((int(mseq.group(1)) if mseq else 0,
int(muid.group(1)) if muid else 0,
mid, parse_date(h.get("Date"))))
lo = hi + 1
return total, items
def fetch_full(M: imaplib.IMAP4, seq: int) -> bytes | None:
typ, data = M.fetch(str(seq), "(RFC822)")
if typ != "OK" or not data or not isinstance(data[0], tuple):
return None
return data[0][1]
# --- mapovani EML -> Mongo dokument -----------------------------------------
def relativize(folder: str, mailbox: str) -> str:
"""schranka/Exchange X/Sent Items -> Sent Items (jako Graph folder_path)."""
parts = folder.split("/")
# odstran prefix schranky a 'Exchange ...' uroven
if len(parts) >= 2 and parts[0] == mailbox:
rest = parts[2:] if len(parts) > 2 else parts[1:]
return "/".join(rest) if rest else parts[-1]
return parts[-1]
def parse_addr_one(raw) -> dict:
if not raw:
return {"email": None, "name": None}
pairs = getaddresses([raw])
if not pairs:
return {"email": None, "name": None}
name, addr = pairs[0]
return {"email": (addr or "").lower() or None, "name": dec(name) or (addr or None)}
def parse_recipients(msg) -> list[dict]:
out = []
for kind, hdr in (("to", "To"), ("cc", "Cc"), ("bcc", "Bcc")):
val = msg.get(hdr)
if not val:
continue
for name, addr in getaddresses([val]):
if addr:
out.append({"type": kind, "email": addr.lower(),
"name": dec(name) or addr})
return out
def extract_bodies(msg):
body_text = body_html = ""
atts = []
for part in msg.walk():
if part.is_multipart():
continue
ct = part.get_content_type()
disp = str(part.get("Content-Disposition") or "")
payload = part.get_payload(decode=True)
is_att = "attachment" in disp or (part.get_filename() and ct not in ("text/plain", "text/html"))
if is_att:
atts.append({
"filename": dec(part.get_filename()) or "(bez nazvu)",
"size_bytes": len(payload or b""),
"mime_type": ct,
"is_inline": "inline" in disp,
})
elif ct == "text/plain" and not body_text:
body_text = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
elif ct == "text/html" and not body_html:
body_html = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
return body_text, body_html, atts
def build_doc(raw: bytes, uid: int, folder: str, mailbox: str) -> dict | None:
msg = email.message_from_bytes(raw)
mid = (msg.get("Message-ID") or "").strip()
if not mid:
return None
dt = parse_date(msg.get("Date"))
body_text, body_html, atts = extract_bodies(msg)
now = datetime.now(timezone.utc).replace(tzinfo=None)
preview = (body_text or "")[:255]
return {
"_id": mid,
"source": "mailstore",
"mailstore_uid": uid,
"mailstore_folder": folder,
# graph_id zamerne VYNECHANO: kolekce ma unique+sparse index na graph_id,
# explicitni None by kolidoval (sparse ignoruje jen CHYBEJICI pole).
"conversation_id": None,
"folder_path": relativize(folder, mailbox),
"subject": dec(msg.get("Subject")),
"sender": parse_addr_one(msg.get("From")),
"recipients": parse_recipients(msg),
"to": dec(msg.get("To")),
"cc": dec(msg.get("Cc")),
"bcc": dec(msg.get("Bcc")),
"sent_at": dt,
"received_at": dt,
"modified_at": now,
"created_at": now,
"parsed_at": now,
"is_read": True,
"is_draft": "draft" in folder.lower() or "koncept" in folder.lower(),
"has_attachments": bool(atts),
"attachment_count": len(atts),
"attachments": atts,
"body_html": body_html or None,
"body_text": body_text or None,
"body_preview": preview,
}
# --- hlavni ------------------------------------------------------------------
def main() -> int:
ap = argparse.ArgumentParser(description="MailStore -> Mongo backfill (dedup dle Message-ID)")
ap.add_argument("mailbox", help="Schranka (top-level slozka MailStore = Mongo kolekce)")
ap.add_argument("--since", type=int, default=None,
help="Ber jen zpravy s rokem >= SINCE (napr. 2020)")
ap.add_argument("--until", type=int, default=None,
help="Ber jen zpravy s rokem <= UNTIL")
ap.add_argument("--folder", default=None, help="Jen jedna konkretni slozka (plna cesta)")
ap.add_argument("--limit", type=int, default=None, help="Max zprav k ingestu (test)")
ap.add_argument("--max-folders", type=int, default=None, help="Max slozek (diagnostika)")
ap.add_argument("--dry-run", action="store_true",
help="Jen spocitej kolik by se dobralo, NIC nezapisuj")
args = ap.parse_args()
t0 = time.time()
print(f"=== MailStore ingest v1.0 | schranka: {args.mailbox} ===")
print(f"Filtr: rok >= {args.since or '-'}{' a <= ' + str(args.until) if args.until else ''}"
f"{' [DRY-RUN]' if args.dry_run else ''}")
# Mongo + set znamych Message-ID
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
mongo.admin.command("ping")
coll = mongo[MONGO_DB][args.mailbox]
print("Nacitam existujici Message-ID z Mongo...", flush=True)
known = set(coll.distinct("_id"))
print(f" v Mongu uz mam: {len(known):,} zprav")
# slozky
if args.folder:
folders = [args.folder]
else:
folders = collect_folders(args.mailbox)
print(f"Slozek ke kontrole: {len(folders)}")
M = imap_connect()
grand_seen = grand_cand = grand_ingested = 0
queue: list[UpdateOne] = []
def flush():
nonlocal queue
if queue and not args.dry_run:
coll.bulk_write(queue, ordered=False)
queue = []
nonlocal_M = {"M": M}
for fidx, folder in enumerate(folders):
if args.max_folders and fidx >= args.max_folders:
print(f" (--max-folders {args.max_folders} dosazeno)")
break
try:
total, items = scan_folder_headers(nonlocal_M["M"], folder)
except Exception as ex:
# jedna chybna slozka nesmi shodit cely beh - zaloguj a pokracuj.
# Pri chybe IMAP spojeni (abort) se prepoj.
print(f" [{relativize(folder, args.mailbox)[:45]:45}] CHYBA: {type(ex).__name__}: {str(ex)[:80]}", flush=True)
try:
nonlocal_M["M"].logout()
except Exception:
pass
nonlocal_M["M"] = imap_connect()
continue
M = nonlocal_M["M"]
if not total:
continue
# kandidati: rok ok, neni v known, ma msgid
cands = []
for seq, uid, mid, dt in items:
if not mid or mid in known:
continue
yr = dt.year if dt else None
if args.since and (yr is None or yr < args.since):
continue
if args.until and (yr is None or yr > args.until):
continue
cands.append((seq, uid, mid))
grand_seen += total
grand_cand += len(cands)
rel = relativize(folder, args.mailbox)
print(f" [{rel[:45]:45}] zprav={total:>6} k dobrani={len(cands):>6}", flush=True)
if args.dry_run:
continue
for seq, uid, mid in cands:
if args.limit and grand_ingested >= args.limit:
break
raw = fetch_full(M, seq)
if not raw:
continue
doc = build_doc(raw, uid, folder, args.mailbox)
if not doc:
continue
queue.append(UpdateOne({"_id": doc["_id"]}, {"$setOnInsert": doc}, upsert=True))
known.add(doc["_id"])
grand_ingested += 1
if len(queue) >= UPSERT_BATCH:
flush()
flush()
if args.limit and grand_ingested >= args.limit:
print(f" (dosazen limit {args.limit})")
break
M.logout()
flush()
print("-" * 64)
print(f"Zprav proskenovano: {grand_seen:,}")
print(f"K dobrani (chybi, v okne): {grand_cand:,}")
if args.dry_run:
print(">>> DRY-RUN: nic nezapsano. Pro ostry beh spust bez --dry-run.")
else:
print(f"Zapsano do Mongo: {grand_ingested:,}")
print(f"Trvalo: {time.time()-t0:.1f}s")
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print("\nPreruseno", file=sys.stderr)
sys.exit(1)
+176
View File
@@ -0,0 +1,176 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
================================================================================
Nazev: mailstore_map_v1.0.py
Verze: 1.0
Datum: 2026-06-11
Autor: Vladimir Buzalka (asistovano Claude)
Popis: Vykresli "mapu" jedne MailStore schranky - strom slozek z
Administration API (GetChildFolders) + celkovy pocet zprav schranky
z GetFolderStatistics.
Argument = nazev schranky (top-level slozka v MailStore archivu),
napr. "vladimir.buzalka@buzalka.cz" nebo "lenka.hanzalova".
Seznam dostupnych schranek: --list (vola GetUsers/GetChildFolders root).
Zdroj: MailStore Server Administration API, HTTPS port 8463.
Auth: admin / heslo (Basic). Parametry jako form-body. Async operace
(GetFolderStatistics) se poluji pres /api/get-status.
Pozn.: API umi jen strukturu + souhrnne pocty per schranka. Pocty zprav per
jednotliva slozka API levne nedava - to bude dalsi krok (IMAP STATUS).
Spusteni:
python mailstore_map_v1.0.py "lenka.hanzalova"
python mailstore_map_v1.0.py "vladimir.buzalka@buzalka.cz" --no-stats
python mailstore_map_v1.0.py --list
================================================================================
"""
from __future__ import annotations
import argparse
import json
import ssl
import sys
import time
import urllib.parse
import urllib.request
from base64 import b64encode
# --- konfigurace ------------------------------------------------------------
HOST = "192.168.1.53"
PORT = 8463
USER = "admin"
PASS = "*$N(B)vMUym!%"
BASE = f"https://{HOST}:{PORT}/api"
_AUTH = "Basic " + b64encode(f"{USER}:{PASS}".encode()).decode()
_CTX = ssl.create_default_context()
_CTX.check_hostname = False
_CTX.verify_mode = ssl.CERT_NONE
# --- API helper -------------------------------------------------------------
def _post(path: str, params: dict | None = None) -> dict:
"""Jeden POST na API, vrati naparsovany JSON (odstrani BOM)."""
data = urllib.parse.urlencode(params or {}).encode()
req = urllib.request.Request(
f"{BASE}/{path}", data=data, method="POST",
headers={"Authorization": _AUTH,
"Content-Type": "application/x-www-form-urlencoded"},
)
with urllib.request.urlopen(req, context=_CTX, timeout=30) as resp:
raw = resp.read().decode("utf-8-sig") # utf-8-sig sezere BOM
return json.loads(raw)
def api(method: str, params: dict | None = None, poll_timeout: int = 120) -> dict:
"""Zavola API funkci. Pokud je async (statusCode=running), poluje
/api/get-status az do dokonceni. Vrati cely objekt odpovedi."""
r = _post(f"invoke/{method}", params)
if r.get("statusCode") != "running":
return r
token = r.get("token")
sv = r.get("statusVersion", 0)
t0 = time.time()
while r.get("statusCode") == "running":
if time.time() - t0 > poll_timeout:
raise TimeoutError(f"{method}: polling prekrocil {poll_timeout}s")
r = _post("get-status", {"token": token,
"lastKnownStatusVersion": sv,
"millisecondsTimeout": 5000})
sv = r.get("statusVersion", sv)
return r
def api_result(method: str, params: dict | None = None):
r = api(method, params)
if r.get("statusCode") != "succeeded":
err = (r.get("error") or {}).get("message", "neznama chyba")
raise RuntimeError(f"{method} selhalo: {err}")
return r.get("result")
# --- formatovani ------------------------------------------------------------
def human_size(n: int) -> str:
f = float(n)
for unit in ("B", "KB", "MB", "GB", "TB"):
if f < 1024 or unit == "TB":
return f"{f:.1f} {unit}"
f /= 1024
def print_tree(node: dict, indent: int = 0) -> int:
"""Rekurzivne vypise strom slozek. Vrati pocet vypsanych slozek."""
count = 0
for ch in node.get("childFolders") or []:
marker = "+" if ch.get("hasChildFolders") else "-"
print(f" {' ' * indent}{marker} {ch.get('name')}")
count += 1
count += print_tree(ch, indent + 1)
return count
# --- akce -------------------------------------------------------------------
def list_mailboxes() -> None:
"""Vypise top-level slozky (schranky) v archivu."""
root = api_result("GetChildFolders", {"maxLevels": 1})
print("Dostupne schranky (top-level slozky archivu):")
for ch in root.get("childFolders") or []:
print(f" - {ch.get('name')}")
def map_mailbox(mailbox: str, with_stats: bool = True) -> None:
# 1) celkovy pocet zprav schranky (volitelne - GetFolderStatistics je ~20s)
total = size = None
if with_stats:
print("Nacitam statistiky (GetFolderStatistics, muze trvat ~20s)...",
file=sys.stderr, flush=True)
stats = api_result("GetFolderStatistics") or []
for s in stats:
if s.get("folder") == mailbox:
total, size = s.get("count"), s.get("size")
break
# 2) strom slozek
tree = api_result("GetChildFolders", {"folder": mailbox, "maxLevels": 20})
print("=" * 64)
print(f"MAILSTORE MAPA SCHRANKY: {mailbox}")
if total is not None:
print(f"Celkem zprav: {total:,} Velikost: {human_size(size)}")
print("=" * 64)
n = print_tree(tree)
print("-" * 64)
print(f"Slozek celkem: {n}")
def main() -> int:
ap = argparse.ArgumentParser(description="MailStore mapa schranky (API)")
ap.add_argument("mailbox", nargs="?", help="Nazev schranky (top-level slozka)")
ap.add_argument("--list", action="store_true",
help="Vypsat dostupne schranky a skoncit")
ap.add_argument("--no-stats", action="store_true",
help="Preskocit celkovy pocet zprav (rychlejsi, bez ~20s GetFolderStatistics)")
args = ap.parse_args()
if args.list:
list_mailboxes()
return 0
if not args.mailbox:
ap.error("zadej nazev schranky, nebo --list pro seznam")
map_mailbox(args.mailbox, with_stats=not args.no_stats)
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print("\nPreruseno", file=sys.stderr)
sys.exit(1)
+212
View File
@@ -0,0 +1,212 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
================================================================================
Nazev: mailstore_read_v1.0.py
Verze: 1.0
Datum: 2026-06-11
Autor: Vladimir Buzalka (asistovano Claude)
Popis: Precte JEDNU konkretni zpravu z MailStore slozky a vypise jeji plny
obsah - hlavicky, telo (text), seznam priloh. Volitelne ulozi
prilohy na disk. Posledni dilek rucniho prohlizece archivu.
Argumenty: <slozka> <cislo>
slozka = plna cesta (fullName z mapy / vystupu mailstore_folder)
cislo = poradove cislo zpravy (# z mailstore_folder), nebo UID s --uid
Zdroj: MailStore IMAP, port 143, STARTTLS, auth Prosty text (LOGIN).
FETCH <n> (RFC822) = cely syrovy EML, naparsovan emailem.
Spusteni:
python mailstore_read_v1.0.py "...slozka..." 63627
python mailstore_read_v1.0.py "...slozka..." 12345 --uid # cislo je UID
python mailstore_read_v1.0.py "...slozka..." 63627 --save .\att # ulozi prilohy
python mailstore_read_v1.0.py "...slozka..." 63627 --raw # vypise cely EML
================================================================================
"""
from __future__ import annotations
import argparse
import email
import imaplib
import os
import ssl
import sys
from email.header import decode_header
from email.utils import parsedate_to_datetime
# --- konfigurace ------------------------------------------------------------
HOST = "192.168.1.53"
PORT = 143
USER = "admin"
PASS = "*$N(B)vMUym!%"
BODY_PREVIEW_CHARS = 4000 # kolik znaku tela vypsat na obrazovku
def connect() -> imaplib.IMAP4:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
M = imaplib.IMAP4(HOST, PORT)
M.starttls(ssl_context=ctx)
M.login(USER, PASS)
return M
def encode_mutf7(s: str) -> str:
"""Nazev IMAP slozky -> modified UTF-7 (RFC 3501) kvuli diakritice."""
import base64 as _b64
res = []
i, n = 0, len(s)
while i < n:
ch = s[i]; o = ord(ch)
if 0x20 <= o <= 0x7e:
res.append("&-" if ch == "&" else ch); i += 1
else:
j = i
while j < n and not (0x20 <= ord(s[j]) <= 0x7e):
j += 1
enc = _b64.b64encode(s[i:j].encode("utf-16-be")).decode("ascii").rstrip("=").replace("/", ",")
res.append("&" + enc + "-"); i = j
return "".join(res)
def dec(s: str | None) -> str:
if not s:
return ""
out = []
for txt, enc in decode_header(s):
if isinstance(txt, bytes):
out.append(txt.decode(enc or "utf-8", errors="replace"))
else:
out.append(txt)
return "".join(out).replace("\r", " ").replace("\n", " ").strip()
def html_to_text(html: str) -> str:
"""HTML -> text. Zkusi bs4 (je v projektu), jinak hrubsi fallback."""
try:
from bs4 import BeautifulSoup
try:
soup = BeautifulSoup(html, "lxml")
except Exception:
soup = BeautifulSoup(html, "html.parser")
for t in soup(["script", "style", "head"]):
t.decompose()
text = soup.get_text(separator="\n")
except Exception:
import re
text = re.sub(r"<[^>]+>", "", html)
lines = [ln.strip() for ln in text.splitlines()]
return "\n".join(ln for ln in lines if ln)
def main() -> int:
ap = argparse.ArgumentParser(description="Precist jednu zpravu z MailStore")
ap.add_argument("folder", help="Plna cesta slozky")
ap.add_argument("number", help="Poradove cislo zpravy (nebo UID s --uid)")
ap.add_argument("--uid", action="store_true", help="Cislo je IMAP UID, ne poradi")
ap.add_argument("--save", metavar="DIR", help="Ulozit prilohy do adresare")
ap.add_argument("--raw", action="store_true", help="Vypsat cely syrovy EML a skoncit")
args = ap.parse_args()
M = connect()
typ, data = M.select(f'"{encode_mutf7(args.folder)}"', readonly=True)
if typ != "OK":
print(f"Slozku nelze otevrit: {data}", file=sys.stderr)
return 1
# FETCH cele zpravy (RFC822). UID FETCH kdyz --uid.
if args.uid:
typ, msg_data = M.uid("FETCH", args.number, "(RFC822)")
else:
typ, msg_data = M.fetch(args.number, "(RFC822)")
if typ != "OK" or not msg_data or not isinstance(msg_data[0], tuple):
print(f"Zpravu #{args.number} nelze nacist (typ={typ})", file=sys.stderr)
M.logout()
return 1
raw = msg_data[0][1]
M.logout()
if args.raw:
sys.stdout.buffer.write(raw)
return 0
msg = email.message_from_bytes(raw)
# --- hlavicky ---
print("=" * 80)
print(f"Slozka : {args.folder}")
print(f"{'UID' if args.uid else 'Cislo'} : {args.number}")
print("-" * 80)
print(f"Datum : {msg.get('Date')}")
print(f"Od : {dec(msg.get('From'))}")
print(f"Komu : {dec(msg.get('To'))}")
if msg.get("Cc"):
print(f"Kopie : {dec(msg.get('Cc'))}")
print(f"Predmet : {dec(msg.get('Subject'))}")
print(f"Msg-ID : {msg.get('Message-ID')}")
print(f"EML velikost: {len(raw):,} bytu")
# --- telo + prilohy ---
body_text = body_html = ""
attachments = [] # (filename, size, payload)
for part in msg.walk():
if part.is_multipart():
continue
ct = part.get_content_type()
disp = str(part.get("Content-Disposition") or "")
payload = part.get_payload(decode=True)
if "attachment" in disp or (part.get_filename() and ct not in ("text/plain", "text/html")):
attachments.append((dec(part.get_filename()) or "(bez nazvu)",
len(payload or b""), payload or b""))
elif ct == "text/plain" and not body_text:
body_text = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
elif ct == "text/html" and not body_html:
body_html = (payload or b"").decode(part.get_content_charset() or "utf-8", errors="replace")
print("-" * 80)
if attachments:
print(f"Prilohy ({len(attachments)}):")
for name, size, _ in attachments:
print(f" - {name} ({size:,} B)")
else:
print("Prilohy: zadne")
# telo: preferuj plain, jinak html->text
text = body_text or (html_to_text(body_html) if body_html else "")
src = "text/plain" if body_text else ("text/html->text" if body_html else "(zadne)")
print("-" * 80)
print(f"TELO ({src}, {len(text):,} znaku):")
print("-" * 80)
if text:
print(text[:BODY_PREVIEW_CHARS])
if len(text) > BODY_PREVIEW_CHARS:
print(f"\n... [zkraceno, celkem {len(text):,} znaku] ...")
else:
print("(prazdne telo)")
# --- ulozeni priloh ---
if args.save and attachments:
os.makedirs(args.save, exist_ok=True)
print("-" * 80)
for name, size, payload in attachments:
safe = name.replace("/", "_").replace("\\", "_") or "att.bin"
path = os.path.join(args.save, safe)
with open(path, "wb") as f:
f.write(payload)
print(f"Ulozeno: {path} ({size:,} B)")
print("=" * 80)
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print("\nPreruseno", file=sys.stderr)
sys.exit(1)