This commit is contained in:
2026-06-14 08:25:15 +02:00
parent f94573ea6e
commit ed6455787a
7 changed files with 876 additions and 20 deletions
+48 -7
View File
@@ -20,6 +20,7 @@ Spusteni:
from __future__ import annotations
import re
import sys
import traceback
from datetime import datetime, timezone, timedelta
@@ -178,6 +179,8 @@ def search(
mailbox: Optional[Union[str, list]] = None,
since: Optional[str] = None,
until: Optional[str] = None,
days: Optional[int] = None,
inflect: bool = False,
folder_contains: Optional[str] = None,
sender_contains: Optional[str] = None,
has_attachments: Optional[bool] = None,
@@ -186,15 +189,23 @@ def search(
"""PRIMARY TOOL — fulltext search across all indexed emails.
Index includes: subject, sender (email + name), recipients (to/cc),
attachment filenames, AND full body text.
attachment filenames, AND full body text. Diacritics are stripped at index
time, so search is already accent-insensitive (recept == řecept == RECEPT).
query: websearch_to_tsquery syntax:
query: websearch_to_tsquery syntax (only when inflect=False):
invoice payment -> AND
"lot expiration" -> phrase
SAE OR "serious adverse" -> OR
urgent -newsletter -> exclude
mailbox: one mailbox string or list (e.g. "vbuzalka@its.jnj.com"). None = all.
since/until: ISO date "YYYY-MM-DD" on received_at
days: convenience window — only the last N days (overrides `since`). "za posledních X dní".
inflect: Czech declension. The index uses no stemmer, so a plain search for
"recept" misses "recepty/receptu/receptem/...". With inflect=True each word
in `query` is prefix-matched (recept -> recept:*) and AND-ed, catching the
other grammatical cases. Set this for Czech-word searches. Trade-off: a
prefix also matches unrelated longer words (recept:* also hits "receptor").
In this mode the query is treated as plain words (operators/quotes ignored).
folder_contains: substring match against folder_path (case-insensitive)
sender_contains: substring match against sender_email OR sender_name (case-insensitive)
has_attachments: True / False / None (any)
@@ -207,11 +218,23 @@ def search(
mboxes = normalize_mailbox(mailbox)
since_dt = parse_since(since)
until_dt = parse_since(until)
if days and days > 0:
since_dt = datetime.now(timezone.utc) - timedelta(days=days)
limit = min(max(1, limit), 100)
sql = """
# Build the tsquery. inflect=True → prefix-match each word (Czech cases)
# via to_tsquery; otherwise use websearch_to_tsquery for full operator support.
tsq_func = "websearch_to_tsquery"
tsq_text = query
if inflect:
tokens = re.findall(r"\w+", query, flags=re.UNICODE)
if tokens:
tsq_func = "to_tsquery"
tsq_text = " & ".join(f"{t}:*" for t in tokens)
sql = f"""
WITH q AS (
SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
SELECT {tsq_func}('soubory'::regconfig, %(query)s) AS tsq
)
SELECT
e.id, e.mailbox, e.message_id, e.conversation_id, e.folder_path,
@@ -573,14 +596,26 @@ def find_attachment(
def top_senders(
mailbox: Optional[Union[str, list]] = None,
since: Optional[str] = None,
days: Optional[int] = None,
folder_contains: Optional[str] = None,
limit: int = 20,
) -> dict:
"""Top senders by volume (count of received emails). Optionally limit by mailbox or date window.
Use for "who emails me most" or "top senders this month".
"""Unique senders grouped by sender_email, counted, sorted by count DESC.
Use for "who emails me most" / "top senders this month".
mailbox: one mailbox string or list. None = all.
since: ISO date "YYYY-MM-DD" lower bound on received_at.
days: convenience window — count only the last N days (overrides `since`
when both given). Use for "za posledních X dní".
folder_contains: substring match against folder_path (case-insensitive).
Pass "Inbox" to count ONLY received/incoming mail and exclude the
mailbox owner's own Sent Items, Drafts, etc. Default None = all folders.
"""
try:
mboxes = normalize_mailbox(mailbox)
since_dt = parse_since(since)
if days and days > 0:
since_dt = datetime.now(timezone.utc) - timedelta(days=days)
limit = min(max(1, limit), 100)
sql = """
SELECT sender_email, count(*) AS c, max(received_at) AS last_at
@@ -588,12 +623,18 @@ def top_senders(
WHERE ok = TRUE AND sender_email IS NOT NULL
AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
AND (%(folder)s::text IS NULL OR folder_path ILIKE %(folder_like)s)
GROUP BY sender_email
ORDER BY c DESC
LIMIT %(limit)s
"""
with pg_conn() as pg, pg.cursor() as cur:
cur.execute(sql, {"mboxes": mboxes, "since": since_dt, "limit": limit})
cur.execute(sql, {
"mboxes": mboxes, "since": since_dt,
"folder": folder_contains,
"folder_like": f"%{folder_contains}%" if folder_contains else None,
"limit": limit,
})
rows = [{"sender_email": s, "count": c, "last_at": serialize(t)}
for s, c, t in cur.fetchall()]
return {"count": len(rows), "results": rows}