notebook

2026-06-14 08:25:15 +02:00
parent f94573ea6e
commit ed6455787a
7 changed files with 876 additions and 20 deletions
@@ -20,6 +20,7 @@ Spusteni:

 from __future__ import annotations

+import re
 import sys
 import traceback
 from datetime import datetime, timezone, timedelta
@@ -178,6 +179,8 @@ def search(
    mailbox: Optional[Union[str, list]] = None,
    since: Optional[str] = None,
    until: Optional[str] = None,
+    days: Optional[int] = None,
+    inflect: bool = False,
    folder_contains: Optional[str] = None,
    sender_contains: Optional[str] = None,
    has_attachments: Optional[bool] = None,
@@ -186,15 +189,23 @@ def search(
    """PRIMARY TOOL — fulltext search across all indexed emails.

    Index includes: subject, sender (email + name), recipients (to/cc),
-    attachment filenames, AND full body text.
+    attachment filenames, AND full body text. Diacritics are stripped at index
+    time, so search is already accent-insensitive (recept == řecept == RECEPT).

-    query: websearch_to_tsquery syntax:
+    query: websearch_to_tsquery syntax (only when inflect=False):
        invoice payment           -> AND
        "lot expiration"          -> phrase
        SAE OR "serious adverse"  -> OR
        urgent -newsletter        -> exclude
    mailbox: one mailbox string or list (e.g. "vbuzalka@its.jnj.com"). None = all.
    since/until: ISO date "YYYY-MM-DD" on received_at
+    days: convenience window — only the last N days (overrides `since`). "za posledních X dní".
+    inflect: Czech declension. The index uses no stemmer, so a plain search for
+        "recept" misses "recepty/receptu/receptem/...". With inflect=True each word
+        in `query` is prefix-matched (recept -> recept:*) and AND-ed, catching the
+        other grammatical cases. Set this for Czech-word searches. Trade-off: a
+        prefix also matches unrelated longer words (recept:* also hits "receptor").
+        In this mode the query is treated as plain words (operators/quotes ignored).
    folder_contains: substring match against folder_path (case-insensitive)
    sender_contains: substring match against sender_email OR sender_name (case-insensitive)
    has_attachments: True / False / None (any)
@@ -207,11 +218,23 @@ def search(
        mboxes = normalize_mailbox(mailbox)
        since_dt = parse_since(since)
        until_dt = parse_since(until)
+        if days and days > 0:
+            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
        limit = min(max(1, limit), 100)

-        sql = """
+        # Build the tsquery. inflect=True → prefix-match each word (Czech cases)
+        # via to_tsquery; otherwise use websearch_to_tsquery for full operator support.
+        tsq_func = "websearch_to_tsquery"
+        tsq_text = query
+        if inflect:
+            tokens = re.findall(r"\w+", query, flags=re.UNICODE)
+            if tokens:
+                tsq_func = "to_tsquery"
+                tsq_text = " & ".join(f"{t}:*" for t in tokens)
+
+        sql = f"""
        WITH q AS (
-            SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
+            SELECT {tsq_func}('soubory'::regconfig, %(query)s) AS tsq
        )
        SELECT
            e.id, e.mailbox, e.message_id, e.conversation_id, e.folder_path,
@@ -573,14 +596,26 @@ def find_attachment(
 def top_senders(
    mailbox: Optional[Union[str, list]] = None,
    since: Optional[str] = None,
+    days: Optional[int] = None,
+    folder_contains: Optional[str] = None,
    limit: int = 20,
 ) -> dict:
-    """Top senders by volume (count of received emails). Optionally limit by mailbox or date window.
-    Use for "who emails me most" or "top senders this month".
+    """Unique senders grouped by sender_email, counted, sorted by count DESC.
+    Use for "who emails me most" / "top senders this month".
+
+    mailbox: one mailbox string or list. None = all.
+    since: ISO date "YYYY-MM-DD" lower bound on received_at.
+    days: convenience window — count only the last N days (overrides `since`
+          when both given). Use for "za posledních X dní".
+    folder_contains: substring match against folder_path (case-insensitive).
+          Pass "Inbox" to count ONLY received/incoming mail and exclude the
+          mailbox owner's own Sent Items, Drafts, etc. Default None = all folders.
    """
    try:
        mboxes = normalize_mailbox(mailbox)
        since_dt = parse_since(since)
+        if days and days > 0:
+            since_dt = datetime.now(timezone.utc) - timedelta(days=days)
        limit = min(max(1, limit), 100)
        sql = """
        SELECT sender_email, count(*) AS c, max(received_at) AS last_at
@@ -588,12 +623,18 @@ def top_senders(
        WHERE ok = TRUE AND sender_email IS NOT NULL
          AND (%(mboxes)s::text[] IS NULL OR mailbox = ANY(%(mboxes)s::text[]))
          AND (%(since)s::timestamptz IS NULL OR received_at >= %(since)s::timestamptz)
+          AND (%(folder)s::text IS NULL OR folder_path ILIKE %(folder_like)s)
        GROUP BY sender_email
        ORDER BY c DESC
        LIMIT %(limit)s
        """
        with pg_conn() as pg, pg.cursor() as cur:
-            cur.execute(sql, {"mboxes": mboxes, "since": since_dt, "limit": limit})
+            cur.execute(sql, {
+                "mboxes": mboxes, "since": since_dt,
+                "folder": folder_contains,
+                "folder_like": f"%{folder_contains}%" if folder_contains else None,
+                "limit": limit,
+            })
            rows = [{"sender_email": s, "count": c, "last_at": serialize(t)}
                    for s, c, t in cur.fetchall()]
        return {"count": len(rows), "results": rows}