""" ============================================================================== Skript: query_v0.1.py Verze: 0.1 Datum: 2026-06-03 Autor: vladimir.buzalka Popis: Hybridni dotaz: PostgreSQL fulltext (tsv + ts_rank + ts_headline) + obohaceni z MongoDB (content.* - autor, listy, EML hlavicky, datumy v nazvu). Pouziti: python query_v0.1.py "adverse event" python query_v0.1.py "protocol deviation" --study MDD3003 --ext docx pptx python query_v0.1.py "randomization" --ext xlsx xlsm --limit 20 python query_v0.1.py "lot expiration" --since 2026-01-01 Syntaxe dotazu = websearch_to_tsquery: adverse event -> AND "adverse event" -> fraze adverse OR serious -> OR adverse -mild -> NOT ============================================================================== """ from __future__ import annotations import argparse import sys from datetime import datetime, timezone import psycopg from pymongo import MongoClient MONGO_URI = "mongodb://192.168.1.76:27017" MONGO_DB = "soubory" STUDY_COLLECTIONS = { "MDD3003": "42847922MDD3003", "UCO3001": "77242113UCO3001", } PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory " "user=vladimir.buzalka password=Vlado7309208104++") SEARCH_SQL = """ WITH q AS ( SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq ) SELECT d.study, d.path, d.rel_path, d.name, d.ext, d.size_bytes, d.mtime, d.body_length, ts_rank(d.tsv, q.tsq) AS rank, ts_headline('soubory'::regconfig, left(d.body, 200000), q.tsq, 'MaxFragments=2, MinWords=4, MaxWords=18, ' 'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet FROM documents d, q WHERE d.tsv @@ q.tsq AND d.ok = TRUE AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[])) AND (%(exts)s::text[] IS NULL OR d.ext = ANY(%(exts)s::text[])) AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz) ORDER BY rank DESC, d.mtime DESC NULLS LAST LIMIT %(limit)s """ def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Hybridni dotaz PG fulltext + Mongo metadata", formatter_class=argparse.RawDescriptionHelpFormatter, ) p.add_argument("query", help="Vyhledavaci vyraz (websearch syntaxe)") p.add_argument("--study", nargs="*", choices=sorted(STUDY_COLLECTIONS.keys()), help="Filtr studie (default: obe)") p.add_argument("--ext", nargs="*", help="Filtr pripon (napr. pdf docx xlsx)") p.add_argument("--since", help="mtime >= datum (YYYY-MM-DD)") p.add_argument("--limit", type=int, default=15, help="Pocet vysledku (default 15)") p.add_argument("--no-meta", action="store_true", help="Vynechat doplneni z Mongo") return p.parse_args() def _short(s, n=60): if not s: return "" s = str(s).replace("\n", " ").strip() return s if len(s) <= n else s[:n] + "..." def _fmt_meta(study_code: str, content: dict) -> str: """Vrati jednoradkove shrnuti zajimavych poli z content.*""" if not content: return "(bez content)" bits = [] if not content.get("ok", True): return f"content.error: {content.get('error', '?')}" for key in ("title", "subject", "author", "last_modified_by", "from", "to", "subject", "date"): v = content.get(key) if v: bits.append(f"{key}={_short(v, 40)}") if "pages" in content: bits.append(f"pages={content['pages']}") if "slides" in content: bits.append(f"slides={content['slides']}") if "total_sheets" in content: sheet_names = [s.get("name") for s in content.get("sheets", [])][:4] bits.append(f"sheets={content['total_sheets']} {sheet_names}") if "paragraphs" in content: bits.append(f"paragraphs={content['paragraphs']}") if "has_attachments" in content: bits.append(f"attachments={len(content.get('attachments', []))}") return " | ".join(bits) if bits else "(content bez vyznamnych poli)" def main() -> int: args = parse_args() studies = None if args.study: studies = [STUDY_COLLECTIONS[s] for s in args.study] exts = None if args.ext: exts = [e.lower().lstrip(".") for e in args.ext] since = None if args.since: since = datetime.strptime(args.since, "%Y-%m-%d").replace(tzinfo=timezone.utc) params = { "query": args.query, "studies": studies, "exts": exts, "since": since, "limit": args.limit, } with psycopg.connect(PG_DSN, connect_timeout=10) as pg, pg.cursor() as cur: cur.execute(SEARCH_SQL, params) cols = [c.name for c in cur.description] rows = [dict(zip(cols, r)) for r in cur.fetchall()] if not rows: print(f"Zadne vysledky pro: {args.query!r}") return 0 # obohaceni z Mongo - jeden round-trip na studii meta_by_path: dict[str, dict] = {} if not args.no_meta: mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) db = mongo[MONGO_DB] by_study: dict[str, list[str]] = {} for r in rows: by_study.setdefault(r["study"], []).append(r["path"]) for study_code, paths in by_study.items(): for d in db[study_code].find( {"path": {"$in": paths}}, {"path": 1, "content": 1, "dates_in_name": 1, "parent_folders": 1}, ): meta_by_path[d["path"]] = d mongo.close() print(f"\n=== Dotaz: {args.query!r} vysledku: {len(rows)} ===\n") for i, r in enumerate(rows, 1): size_mb = (r["size_bytes"] or 0) / 1024 / 1024 mtime = r["mtime"].strftime("%Y-%m-%d") if r["mtime"] else "?" print(f"[{i:>2}] rank={r['rank']:.4f} {r['study']} " f"{r['ext']:<4} {size_mb:5.1f}MB {mtime} " f"({r['body_length']} znaku)") print(f" {r['rel_path'] or r['name']}") snippet = (r["snippet"] or "").replace("\n", " ").strip() if snippet: print(f" >> {snippet}") if not args.no_meta: m = meta_by_path.get(r["path"]) or {} content_line = _fmt_meta(r["study"], m.get("content") or {}) print(f" meta: {content_line}") if m.get("dates_in_name"): print(f" dates_in_name: {m['dates_in_name']}") print() return 0 if __name__ == "__main__": try: raise SystemExit(main()) except KeyboardInterrupt: print("\nPreruseno uzivatelem") sys.exit(130)