204 lines
6.7 KiB
Python
204 lines
6.7 KiB
Python
"""
|
|
==============================================================================
|
|
Skript: query_v0.1.py
|
|
Verze: 0.1
|
|
Datum: 2026-06-03
|
|
Autor: vladimir.buzalka
|
|
Popis: Hybridni dotaz: PostgreSQL fulltext (tsv + ts_rank + ts_headline)
|
|
+ obohaceni z MongoDB (content.* - autor, listy, EML hlavicky,
|
|
datumy v nazvu).
|
|
|
|
Pouziti:
|
|
python query_v0.1.py "adverse event"
|
|
python query_v0.1.py "protocol deviation" --study MDD3003 --ext docx pptx
|
|
python query_v0.1.py "randomization" --ext xlsx xlsm --limit 20
|
|
python query_v0.1.py "lot expiration" --since 2026-01-01
|
|
|
|
Syntaxe dotazu = websearch_to_tsquery:
|
|
adverse event -> AND
|
|
"adverse event" -> fraze
|
|
adverse OR serious -> OR
|
|
adverse -mild -> NOT
|
|
==============================================================================
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
|
|
import psycopg
|
|
from pymongo import MongoClient
|
|
|
|
MONGO_URI = "mongodb://192.168.1.76:27017"
|
|
MONGO_DB = "soubory"
|
|
STUDY_COLLECTIONS = {
|
|
"MDD3003": "42847922MDD3003",
|
|
"UCO3001": "77242113UCO3001",
|
|
}
|
|
|
|
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
|
"user=vladimir.buzalka password=Vlado7309208104++")
|
|
|
|
|
|
SEARCH_SQL = """
|
|
WITH q AS (
|
|
SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
|
|
)
|
|
SELECT
|
|
d.study,
|
|
d.path,
|
|
d.rel_path,
|
|
d.name,
|
|
d.ext,
|
|
d.size_bytes,
|
|
d.mtime,
|
|
d.body_length,
|
|
ts_rank(d.tsv, q.tsq) AS rank,
|
|
ts_headline('soubory'::regconfig,
|
|
left(d.body, 200000),
|
|
q.tsq,
|
|
'MaxFragments=2, MinWords=4, MaxWords=18, '
|
|
'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
|
|
FROM documents d, q
|
|
WHERE d.tsv @@ q.tsq
|
|
AND d.ok = TRUE
|
|
AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
|
|
AND (%(exts)s::text[] IS NULL OR d.ext = ANY(%(exts)s::text[]))
|
|
AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
|
|
ORDER BY rank DESC, d.mtime DESC NULLS LAST
|
|
LIMIT %(limit)s
|
|
"""
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(
|
|
description="Hybridni dotaz PG fulltext + Mongo metadata",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
p.add_argument("query", help="Vyhledavaci vyraz (websearch syntaxe)")
|
|
p.add_argument("--study", nargs="*",
|
|
choices=sorted(STUDY_COLLECTIONS.keys()),
|
|
help="Filtr studie (default: obe)")
|
|
p.add_argument("--ext", nargs="*",
|
|
help="Filtr pripon (napr. pdf docx xlsx)")
|
|
p.add_argument("--since",
|
|
help="mtime >= datum (YYYY-MM-DD)")
|
|
p.add_argument("--limit", type=int, default=15,
|
|
help="Pocet vysledku (default 15)")
|
|
p.add_argument("--no-meta", action="store_true",
|
|
help="Vynechat doplneni z Mongo")
|
|
return p.parse_args()
|
|
|
|
|
|
def _short(s, n=60):
|
|
if not s:
|
|
return ""
|
|
s = str(s).replace("\n", " ").strip()
|
|
return s if len(s) <= n else s[:n] + "..."
|
|
|
|
|
|
def _fmt_meta(study_code: str, content: dict) -> str:
|
|
"""Vrati jednoradkove shrnuti zajimavych poli z content.*"""
|
|
if not content:
|
|
return "(bez content)"
|
|
bits = []
|
|
if not content.get("ok", True):
|
|
return f"content.error: {content.get('error', '?')}"
|
|
for key in ("title", "subject", "author", "last_modified_by",
|
|
"from", "to", "subject", "date"):
|
|
v = content.get(key)
|
|
if v:
|
|
bits.append(f"{key}={_short(v, 40)}")
|
|
if "pages" in content:
|
|
bits.append(f"pages={content['pages']}")
|
|
if "slides" in content:
|
|
bits.append(f"slides={content['slides']}")
|
|
if "total_sheets" in content:
|
|
sheet_names = [s.get("name") for s in content.get("sheets", [])][:4]
|
|
bits.append(f"sheets={content['total_sheets']} {sheet_names}")
|
|
if "paragraphs" in content:
|
|
bits.append(f"paragraphs={content['paragraphs']}")
|
|
if "has_attachments" in content:
|
|
bits.append(f"attachments={len(content.get('attachments', []))}")
|
|
return " | ".join(bits) if bits else "(content bez vyznamnych poli)"
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
|
|
studies = None
|
|
if args.study:
|
|
studies = [STUDY_COLLECTIONS[s] for s in args.study]
|
|
|
|
exts = None
|
|
if args.ext:
|
|
exts = [e.lower().lstrip(".") for e in args.ext]
|
|
|
|
since = None
|
|
if args.since:
|
|
since = datetime.strptime(args.since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
|
|
params = {
|
|
"query": args.query,
|
|
"studies": studies,
|
|
"exts": exts,
|
|
"since": since,
|
|
"limit": args.limit,
|
|
}
|
|
|
|
with psycopg.connect(PG_DSN, connect_timeout=10) as pg, pg.cursor() as cur:
|
|
cur.execute(SEARCH_SQL, params)
|
|
cols = [c.name for c in cur.description]
|
|
rows = [dict(zip(cols, r)) for r in cur.fetchall()]
|
|
|
|
if not rows:
|
|
print(f"Zadne vysledky pro: {args.query!r}")
|
|
return 0
|
|
|
|
# obohaceni z Mongo - jeden round-trip na studii
|
|
meta_by_path: dict[str, dict] = {}
|
|
if not args.no_meta:
|
|
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
|
db = mongo[MONGO_DB]
|
|
by_study: dict[str, list[str]] = {}
|
|
for r in rows:
|
|
by_study.setdefault(r["study"], []).append(r["path"])
|
|
for study_code, paths in by_study.items():
|
|
for d in db[study_code].find(
|
|
{"path": {"$in": paths}},
|
|
{"path": 1, "content": 1, "dates_in_name": 1, "parent_folders": 1},
|
|
):
|
|
meta_by_path[d["path"]] = d
|
|
mongo.close()
|
|
|
|
print(f"\n=== Dotaz: {args.query!r} vysledku: {len(rows)} ===\n")
|
|
for i, r in enumerate(rows, 1):
|
|
size_mb = (r["size_bytes"] or 0) / 1024 / 1024
|
|
mtime = r["mtime"].strftime("%Y-%m-%d") if r["mtime"] else "?"
|
|
print(f"[{i:>2}] rank={r['rank']:.4f} {r['study']} "
|
|
f"{r['ext']:<4} {size_mb:5.1f}MB {mtime} "
|
|
f"({r['body_length']} znaku)")
|
|
print(f" {r['rel_path'] or r['name']}")
|
|
snippet = (r["snippet"] or "").replace("\n", " ").strip()
|
|
if snippet:
|
|
print(f" >> {snippet}")
|
|
if not args.no_meta:
|
|
m = meta_by_path.get(r["path"]) or {}
|
|
content_line = _fmt_meta(r["study"], m.get("content") or {})
|
|
print(f" meta: {content_line}")
|
|
if m.get("dates_in_name"):
|
|
print(f" dates_in_name: {m['dates_in_name']}")
|
|
print()
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
raise SystemExit(main())
|
|
except KeyboardInterrupt:
|
|
print("\nPreruseno uzivatelem")
|
|
sys.exit(130)
|