Files
janssen/Soubory/query_v0.1.py
T
2026-06-05 21:21:30 +02:00

204 lines
6.7 KiB
Python

"""
==============================================================================
Skript: query_v0.1.py
Verze: 0.1
Datum: 2026-06-03
Autor: vladimir.buzalka
Popis: Hybridni dotaz: PostgreSQL fulltext (tsv + ts_rank + ts_headline)
+ obohaceni z MongoDB (content.* - autor, listy, EML hlavicky,
datumy v nazvu).
Pouziti:
python query_v0.1.py "adverse event"
python query_v0.1.py "protocol deviation" --study MDD3003 --ext docx pptx
python query_v0.1.py "randomization" --ext xlsx xlsm --limit 20
python query_v0.1.py "lot expiration" --since 2026-01-01
Syntaxe dotazu = websearch_to_tsquery:
adverse event -> AND
"adverse event" -> fraze
adverse OR serious -> OR
adverse -mild -> NOT
==============================================================================
"""
from __future__ import annotations
import argparse
import sys
from datetime import datetime, timezone
import psycopg
from pymongo import MongoClient
MONGO_URI = "mongodb://192.168.1.76:27017"
MONGO_DB = "soubory"
STUDY_COLLECTIONS = {
"MDD3003": "42847922MDD3003",
"UCO3001": "77242113UCO3001",
}
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
"user=vladimir.buzalka password=Vlado7309208104++")
SEARCH_SQL = """
WITH q AS (
SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
)
SELECT
d.study,
d.path,
d.rel_path,
d.name,
d.ext,
d.size_bytes,
d.mtime,
d.body_length,
ts_rank(d.tsv, q.tsq) AS rank,
ts_headline('soubory'::regconfig,
left(d.body, 200000),
q.tsq,
'MaxFragments=2, MinWords=4, MaxWords=18, '
'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
FROM documents d, q
WHERE d.tsv @@ q.tsq
AND d.ok = TRUE
AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
AND (%(exts)s::text[] IS NULL OR d.ext = ANY(%(exts)s::text[]))
AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
ORDER BY rank DESC, d.mtime DESC NULLS LAST
LIMIT %(limit)s
"""
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Hybridni dotaz PG fulltext + Mongo metadata",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument("query", help="Vyhledavaci vyraz (websearch syntaxe)")
p.add_argument("--study", nargs="*",
choices=sorted(STUDY_COLLECTIONS.keys()),
help="Filtr studie (default: obe)")
p.add_argument("--ext", nargs="*",
help="Filtr pripon (napr. pdf docx xlsx)")
p.add_argument("--since",
help="mtime >= datum (YYYY-MM-DD)")
p.add_argument("--limit", type=int, default=15,
help="Pocet vysledku (default 15)")
p.add_argument("--no-meta", action="store_true",
help="Vynechat doplneni z Mongo")
return p.parse_args()
def _short(s, n=60):
if not s:
return ""
s = str(s).replace("\n", " ").strip()
return s if len(s) <= n else s[:n] + "..."
def _fmt_meta(study_code: str, content: dict) -> str:
"""Vrati jednoradkove shrnuti zajimavych poli z content.*"""
if not content:
return "(bez content)"
bits = []
if not content.get("ok", True):
return f"content.error: {content.get('error', '?')}"
for key in ("title", "subject", "author", "last_modified_by",
"from", "to", "subject", "date"):
v = content.get(key)
if v:
bits.append(f"{key}={_short(v, 40)}")
if "pages" in content:
bits.append(f"pages={content['pages']}")
if "slides" in content:
bits.append(f"slides={content['slides']}")
if "total_sheets" in content:
sheet_names = [s.get("name") for s in content.get("sheets", [])][:4]
bits.append(f"sheets={content['total_sheets']} {sheet_names}")
if "paragraphs" in content:
bits.append(f"paragraphs={content['paragraphs']}")
if "has_attachments" in content:
bits.append(f"attachments={len(content.get('attachments', []))}")
return " | ".join(bits) if bits else "(content bez vyznamnych poli)"
def main() -> int:
args = parse_args()
studies = None
if args.study:
studies = [STUDY_COLLECTIONS[s] for s in args.study]
exts = None
if args.ext:
exts = [e.lower().lstrip(".") for e in args.ext]
since = None
if args.since:
since = datetime.strptime(args.since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
params = {
"query": args.query,
"studies": studies,
"exts": exts,
"since": since,
"limit": args.limit,
}
with psycopg.connect(PG_DSN, connect_timeout=10) as pg, pg.cursor() as cur:
cur.execute(SEARCH_SQL, params)
cols = [c.name for c in cur.description]
rows = [dict(zip(cols, r)) for r in cur.fetchall()]
if not rows:
print(f"Zadne vysledky pro: {args.query!r}")
return 0
# obohaceni z Mongo - jeden round-trip na studii
meta_by_path: dict[str, dict] = {}
if not args.no_meta:
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
db = mongo[MONGO_DB]
by_study: dict[str, list[str]] = {}
for r in rows:
by_study.setdefault(r["study"], []).append(r["path"])
for study_code, paths in by_study.items():
for d in db[study_code].find(
{"path": {"$in": paths}},
{"path": 1, "content": 1, "dates_in_name": 1, "parent_folders": 1},
):
meta_by_path[d["path"]] = d
mongo.close()
print(f"\n=== Dotaz: {args.query!r} vysledku: {len(rows)} ===\n")
for i, r in enumerate(rows, 1):
size_mb = (r["size_bytes"] or 0) / 1024 / 1024
mtime = r["mtime"].strftime("%Y-%m-%d") if r["mtime"] else "?"
print(f"[{i:>2}] rank={r['rank']:.4f} {r['study']} "
f"{r['ext']:<4} {size_mb:5.1f}MB {mtime} "
f"({r['body_length']} znaku)")
print(f" {r['rel_path'] or r['name']}")
snippet = (r["snippet"] or "").replace("\n", " ").strip()
if snippet:
print(f" >> {snippet}")
if not args.no_meta:
m = meta_by_path.get(r["path"]) or {}
content_line = _fmt_meta(r["study"], m.get("content") or {})
print(f" meta: {content_line}")
if m.get("dates_in_name"):
print(f" dates_in_name: {m['dates_in_name']}")
print()
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except KeyboardInterrupt:
print("\nPreruseno uzivatelem")
sys.exit(130)