notebook
This commit is contained in:
@@ -0,0 +1,203 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: query_v0.1.py
|
||||
Verze: 0.1
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Hybridni dotaz: PostgreSQL fulltext (tsv + ts_rank + ts_headline)
|
||||
+ obohaceni z MongoDB (content.* - autor, listy, EML hlavicky,
|
||||
datumy v nazvu).
|
||||
|
||||
Pouziti:
|
||||
python query_v0.1.py "adverse event"
|
||||
python query_v0.1.py "protocol deviation" --study MDD3003 --ext docx pptx
|
||||
python query_v0.1.py "randomization" --ext xlsx xlsm --limit 20
|
||||
python query_v0.1.py "lot expiration" --since 2026-01-01
|
||||
|
||||
Syntaxe dotazu = websearch_to_tsquery:
|
||||
adverse event -> AND
|
||||
"adverse event" -> fraze
|
||||
adverse OR serious -> OR
|
||||
adverse -mild -> NOT
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import psycopg
|
||||
from pymongo import MongoClient
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "soubory"
|
||||
STUDY_COLLECTIONS = {
|
||||
"MDD3003": "42847922MDD3003",
|
||||
"UCO3001": "77242113UCO3001",
|
||||
}
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
|
||||
SEARCH_SQL = """
|
||||
WITH q AS (
|
||||
SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
|
||||
)
|
||||
SELECT
|
||||
d.study,
|
||||
d.path,
|
||||
d.rel_path,
|
||||
d.name,
|
||||
d.ext,
|
||||
d.size_bytes,
|
||||
d.mtime,
|
||||
d.body_length,
|
||||
ts_rank(d.tsv, q.tsq) AS rank,
|
||||
ts_headline('soubory'::regconfig,
|
||||
left(d.body, 200000),
|
||||
q.tsq,
|
||||
'MaxFragments=2, MinWords=4, MaxWords=18, '
|
||||
'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
|
||||
FROM documents d, q
|
||||
WHERE d.tsv @@ q.tsq
|
||||
AND d.ok = TRUE
|
||||
AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
|
||||
AND (%(exts)s::text[] IS NULL OR d.ext = ANY(%(exts)s::text[]))
|
||||
AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
|
||||
ORDER BY rank DESC, d.mtime DESC NULLS LAST
|
||||
LIMIT %(limit)s
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Hybridni dotaz PG fulltext + Mongo metadata",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
p.add_argument("query", help="Vyhledavaci vyraz (websearch syntaxe)")
|
||||
p.add_argument("--study", nargs="*",
|
||||
choices=sorted(STUDY_COLLECTIONS.keys()),
|
||||
help="Filtr studie (default: obe)")
|
||||
p.add_argument("--ext", nargs="*",
|
||||
help="Filtr pripon (napr. pdf docx xlsx)")
|
||||
p.add_argument("--since",
|
||||
help="mtime >= datum (YYYY-MM-DD)")
|
||||
p.add_argument("--limit", type=int, default=15,
|
||||
help="Pocet vysledku (default 15)")
|
||||
p.add_argument("--no-meta", action="store_true",
|
||||
help="Vynechat doplneni z Mongo")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def _short(s, n=60):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _fmt_meta(study_code: str, content: dict) -> str:
|
||||
"""Vrati jednoradkove shrnuti zajimavych poli z content.*"""
|
||||
if not content:
|
||||
return "(bez content)"
|
||||
bits = []
|
||||
if not content.get("ok", True):
|
||||
return f"content.error: {content.get('error', '?')}"
|
||||
for key in ("title", "subject", "author", "last_modified_by",
|
||||
"from", "to", "subject", "date"):
|
||||
v = content.get(key)
|
||||
if v:
|
||||
bits.append(f"{key}={_short(v, 40)}")
|
||||
if "pages" in content:
|
||||
bits.append(f"pages={content['pages']}")
|
||||
if "slides" in content:
|
||||
bits.append(f"slides={content['slides']}")
|
||||
if "total_sheets" in content:
|
||||
sheet_names = [s.get("name") for s in content.get("sheets", [])][:4]
|
||||
bits.append(f"sheets={content['total_sheets']} {sheet_names}")
|
||||
if "paragraphs" in content:
|
||||
bits.append(f"paragraphs={content['paragraphs']}")
|
||||
if "has_attachments" in content:
|
||||
bits.append(f"attachments={len(content.get('attachments', []))}")
|
||||
return " | ".join(bits) if bits else "(content bez vyznamnych poli)"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
studies = None
|
||||
if args.study:
|
||||
studies = [STUDY_COLLECTIONS[s] for s in args.study]
|
||||
|
||||
exts = None
|
||||
if args.ext:
|
||||
exts = [e.lower().lstrip(".") for e in args.ext]
|
||||
|
||||
since = None
|
||||
if args.since:
|
||||
since = datetime.strptime(args.since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
||||
|
||||
params = {
|
||||
"query": args.query,
|
||||
"studies": studies,
|
||||
"exts": exts,
|
||||
"since": since,
|
||||
"limit": args.limit,
|
||||
}
|
||||
|
||||
with psycopg.connect(PG_DSN, connect_timeout=10) as pg, pg.cursor() as cur:
|
||||
cur.execute(SEARCH_SQL, params)
|
||||
cols = [c.name for c in cur.description]
|
||||
rows = [dict(zip(cols, r)) for r in cur.fetchall()]
|
||||
|
||||
if not rows:
|
||||
print(f"Zadne vysledky pro: {args.query!r}")
|
||||
return 0
|
||||
|
||||
# obohaceni z Mongo - jeden round-trip na studii
|
||||
meta_by_path: dict[str, dict] = {}
|
||||
if not args.no_meta:
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
db = mongo[MONGO_DB]
|
||||
by_study: dict[str, list[str]] = {}
|
||||
for r in rows:
|
||||
by_study.setdefault(r["study"], []).append(r["path"])
|
||||
for study_code, paths in by_study.items():
|
||||
for d in db[study_code].find(
|
||||
{"path": {"$in": paths}},
|
||||
{"path": 1, "content": 1, "dates_in_name": 1, "parent_folders": 1},
|
||||
):
|
||||
meta_by_path[d["path"]] = d
|
||||
mongo.close()
|
||||
|
||||
print(f"\n=== Dotaz: {args.query!r} vysledku: {len(rows)} ===\n")
|
||||
for i, r in enumerate(rows, 1):
|
||||
size_mb = (r["size_bytes"] or 0) / 1024 / 1024
|
||||
mtime = r["mtime"].strftime("%Y-%m-%d") if r["mtime"] else "?"
|
||||
print(f"[{i:>2}] rank={r['rank']:.4f} {r['study']} "
|
||||
f"{r['ext']:<4} {size_mb:5.1f}MB {mtime} "
|
||||
f"({r['body_length']} znaku)")
|
||||
print(f" {r['rel_path'] or r['name']}")
|
||||
snippet = (r["snippet"] or "").replace("\n", " ").strip()
|
||||
if snippet:
|
||||
print(f" >> {snippet}")
|
||||
if not args.no_meta:
|
||||
m = meta_by_path.get(r["path"]) or {}
|
||||
content_line = _fmt_meta(r["study"], m.get("content") or {})
|
||||
print(f" meta: {content_line}")
|
||||
if m.get("dates_in_name"):
|
||||
print(f" dates_in_name: {m['dates_in_name']}")
|
||||
print()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
sys.exit(130)
|
||||
Reference in New Issue
Block a user