notebook
This commit is contained in:
@@ -0,0 +1,672 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
==============================================================================
|
||||
MCP server: SOUBORY (Dropbox studie 42847922MDD3003 + 77242113UCO3001)
|
||||
|
||||
Hybridni dotaz nad:
|
||||
- PostgreSQL 192.168.1.76 db=MongoSoubory tabulka=documents
|
||||
(fulltext tsvector index, ts_headline, ts_rank)
|
||||
- MongoDB 192.168.1.76 db=soubory
|
||||
kolekce=42847922MDD3003, 77242113UCO3001
|
||||
(metadata, content.* z enrich_files_v1.0)
|
||||
|
||||
Spusteni:
|
||||
python mcp_soubory.py (stdio MCP)
|
||||
|
||||
Pridano do U:\\janssen\\.mcp.json jako "soubory".
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import traceback
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Optional, Union
|
||||
|
||||
import psycopg
|
||||
from bson import ObjectId
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
from pymongo import MongoClient
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "soubory"
|
||||
|
||||
# Kratky alias -> Mongo kolekce = PG.study
|
||||
STUDY_MAP = {
|
||||
"MDD3003": "42847922MDD3003",
|
||||
"UCO3001": "77242113UCO3001",
|
||||
}
|
||||
STUDY_ALL = list(STUDY_MAP.values())
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
# Limit kolik telo doc vracime defaultne (aby tool response nebyla obri)
|
||||
DEFAULT_BODY_CHARS = 8000
|
||||
MAX_BODY_CHARS = 200_000
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(msg, file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
# --- inicializace klientu ---------------------------------------------------
|
||||
try:
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
mongo.admin.command("ping")
|
||||
log(f"Mongo OK ({MONGO_URI})")
|
||||
except Exception as e:
|
||||
log(f"Mongo connection failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
_test = psycopg.connect(PG_DSN, connect_timeout=10)
|
||||
_test.close()
|
||||
log("Postgres OK")
|
||||
except Exception as e:
|
||||
log(f"Postgres connection failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def pg_conn():
|
||||
return psycopg.connect(PG_DSN, connect_timeout=10)
|
||||
|
||||
|
||||
def serialize(obj):
|
||||
if isinstance(obj, ObjectId):
|
||||
return str(obj)
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
if isinstance(obj, bytes):
|
||||
return obj.decode("utf-8", errors="replace")
|
||||
if isinstance(obj, dict):
|
||||
return {k: serialize(v) for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [serialize(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def resolve_studies(study: Optional[Union[str, list]]) -> Optional[list[str]]:
|
||||
"""Alias 'MDD3003' / 'UCO3001' -> plne nazvy kolekce. None -> obe (vraci None pro PG = bez filtru)."""
|
||||
if study is None or study == "" or study == []:
|
||||
return None
|
||||
if isinstance(study, str):
|
||||
study = [study]
|
||||
out = []
|
||||
for s in study:
|
||||
if s in STUDY_MAP:
|
||||
out.append(STUDY_MAP[s])
|
||||
elif s in STUDY_MAP.values():
|
||||
out.append(s)
|
||||
else:
|
||||
raise ValueError(f"Unknown study {s!r}. Use MDD3003 / UCO3001 or full code.")
|
||||
return out
|
||||
|
||||
|
||||
def normalize_exts(ext: Optional[Union[str, list]]) -> Optional[list[str]]:
|
||||
if ext is None or ext == "" or ext == []:
|
||||
return None
|
||||
if isinstance(ext, str):
|
||||
ext = [ext]
|
||||
return [e.lower().lstrip(".") for e in ext]
|
||||
|
||||
|
||||
def parse_since(since: Optional[str]) -> Optional[datetime]:
|
||||
if not since:
|
||||
return None
|
||||
# akceptuj YYYY-MM-DD i ISO
|
||||
try:
|
||||
if "T" in since:
|
||||
return datetime.fromisoformat(since.replace("Z", "+00:00"))
|
||||
return datetime.strptime(since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Bad date {since!r}: {e}")
|
||||
|
||||
|
||||
def short_meta(content: dict) -> dict:
|
||||
"""Zhustene metadata z content.* pro tool response."""
|
||||
if not content or not content.get("ok", True):
|
||||
return {"ok": False, "error": (content or {}).get("error")}
|
||||
out = {}
|
||||
for k in ("title", "subject", "author", "last_modified_by",
|
||||
"from", "to", "cc", "date", "pages", "slides",
|
||||
"total_sheets", "paragraphs", "words",
|
||||
"created", "modified", "encrypted"):
|
||||
if k in content and content[k] not in (None, "", []):
|
||||
v = content[k]
|
||||
if isinstance(v, str) and len(v) > 200:
|
||||
v = v[:200] + "..."
|
||||
out[k] = v
|
||||
if "sheets" in content:
|
||||
out["sheet_names"] = [s.get("name") for s in content.get("sheets", []) if s]
|
||||
if "attachments" in content:
|
||||
out["attachment_count"] = len(content.get("attachments") or [])
|
||||
if out["attachment_count"]:
|
||||
out["attachments"] = content["attachments"][:10]
|
||||
if "text_head" in content:
|
||||
head = content["text_head"]
|
||||
out["text_head"] = head[:400] + ("..." if head and len(head) > 400 else "")
|
||||
return out
|
||||
|
||||
|
||||
# --- MCP --------------------------------------------------------------------
|
||||
mcp = FastMCP("soubory")
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def ping() -> dict:
|
||||
"""Quick health check. Reports Mongo + Postgres connectivity, totals per study, and PG documents.ok count.
|
||||
Call this first when starting an investigation to confirm everything is up.
|
||||
"""
|
||||
try:
|
||||
info = mongo.admin.command("buildInfo")
|
||||
study_counts = {}
|
||||
for code in STUDY_ALL:
|
||||
study_counts[code] = mongo[MONGO_DB][code].estimated_document_count()
|
||||
with pg_conn() as pg, pg.cursor() as cur:
|
||||
cur.execute("SELECT study, ok, count(*) FROM documents GROUP BY study, ok ORDER BY study, ok")
|
||||
rows = cur.fetchall()
|
||||
pg_summary = {}
|
||||
for s, ok, c in rows:
|
||||
pg_summary.setdefault(s, {})[("ok" if ok else "error")] = c
|
||||
return {
|
||||
"status": "ok",
|
||||
"mongo_version": info.get("version"),
|
||||
"mongo_files_per_study": study_counts,
|
||||
"pg_documents_per_study": pg_summary,
|
||||
"studies": STUDY_MAP,
|
||||
}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def list_studies() -> dict:
|
||||
"""Overview of both studies — total files, breakdown by extension, fulltext coverage,
|
||||
earliest/latest mtime. Use this to understand the corpus before searching.
|
||||
"""
|
||||
out = {}
|
||||
try:
|
||||
for alias, code in STUDY_MAP.items():
|
||||
col = mongo[MONGO_DB][code]
|
||||
total = col.count_documents({})
|
||||
deleted = col.count_documents({"deleted_at": {"$exists": True}})
|
||||
ext_breakdown = list(col.aggregate([
|
||||
{"$match": {"deleted_at": {"$exists": False}}},
|
||||
{"$group": {"_id": "$ext", "count": {"$sum": 1}}},
|
||||
{"$sort": {"count": -1}},
|
||||
]))
|
||||
mtime_minmax = list(col.aggregate([
|
||||
{"$match": {"deleted_at": {"$exists": False}}},
|
||||
{"$group": {"_id": None,
|
||||
"min_mtime": {"$min": "$mtime"},
|
||||
"max_mtime": {"$max": "$mtime"}}},
|
||||
]))
|
||||
with pg_conn() as pg, pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT count(*) FILTER (WHERE ok), count(*) FROM documents WHERE study=%s",
|
||||
(code,),
|
||||
)
|
||||
pg_ok, pg_total = cur.fetchone()
|
||||
out[alias] = {
|
||||
"code": code,
|
||||
"mongo_total": total,
|
||||
"mongo_active": total - deleted,
|
||||
"mongo_deleted": deleted,
|
||||
"by_ext": {r["_id"]: r["count"] for r in ext_breakdown},
|
||||
"fulltext_indexed": pg_ok,
|
||||
"fulltext_failed": pg_total - pg_ok,
|
||||
"oldest_mtime": serialize(mtime_minmax[0]["min_mtime"]) if mtime_minmax else None,
|
||||
"newest_mtime": serialize(mtime_minmax[0]["max_mtime"]) if mtime_minmax else None,
|
||||
}
|
||||
return {"studies": out}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
raise
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def search(
|
||||
query: str,
|
||||
study: Optional[Union[str, list]] = None,
|
||||
ext: Optional[Union[str, list]] = None,
|
||||
since: Optional[str] = None,
|
||||
folder: Optional[str] = None,
|
||||
limit: int = 15,
|
||||
with_metadata: bool = True,
|
||||
) -> dict:
|
||||
"""PRIMARY TOOL — fulltext search across all parsed documents in both studies.
|
||||
|
||||
query: search expression in PostgreSQL websearch_to_tsquery syntax:
|
||||
adverse event -> AND (both must appear)
|
||||
"adverse event" -> exact phrase
|
||||
adverse OR serious -> OR
|
||||
adverse -mild -> exclude
|
||||
study: "MDD3003", "UCO3001", or list. None = both.
|
||||
ext: filter file types: ["pdf", "docx", "xlsx", "xlsm", "pptx", "eml", "msg", "txt", "csv"]
|
||||
since: ISO date "YYYY-MM-DD" — only files modified on/after this date
|
||||
folder: substring match against any parent folder name (e.g. "CRF", "Training")
|
||||
limit: max results (default 15, max 100)
|
||||
with_metadata: if True, also fetch content.* metadata from Mongo (author, pages, sheets, EML headers)
|
||||
|
||||
Returns ranked results with `snippet` showing matches highlighted with <<...>>.
|
||||
Use `read_document` to fetch full body of a specific hit.
|
||||
"""
|
||||
try:
|
||||
studies = resolve_studies(study)
|
||||
exts = normalize_exts(ext)
|
||||
since_dt = parse_since(since)
|
||||
limit = min(max(1, limit), 100)
|
||||
|
||||
sql = """
|
||||
WITH q AS (
|
||||
SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
|
||||
)
|
||||
SELECT
|
||||
d.id, d.mongo_id, d.study, d.path, d.rel_path, d.name, d.ext,
|
||||
d.size_bytes, d.mtime, d.body_length,
|
||||
ts_rank(d.tsv, q.tsq) AS rank,
|
||||
ts_headline('soubory'::regconfig,
|
||||
left(d.body, 200000),
|
||||
q.tsq,
|
||||
'MaxFragments=3, MinWords=4, MaxWords=18, '
|
||||
'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
|
||||
FROM documents d, q
|
||||
WHERE d.tsv @@ q.tsq
|
||||
AND d.ok = TRUE
|
||||
AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
|
||||
AND (%(exts)s::text[] IS NULL OR d.ext = ANY(%(exts)s::text[]))
|
||||
AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
|
||||
ORDER BY rank DESC, d.mtime DESC NULLS LAST
|
||||
LIMIT %(limit)s
|
||||
"""
|
||||
params = {"query": query, "studies": studies, "exts": exts,
|
||||
"since": since_dt, "limit": limit}
|
||||
|
||||
with pg_conn() as pg, pg.cursor() as cur:
|
||||
cur.execute(sql, params)
|
||||
cols = [c.name for c in cur.description]
|
||||
rows = [dict(zip(cols, r)) for r in cur.fetchall()]
|
||||
|
||||
# filter by folder via Mongo (PG nema parent_folders)
|
||||
meta_by_path: dict[str, dict] = {}
|
||||
if rows and (with_metadata or folder):
|
||||
by_study: dict[str, list[str]] = {}
|
||||
for r in rows:
|
||||
by_study.setdefault(r["study"], []).append(r["path"])
|
||||
for code, paths in by_study.items():
|
||||
proj = {"path": 1, "parent_folders": 1, "dates_in_name": 1}
|
||||
if with_metadata:
|
||||
proj["content"] = 1
|
||||
for d in mongo[MONGO_DB][code].find({"path": {"$in": paths}}, proj):
|
||||
meta_by_path[d["path"]] = d
|
||||
|
||||
if folder:
|
||||
needle = folder.lower()
|
||||
kept = []
|
||||
for r in rows:
|
||||
folders = (meta_by_path.get(r["path"]) or {}).get("parent_folders") or []
|
||||
if any(needle in (f or "").lower() for f in folders):
|
||||
kept.append(r)
|
||||
rows = kept
|
||||
|
||||
results = []
|
||||
for r in rows:
|
||||
mongo_doc = meta_by_path.get(r["path"]) or {}
|
||||
results.append({
|
||||
"study": r["study"],
|
||||
"path": r["path"],
|
||||
"rel_path": r["rel_path"],
|
||||
"name": r["name"],
|
||||
"ext": r["ext"],
|
||||
"size_mb": round((r["size_bytes"] or 0) / 1024 / 1024, 2),
|
||||
"mtime": serialize(r["mtime"]),
|
||||
"body_length": r["body_length"],
|
||||
"rank": round(float(r["rank"]), 5),
|
||||
"snippet": (r["snippet"] or "").strip(),
|
||||
"mongo_id": r["mongo_id"],
|
||||
"dates_in_name": mongo_doc.get("dates_in_name"),
|
||||
"metadata": short_meta(mongo_doc.get("content") or {}) if with_metadata else None,
|
||||
})
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"filters": {"study": studies, "ext": exts, "since": since,
|
||||
"folder": folder, "limit": limit},
|
||||
"count": len(results),
|
||||
"results": results,
|
||||
"tip": "Use read_document(path=...) to fetch full body of any hit.",
|
||||
}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e), "query": query}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def read_document(
|
||||
path: Optional[str] = None,
|
||||
mongo_id: Optional[str] = None,
|
||||
offset: int = 0,
|
||||
length: int = DEFAULT_BODY_CHARS,
|
||||
around_match: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Read the full parsed text of one document (PG body column) + its Mongo metadata.
|
||||
|
||||
Identify the document by EITHER `path` (absolute) OR `mongo_id`.
|
||||
offset, length: slice the body (default first 8000 chars). length capped at 200000.
|
||||
around_match: if given, return up to 3 windows of ~1000 chars centered on the first matches
|
||||
of this substring (case-insensitive). Useful to jump to a keyword in a long doc.
|
||||
|
||||
Body is truncated to fit; check `body_length` vs returned length to know if more exists.
|
||||
Use offset to page further (offset=8000, then 16000, ...).
|
||||
"""
|
||||
try:
|
||||
if not path and not mongo_id:
|
||||
return {"error": "Provide either path or mongo_id."}
|
||||
|
||||
length = min(max(1, length), MAX_BODY_CHARS)
|
||||
|
||||
sql = """
|
||||
SELECT id, mongo_id, study, path, rel_path, name, ext, sha256,
|
||||
size_bytes, mtime, body, body_length, extractor_version,
|
||||
extracted_at, ok, error
|
||||
FROM documents
|
||||
WHERE """ + ("path = %s" if path else "mongo_id = %s") + " LIMIT 1"
|
||||
|
||||
with pg_conn() as pg, pg.cursor() as cur:
|
||||
cur.execute(sql, (path or mongo_id,))
|
||||
row = cur.fetchone()
|
||||
cols = [c.name for c in cur.description]
|
||||
if not row:
|
||||
return {"error": "Document not found.", "path": path, "mongo_id": mongo_id}
|
||||
rec = dict(zip(cols, row))
|
||||
|
||||
body = rec.get("body") or ""
|
||||
|
||||
if around_match and body:
|
||||
needle = around_match.lower()
|
||||
hay = body.lower()
|
||||
windows = []
|
||||
start = 0
|
||||
while len(windows) < 3:
|
||||
pos = hay.find(needle, start)
|
||||
if pos < 0:
|
||||
break
|
||||
lo = max(0, pos - 400)
|
||||
hi = min(len(body), pos + 600)
|
||||
windows.append({"offset": lo, "text": body[lo:hi]})
|
||||
start = pos + len(needle)
|
||||
body_out = None
|
||||
slice_info = {"mode": "around_match", "match": around_match,
|
||||
"windows": windows, "windows_found": len(windows)}
|
||||
else:
|
||||
body_out = body[offset:offset + length]
|
||||
slice_info = {
|
||||
"mode": "slice", "offset": offset,
|
||||
"length_returned": len(body_out),
|
||||
"has_more": offset + length < len(body),
|
||||
"next_offset": offset + length if offset + length < len(body) else None,
|
||||
}
|
||||
|
||||
# Mongo metadata
|
||||
col_code = rec["study"]
|
||||
mdoc = mongo[MONGO_DB][col_code].find_one(
|
||||
{"path": rec["path"]},
|
||||
{"content": 1, "dates_in_name": 1, "parent_folders": 1, "tokens": 1},
|
||||
) or {}
|
||||
|
||||
out = {
|
||||
"study": rec["study"],
|
||||
"path": rec["path"],
|
||||
"rel_path": rec["rel_path"],
|
||||
"name": rec["name"],
|
||||
"ext": rec["ext"],
|
||||
"size_mb": round((rec["size_bytes"] or 0) / 1024 / 1024, 2),
|
||||
"mtime": serialize(rec["mtime"]),
|
||||
"sha256": rec["sha256"],
|
||||
"body_length": rec["body_length"],
|
||||
"extractor_version": rec["extractor_version"],
|
||||
"extracted_at": serialize(rec["extracted_at"]),
|
||||
"ok": rec["ok"],
|
||||
"error": rec["error"],
|
||||
"parent_folders": mdoc.get("parent_folders"),
|
||||
"dates_in_name": mdoc.get("dates_in_name"),
|
||||
"metadata": short_meta(mdoc.get("content") or {}),
|
||||
}
|
||||
if body_out is not None:
|
||||
out["body"] = body_out
|
||||
out["slice"] = slice_info
|
||||
return out
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_metadata(path: str) -> dict:
|
||||
"""Return raw Mongo document for one path (full content.*, parent_folders, dates_in_name,
|
||||
sha256, sizes, timestamps, tokens). Use when you need the full structured metadata —
|
||||
e.g. all sheet names of an XLSX, all attachments of an email, full author info.
|
||||
Does NOT return body text — use `read_document` for that.
|
||||
"""
|
||||
try:
|
||||
for code in STUDY_ALL:
|
||||
d = mongo[MONGO_DB][code].find_one({"path": path})
|
||||
if d:
|
||||
return serialize(d)
|
||||
return {"error": "Not found in any study collection.", "path": path}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def recent_files(
|
||||
study: Optional[Union[str, list]] = None,
|
||||
days: int = 7,
|
||||
ext: Optional[Union[str, list]] = None,
|
||||
limit: int = 30,
|
||||
) -> dict:
|
||||
"""List most recently modified files (no fulltext involved). Use for "what changed lately"
|
||||
or "what did I get this week" questions.
|
||||
|
||||
days: window from now (default 7). Set to 0 for no time filter (just top-N newest).
|
||||
"""
|
||||
try:
|
||||
studies = resolve_studies(study) or STUDY_ALL
|
||||
exts = normalize_exts(ext)
|
||||
limit = min(max(1, limit), 200)
|
||||
|
||||
q: dict = {"deleted_at": {"$exists": False}}
|
||||
if exts:
|
||||
q["ext"] = {"$in": exts}
|
||||
if days and days > 0:
|
||||
since_dt = datetime.now(timezone.utc) - timedelta(days=days)
|
||||
q["mtime"] = {"$gte": since_dt}
|
||||
|
||||
results = []
|
||||
for code in studies:
|
||||
for d in (mongo[MONGO_DB][code]
|
||||
.find(q, {"path": 1, "rel_path": 1, "name": 1, "ext": 1,
|
||||
"size_bytes": 1, "mtime": 1, "study": 1,
|
||||
"content.author": 1, "content.title": 1,
|
||||
"content.last_modified_by": 1})
|
||||
.sort("mtime", -1).limit(limit)):
|
||||
results.append({
|
||||
"study": d.get("study"),
|
||||
"path": d["path"],
|
||||
"rel_path": d.get("rel_path"),
|
||||
"name": d.get("name"),
|
||||
"ext": d.get("ext"),
|
||||
"size_mb": round((d.get("size_bytes") or 0) / 1024 / 1024, 2),
|
||||
"mtime": serialize(d.get("mtime")),
|
||||
"author": (d.get("content") or {}).get("author"),
|
||||
"title": (d.get("content") or {}).get("title"),
|
||||
"last_modified_by": (d.get("content") or {}).get("last_modified_by"),
|
||||
})
|
||||
results.sort(key=lambda r: r["mtime"] or "", reverse=True)
|
||||
return {"days": days, "count": len(results[:limit]), "results": results[:limit]}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def find_duplicates(
|
||||
study: Optional[Union[str, list]] = None,
|
||||
min_size_kb: int = 10,
|
||||
limit: int = 30,
|
||||
) -> dict:
|
||||
"""Find groups of files with identical content (same sha256) but at different paths.
|
||||
Reveals copies of the same document scattered across folders / studies.
|
||||
|
||||
min_size_kb: ignore tiny duplicate groups (default 10 KB)
|
||||
limit: max duplicate groups returned
|
||||
"""
|
||||
try:
|
||||
studies = resolve_studies(study) or STUDY_ALL
|
||||
pipeline = [
|
||||
{"$match": {"deleted_at": {"$exists": False},
|
||||
"size_bytes": {"$gte": min_size_kb * 1024}}},
|
||||
{"$group": {"_id": "$sha256",
|
||||
"count": {"$sum": 1},
|
||||
"size_bytes": {"$first": "$size_bytes"},
|
||||
"ext": {"$first": "$ext"},
|
||||
"paths": {"$push": {"study": "$study",
|
||||
"path": "$path",
|
||||
"rel_path": "$rel_path",
|
||||
"mtime": "$mtime"}}}},
|
||||
{"$match": {"count": {"$gte": 2}}},
|
||||
{"$sort": {"size_bytes": -1, "count": -1}},
|
||||
{"$limit": limit},
|
||||
]
|
||||
|
||||
all_groups: dict = {}
|
||||
for code in studies:
|
||||
for g in mongo[MONGO_DB][code].aggregate(pipeline):
|
||||
sha = g["_id"]
|
||||
if sha in all_groups:
|
||||
all_groups[sha]["count"] += g["count"]
|
||||
all_groups[sha]["paths"].extend(g["paths"])
|
||||
else:
|
||||
all_groups[sha] = {
|
||||
"sha256": sha, "count": g["count"], "ext": g["ext"],
|
||||
"size_mb": round(g["size_bytes"] / 1024 / 1024, 2),
|
||||
"paths": g["paths"],
|
||||
}
|
||||
|
||||
groups = sorted(all_groups.values(),
|
||||
key=lambda x: (x["size_mb"], x["count"]), reverse=True)[:limit]
|
||||
for g in groups:
|
||||
for p in g["paths"]:
|
||||
p["mtime"] = serialize(p.get("mtime"))
|
||||
return {
|
||||
"filters": {"study": studies, "min_size_kb": min_size_kb},
|
||||
"group_count": len(groups),
|
||||
"wasted_mb_estimate": round(sum(g["size_mb"] * (g["count"] - 1) for g in groups), 2),
|
||||
"groups": groups,
|
||||
}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def by_author(
|
||||
name: str,
|
||||
study: Optional[Union[str, list]] = None,
|
||||
ext: Optional[Union[str, list]] = None,
|
||||
limit: int = 30,
|
||||
) -> dict:
|
||||
"""Find documents where content.author OR content.last_modified_by matches `name` (case-insensitive substring).
|
||||
Works for DOCX/XLSX/PPTX/PDF embedded metadata. Use for "what did X write" or "who edited this".
|
||||
"""
|
||||
try:
|
||||
studies = resolve_studies(study) or STUDY_ALL
|
||||
exts = normalize_exts(ext)
|
||||
limit = min(max(1, limit), 200)
|
||||
|
||||
rx = {"$regex": name, "$options": "i"}
|
||||
q: dict = {"deleted_at": {"$exists": False},
|
||||
"$or": [{"content.author": rx},
|
||||
{"content.last_modified_by": rx}]}
|
||||
if exts:
|
||||
q["ext"] = {"$in": exts}
|
||||
|
||||
results = []
|
||||
for code in studies:
|
||||
for d in (mongo[MONGO_DB][code]
|
||||
.find(q, {"path": 1, "rel_path": 1, "name": 1, "ext": 1,
|
||||
"size_bytes": 1, "mtime": 1, "study": 1, "content": 1})
|
||||
.sort("mtime", -1).limit(limit)):
|
||||
c = d.get("content") or {}
|
||||
results.append({
|
||||
"study": d.get("study"),
|
||||
"path": d["path"],
|
||||
"rel_path": d.get("rel_path"),
|
||||
"name": d.get("name"),
|
||||
"ext": d.get("ext"),
|
||||
"mtime": serialize(d.get("mtime")),
|
||||
"size_mb": round((d.get("size_bytes") or 0) / 1024 / 1024, 2),
|
||||
"author": c.get("author"),
|
||||
"last_modified_by": c.get("last_modified_by"),
|
||||
"title": c.get("title"),
|
||||
})
|
||||
results.sort(key=lambda r: r["mtime"] or "", reverse=True)
|
||||
return {"author_match": name, "count": len(results[:limit]), "results": results[:limit]}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def browse_folder(
|
||||
folder: str,
|
||||
study: Optional[Union[str, list]] = None,
|
||||
ext: Optional[Union[str, list]] = None,
|
||||
limit: int = 100,
|
||||
) -> dict:
|
||||
"""List files where any parent folder name contains `folder` (case-insensitive substring match).
|
||||
Use for "show me what's in the CRF folder" or "what's in Training". Returns just metadata,
|
||||
no body text. Files sorted by relative path.
|
||||
"""
|
||||
try:
|
||||
studies = resolve_studies(study) or STUDY_ALL
|
||||
exts = normalize_exts(ext)
|
||||
limit = min(max(1, limit), 500)
|
||||
|
||||
rx = {"$regex": folder, "$options": "i"}
|
||||
q: dict = {"deleted_at": {"$exists": False}, "parent_folders": rx}
|
||||
if exts:
|
||||
q["ext"] = {"$in": exts}
|
||||
|
||||
results = []
|
||||
for code in studies:
|
||||
for d in (mongo[MONGO_DB][code]
|
||||
.find(q, {"path": 1, "rel_path": 1, "name": 1, "ext": 1,
|
||||
"size_bytes": 1, "mtime": 1, "study": 1,
|
||||
"parent_folders": 1, "dates_in_name": 1})
|
||||
.sort("rel_path", 1).limit(limit)):
|
||||
results.append({
|
||||
"study": d.get("study"),
|
||||
"path": d["path"],
|
||||
"rel_path": d.get("rel_path"),
|
||||
"name": d.get("name"),
|
||||
"ext": d.get("ext"),
|
||||
"mtime": serialize(d.get("mtime")),
|
||||
"size_mb": round((d.get("size_bytes") or 0) / 1024 / 1024, 2),
|
||||
"parent_folders": d.get("parent_folders"),
|
||||
"dates_in_name": d.get("dates_in_name"),
|
||||
})
|
||||
return {"folder_match": folder, "count": len(results), "results": results}
|
||||
except Exception as e:
|
||||
log(traceback.format_exc())
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
log("MCP soubory server started (FastMCP)")
|
||||
mcp.run()
|
||||
@@ -0,0 +1,210 @@
|
||||
# Příklady dotazů — MCP `soubory`
|
||||
|
||||
10 příkladů od nejjednoduššího po nejsložitější. Každý je nejdřív stručně, pak rozepsaný.
|
||||
|
||||
Volání je přes `search(...)`. V Claude chatu se ptáš normálně česky ("najdi mi…") a Claude pod kapotou volá `mcp__soubory__search(...)`. Tady ukazuju **přímé volání** tak, abys viděl co se kombinuje.
|
||||
|
||||
---
|
||||
|
||||
## Přehled (od nejlehčího po nejtěžší)
|
||||
|
||||
| # | Příklad |
|
||||
|---|---|
|
||||
| 1 | `search("randomization")` |
|
||||
| 2 | `search("adverse event")` |
|
||||
| 3 | `search('"protocol deviation"')` |
|
||||
| 4 | `search("randomization", ext=["xlsx","xlsm"])` |
|
||||
| 5 | `search("SAE", study="UCO3001", ext=["eml","msg"])` |
|
||||
| 6 | `search('"kit number"', folder="CRF", since="2025-06-01")` |
|
||||
| 7 | `search("adverse OR serious -mild")` |
|
||||
| 8 | `search('"serious adverse event" -draft -obsolete', ext=["docx","pdf"])` |
|
||||
| 9 | `search('icotrokinra placebo', study="UCO3001", folder="Training", limit=30)` |
|
||||
| 10 | `search('"lot expiration" OR "expirační" OR "expiry"', ext=["eml","msg","pdf"], since="2025-01-01")` |
|
||||
|
||||
---
|
||||
|
||||
## 1. Nejjednodušší — jedno slovo
|
||||
|
||||
```python
|
||||
search("randomization")
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde všechny dokumenty z obou studií, které kdekoli v textu obsahují slovo "randomization". Bez filtru typu souboru, studie, ani data. Vrátí 15 nejlépe rankovaných.
|
||||
|
||||
**Kdy použít:** Když máš jen obecné slovo a nevíš kde to může být. Dobré pro první nástřel — uvidíš, ve kterých typech souborů a v kterých složkách se to vyskytuje.
|
||||
|
||||
**Trik:** Slovník indexu používá `unaccent`, takže `príloha` najde i `priloha` (diakritika neřeší).
|
||||
|
||||
---
|
||||
|
||||
## 2. Dvě slova — implicitní AND
|
||||
|
||||
```python
|
||||
search("adverse event")
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde dokumenty, kde se vyskytují **obě** slova "adverse" a "event" — ale **kdekoli v dokumentu**, nemusí být vedle sebe. Mohou být klidně na různých stranách.
|
||||
|
||||
**Kdy použít:** Když chceš zúžit širší slovo (`adverse` samotné by našlo i `adversely`). Dvě slova = silnější rank.
|
||||
|
||||
**Rozdíl proti #3:** "adverse" může být na straně 5 a "event" na straně 150 — pořád match.
|
||||
|
||||
---
|
||||
|
||||
## 3. Přesná fráze
|
||||
|
||||
```python
|
||||
search('"protocol deviation"')
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde dokumenty, kde jsou tato dvě slova **přímo vedle sebe** v tomto pořadí. "protocol of deviation" už nematchne, "deviation from protocol" taky ne.
|
||||
|
||||
**Kdy použít:** Pro odborné termíny, názvy formulářů, ustálené fráze. Mnohem ostřejší než AND.
|
||||
|
||||
**Pozor:** Uvozovky musí být přesně `"..."` (PowerShell může vyžadovat escape: `'"protocol deviation"'`).
|
||||
|
||||
---
|
||||
|
||||
## 4. Filtr typu souboru
|
||||
|
||||
```python
|
||||
search("randomization", ext=["xlsx", "xlsm"])
|
||||
```
|
||||
|
||||
**Co to dělá:** Stejné jako #1, ale jen v Excelech (`.xlsx` + `.xlsm`). Užitečné když víš, že to bude v tabulce — typicky randomizační listy, IWRS exporty.
|
||||
|
||||
**Kdy použít:** Když chceš najít data, ne dokumentaci. Excel = data tabulky, PDF/DOCX = popis.
|
||||
|
||||
**Tip:** Metadata v odpovědi obsahují `sheet_names` — uvidíš ve kterých listech to může být. Pak otevřeš ten Excel rovnou na správném listu.
|
||||
|
||||
---
|
||||
|
||||
## 5. Studie + typ — kombinovaný filtr
|
||||
|
||||
```python
|
||||
search("SAE", study="UCO3001", ext=["eml", "msg"])
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde emaily (EML i MSG) z **UCO3001 studie**, které obsahují slovo "SAE" (Serious Adverse Event). Metadata vrátí `from`, `to`, `subject`, `date`, počet příloh.
|
||||
|
||||
**Kdy použít:** "Kdo mi psal o SAE případu" — typický audit dotaz.
|
||||
|
||||
**Trik:** Kombinace `study + ext` je výkonná — Postgres má index `(study, ext)` přímo na to.
|
||||
|
||||
---
|
||||
|
||||
## 6. Tři filtry — fráze + složka + datum
|
||||
|
||||
```python
|
||||
search('"kit number"', folder="CRF", since="2025-06-01")
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde dokumenty obsahující frázi "kit number", ale jen ty **uložené v jakékoli složce s "CRF" v názvu**, a **modifikované od 1. června 2025** dál.
|
||||
|
||||
**Kdy použít:** Když si pamatuješ kontext ("bylo to v CRF dokumentaci po SIVu") ale ne celý text.
|
||||
|
||||
**Jak to funguje pod kapotou:**
|
||||
1. Postgres najde fulltextové matche
|
||||
2. Mongo dotáhne `parent_folders` a `mtime`
|
||||
3. Filtruje se v Pythonu — proto se to dělá jako AND nad všemi třemi
|
||||
|
||||
---
|
||||
|
||||
## 7. OR + NOT — logické operátory
|
||||
|
||||
```python
|
||||
search("adverse OR serious -mild")
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde dokumenty, kde je **buď** "adverse" **nebo** "serious", ale **nesmí** obsahovat slovo "mild".
|
||||
|
||||
**Kdy použít:** Když máš více synonym a chceš jeden dotaz místo tří. `-mild` vyloučí typicky tréninkové materiály ("mild AE example") nebo nezávažné případy.
|
||||
|
||||
**Důležité — priorita operátorů:**
|
||||
- `A OR B C` se vyhodnotí jako `A OR (B AND C)`
|
||||
- `websearch_to_tsquery` **nemá závorky** — nemůžeš to přeskupit
|
||||
- Když potřebuješ jiné pořadí, rozděl na dva dotazy
|
||||
|
||||
---
|
||||
|
||||
## 8. Fráze + dvě vyloučení + ext filtr
|
||||
|
||||
```python
|
||||
search('"serious adverse event" -draft -obsolete', ext=["docx", "pdf"])
|
||||
```
|
||||
|
||||
**Co to dělá:** Přesná fráze "serious adverse event", ale **bez** dokumentů obsahujících slova "draft" nebo "obsolete", a jen v Wordech a PDFkách.
|
||||
|
||||
**Kdy použít:** Když chceš jen **finální** verze dokumentů. V Dropboxu typicky najdeš 5 verzí toho samého (draft, v0.9, v1.0, v1.0_FINAL, OBSOLETE) — tohle odřízne šum.
|
||||
|
||||
**Kombinace technik:** fráze + vícenásobné NOT + typ. Reálné dotazy v práci vypadají takhle.
|
||||
|
||||
---
|
||||
|
||||
## 9. Dvě AND slova + 2 filtry + víc výsledků
|
||||
|
||||
```python
|
||||
search("icotrokinra placebo", study="UCO3001", folder="Training", limit=30)
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde tréninkové materiály z UCO3001, kde se mluví **jak o léku icotrokinra, tak o placebu** (typicky srovnání ramen studie). `limit=30` místo defaultních 15.
|
||||
|
||||
**Kdy použít:**
|
||||
- Onboarding nového člena týmu — "dej mi všechny prezentace co srovnávají větve studie"
|
||||
- Příprava na monitorovací návštěvu
|
||||
- Hledání edukačního obsahu pro pacienty
|
||||
|
||||
**Proč 30:** Tréninkových materiálů bývá hodně verzí (každý SIV nové), default 15 by jich pravděpodobně neukázal všechny.
|
||||
|
||||
---
|
||||
|
||||
## 10. Nejtěžší — vícejazyčné OR + tři typy + datum
|
||||
|
||||
```python
|
||||
search(
|
||||
'"lot expiration" OR "expirační" OR "expiry"',
|
||||
ext=["eml", "msg", "pdf"],
|
||||
since="2025-01-01"
|
||||
)
|
||||
```
|
||||
|
||||
**Co to dělá:** Najde **dokumentaci a komunikaci o expiraci léků** z roku 2025. Hledá ve třech jazykových variantách (EN fráze "lot expiration", CZ "expirační", krátké "expiry") napříč emaily, MSG soubory a PDFkami.
|
||||
|
||||
**Kdy použít:** Typický compliance dotaz — "ukaž mi všechno co tento rok řešilo expiraci kitů". Kombinuje:
|
||||
- **vícejazyčnost** (sponsor píše anglicky, ty notifikuješ česky)
|
||||
- **více kanálů** (emaily i oficiální PDF dokumenty)
|
||||
- **časové okno** (relevantní jen letošek)
|
||||
|
||||
**Pod kapotou:**
|
||||
1. PG fulltext spojí 3 OR větve do jednoho tsquery
|
||||
2. Filtr `ext` IN ('eml','msg','pdf') na PG úrovni
|
||||
3. Filtr `since` na sloupec `mtime` (indexovaný)
|
||||
4. Mongo metadata: u emailů `from/to/subject`, u PDF `pages/author`
|
||||
5. Výsledky setřízené podle `ts_rank` (nejvíc relevantní nahoře)
|
||||
|
||||
**Tohle je ten případ kdy nakombinuješ úplně všechno** a Claude ti pak v chatu napíše: *"Našel jsem 8 dokumentů. 3 emaily od monitorky z března, 1 PDF notifikace IWRS z června, ..."*. Přesně proto jsme to stavěli.
|
||||
|
||||
---
|
||||
|
||||
## Shrnutí — pravidla palce
|
||||
|
||||
- **Začni jednoduše** (1 slovo) → uvidíš co je v korpusu → zužuj
|
||||
- **Fráze (`"..."`)** je vždy ostřejší než AND
|
||||
- **`-slovo`** je tvůj nejlepší kamarád proti šumu (draft, obsolete, training)
|
||||
- **`ext=[...]`** dramaticky zrychlí dotaz a vyfiltruje formátový šum
|
||||
- **`folder=...`** funguje skvěle pokud máš konzistentní strukturu složek (#190 eCRF, #200 Training, …)
|
||||
- **`since=...`** používej kdykoli tě zajímá "co je nového"
|
||||
- **Diakritika neřeší** — `expirace` najde i `expirační` (oboje má root `expira`)
|
||||
- **Není wildcard** — `randomiz*` nefunguje, ale `randomization` a `randomized` jsou různá slova → dej je do OR
|
||||
|
||||
---
|
||||
|
||||
## Co když fulltext nestačí?
|
||||
|
||||
Jiné nástroje MCP:
|
||||
|
||||
- **`by_author("Hazzard")`** — kdo psal/upravoval (DOCX/PPTX metadata)
|
||||
- **`recent_files(days=7)`** — co se změnilo bez ohledu na obsah
|
||||
- **`find_duplicates()`** — kolikrát mám stejný soubor
|
||||
- **`browse_folder("CRF")`** — výpis složky bez fulltextu
|
||||
- **`read_document(path=..., around_match="randomization")`** — skok přímo na slovo v dlouhém dokumentu
|
||||
@@ -0,0 +1,203 @@
|
||||
"""
|
||||
==============================================================================
|
||||
Skript: query_v0.1.py
|
||||
Verze: 0.1
|
||||
Datum: 2026-06-03
|
||||
Autor: vladimir.buzalka
|
||||
Popis: Hybridni dotaz: PostgreSQL fulltext (tsv + ts_rank + ts_headline)
|
||||
+ obohaceni z MongoDB (content.* - autor, listy, EML hlavicky,
|
||||
datumy v nazvu).
|
||||
|
||||
Pouziti:
|
||||
python query_v0.1.py "adverse event"
|
||||
python query_v0.1.py "protocol deviation" --study MDD3003 --ext docx pptx
|
||||
python query_v0.1.py "randomization" --ext xlsx xlsm --limit 20
|
||||
python query_v0.1.py "lot expiration" --since 2026-01-01
|
||||
|
||||
Syntaxe dotazu = websearch_to_tsquery:
|
||||
adverse event -> AND
|
||||
"adverse event" -> fraze
|
||||
adverse OR serious -> OR
|
||||
adverse -mild -> NOT
|
||||
==============================================================================
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import psycopg
|
||||
from pymongo import MongoClient
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "soubory"
|
||||
STUDY_COLLECTIONS = {
|
||||
"MDD3003": "42847922MDD3003",
|
||||
"UCO3001": "77242113UCO3001",
|
||||
}
|
||||
|
||||
PG_DSN = ("host=192.168.1.76 port=5432 dbname=MongoSoubory "
|
||||
"user=vladimir.buzalka password=Vlado7309208104++")
|
||||
|
||||
|
||||
SEARCH_SQL = """
|
||||
WITH q AS (
|
||||
SELECT websearch_to_tsquery('soubory'::regconfig, %(query)s) AS tsq
|
||||
)
|
||||
SELECT
|
||||
d.study,
|
||||
d.path,
|
||||
d.rel_path,
|
||||
d.name,
|
||||
d.ext,
|
||||
d.size_bytes,
|
||||
d.mtime,
|
||||
d.body_length,
|
||||
ts_rank(d.tsv, q.tsq) AS rank,
|
||||
ts_headline('soubory'::regconfig,
|
||||
left(d.body, 200000),
|
||||
q.tsq,
|
||||
'MaxFragments=2, MinWords=4, MaxWords=18, '
|
||||
'StartSel=<<, StopSel=>>, FragmentDelimiter= ... ') AS snippet
|
||||
FROM documents d, q
|
||||
WHERE d.tsv @@ q.tsq
|
||||
AND d.ok = TRUE
|
||||
AND (%(studies)s::text[] IS NULL OR d.study = ANY(%(studies)s::text[]))
|
||||
AND (%(exts)s::text[] IS NULL OR d.ext = ANY(%(exts)s::text[]))
|
||||
AND (%(since)s::timestamptz IS NULL OR d.mtime >= %(since)s::timestamptz)
|
||||
ORDER BY rank DESC, d.mtime DESC NULLS LAST
|
||||
LIMIT %(limit)s
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Hybridni dotaz PG fulltext + Mongo metadata",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
p.add_argument("query", help="Vyhledavaci vyraz (websearch syntaxe)")
|
||||
p.add_argument("--study", nargs="*",
|
||||
choices=sorted(STUDY_COLLECTIONS.keys()),
|
||||
help="Filtr studie (default: obe)")
|
||||
p.add_argument("--ext", nargs="*",
|
||||
help="Filtr pripon (napr. pdf docx xlsx)")
|
||||
p.add_argument("--since",
|
||||
help="mtime >= datum (YYYY-MM-DD)")
|
||||
p.add_argument("--limit", type=int, default=15,
|
||||
help="Pocet vysledku (default 15)")
|
||||
p.add_argument("--no-meta", action="store_true",
|
||||
help="Vynechat doplneni z Mongo")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def _short(s, n=60):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).replace("\n", " ").strip()
|
||||
return s if len(s) <= n else s[:n] + "..."
|
||||
|
||||
|
||||
def _fmt_meta(study_code: str, content: dict) -> str:
|
||||
"""Vrati jednoradkove shrnuti zajimavych poli z content.*"""
|
||||
if not content:
|
||||
return "(bez content)"
|
||||
bits = []
|
||||
if not content.get("ok", True):
|
||||
return f"content.error: {content.get('error', '?')}"
|
||||
for key in ("title", "subject", "author", "last_modified_by",
|
||||
"from", "to", "subject", "date"):
|
||||
v = content.get(key)
|
||||
if v:
|
||||
bits.append(f"{key}={_short(v, 40)}")
|
||||
if "pages" in content:
|
||||
bits.append(f"pages={content['pages']}")
|
||||
if "slides" in content:
|
||||
bits.append(f"slides={content['slides']}")
|
||||
if "total_sheets" in content:
|
||||
sheet_names = [s.get("name") for s in content.get("sheets", [])][:4]
|
||||
bits.append(f"sheets={content['total_sheets']} {sheet_names}")
|
||||
if "paragraphs" in content:
|
||||
bits.append(f"paragraphs={content['paragraphs']}")
|
||||
if "has_attachments" in content:
|
||||
bits.append(f"attachments={len(content.get('attachments', []))}")
|
||||
return " | ".join(bits) if bits else "(content bez vyznamnych poli)"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
studies = None
|
||||
if args.study:
|
||||
studies = [STUDY_COLLECTIONS[s] for s in args.study]
|
||||
|
||||
exts = None
|
||||
if args.ext:
|
||||
exts = [e.lower().lstrip(".") for e in args.ext]
|
||||
|
||||
since = None
|
||||
if args.since:
|
||||
since = datetime.strptime(args.since, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
||||
|
||||
params = {
|
||||
"query": args.query,
|
||||
"studies": studies,
|
||||
"exts": exts,
|
||||
"since": since,
|
||||
"limit": args.limit,
|
||||
}
|
||||
|
||||
with psycopg.connect(PG_DSN, connect_timeout=10) as pg, pg.cursor() as cur:
|
||||
cur.execute(SEARCH_SQL, params)
|
||||
cols = [c.name for c in cur.description]
|
||||
rows = [dict(zip(cols, r)) for r in cur.fetchall()]
|
||||
|
||||
if not rows:
|
||||
print(f"Zadne vysledky pro: {args.query!r}")
|
||||
return 0
|
||||
|
||||
# obohaceni z Mongo - jeden round-trip na studii
|
||||
meta_by_path: dict[str, dict] = {}
|
||||
if not args.no_meta:
|
||||
mongo = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
db = mongo[MONGO_DB]
|
||||
by_study: dict[str, list[str]] = {}
|
||||
for r in rows:
|
||||
by_study.setdefault(r["study"], []).append(r["path"])
|
||||
for study_code, paths in by_study.items():
|
||||
for d in db[study_code].find(
|
||||
{"path": {"$in": paths}},
|
||||
{"path": 1, "content": 1, "dates_in_name": 1, "parent_folders": 1},
|
||||
):
|
||||
meta_by_path[d["path"]] = d
|
||||
mongo.close()
|
||||
|
||||
print(f"\n=== Dotaz: {args.query!r} vysledku: {len(rows)} ===\n")
|
||||
for i, r in enumerate(rows, 1):
|
||||
size_mb = (r["size_bytes"] or 0) / 1024 / 1024
|
||||
mtime = r["mtime"].strftime("%Y-%m-%d") if r["mtime"] else "?"
|
||||
print(f"[{i:>2}] rank={r['rank']:.4f} {r['study']} "
|
||||
f"{r['ext']:<4} {size_mb:5.1f}MB {mtime} "
|
||||
f"({r['body_length']} znaku)")
|
||||
print(f" {r['rel_path'] or r['name']}")
|
||||
snippet = (r["snippet"] or "").replace("\n", " ").strip()
|
||||
if snippet:
|
||||
print(f" >> {snippet}")
|
||||
if not args.no_meta:
|
||||
m = meta_by_path.get(r["path"]) or {}
|
||||
content_line = _fmt_meta(r["study"], m.get("content") or {})
|
||||
print(f" meta: {content_line}")
|
||||
if m.get("dates_in_name"):
|
||||
print(f" dates_in_name: {m['dates_in_name']}")
|
||||
print()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nPreruseno uzivatelem")
|
||||
sys.exit(130)
|
||||
Reference in New Issue
Block a user