Fix pgvector integration: VECTOR(512) for voyage-3-lite, register_vector on connect

- voyage-3-lite returns 512 dims (not 1024) — migrated column + schema
- register_vector now called once at connection time, not per-query
- Removes per-function register_vector calls that caused type cast conflicts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-07 06:16:16 +02:00
parent 66475d48d2
commit 82d7bc375f
3 changed files with 98 additions and 11 deletions
+86
View File
@@ -0,0 +1,86 @@
import win32com.client
import pandas as pd
from pathlib import Path
OUT_XLSX = Path(r"C:\Temp\GAL_export.xlsx")
def safe_get(obj, attr):
try:
return getattr(obj, attr)
except Exception:
return None
outlook = win32com.client.Dispatch("Outlook.Application")
ns = outlook.GetNamespace("MAPI")
gal = ns.GetGlobalAddressList()
entries = gal.AddressEntries
rows = []
print(f"Počet položek v GAL: {entries.Count}")
for i in range(1, entries.Count + 1): # Outlook COM je 1-based
try:
entry = entries.Item(i)
name = safe_get(entry, "Name")
address = safe_get(entry, "Address")
entry_type = safe_get(entry, "AddressEntryUserType")
smtp = None
job_title = None
department = None
company = None
office = None
phone = None
mobile = None
# Exchange user
try:
exch_user = entry.GetExchangeUser()
except Exception:
exch_user = None
if exch_user:
smtp = safe_get(exch_user, "PrimarySmtpAddress")
job_title = safe_get(exch_user, "JobTitle")
department = safe_get(exch_user, "Department")
company = safe_get(exch_user, "CompanyName")
office = safe_get(exch_user, "OfficeLocation")
phone = safe_get(exch_user, "BusinessTelephoneNumber")
mobile = safe_get(exch_user, "MobileTelephoneNumber")
# Distribution list
try:
exch_dl = entry.GetExchangeDistributionList()
except Exception:
exch_dl = None
if exch_dl and not smtp:
smtp = safe_get(exch_dl, "PrimarySmtpAddress")
rows.append({
"name": name,
"smtp": smtp,
"address": address,
"entry_type": entry_type,
"job_title": job_title,
"department": department,
"company": company,
"office": office,
"phone": phone,
"mobile": mobile,
})
except Exception as e:
rows.append({
"name": None,
"smtp": None,
"error": str(e),
})
df = pd.DataFrame(rows)
df.to_excel(OUT_XLSX, index=False)
print(f"Hotovo: {OUT_XLSX}")
+4 -2
View File
@@ -2,6 +2,7 @@
-- PostgreSQL, bez pgvector (embeddingy jako double precision[])
-- pg_trgm pro fuzzy matching (volitelné)
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- ─── Conversation sessions ────────────────────────────────────────────────────
@@ -56,7 +57,7 @@ CREATE TABLE IF NOT EXISTS kb_memories (
-- search
tags TEXT[] DEFAULT '{}',
importance FLOAT DEFAULT 0.5, -- 0..1
embedding double precision[], -- Voyage AI voyage-3-lite (1024-dim), Python-side similarity
embedding vector(512), -- Voyage AI voyage-3-lite (512-dim)
fts TSVECTOR,
-- lifecycle
@@ -94,4 +95,5 @@ CREATE INDEX IF NOT EXISTS kb_memories_project_idx ON kb_memories(project);
CREATE INDEX IF NOT EXISTS kb_memories_importance_idx ON kb_memories(importance DESC);
CREATE INDEX IF NOT EXISTS kb_memories_created_idx ON kb_memories(created_at DESC);
CREATE INDEX IF NOT EXISTS kb_memories_session_idx ON kb_memories(session_id);
-- Note: embedding je double precision[] — similarity se počítá Python-side po FTS pre-filtru
-- Vector index (aktivovat po ~1k řádcích pro lepší recall)
-- CREATE INDEX kb_memories_vec_idx ON kb_memories USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
+8 -9
View File
@@ -34,7 +34,8 @@ PG_PASSWORD = os.getenv("PG_PASSWORD", "Vlado7309208104++")
PG_DB = os.getenv("PG_DB", "knowledgebase")
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY", "")
EMBED_MODEL = "voyage-3-lite" # 1024-dim, fast & cheap
EMBED_MODEL = "voyage-3-lite" # 512-dim, fast & cheap
EMBED_DIM = 512
# ─── Logging ─────────────────────────────────────────────────────────────────
@@ -60,6 +61,12 @@ def get_conn() -> psycopg.Connection:
row_factory=dict_row,
autocommit=False,
)
# register_vector jednou na connection — umožní psycopg správně serializovat np.array jako vector
try:
from pgvector.psycopg import register_vector
register_vector(_conn)
except Exception as e:
log(f"pgvector register warning: {e}")
log(f"Connected to {PG_DB}@{PG_HOST}")
return _conn
@@ -156,9 +163,7 @@ def store_memory(
try:
with conn.transaction():
if embedding:
from pgvector.psycopg import register_vector
import numpy as np
register_vector(conn)
row = conn.execute(
"""
INSERT INTO kb_memories
@@ -312,9 +317,7 @@ def _insert_memory_in_tx(conn, data: dict):
"""Helper: insert memory within an existing transaction."""
embedding = data.get("embedding")
if embedding:
from pgvector.psycopg import register_vector
import numpy as np
register_vector(conn)
conn.execute(
"""
INSERT INTO kb_memories
@@ -420,9 +423,7 @@ def search(
query_emb = get_embedding(query)
if query_emb:
try:
from pgvector.psycopg import register_vector
import numpy as np
register_vector(conn)
vec_conditions = ["deleted = FALSE", "embedding IS NOT NULL"]
vec_params2: list[Any] = []
@@ -700,9 +701,7 @@ def update_memory(
params.append(content)
new_emb = get_embedding(f"{title or ''} {content}")
if new_emb:
from pgvector.psycopg import register_vector
import numpy as np
register_vector(conn)
updates.append("embedding = %s")
params.append(np.array(new_emb))
if title is not None: