Fix pgvector integration: VECTOR(512) for voyage-3-lite, register_vector on connect
- voyage-3-lite returns 512 dims (not 1024) — migrated column + schema - register_vector now called once at connection time, not per-query - Removes per-function register_vector calls that caused type cast conflicts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
import win32com.client
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
OUT_XLSX = Path(r"C:\Temp\GAL_export.xlsx")
|
||||
|
||||
def safe_get(obj, attr):
|
||||
try:
|
||||
return getattr(obj, attr)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
outlook = win32com.client.Dispatch("Outlook.Application")
|
||||
ns = outlook.GetNamespace("MAPI")
|
||||
|
||||
gal = ns.GetGlobalAddressList()
|
||||
entries = gal.AddressEntries
|
||||
|
||||
rows = []
|
||||
|
||||
print(f"Počet položek v GAL: {entries.Count}")
|
||||
|
||||
for i in range(1, entries.Count + 1): # Outlook COM je 1-based
|
||||
try:
|
||||
entry = entries.Item(i)
|
||||
|
||||
name = safe_get(entry, "Name")
|
||||
address = safe_get(entry, "Address")
|
||||
entry_type = safe_get(entry, "AddressEntryUserType")
|
||||
|
||||
smtp = None
|
||||
job_title = None
|
||||
department = None
|
||||
company = None
|
||||
office = None
|
||||
phone = None
|
||||
mobile = None
|
||||
|
||||
# Exchange user
|
||||
try:
|
||||
exch_user = entry.GetExchangeUser()
|
||||
except Exception:
|
||||
exch_user = None
|
||||
|
||||
if exch_user:
|
||||
smtp = safe_get(exch_user, "PrimarySmtpAddress")
|
||||
job_title = safe_get(exch_user, "JobTitle")
|
||||
department = safe_get(exch_user, "Department")
|
||||
company = safe_get(exch_user, "CompanyName")
|
||||
office = safe_get(exch_user, "OfficeLocation")
|
||||
phone = safe_get(exch_user, "BusinessTelephoneNumber")
|
||||
mobile = safe_get(exch_user, "MobileTelephoneNumber")
|
||||
|
||||
# Distribution list
|
||||
try:
|
||||
exch_dl = entry.GetExchangeDistributionList()
|
||||
except Exception:
|
||||
exch_dl = None
|
||||
|
||||
if exch_dl and not smtp:
|
||||
smtp = safe_get(exch_dl, "PrimarySmtpAddress")
|
||||
|
||||
rows.append({
|
||||
"name": name,
|
||||
"smtp": smtp,
|
||||
"address": address,
|
||||
"entry_type": entry_type,
|
||||
"job_title": job_title,
|
||||
"department": department,
|
||||
"company": company,
|
||||
"office": office,
|
||||
"phone": phone,
|
||||
"mobile": mobile,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
rows.append({
|
||||
"name": None,
|
||||
"smtp": None,
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
df.to_excel(OUT_XLSX, index=False)
|
||||
|
||||
print(f"Hotovo: {OUT_XLSX}")
|
||||
@@ -2,6 +2,7 @@
|
||||
-- PostgreSQL, bez pgvector (embeddingy jako double precision[])
|
||||
-- pg_trgm pro fuzzy matching (volitelné)
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
-- ─── Conversation sessions ────────────────────────────────────────────────────
|
||||
@@ -56,7 +57,7 @@ CREATE TABLE IF NOT EXISTS kb_memories (
|
||||
-- search
|
||||
tags TEXT[] DEFAULT '{}',
|
||||
importance FLOAT DEFAULT 0.5, -- 0..1
|
||||
embedding double precision[], -- Voyage AI voyage-3-lite (1024-dim), Python-side similarity
|
||||
embedding vector(512), -- Voyage AI voyage-3-lite (512-dim)
|
||||
fts TSVECTOR,
|
||||
|
||||
-- lifecycle
|
||||
@@ -94,4 +95,5 @@ CREATE INDEX IF NOT EXISTS kb_memories_project_idx ON kb_memories(project);
|
||||
CREATE INDEX IF NOT EXISTS kb_memories_importance_idx ON kb_memories(importance DESC);
|
||||
CREATE INDEX IF NOT EXISTS kb_memories_created_idx ON kb_memories(created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS kb_memories_session_idx ON kb_memories(session_id);
|
||||
-- Note: embedding je double precision[] — similarity se počítá Python-side po FTS pre-filtru
|
||||
-- Vector index (aktivovat po ~1k řádcích pro lepší recall)
|
||||
-- CREATE INDEX kb_memories_vec_idx ON kb_memories USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
||||
|
||||
@@ -34,7 +34,8 @@ PG_PASSWORD = os.getenv("PG_PASSWORD", "Vlado7309208104++")
|
||||
PG_DB = os.getenv("PG_DB", "knowledgebase")
|
||||
|
||||
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY", "")
|
||||
EMBED_MODEL = "voyage-3-lite" # 1024-dim, fast & cheap
|
||||
EMBED_MODEL = "voyage-3-lite" # 512-dim, fast & cheap
|
||||
EMBED_DIM = 512
|
||||
|
||||
# ─── Logging ─────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -60,6 +61,12 @@ def get_conn() -> psycopg.Connection:
|
||||
row_factory=dict_row,
|
||||
autocommit=False,
|
||||
)
|
||||
# register_vector jednou na connection — umožní psycopg správně serializovat np.array jako vector
|
||||
try:
|
||||
from pgvector.psycopg import register_vector
|
||||
register_vector(_conn)
|
||||
except Exception as e:
|
||||
log(f"pgvector register warning: {e}")
|
||||
log(f"Connected to {PG_DB}@{PG_HOST}")
|
||||
return _conn
|
||||
|
||||
@@ -156,9 +163,7 @@ def store_memory(
|
||||
try:
|
||||
with conn.transaction():
|
||||
if embedding:
|
||||
from pgvector.psycopg import register_vector
|
||||
import numpy as np
|
||||
register_vector(conn)
|
||||
row = conn.execute(
|
||||
"""
|
||||
INSERT INTO kb_memories
|
||||
@@ -312,9 +317,7 @@ def _insert_memory_in_tx(conn, data: dict):
|
||||
"""Helper: insert memory within an existing transaction."""
|
||||
embedding = data.get("embedding")
|
||||
if embedding:
|
||||
from pgvector.psycopg import register_vector
|
||||
import numpy as np
|
||||
register_vector(conn)
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO kb_memories
|
||||
@@ -420,9 +423,7 @@ def search(
|
||||
query_emb = get_embedding(query)
|
||||
if query_emb:
|
||||
try:
|
||||
from pgvector.psycopg import register_vector
|
||||
import numpy as np
|
||||
register_vector(conn)
|
||||
|
||||
vec_conditions = ["deleted = FALSE", "embedding IS NOT NULL"]
|
||||
vec_params2: list[Any] = []
|
||||
@@ -700,9 +701,7 @@ def update_memory(
|
||||
params.append(content)
|
||||
new_emb = get_embedding(f"{title or ''} {content}")
|
||||
if new_emb:
|
||||
from pgvector.psycopg import register_vector
|
||||
import numpy as np
|
||||
register_vector(conn)
|
||||
updates.append("embedding = %s")
|
||||
params.append(np.array(new_emb))
|
||||
if title is not None:
|
||||
|
||||
Reference in New Issue
Block a user