From 82d7bc375f0def5458856fe8fca9c60a4e469b71 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Sun, 7 Jun 2026 06:16:16 +0200 Subject: [PATCH] Fix pgvector integration: VECTOR(512) for voyage-3-lite, register_vector on connect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - voyage-3-lite returns 512 dims (not 1024) — migrated column + schema - register_vector now called once at connection time, not per-query - Removes per-function register_vector calls that caused type cast conflicts Co-Authored-By: Claude Sonnet 4.6 --- EmailsImport/GAL.py | 86 ++++++++++++++++++++++++++++++++++++++++ Knowledgebase/schema.sql | 6 ++- Knowledgebase/server.py | 17 ++++---- 3 files changed, 98 insertions(+), 11 deletions(-) create mode 100644 EmailsImport/GAL.py diff --git a/EmailsImport/GAL.py b/EmailsImport/GAL.py new file mode 100644 index 0000000..16336ad --- /dev/null +++ b/EmailsImport/GAL.py @@ -0,0 +1,86 @@ +import win32com.client +import pandas as pd +from pathlib import Path + +OUT_XLSX = Path(r"C:\Temp\GAL_export.xlsx") + +def safe_get(obj, attr): + try: + return getattr(obj, attr) + except Exception: + return None + +outlook = win32com.client.Dispatch("Outlook.Application") +ns = outlook.GetNamespace("MAPI") + +gal = ns.GetGlobalAddressList() +entries = gal.AddressEntries + +rows = [] + +print(f"Počet položek v GAL: {entries.Count}") + +for i in range(1, entries.Count + 1): # Outlook COM je 1-based + try: + entry = entries.Item(i) + + name = safe_get(entry, "Name") + address = safe_get(entry, "Address") + entry_type = safe_get(entry, "AddressEntryUserType") + + smtp = None + job_title = None + department = None + company = None + office = None + phone = None + mobile = None + + # Exchange user + try: + exch_user = entry.GetExchangeUser() + except Exception: + exch_user = None + + if exch_user: + smtp = safe_get(exch_user, "PrimarySmtpAddress") + job_title = safe_get(exch_user, "JobTitle") + department = safe_get(exch_user, "Department") + company = safe_get(exch_user, "CompanyName") + office = safe_get(exch_user, "OfficeLocation") + phone = safe_get(exch_user, "BusinessTelephoneNumber") + mobile = safe_get(exch_user, "MobileTelephoneNumber") + + # Distribution list + try: + exch_dl = entry.GetExchangeDistributionList() + except Exception: + exch_dl = None + + if exch_dl and not smtp: + smtp = safe_get(exch_dl, "PrimarySmtpAddress") + + rows.append({ + "name": name, + "smtp": smtp, + "address": address, + "entry_type": entry_type, + "job_title": job_title, + "department": department, + "company": company, + "office": office, + "phone": phone, + "mobile": mobile, + }) + + except Exception as e: + rows.append({ + "name": None, + "smtp": None, + "error": str(e), + }) + +df = pd.DataFrame(rows) +df.to_excel(OUT_XLSX, index=False) + +print(f"Hotovo: {OUT_XLSX}") \ No newline at end of file diff --git a/Knowledgebase/schema.sql b/Knowledgebase/schema.sql index 45e4d9c..ba4ae5a 100644 --- a/Knowledgebase/schema.sql +++ b/Knowledgebase/schema.sql @@ -2,6 +2,7 @@ -- PostgreSQL, bez pgvector (embeddingy jako double precision[]) -- pg_trgm pro fuzzy matching (volitelné) +CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS pg_trgm; -- ─── Conversation sessions ──────────────────────────────────────────────────── @@ -56,7 +57,7 @@ CREATE TABLE IF NOT EXISTS kb_memories ( -- search tags TEXT[] DEFAULT '{}', importance FLOAT DEFAULT 0.5, -- 0..1 - embedding double precision[], -- Voyage AI voyage-3-lite (1024-dim), Python-side similarity + embedding vector(512), -- Voyage AI voyage-3-lite (512-dim) fts TSVECTOR, -- lifecycle @@ -94,4 +95,5 @@ CREATE INDEX IF NOT EXISTS kb_memories_project_idx ON kb_memories(project); CREATE INDEX IF NOT EXISTS kb_memories_importance_idx ON kb_memories(importance DESC); CREATE INDEX IF NOT EXISTS kb_memories_created_idx ON kb_memories(created_at DESC); CREATE INDEX IF NOT EXISTS kb_memories_session_idx ON kb_memories(session_id); --- Note: embedding je double precision[] — similarity se počítá Python-side po FTS pre-filtru +-- Vector index (aktivovat po ~1k řádcích pro lepší recall) +-- CREATE INDEX kb_memories_vec_idx ON kb_memories USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); diff --git a/Knowledgebase/server.py b/Knowledgebase/server.py index 35f3b15..06fd0c1 100644 --- a/Knowledgebase/server.py +++ b/Knowledgebase/server.py @@ -34,7 +34,8 @@ PG_PASSWORD = os.getenv("PG_PASSWORD", "Vlado7309208104++") PG_DB = os.getenv("PG_DB", "knowledgebase") VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY", "") -EMBED_MODEL = "voyage-3-lite" # 1024-dim, fast & cheap +EMBED_MODEL = "voyage-3-lite" # 512-dim, fast & cheap +EMBED_DIM = 512 # ─── Logging ───────────────────────────────────────────────────────────────── @@ -60,6 +61,12 @@ def get_conn() -> psycopg.Connection: row_factory=dict_row, autocommit=False, ) + # register_vector jednou na connection — umožní psycopg správně serializovat np.array jako vector + try: + from pgvector.psycopg import register_vector + register_vector(_conn) + except Exception as e: + log(f"pgvector register warning: {e}") log(f"Connected to {PG_DB}@{PG_HOST}") return _conn @@ -156,9 +163,7 @@ def store_memory( try: with conn.transaction(): if embedding: - from pgvector.psycopg import register_vector import numpy as np - register_vector(conn) row = conn.execute( """ INSERT INTO kb_memories @@ -312,9 +317,7 @@ def _insert_memory_in_tx(conn, data: dict): """Helper: insert memory within an existing transaction.""" embedding = data.get("embedding") if embedding: - from pgvector.psycopg import register_vector import numpy as np - register_vector(conn) conn.execute( """ INSERT INTO kb_memories @@ -420,9 +423,7 @@ def search( query_emb = get_embedding(query) if query_emb: try: - from pgvector.psycopg import register_vector import numpy as np - register_vector(conn) vec_conditions = ["deleted = FALSE", "embedding IS NOT NULL"] vec_params2: list[Any] = [] @@ -700,9 +701,7 @@ def update_memory( params.append(content) new_emb = get_embedding(f"{title or ''} {content}") if new_emb: - from pgvector.psycopg import register_vector import numpy as np - register_vector(conn) updates.append("embedding = %s") params.append(np.array(new_emb)) if title is not None: