|
|
|
|
|
import json |
|
|
import pickle |
|
|
from pathlib import Path |
|
|
import numpy as np |
|
|
import faiss |
|
|
from app.config import VECTOR_INDEX_PATH, EMBEDDING_DIM, DATA_DIR |
|
|
|
|
|
class VectorStore: |
|
|
"""FAISS vector store with persistence""" |
|
|
|
|
|
def __init__(self): |
|
|
self.index_path = Path(VECTOR_INDEX_PATH) |
|
|
self.metadata_path = self.index_path.with_suffix(".meta") |
|
|
self.index = None |
|
|
self.metadata = [] |
|
|
self._initialize() |
|
|
|
|
|
def _initialize(self): |
|
|
"""Initialize or load the index""" |
|
|
if self.index_path.exists(): |
|
|
self._load() |
|
|
else: |
|
|
self._create_new() |
|
|
|
|
|
def _create_new(self): |
|
|
"""Create a new FAISS index""" |
|
|
|
|
|
self.index = faiss.IndexFlatIP(EMBEDDING_DIM) |
|
|
self.metadata = [] |
|
|
|
|
|
def _load(self): |
|
|
"""Load existing index and metadata""" |
|
|
try: |
|
|
self.index = faiss.read_index(str(self.index_path)) |
|
|
|
|
|
if self.metadata_path.exists(): |
|
|
with open(self.metadata_path, "rb") as f: |
|
|
self.metadata = pickle.load(f) |
|
|
except Exception as e: |
|
|
print(f"Could not load index: {e}") |
|
|
self._create_new() |
|
|
|
|
|
def save(self): |
|
|
"""Persist index and metadata""" |
|
|
if self.index: |
|
|
self.index_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
faiss.write_index(self.index, str(self.index_path)) |
|
|
|
|
|
with open(self.metadata_path, "wb") as f: |
|
|
pickle.dump(self.metadata, f) |
|
|
|
|
|
def add(self, embeddings: np.ndarray, metadata: list): |
|
|
"""Add embeddings with metadata""" |
|
|
if self.index is None: |
|
|
self._create_new() |
|
|
|
|
|
|
|
|
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) |
|
|
normalized = embeddings / (norms + 1e-10) |
|
|
|
|
|
self.index.add(normalized.astype(np.float32)) |
|
|
self.metadata.extend(metadata) |
|
|
self.save() |
|
|
|
|
|
def search(self, query_embedding: np.ndarray, k: int = 5): |
|
|
"""Search for similar vectors""" |
|
|
if self.index is None or self.index.ntotal == 0: |
|
|
return [] |
|
|
|
|
|
|
|
|
norm = np.linalg.norm(query_embedding) |
|
|
normalized = query_embedding / (norm + 1e-10) |
|
|
|
|
|
|
|
|
scores, indices = self.index.search( |
|
|
normalized.reshape(1, -1).astype(np.float32), |
|
|
min(k, self.index.ntotal) |
|
|
) |
|
|
|
|
|
results = [] |
|
|
for score, idx in zip(scores[0], indices[0]): |
|
|
if idx < len(self.metadata): |
|
|
result = self.metadata[idx].copy() |
|
|
result["score"] = float(score) |
|
|
results.append(result) |
|
|
|
|
|
return results |
|
|
|
|
|
def rebuild_index(self): |
|
|
"""Rebuild the index from scratch""" |
|
|
self._create_new() |
|
|
|
|
|
|
|
|
companies_file = DATA_DIR / "companies.json" |
|
|
if companies_file.exists(): |
|
|
with open(companies_file) as f: |
|
|
companies = json.load(f) |
|
|
|
|
|
from vector.embeddings import get_embedding_model |
|
|
model = get_embedding_model() |
|
|
|
|
|
texts = [] |
|
|
metadata = [] |
|
|
|
|
|
for company in companies: |
|
|
|
|
|
desc = f"{company['name']} is a {company['industry']} company with {company['size']} employees" |
|
|
texts.append(desc) |
|
|
metadata.append({ |
|
|
"company_id": company["id"], |
|
|
"type": "description", |
|
|
"text": desc |
|
|
}) |
|
|
|
|
|
|
|
|
for pain in company.get("pains", []): |
|
|
text = f"{company['name']} pain point: {pain}" |
|
|
texts.append(text) |
|
|
metadata.append({ |
|
|
"company_id": company["id"], |
|
|
"type": "pain", |
|
|
"text": text |
|
|
}) |
|
|
|
|
|
|
|
|
for note in company.get("notes", []): |
|
|
texts.append(note) |
|
|
metadata.append({ |
|
|
"company_id": company["id"], |
|
|
"type": "note", |
|
|
"text": note |
|
|
}) |
|
|
|
|
|
if texts: |
|
|
embeddings = model.encode(texts) |
|
|
self.add(embeddings, metadata) |
|
|
|
|
|
def is_initialized(self): |
|
|
"""Check if the store is initialized""" |
|
|
return self.index is not None and self.index.ntotal > 0 |