diff --git a/application/core/settings.py b/application/core/settings.py index e6173be4..5659020b 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -18,7 +18,7 @@ class Settings(BaseSettings): DEFAULT_MAX_HISTORY: int = 150 MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5} UPLOAD_FOLDER: str = "inputs" - VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" + VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search API_URL: str = "http://localhost:7091" # backend url for celery worker @@ -67,6 +67,12 @@ class Settings(BaseSettings): MILVUS_URI: Optional[str] = "./milvus_local.db" # milvus lite version as default MILVUS_TOKEN: Optional[str] = "" + # LanceDB vectorstore config + LANCEDB_PATH: str = "/tmp/lancedb" # Path where LanceDB stores its local data + LANCEDB_URI: Optional[str] = "db://localhost:5432/lancedb" # URI for connecting to a LanceDB instance + LANCEDB_TABLE_NAME: Optional[str] = "gptcache" # Name of the table to use for storing vectors + LANCEDB_API_KEY: Optional[str] = None # API key for connecting to LanceDB cloud (if applicable) + LANCEDB_REGION: Optional[str] = None # Region for LanceDB cloud (if using cloud deployment) BRAVE_SEARCH_API_KEY: Optional[str] = None FLASK_DEBUG_MODE: bool = False diff --git a/application/vectorstore/lancedb.py b/application/vectorstore/lancedb.py new file mode 100644 index 00000000..caec57e9 --- /dev/null +++ b/application/vectorstore/lancedb.py @@ -0,0 +1,93 @@ +from typing import List, Optional +import pyarrow as pa +import lancedb +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings + +class LanceDBVectorStore(BaseVectorStore): + """Class for LanceDB Vector Store integration.""" + + def __init__(self, path: str = settings.LANCEDB_PATH, + table_name: str = settings.LANCEDB_TABLE_NAME, + embeddings_key: str = "embeddings"): + """Initialize the LanceDB vector store.""" + super().__init__() + self.path = path + self.table_name = table_name + self.embeddings_key = embeddings_key + self._lance_db = None # Updated to snake_case + self.docsearch = None + + @property + def lance_db(self): + """Lazy load the LanceDB connection.""" + if self._lance_db is None: + self._lance_db = lancedb.connect(self.path) + return self._lance_db + + @property + def table(self): + """Lazy load the LanceDB table.""" + if self.docsearch is None: + if self.table_name in self.lance_db.table_names(): + self.docsearch = self.lance_db.open_table(self.table_name) + else: + self.docsearch = None + return self.docsearch + + def ensure_table_exists(self): + """Ensure the table exists before performing operations.""" + if self.table is None: + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) + schema = pa.schema([ + pa.field("vector", pa.list_(pa.float32(), list_size=embeddings.dimension)), + pa.field("text", pa.string()), + pa.field("metadata", pa.struct([ + pa.field("key", pa.string()), + pa.field("value", pa.string()) + ])) + ]) + self.docsearch = self.lance_db.create_table(self.table_name, schema=schema) + + def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None): + """Add texts with metadata and their embeddings to the LanceDB table.""" + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts) + vectors = [] + for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)): + metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()] + vectors.append({ + "vector": embedding, + "text": text, + "metadata": metadata_struct + }) + self.ensure_table_exists() + self.docsearch.add(vectors) + + def search(self, query: str, k: int = 2, *args, **kwargs): + """Search LanceDB for the top k most similar vectors.""" + self.ensure_table_exists() + query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query) + results = self.docsearch.search(query_embedding).limit(k).to_list() + return [(result["_distance"], result["text"], result["metadata"]) for result in results] + + def delete_index(self): + """Delete the entire LanceDB index (table).""" + if self.table: + self.lance_db.drop_table(self.table_name) + + def assert_embedding_dimensions(self, embeddings): + """Ensure that embedding dimensions match the table index dimensions.""" + word_embedding_dimension = embeddings.dimension + if self.table: + table_index_dimension = len(self.docsearch.schema["vector"].type.value_type) + if word_embedding_dimension != table_index_dimension: + raise ValueError( + f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " + f"!= table index dimension ({table_index_dimension})" + ) + + def filter_documents(self, filter_condition: dict) -> List[dict]: + """Filter documents based on certain conditions.""" + self.ensure_table_exists() + filtered_data = self.docsearch.filter(filter_condition).to_list() + return filtered_data \ No newline at end of file