From 7e7551315140a20152ee9b802cb5ba78a778f044 Mon Sep 17 00:00:00 2001 From: akashAD98 Date: Thu, 12 Sep 2024 18:51:29 +0530 Subject: [PATCH 1/4] added support for lacedb as vectordb --- application/core/settings.py | 8 ++- application/vectorstore/lancedb.py | 93 ++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 application/vectorstore/lancedb.py diff --git a/application/core/settings.py b/application/core/settings.py index e6173be4..5659020b 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -18,7 +18,7 @@ class Settings(BaseSettings): DEFAULT_MAX_HISTORY: int = 150 MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5} UPLOAD_FOLDER: str = "inputs" - VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" + VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search API_URL: str = "http://localhost:7091" # backend url for celery worker @@ -67,6 +67,12 @@ class Settings(BaseSettings): MILVUS_URI: Optional[str] = "./milvus_local.db" # milvus lite version as default MILVUS_TOKEN: Optional[str] = "" + # LanceDB vectorstore config + LANCEDB_PATH: str = "/tmp/lancedb" # Path where LanceDB stores its local data + LANCEDB_URI: Optional[str] = "db://localhost:5432/lancedb" # URI for connecting to a LanceDB instance + LANCEDB_TABLE_NAME: Optional[str] = "gptcache" # Name of the table to use for storing vectors + LANCEDB_API_KEY: Optional[str] = None # API key for connecting to LanceDB cloud (if applicable) + LANCEDB_REGION: Optional[str] = None # Region for LanceDB cloud (if using cloud deployment) BRAVE_SEARCH_API_KEY: Optional[str] = None FLASK_DEBUG_MODE: bool = False diff --git a/application/vectorstore/lancedb.py b/application/vectorstore/lancedb.py new file mode 100644 index 00000000..caec57e9 --- /dev/null +++ b/application/vectorstore/lancedb.py @@ -0,0 +1,93 @@ +from typing import List, Optional +import pyarrow as pa +import lancedb +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings + +class LanceDBVectorStore(BaseVectorStore): + """Class for LanceDB Vector Store integration.""" + + def __init__(self, path: str = settings.LANCEDB_PATH, + table_name: str = settings.LANCEDB_TABLE_NAME, + embeddings_key: str = "embeddings"): + """Initialize the LanceDB vector store.""" + super().__init__() + self.path = path + self.table_name = table_name + self.embeddings_key = embeddings_key + self._lance_db = None # Updated to snake_case + self.docsearch = None + + @property + def lance_db(self): + """Lazy load the LanceDB connection.""" + if self._lance_db is None: + self._lance_db = lancedb.connect(self.path) + return self._lance_db + + @property + def table(self): + """Lazy load the LanceDB table.""" + if self.docsearch is None: + if self.table_name in self.lance_db.table_names(): + self.docsearch = self.lance_db.open_table(self.table_name) + else: + self.docsearch = None + return self.docsearch + + def ensure_table_exists(self): + """Ensure the table exists before performing operations.""" + if self.table is None: + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) + schema = pa.schema([ + pa.field("vector", pa.list_(pa.float32(), list_size=embeddings.dimension)), + pa.field("text", pa.string()), + pa.field("metadata", pa.struct([ + pa.field("key", pa.string()), + pa.field("value", pa.string()) + ])) + ]) + self.docsearch = self.lance_db.create_table(self.table_name, schema=schema) + + def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None): + """Add texts with metadata and their embeddings to the LanceDB table.""" + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts) + vectors = [] + for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)): + metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()] + vectors.append({ + "vector": embedding, + "text": text, + "metadata": metadata_struct + }) + self.ensure_table_exists() + self.docsearch.add(vectors) + + def search(self, query: str, k: int = 2, *args, **kwargs): + """Search LanceDB for the top k most similar vectors.""" + self.ensure_table_exists() + query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query) + results = self.docsearch.search(query_embedding).limit(k).to_list() + return [(result["_distance"], result["text"], result["metadata"]) for result in results] + + def delete_index(self): + """Delete the entire LanceDB index (table).""" + if self.table: + self.lance_db.drop_table(self.table_name) + + def assert_embedding_dimensions(self, embeddings): + """Ensure that embedding dimensions match the table index dimensions.""" + word_embedding_dimension = embeddings.dimension + if self.table: + table_index_dimension = len(self.docsearch.schema["vector"].type.value_type) + if word_embedding_dimension != table_index_dimension: + raise ValueError( + f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " + f"!= table index dimension ({table_index_dimension})" + ) + + def filter_documents(self, filter_condition: dict) -> List[dict]: + """Filter documents based on certain conditions.""" + self.ensure_table_exists() + filtered_data = self.docsearch.filter(filter_condition).to_list() + return filtered_data \ No newline at end of file From 66ebfef619fa8df6da2188c554d39b8bb6db648b Mon Sep 17 00:00:00 2001 From: akashAD98 Date: Thu, 12 Sep 2024 18:55:01 +0530 Subject: [PATCH 2/4] added lancedb in pacage --- application/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/application/requirements.txt b/application/requirements.txt index d9e9edef..f1b9e3c8 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -18,6 +18,7 @@ langchain==0.2.16 langchain-community==0.2.16 langchain-core==0.2.38 langchain-openai==0.1.23 +lancedb==0.13.0 openapi3_parser==1.1.16 pandas==2.2.2 pydantic_settings==2.4.0 From 24383997ef185ba0c5726f429ae473d7838c622e Mon Sep 17 00:00:00 2001 From: akashmangoai Date: Fri, 11 Oct 2024 21:53:29 +0530 Subject: [PATCH 3/4] removed cloud based parameter which are not needed --- application/core/settings.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/application/core/settings.py b/application/core/settings.py index 5659020b..c8fe9c4a 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -69,10 +69,7 @@ class Settings(BaseSettings): # LanceDB vectorstore config LANCEDB_PATH: str = "/tmp/lancedb" # Path where LanceDB stores its local data - LANCEDB_URI: Optional[str] = "db://localhost:5432/lancedb" # URI for connecting to a LanceDB instance - LANCEDB_TABLE_NAME: Optional[str] = "gptcache" # Name of the table to use for storing vectors - LANCEDB_API_KEY: Optional[str] = None # API key for connecting to LanceDB cloud (if applicable) - LANCEDB_REGION: Optional[str] = None # Region for LanceDB cloud (if using cloud deployment) + LANCEDB_TABLE_NAME: Optional[str] = "docsgpts" # Name of the table to use for storing vectors BRAVE_SEARCH_API_KEY: Optional[str] = None FLASK_DEBUG_MODE: bool = False From a9f6a06446be28408de07146cc27c0774448ac44 Mon Sep 17 00:00:00 2001 From: akashmangoai Date: Fri, 11 Oct 2024 22:58:18 +0530 Subject: [PATCH 4/4] lazy import & fixed other issue --- application/vectorstore/lancedb.py | 54 ++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/application/vectorstore/lancedb.py b/application/vectorstore/lancedb.py index caec57e9..25d62318 100644 --- a/application/vectorstore/lancedb.py +++ b/application/vectorstore/lancedb.py @@ -1,6 +1,5 @@ from typing import List, Optional -import pyarrow as pa -import lancedb +import importlib from application.vectorstore.base import BaseVectorStore from application.core.settings import settings @@ -8,21 +7,37 @@ class LanceDBVectorStore(BaseVectorStore): """Class for LanceDB Vector Store integration.""" def __init__(self, path: str = settings.LANCEDB_PATH, - table_name: str = settings.LANCEDB_TABLE_NAME, + table_name_prefix: str = settings.LANCEDB_TABLE_NAME, + source_id: str = None, embeddings_key: str = "embeddings"): """Initialize the LanceDB vector store.""" super().__init__() self.path = path - self.table_name = table_name + self.table_name = f"{table_name_prefix}_{source_id}" if source_id else table_name_prefix self.embeddings_key = embeddings_key - self._lance_db = None # Updated to snake_case + self._lance_db = None self.docsearch = None + self._pa = None # PyArrow (pa) will be lazy loaded + + @property + def pa(self): + """Lazy load pyarrow module.""" + if self._pa is None: + self._pa = importlib.import_module("pyarrow") + return self._pa + + @property + def lancedb(self): + """Lazy load lancedb module.""" + if not hasattr(self, "_lancedb_module"): + self._lancedb_module = importlib.import_module("lancedb") + return self._lancedb_module @property def lance_db(self): """Lazy load the LanceDB connection.""" if self._lance_db is None: - self._lance_db = lancedb.connect(self.path) + self._lance_db = self.lancedb.connect(self.path) return self._lance_db @property @@ -39,21 +54,23 @@ class LanceDBVectorStore(BaseVectorStore): """Ensure the table exists before performing operations.""" if self.table is None: embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) - schema = pa.schema([ - pa.field("vector", pa.list_(pa.float32(), list_size=embeddings.dimension)), - pa.field("text", pa.string()), - pa.field("metadata", pa.struct([ - pa.field("key", pa.string()), - pa.field("value", pa.string()) + schema = self.pa.schema([ + self.pa.field("vector", self.pa.list_(self.pa.float32(), list_size=embeddings.dimension)), + self.pa.field("text", self.pa.string()), + self.pa.field("metadata", self.pa.struct([ + self.pa.field("key", self.pa.string()), + self.pa.field("value", self.pa.string()) ])) ]) self.docsearch = self.lance_db.create_table(self.table_name, schema=schema) - def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None): + def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, source_id: str = None): """Add texts with metadata and their embeddings to the LanceDB table.""" embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts) vectors = [] for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)): + if source_id: + metadata["source_id"] = source_id metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()] vectors.append({ "vector": embedding, @@ -89,5 +106,14 @@ class LanceDBVectorStore(BaseVectorStore): def filter_documents(self, filter_condition: dict) -> List[dict]: """Filter documents based on certain conditions.""" self.ensure_table_exists() - filtered_data = self.docsearch.filter(filter_condition).to_list() + + # Ensure source_id exists in the filter condition + if 'source_id' not in filter_condition: + raise ValueError("filter_condition must contain 'source_id'") + + source_id = filter_condition["source_id"] + + # Use LanceDB's native filtering if supported, otherwise filter manually + filtered_data = self.docsearch.filter(lambda x: x.metadata and x.metadata.get("source_id") == source_id).to_list() + return filtered_data \ No newline at end of file