From 7e7551315140a20152ee9b802cb5ba78a778f044 Mon Sep 17 00:00:00 2001
From: akashAD98 <aksdesai1998@gmail.com>
Date: Thu, 12 Sep 2024 18:51:29 +0530
Subject: [PATCH 1/4] added support for lacedb as vectordb

---
 application/core/settings.py       |  8 ++-
 application/vectorstore/lancedb.py | 93 ++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 application/vectorstore/lancedb.py

diff --git a/application/core/settings.py b/application/core/settings.py
index e6173be4..5659020b 100644
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -18,7 +18,7 @@ class Settings(BaseSettings):
     DEFAULT_MAX_HISTORY: int = 150
     MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
     UPLOAD_FOLDER: str = "inputs"
-    VECTOR_STORE: str = "faiss"  # "faiss" or "elasticsearch" or "qdrant" or "milvus"
+    VECTOR_STORE: str = "faiss" #  "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
     RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search
 
     API_URL: str = "http://localhost:7091"  # backend url for celery worker
@@ -67,6 +67,12 @@ class Settings(BaseSettings):
     MILVUS_URI: Optional[str] = "./milvus_local.db"   # milvus lite version as default
     MILVUS_TOKEN: Optional[str] = ""
 
+    # LanceDB vectorstore config
+    LANCEDB_PATH: str = "/tmp/lancedb"  # Path where LanceDB stores its local data
+    LANCEDB_URI: Optional[str] = "db://localhost:5432/lancedb"  # URI for connecting to a LanceDB instance
+    LANCEDB_TABLE_NAME: Optional[str] = "gptcache"  # Name of the table to use for storing vectors
+    LANCEDB_API_KEY: Optional[str] = None  # API key for connecting to LanceDB cloud (if applicable)
+    LANCEDB_REGION: Optional[str] = None  # Region for LanceDB cloud (if using cloud deployment)
     BRAVE_SEARCH_API_KEY: Optional[str] = None
 
     FLASK_DEBUG_MODE: bool = False
diff --git a/application/vectorstore/lancedb.py b/application/vectorstore/lancedb.py
new file mode 100644
index 00000000..caec57e9
--- /dev/null
+++ b/application/vectorstore/lancedb.py
@@ -0,0 +1,93 @@
+from typing import List, Optional
+import pyarrow as pa
+import lancedb
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+
+class LanceDBVectorStore(BaseVectorStore):
+    """Class for LanceDB Vector Store integration."""
+
+    def __init__(self, path: str = settings.LANCEDB_PATH,
+                 table_name: str = settings.LANCEDB_TABLE_NAME,
+                 embeddings_key: str = "embeddings"):
+        """Initialize the LanceDB vector store."""
+        super().__init__()
+        self.path = path
+        self.table_name = table_name
+        self.embeddings_key = embeddings_key
+        self._lance_db = None  # Updated to snake_case
+        self.docsearch = None
+
+    @property
+    def lance_db(self):
+        """Lazy load the LanceDB connection."""
+        if self._lance_db is None:
+            self._lance_db = lancedb.connect(self.path)
+        return self._lance_db
+
+    @property
+    def table(self):
+        """Lazy load the LanceDB table."""
+        if self.docsearch is None:
+            if self.table_name in self.lance_db.table_names():
+                self.docsearch = self.lance_db.open_table(self.table_name)
+            else:
+                self.docsearch = None
+        return self.docsearch
+
+    def ensure_table_exists(self):
+        """Ensure the table exists before performing operations."""
+        if self.table is None:
+            embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
+            schema = pa.schema([
+                pa.field("vector", pa.list_(pa.float32(), list_size=embeddings.dimension)),
+                pa.field("text", pa.string()),
+                pa.field("metadata", pa.struct([
+                    pa.field("key", pa.string()),
+                    pa.field("value", pa.string())
+                ]))
+            ])
+            self.docsearch = self.lance_db.create_table(self.table_name, schema=schema)
+
+    def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None):
+        """Add texts with metadata and their embeddings to the LanceDB table."""
+        embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts)
+        vectors = []
+        for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)):
+            metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()]
+            vectors.append({
+                "vector": embedding,
+                "text": text,
+                "metadata": metadata_struct
+            })
+        self.ensure_table_exists()
+        self.docsearch.add(vectors)
+
+    def search(self, query: str, k: int = 2, *args, **kwargs):
+        """Search LanceDB for the top k most similar vectors."""
+        self.ensure_table_exists()
+        query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query)
+        results = self.docsearch.search(query_embedding).limit(k).to_list()
+        return [(result["_distance"], result["text"], result["metadata"]) for result in results]
+
+    def delete_index(self):
+        """Delete the entire LanceDB index (table)."""
+        if self.table:
+            self.lance_db.drop_table(self.table_name)
+
+    def assert_embedding_dimensions(self, embeddings):
+        """Ensure that embedding dimensions match the table index dimensions."""
+        word_embedding_dimension = embeddings.dimension
+        if self.table:
+            table_index_dimension = len(self.docsearch.schema["vector"].type.value_type)
+            if word_embedding_dimension != table_index_dimension:
+                raise ValueError(
+                    f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) "
+                    f"!= table index dimension ({table_index_dimension})"
+                )
+
+    def filter_documents(self, filter_condition: dict) -> List[dict]:
+        """Filter documents based on certain conditions."""
+        self.ensure_table_exists()
+        filtered_data = self.docsearch.filter(filter_condition).to_list()
+        return filtered_data
\ No newline at end of file

From 66ebfef619fa8df6da2188c554d39b8bb6db648b Mon Sep 17 00:00:00 2001
From: akashAD98 <aksdesai1998@gmail.com>
Date: Thu, 12 Sep 2024 18:55:01 +0530
Subject: [PATCH 2/4] added lancedb in pacage

---
 application/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/application/requirements.txt b/application/requirements.txt
index d9e9edef..f1b9e3c8 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -18,6 +18,7 @@ langchain==0.2.16
 langchain-community==0.2.16
 langchain-core==0.2.38
 langchain-openai==0.1.23
+lancedb==0.13.0
 openapi3_parser==1.1.16
 pandas==2.2.2
 pydantic_settings==2.4.0

From 24383997ef185ba0c5726f429ae473d7838c622e Mon Sep 17 00:00:00 2001
From: akashmangoai <akashd@mangospring.com>
Date: Fri, 11 Oct 2024 21:53:29 +0530
Subject: [PATCH 3/4] removed cloud based parameter which are not needed

---
 application/core/settings.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/application/core/settings.py b/application/core/settings.py
index 5659020b..c8fe9c4a 100644
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -69,10 +69,7 @@ class Settings(BaseSettings):
 
     # LanceDB vectorstore config
     LANCEDB_PATH: str = "/tmp/lancedb"  # Path where LanceDB stores its local data
-    LANCEDB_URI: Optional[str] = "db://localhost:5432/lancedb"  # URI for connecting to a LanceDB instance
-    LANCEDB_TABLE_NAME: Optional[str] = "gptcache"  # Name of the table to use for storing vectors
-    LANCEDB_API_KEY: Optional[str] = None  # API key for connecting to LanceDB cloud (if applicable)
-    LANCEDB_REGION: Optional[str] = None  # Region for LanceDB cloud (if using cloud deployment)
+    LANCEDB_TABLE_NAME: Optional[str] = "docsgpts"  # Name of the table to use for storing vectors
     BRAVE_SEARCH_API_KEY: Optional[str] = None
 
     FLASK_DEBUG_MODE: bool = False

From a9f6a06446be28408de07146cc27c0774448ac44 Mon Sep 17 00:00:00 2001
From: akashmangoai <akashd@mangospring.com>
Date: Fri, 11 Oct 2024 22:58:18 +0530
Subject: [PATCH 4/4] lazy import & fixed other issue

---
 application/vectorstore/lancedb.py | 54 ++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/application/vectorstore/lancedb.py b/application/vectorstore/lancedb.py
index caec57e9..25d62318 100644
--- a/application/vectorstore/lancedb.py
+++ b/application/vectorstore/lancedb.py
@@ -1,6 +1,5 @@
 from typing import List, Optional
-import pyarrow as pa
-import lancedb
+import importlib
 from application.vectorstore.base import BaseVectorStore
 from application.core.settings import settings
 
@@ -8,21 +7,37 @@ class LanceDBVectorStore(BaseVectorStore):
     """Class for LanceDB Vector Store integration."""
 
     def __init__(self, path: str = settings.LANCEDB_PATH,
-                 table_name: str = settings.LANCEDB_TABLE_NAME,
+                 table_name_prefix: str = settings.LANCEDB_TABLE_NAME,
+                 source_id: str = None,
                  embeddings_key: str = "embeddings"):
         """Initialize the LanceDB vector store."""
         super().__init__()
         self.path = path
-        self.table_name = table_name
+        self.table_name = f"{table_name_prefix}_{source_id}" if source_id else table_name_prefix
         self.embeddings_key = embeddings_key
-        self._lance_db = None  # Updated to snake_case
+        self._lance_db = None
         self.docsearch = None
+        self._pa = None  # PyArrow (pa) will be lazy loaded
+
+    @property
+    def pa(self):
+        """Lazy load pyarrow module."""
+        if self._pa is None:
+            self._pa = importlib.import_module("pyarrow")
+        return self._pa
+
+    @property
+    def lancedb(self):
+        """Lazy load lancedb module."""
+        if not hasattr(self, "_lancedb_module"):
+            self._lancedb_module = importlib.import_module("lancedb")
+        return self._lancedb_module
 
     @property
     def lance_db(self):
         """Lazy load the LanceDB connection."""
         if self._lance_db is None:
-            self._lance_db = lancedb.connect(self.path)
+            self._lance_db = self.lancedb.connect(self.path)
         return self._lance_db
 
     @property
@@ -39,21 +54,23 @@ class LanceDBVectorStore(BaseVectorStore):
         """Ensure the table exists before performing operations."""
         if self.table is None:
             embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
-            schema = pa.schema([
-                pa.field("vector", pa.list_(pa.float32(), list_size=embeddings.dimension)),
-                pa.field("text", pa.string()),
-                pa.field("metadata", pa.struct([
-                    pa.field("key", pa.string()),
-                    pa.field("value", pa.string())
+            schema = self.pa.schema([
+                self.pa.field("vector", self.pa.list_(self.pa.float32(), list_size=embeddings.dimension)),
+                self.pa.field("text", self.pa.string()),
+                self.pa.field("metadata", self.pa.struct([
+                    self.pa.field("key", self.pa.string()),
+                    self.pa.field("value", self.pa.string())
                 ]))
             ])
             self.docsearch = self.lance_db.create_table(self.table_name, schema=schema)
 
-    def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None):
+    def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, source_id: str = None):
         """Add texts with metadata and their embeddings to the LanceDB table."""
         embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts)
         vectors = []
         for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)):
+            if source_id:
+                metadata["source_id"] = source_id
             metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()]
             vectors.append({
                 "vector": embedding,
@@ -89,5 +106,14 @@ class LanceDBVectorStore(BaseVectorStore):
     def filter_documents(self, filter_condition: dict) -> List[dict]:
         """Filter documents based on certain conditions."""
         self.ensure_table_exists()
-        filtered_data = self.docsearch.filter(filter_condition).to_list()
+
+        # Ensure source_id exists in the filter condition
+        if 'source_id' not in filter_condition:
+            raise ValueError("filter_condition must contain 'source_id'")
+
+        source_id = filter_condition["source_id"]
+
+        # Use LanceDB's native filtering if supported, otherwise filter manually
+        filtered_data = self.docsearch.filter(lambda x: x.metadata and x.metadata.get("source_id") == source_id).to_list()
+
         return filtered_data
\ No newline at end of file