From 293b7b09a98f220d139610ac9c2ef73cd2b74607 Mon Sep 17 00:00:00 2001
From: Alex
Date: Fri, 5 Jan 2024 17:16:16 +0000
Subject: [PATCH 1/4] init tests
---
application/vectorstore/document_class.py | 8 ++
application/vectorstore/elasticsearch.py | 10 +--
application/vectorstore/mongodb.py | 92 +++++++++++++++++++++++
3 files changed, 101 insertions(+), 9 deletions(-)
create mode 100644 application/vectorstore/document_class.py
create mode 100644 application/vectorstore/mongodb.py
diff --git a/application/vectorstore/document_class.py b/application/vectorstore/document_class.py
new file mode 100644
index 00000000..30d70a56
--- /dev/null
+++ b/application/vectorstore/document_class.py
@@ -0,0 +1,8 @@
+class Document(str):
+ """Class for storing a piece of text and associated metadata."""
+
+ def __new__(cls, page_content: str, metadata: dict):
+ instance = super().__new__(cls, page_content)
+ instance.page_content = page_content
+ instance.metadata = metadata
+ return instance
diff --git a/application/vectorstore/elasticsearch.py b/application/vectorstore/elasticsearch.py
index 734b3406..bb28d5ce 100644
--- a/application/vectorstore/elasticsearch.py
+++ b/application/vectorstore/elasticsearch.py
@@ -1,16 +1,8 @@
from application.vectorstore.base import BaseVectorStore
from application.core.settings import settings
+from application.vectorstore.document_class import Document
import elasticsearch
-class Document(str):
- """Class for storing a piece of text and associated metadata."""
-
- def __new__(cls, page_content: str, metadata: dict):
- instance = super().__new__(cls, page_content)
- instance.page_content = page_content
- instance.metadata = metadata
- return instance
-
diff --git a/application/vectorstore/mongodb.py b/application/vectorstore/mongodb.py
new file mode 100644
index 00000000..b0fd13c1
--- /dev/null
+++ b/application/vectorstore/mongodb.py
@@ -0,0 +1,92 @@
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+from application.vectorstore.document_class import Document
+
+class MongoDBVectorStore(BaseVectorStore):
+ def __init__(
+ self,
+ collection: str = "documents",
+ index_name: str = "default",
+ text_key: str = "text",
+ embedding_key: str = "embedding",
+ embedding_api_key: str = "embedding_api_key",
+ path: str = "",
+ ):
+ self._collection = collection
+ self._embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embedding_api_key)
+ self._index_name = index_name
+ self._text_key = text_key
+ self._embedding_key = embedding_key
+ self._mongo_uri = settings.MONGO_URI
+ self._path = path
+ # import pymongo
+ try:
+ import pymongo
+ except ImportError:
+ raise ImportError(
+ "Could not import pymongo python package. "
+ "Please install it with `pip install pymongo`."
+ )
+ self._client = pymongo.MongoClient(self._mongo_uri)
+
+ def search(self, question, k=2, *args, **kwargs):
+ query_vector = self._embeddings.embed_query(question)
+
+ pipeline = [
+ {
+ "$vectorSearch": {
+ "queryVector": query_vector,
+ "path": self._embedding_key,
+ "limit": k,
+ "index": self._index_name
+ }
+ }
+ ]
+
+ cursor = self._client._collection.aggregate(pipeline)
+
+ results = []
+ for doc in cursor:
+ text = doc[self._text_key]
+ metadata = doc
+ results.append(Document(text, metadata))
+
+ return results
+
+ def _insert_texts(self, texts, metadatas):
+ if not texts:
+ return []
+ embeddings = self._embedding.embed_documents(texts)
+ to_insert = [
+ {self._text_key: t, self._embedding_key: embedding, **m}
+ for t, m, embedding in zip(texts, metadatas, embeddings)
+ ]
+ # insert the documents in MongoDB Atlas
+ insert_result = self.client._collection.insert_many(to_insert)
+ return insert_result.inserted_ids
+
+ def add_texts(self,
+ texts,
+ metadatas = None,
+ ids = None,
+ refresh_indices = True,
+ create_index_if_not_exists = True,
+ bulk_kwargs = None,
+ **kwargs,):
+
+
+ batch_size = 100
+ _metadatas = metadatas or ({} for _ in texts)
+ texts_batch = []
+ metadatas_batch = []
+ result_ids = []
+ for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
+ texts_batch.append(text)
+ metadatas_batch.append(metadata)
+ if (i + 1) % batch_size == 0:
+ result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+ texts_batch = []
+ metadatas_batch = []
+ if texts_batch:
+ result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+ return result_ids
\ No newline at end of file
From 4ff834de76abf2194b8520ff2a1bdf065b78dbf8 Mon Sep 17 00:00:00 2001
From: Alex
Date: Sat, 6 Jan 2024 17:59:01 +0000
Subject: [PATCH 2/4] Refactor MongoDBVectorStore and add delete_index method
---
application/vectorstore/mongodb.py | 64 +++++++++++++++++------
application/vectorstore/vector_creator.py | 4 +-
2 files changed, 52 insertions(+), 16 deletions(-)
diff --git a/application/vectorstore/mongodb.py b/application/vectorstore/mongodb.py
index b0fd13c1..337fc41f 100644
--- a/application/vectorstore/mongodb.py
+++ b/application/vectorstore/mongodb.py
@@ -5,21 +5,22 @@ from application.vectorstore.document_class import Document
class MongoDBVectorStore(BaseVectorStore):
def __init__(
self,
+ path: str = "",
+ embeddings_key: str = "embeddings",
collection: str = "documents",
- index_name: str = "default",
+ index_name: str = "vector_search_index",
text_key: str = "text",
embedding_key: str = "embedding",
- embedding_api_key: str = "embedding_api_key",
- path: str = "",
+ database: str = "docsgpt",
):
- self._collection = collection
- self._embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embedding_api_key)
self._index_name = index_name
self._text_key = text_key
self._embedding_key = embedding_key
+ self._embeddings_key = embeddings_key
self._mongo_uri = settings.MONGO_URI
- self._path = path
- # import pymongo
+ self._path = path.replace("application/indexes/", "").rstrip("/")
+ self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
+
try:
import pymongo
except ImportError:
@@ -27,30 +28,40 @@ class MongoDBVectorStore(BaseVectorStore):
"Could not import pymongo python package. "
"Please install it with `pip install pymongo`."
)
+
self._client = pymongo.MongoClient(self._mongo_uri)
+ self._database = self._client[database]
+ self._collection = self._database[collection]
+
def search(self, question, k=2, *args, **kwargs):
- query_vector = self._embeddings.embed_query(question)
-
+ query_vector = self._embedding.embed_query(question)
+
pipeline = [
{
"$vectorSearch": {
"queryVector": query_vector,
"path": self._embedding_key,
"limit": k,
- "index": self._index_name
+ "numCandidates": k * 10,
+ "index": self._index_name,
+ "filter": {
+ "store": {"$eq": self._path}
+ }
}
}
]
-
- cursor = self._client._collection.aggregate(pipeline)
+
+ cursor = self._collection.aggregate(pipeline)
results = []
for doc in cursor:
text = doc[self._text_key]
+ doc.pop("_id")
+ doc.pop(self._text_key)
+ doc.pop(self._embedding_key)
metadata = doc
results.append(Document(text, metadata))
-
return results
def _insert_texts(self, texts, metadatas):
@@ -62,7 +73,7 @@ class MongoDBVectorStore(BaseVectorStore):
for t, m, embedding in zip(texts, metadatas, embeddings)
]
# insert the documents in MongoDB Atlas
- insert_result = self.client._collection.insert_many(to_insert)
+ insert_result = self._collection.insert_many(to_insert)
return insert_result.inserted_ids
def add_texts(self,
@@ -75,6 +86,26 @@ class MongoDBVectorStore(BaseVectorStore):
**kwargs,):
+ #dims = self._embedding.client[1].word_embedding_dimension
+ # # check if index exists
+ # if create_index_if_not_exists:
+ # # check if index exists
+ # info = self._collection.index_information()
+ # if self._index_name not in info:
+ # index_mongo = {
+ # "fields": [{
+ # "type": "vector",
+ # "path": self._embedding_key,
+ # "numDimensions": dims,
+ # "similarity": "cosine",
+ # },
+ # {
+ # "type": "filter",
+ # "path": "store"
+ # }]
+ # }
+ # self._collection.create_index(self._index_name, index_mongo)
+
batch_size = 100
_metadatas = metadatas or ({} for _ in texts)
texts_batch = []
@@ -89,4 +120,7 @@ class MongoDBVectorStore(BaseVectorStore):
metadatas_batch = []
if texts_batch:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
- return result_ids
\ No newline at end of file
+ return result_ids
+
+ def delete_index(self, *args, **kwargs):
+ self._collection.delete_many({"store": self._path})
\ No newline at end of file
diff --git a/application/vectorstore/vector_creator.py b/application/vectorstore/vector_creator.py
index cbc491f5..68ae2813 100644
--- a/application/vectorstore/vector_creator.py
+++ b/application/vectorstore/vector_creator.py
@@ -1,11 +1,13 @@
from application.vectorstore.faiss import FaissStore
from application.vectorstore.elasticsearch import ElasticsearchStore
+from application.vectorstore.mongodb import MongoDBVectorStore
class VectorCreator:
vectorstores = {
'faiss': FaissStore,
- 'elasticsearch':ElasticsearchStore
+ 'elasticsearch':ElasticsearchStore,
+ 'mongodb': MongoDBVectorStore,
}
@classmethod
From 1fa12e56c63e68f4821487b7bd03303b2f048806 Mon Sep 17 00:00:00 2001
From: Alex
Date: Sat, 6 Jan 2024 18:04:50 +0000
Subject: [PATCH 3/4] Remove unused test cases in test_openai.py
---
tests/llm/test_openai.py | 20 --------------------
1 file changed, 20 deletions(-)
diff --git a/tests/llm/test_openai.py b/tests/llm/test_openai.py
index d1c63c63..445de223 100644
--- a/tests/llm/test_openai.py
+++ b/tests/llm/test_openai.py
@@ -10,23 +10,3 @@ class TestOpenAILLM(unittest.TestCase):
def test_init(self):
self.assertEqual(self.llm.api_key, self.api_key)
-
- @patch('application.llm.openai.openai.ChatCompletion.create')
- def test_gen(self, mock_create):
- model = "test_model"
- engine = "test_engine"
- messages = ["test_message"]
- response = {"choices": [{"message": {"content": "test_response"}}]}
- mock_create.return_value = response
- result = self.llm.gen(model, engine, messages)
- self.assertEqual(result, "test_response")
-
- @patch('application.llm.openai.openai.ChatCompletion.create')
- def test_gen_stream(self, mock_create):
- model = "test_model"
- engine = "test_engine"
- messages = ["test_message"]
- response = [{"choices": [{"delta": {"content": "test_response"}}]}]
- mock_create.return_value = response
- result = list(self.llm.gen_stream(model, engine, messages))
- self.assertEqual(result, ["test_response"])
From a3e6239e6e15db60246016e4a9de487fcb030126 Mon Sep 17 00:00:00 2001
From: Alex
Date: Sat, 6 Jan 2024 18:23:20 +0000
Subject: [PATCH 4/4] fix: remove import
---
tests/llm/test_openai.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/tests/llm/test_openai.py b/tests/llm/test_openai.py
index 445de223..8c713178 100644
--- a/tests/llm/test_openai.py
+++ b/tests/llm/test_openai.py
@@ -1,5 +1,4 @@
import unittest
-from unittest.mock import patch
from application.llm.openai import OpenAILLM
class TestOpenAILLM(unittest.TestCase):