From 293b7b09a98f220d139610ac9c2ef73cd2b74607 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 5 Jan 2024 17:16:16 +0000 Subject: [PATCH 1/4] init tests --- application/vectorstore/document_class.py | 8 ++ application/vectorstore/elasticsearch.py | 10 +-- application/vectorstore/mongodb.py | 92 +++++++++++++++++++++++ 3 files changed, 101 insertions(+), 9 deletions(-) create mode 100644 application/vectorstore/document_class.py create mode 100644 application/vectorstore/mongodb.py diff --git a/application/vectorstore/document_class.py b/application/vectorstore/document_class.py new file mode 100644 index 00000000..30d70a56 --- /dev/null +++ b/application/vectorstore/document_class.py @@ -0,0 +1,8 @@ +class Document(str): + """Class for storing a piece of text and associated metadata.""" + + def __new__(cls, page_content: str, metadata: dict): + instance = super().__new__(cls, page_content) + instance.page_content = page_content + instance.metadata = metadata + return instance diff --git a/application/vectorstore/elasticsearch.py b/application/vectorstore/elasticsearch.py index 734b3406..bb28d5ce 100644 --- a/application/vectorstore/elasticsearch.py +++ b/application/vectorstore/elasticsearch.py @@ -1,16 +1,8 @@ from application.vectorstore.base import BaseVectorStore from application.core.settings import settings +from application.vectorstore.document_class import Document import elasticsearch -class Document(str): - """Class for storing a piece of text and associated metadata.""" - - def __new__(cls, page_content: str, metadata: dict): - instance = super().__new__(cls, page_content) - instance.page_content = page_content - instance.metadata = metadata - return instance - diff --git a/application/vectorstore/mongodb.py b/application/vectorstore/mongodb.py new file mode 100644 index 00000000..b0fd13c1 --- /dev/null +++ b/application/vectorstore/mongodb.py @@ -0,0 +1,92 @@ +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings +from application.vectorstore.document_class import Document + +class MongoDBVectorStore(BaseVectorStore): + def __init__( + self, + collection: str = "documents", + index_name: str = "default", + text_key: str = "text", + embedding_key: str = "embedding", + embedding_api_key: str = "embedding_api_key", + path: str = "", + ): + self._collection = collection + self._embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embedding_api_key) + self._index_name = index_name + self._text_key = text_key + self._embedding_key = embedding_key + self._mongo_uri = settings.MONGO_URI + self._path = path + # import pymongo + try: + import pymongo + except ImportError: + raise ImportError( + "Could not import pymongo python package. " + "Please install it with `pip install pymongo`." + ) + self._client = pymongo.MongoClient(self._mongo_uri) + + def search(self, question, k=2, *args, **kwargs): + query_vector = self._embeddings.embed_query(question) + + pipeline = [ + { + "$vectorSearch": { + "queryVector": query_vector, + "path": self._embedding_key, + "limit": k, + "index": self._index_name + } + } + ] + + cursor = self._client._collection.aggregate(pipeline) + + results = [] + for doc in cursor: + text = doc[self._text_key] + metadata = doc + results.append(Document(text, metadata)) + + return results + + def _insert_texts(self, texts, metadatas): + if not texts: + return [] + embeddings = self._embedding.embed_documents(texts) + to_insert = [ + {self._text_key: t, self._embedding_key: embedding, **m} + for t, m, embedding in zip(texts, metadatas, embeddings) + ] + # insert the documents in MongoDB Atlas + insert_result = self.client._collection.insert_many(to_insert) + return insert_result.inserted_ids + + def add_texts(self, + texts, + metadatas = None, + ids = None, + refresh_indices = True, + create_index_if_not_exists = True, + bulk_kwargs = None, + **kwargs,): + + + batch_size = 100 + _metadatas = metadatas or ({} for _ in texts) + texts_batch = [] + metadatas_batch = [] + result_ids = [] + for i, (text, metadata) in enumerate(zip(texts, _metadatas)): + texts_batch.append(text) + metadatas_batch.append(metadata) + if (i + 1) % batch_size == 0: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + texts_batch = [] + metadatas_batch = [] + if texts_batch: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + return result_ids \ No newline at end of file From 4ff834de76abf2194b8520ff2a1bdf065b78dbf8 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 6 Jan 2024 17:59:01 +0000 Subject: [PATCH 2/4] Refactor MongoDBVectorStore and add delete_index method --- application/vectorstore/mongodb.py | 64 +++++++++++++++++------ application/vectorstore/vector_creator.py | 4 +- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/application/vectorstore/mongodb.py b/application/vectorstore/mongodb.py index b0fd13c1..337fc41f 100644 --- a/application/vectorstore/mongodb.py +++ b/application/vectorstore/mongodb.py @@ -5,21 +5,22 @@ from application.vectorstore.document_class import Document class MongoDBVectorStore(BaseVectorStore): def __init__( self, + path: str = "", + embeddings_key: str = "embeddings", collection: str = "documents", - index_name: str = "default", + index_name: str = "vector_search_index", text_key: str = "text", embedding_key: str = "embedding", - embedding_api_key: str = "embedding_api_key", - path: str = "", + database: str = "docsgpt", ): - self._collection = collection - self._embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embedding_api_key) self._index_name = index_name self._text_key = text_key self._embedding_key = embedding_key + self._embeddings_key = embeddings_key self._mongo_uri = settings.MONGO_URI - self._path = path - # import pymongo + self._path = path.replace("application/indexes/", "").rstrip("/") + self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + try: import pymongo except ImportError: @@ -27,30 +28,40 @@ class MongoDBVectorStore(BaseVectorStore): "Could not import pymongo python package. " "Please install it with `pip install pymongo`." ) + self._client = pymongo.MongoClient(self._mongo_uri) + self._database = self._client[database] + self._collection = self._database[collection] + def search(self, question, k=2, *args, **kwargs): - query_vector = self._embeddings.embed_query(question) - + query_vector = self._embedding.embed_query(question) + pipeline = [ { "$vectorSearch": { "queryVector": query_vector, "path": self._embedding_key, "limit": k, - "index": self._index_name + "numCandidates": k * 10, + "index": self._index_name, + "filter": { + "store": {"$eq": self._path} + } } } ] - - cursor = self._client._collection.aggregate(pipeline) + + cursor = self._collection.aggregate(pipeline) results = [] for doc in cursor: text = doc[self._text_key] + doc.pop("_id") + doc.pop(self._text_key) + doc.pop(self._embedding_key) metadata = doc results.append(Document(text, metadata)) - return results def _insert_texts(self, texts, metadatas): @@ -62,7 +73,7 @@ class MongoDBVectorStore(BaseVectorStore): for t, m, embedding in zip(texts, metadatas, embeddings) ] # insert the documents in MongoDB Atlas - insert_result = self.client._collection.insert_many(to_insert) + insert_result = self._collection.insert_many(to_insert) return insert_result.inserted_ids def add_texts(self, @@ -75,6 +86,26 @@ class MongoDBVectorStore(BaseVectorStore): **kwargs,): + #dims = self._embedding.client[1].word_embedding_dimension + # # check if index exists + # if create_index_if_not_exists: + # # check if index exists + # info = self._collection.index_information() + # if self._index_name not in info: + # index_mongo = { + # "fields": [{ + # "type": "vector", + # "path": self._embedding_key, + # "numDimensions": dims, + # "similarity": "cosine", + # }, + # { + # "type": "filter", + # "path": "store" + # }] + # } + # self._collection.create_index(self._index_name, index_mongo) + batch_size = 100 _metadatas = metadatas or ({} for _ in texts) texts_batch = [] @@ -89,4 +120,7 @@ class MongoDBVectorStore(BaseVectorStore): metadatas_batch = [] if texts_batch: result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) - return result_ids \ No newline at end of file + return result_ids + + def delete_index(self, *args, **kwargs): + self._collection.delete_many({"store": self._path}) \ No newline at end of file diff --git a/application/vectorstore/vector_creator.py b/application/vectorstore/vector_creator.py index cbc491f5..68ae2813 100644 --- a/application/vectorstore/vector_creator.py +++ b/application/vectorstore/vector_creator.py @@ -1,11 +1,13 @@ from application.vectorstore.faiss import FaissStore from application.vectorstore.elasticsearch import ElasticsearchStore +from application.vectorstore.mongodb import MongoDBVectorStore class VectorCreator: vectorstores = { 'faiss': FaissStore, - 'elasticsearch':ElasticsearchStore + 'elasticsearch':ElasticsearchStore, + 'mongodb': MongoDBVectorStore, } @classmethod From 1fa12e56c63e68f4821487b7bd03303b2f048806 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 6 Jan 2024 18:04:50 +0000 Subject: [PATCH 3/4] Remove unused test cases in test_openai.py --- tests/llm/test_openai.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/llm/test_openai.py b/tests/llm/test_openai.py index d1c63c63..445de223 100644 --- a/tests/llm/test_openai.py +++ b/tests/llm/test_openai.py @@ -10,23 +10,3 @@ class TestOpenAILLM(unittest.TestCase): def test_init(self): self.assertEqual(self.llm.api_key, self.api_key) - - @patch('application.llm.openai.openai.ChatCompletion.create') - def test_gen(self, mock_create): - model = "test_model" - engine = "test_engine" - messages = ["test_message"] - response = {"choices": [{"message": {"content": "test_response"}}]} - mock_create.return_value = response - result = self.llm.gen(model, engine, messages) - self.assertEqual(result, "test_response") - - @patch('application.llm.openai.openai.ChatCompletion.create') - def test_gen_stream(self, mock_create): - model = "test_model" - engine = "test_engine" - messages = ["test_message"] - response = [{"choices": [{"delta": {"content": "test_response"}}]}] - mock_create.return_value = response - result = list(self.llm.gen_stream(model, engine, messages)) - self.assertEqual(result, ["test_response"]) From a3e6239e6e15db60246016e4a9de487fcb030126 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 6 Jan 2024 18:23:20 +0000 Subject: [PATCH 4/4] fix: remove import --- tests/llm/test_openai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/llm/test_openai.py b/tests/llm/test_openai.py index 445de223..8c713178 100644 --- a/tests/llm/test_openai.py +++ b/tests/llm/test_openai.py @@ -1,5 +1,4 @@ import unittest -from unittest.mock import patch from application.llm.openai import OpenAILLM class TestOpenAILLM(unittest.TestCase):