diff --git a/application/vectorstore/mongodb.py b/application/vectorstore/mongodb.py index b0fd13c1..337fc41f 100644 --- a/application/vectorstore/mongodb.py +++ b/application/vectorstore/mongodb.py @@ -5,21 +5,22 @@ from application.vectorstore.document_class import Document class MongoDBVectorStore(BaseVectorStore): def __init__( self, + path: str = "", + embeddings_key: str = "embeddings", collection: str = "documents", - index_name: str = "default", + index_name: str = "vector_search_index", text_key: str = "text", embedding_key: str = "embedding", - embedding_api_key: str = "embedding_api_key", - path: str = "", + database: str = "docsgpt", ): - self._collection = collection - self._embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embedding_api_key) self._index_name = index_name self._text_key = text_key self._embedding_key = embedding_key + self._embeddings_key = embeddings_key self._mongo_uri = settings.MONGO_URI - self._path = path - # import pymongo + self._path = path.replace("application/indexes/", "").rstrip("/") + self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + try: import pymongo except ImportError: @@ -27,30 +28,40 @@ class MongoDBVectorStore(BaseVectorStore): "Could not import pymongo python package. " "Please install it with `pip install pymongo`." ) + self._client = pymongo.MongoClient(self._mongo_uri) + self._database = self._client[database] + self._collection = self._database[collection] + def search(self, question, k=2, *args, **kwargs): - query_vector = self._embeddings.embed_query(question) - + query_vector = self._embedding.embed_query(question) + pipeline = [ { "$vectorSearch": { "queryVector": query_vector, "path": self._embedding_key, "limit": k, - "index": self._index_name + "numCandidates": k * 10, + "index": self._index_name, + "filter": { + "store": {"$eq": self._path} + } } } ] - - cursor = self._client._collection.aggregate(pipeline) + + cursor = self._collection.aggregate(pipeline) results = [] for doc in cursor: text = doc[self._text_key] + doc.pop("_id") + doc.pop(self._text_key) + doc.pop(self._embedding_key) metadata = doc results.append(Document(text, metadata)) - return results def _insert_texts(self, texts, metadatas): @@ -62,7 +73,7 @@ class MongoDBVectorStore(BaseVectorStore): for t, m, embedding in zip(texts, metadatas, embeddings) ] # insert the documents in MongoDB Atlas - insert_result = self.client._collection.insert_many(to_insert) + insert_result = self._collection.insert_many(to_insert) return insert_result.inserted_ids def add_texts(self, @@ -75,6 +86,26 @@ class MongoDBVectorStore(BaseVectorStore): **kwargs,): + #dims = self._embedding.client[1].word_embedding_dimension + # # check if index exists + # if create_index_if_not_exists: + # # check if index exists + # info = self._collection.index_information() + # if self._index_name not in info: + # index_mongo = { + # "fields": [{ + # "type": "vector", + # "path": self._embedding_key, + # "numDimensions": dims, + # "similarity": "cosine", + # }, + # { + # "type": "filter", + # "path": "store" + # }] + # } + # self._collection.create_index(self._index_name, index_mongo) + batch_size = 100 _metadatas = metadatas or ({} for _ in texts) texts_batch = [] @@ -89,4 +120,7 @@ class MongoDBVectorStore(BaseVectorStore): metadatas_batch = [] if texts_batch: result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) - return result_ids \ No newline at end of file + return result_ids + + def delete_index(self, *args, **kwargs): + self._collection.delete_many({"store": self._path}) \ No newline at end of file diff --git a/application/vectorstore/vector_creator.py b/application/vectorstore/vector_creator.py index cbc491f5..68ae2813 100644 --- a/application/vectorstore/vector_creator.py +++ b/application/vectorstore/vector_creator.py @@ -1,11 +1,13 @@ from application.vectorstore.faiss import FaissStore from application.vectorstore.elasticsearch import ElasticsearchStore +from application.vectorstore.mongodb import MongoDBVectorStore class VectorCreator: vectorstores = { 'faiss': FaissStore, - 'elasticsearch':ElasticsearchStore + 'elasticsearch':ElasticsearchStore, + 'mongodb': MongoDBVectorStore, } @classmethod