From 1c4d7a6ad1f0dce36e7304e57172bf0587de79f5 Mon Sep 17 00:00:00 2001 From: Jacksonxhx Date: Tue, 30 Jul 2024 17:44:27 +0800 Subject: [PATCH 1/3] integrated milvus db --- application/core/settings.py | 7 ++++- application/vectorstore/milvus.py | 38 +++++++++++++++++++++++ application/vectorstore/vector_creator.py | 2 ++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 application/vectorstore/milvus.py diff --git a/application/core/settings.py b/application/core/settings.py index 6ae5475c..ff29bbb2 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -18,7 +18,7 @@ class Settings(BaseSettings): DEFAULT_MAX_HISTORY: int = 150 MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5} UPLOAD_FOLDER: str = "inputs" - VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" + VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search API_URL: str = "http://localhost:7091" # backend url for celery worker @@ -61,6 +61,11 @@ class Settings(BaseSettings): QDRANT_PATH: Optional[str] = None QDRANT_DISTANCE_FUNC: str = "Cosine" + # Milvus vectorstore config + MILVUS_COLLECTION_NAME: Optional[str] = "docsgpt" + MILVUS_URI: Optional[str] = "./milvus_local.db" # milvus lite version as default + MILVUS_TOKEN: Optional[str] = "" + BRAVE_SEARCH_API_KEY: Optional[str] = None FLASK_DEBUG_MODE: bool = False diff --git a/application/vectorstore/milvus.py b/application/vectorstore/milvus.py new file mode 100644 index 00000000..0861f593 --- /dev/null +++ b/application/vectorstore/milvus.py @@ -0,0 +1,38 @@ +from typing import List, Optional +from langchain_community.vectorstores.milvus import Milvus + +from application.core.settings import settings +from application.vectorstore.base import BaseVectorStore + + +class MilvusStore(BaseVectorStore): + def __init__(self, path: str = "", embeddings_key: str = "embeddings"): + super().__init__() + if path: + connection_args ={ + "uri": path, + "tpken": settings.MILVUS_TOKEN, + } + else: + connection_args = { + "uri": settings.MILVUS_URL, + 'token': settings.MILVUS_TOKEN, + } + self._docsearch = Milvus( + embedding_function=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key), + collection_name=settings.COLLECTION_NAME, + connection_args=connection_args, + drop_old=True, + ) + + def search(self, question, k=2, *args, **kwargs): + return self._docsearch.similarity_search(query=question, k=k, *args, **kwargs) + + def add_texts(self, texts: List[str], metadatas: Optional[List[dict]], *args, **kwargs): + return self._docsearch.add_texts(texts=texts, metadatas=metadatas, *args, **kwargs) + + def save_local(self, *args, **kwargs): + pass + + def delete_index(self, *args, **kwargs): + pass diff --git a/application/vectorstore/vector_creator.py b/application/vectorstore/vector_creator.py index 27b38645..259fa31f 100644 --- a/application/vectorstore/vector_creator.py +++ b/application/vectorstore/vector_creator.py @@ -1,5 +1,6 @@ from application.vectorstore.faiss import FaissStore from application.vectorstore.elasticsearch import ElasticsearchStore +from application.vectorstore.milvus import MilvusStore from application.vectorstore.mongodb import MongoDBVectorStore from application.vectorstore.qdrant import QdrantStore @@ -10,6 +11,7 @@ class VectorCreator: "elasticsearch": ElasticsearchStore, "mongodb": MongoDBVectorStore, "qdrant": QdrantStore, + "milvus": MilvusStore, } @classmethod From d232229abf65a0b25ccbb0c6068e376dc63c071c Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 5 Sep 2024 23:41:51 +0100 Subject: [PATCH 2/3] feat Milvus integration --- application/requirements.txt | 10 +++--- application/vectorstore/base.py | 52 ++++++++++++++++++------------- application/vectorstore/faiss.py | 11 ++++--- application/vectorstore/milvus.py | 29 +++++++++-------- docker-compose-azure.yaml | 2 -- docker-compose-dev.yaml | 2 -- docker-compose-local.yaml | 2 -- docker-compose-mock.yaml | 2 -- docker-compose.yaml | 2 -- 9 files changed, 56 insertions(+), 56 deletions(-) diff --git a/application/requirements.txt b/application/requirements.txt index b072885d..95a16fce 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -8,13 +8,15 @@ EbookLib==0.18 elasticsearch==8.12.0 escodegen==1.0.11 esprima==4.0.1 -faiss-cpu==1.7.4 +faiss-cpu==1.8.0.post1 Flask==3.0.1 gunicorn==22.0.0 html2text==2020.1.16 javalang==0.13.0 -langchain==0.1.4 -langchain-openai==0.0.5 +langchain==0.2.16 +langchain-community==0.2.16 +langchain-core==0.2.38 +langchain-openai==0.1.23 openapi3_parser==1.1.16 pandas==2.2.0 pydantic_settings==2.1.0 @@ -26,7 +28,7 @@ redis==5.0.1 Requests==2.32.0 retry==0.9.2 sentence-transformers -tiktoken +tiktoken==0.7.0 torch tqdm==4.66.3 transformers==4.36.2 diff --git a/application/vectorstore/base.py b/application/vectorstore/base.py index 522ef4fa..9c76b89f 100644 --- a/application/vectorstore/base.py +++ b/application/vectorstore/base.py @@ -1,13 +1,30 @@ from abc import ABC, abstractmethod import os -from langchain_community.embeddings import ( - HuggingFaceEmbeddings, - CohereEmbeddings, - HuggingFaceInstructEmbeddings, -) +from sentence_transformers import SentenceTransformer from langchain_openai import OpenAIEmbeddings from application.core.settings import settings +class EmbeddingsWrapper: + def __init__(self, model_name, *args, **kwargs): + self.model = SentenceTransformer(model_name, config_kwargs={'allow_dangerous_deserialization': True}, *args, **kwargs) + self.dimension = self.model.get_sentence_embedding_dimension() + + def embed_query(self, query: str): + return self.model.encode(query).tolist() + + def embed_documents(self, documents: list): + return self.model.encode(documents).tolist() + + def __call__(self, text): + if isinstance(text, str): + return self.embed_query(text) + elif isinstance(text, list): + return self.embed_documents(text) + else: + raise ValueError("Input must be a string or a list of strings") + + + class EmbeddingsSingleton: _instances = {} @@ -23,16 +40,15 @@ class EmbeddingsSingleton: def _create_instance(embeddings_name, *args, **kwargs): embeddings_factory = { "openai_text-embedding-ada-002": OpenAIEmbeddings, - "huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceEmbeddings, - "huggingface_sentence-transformers-all-mpnet-base-v2": HuggingFaceEmbeddings, - "huggingface_hkunlp/instructor-large": HuggingFaceInstructEmbeddings, - "cohere_medium": CohereEmbeddings + "huggingface_sentence-transformers/all-mpnet-base-v2": lambda: EmbeddingsWrapper("sentence-transformers/all-mpnet-base-v2"), + "huggingface_sentence-transformers-all-mpnet-base-v2": lambda: EmbeddingsWrapper("sentence-transformers/all-mpnet-base-v2"), + "huggingface_hkunlp/instructor-large": lambda: EmbeddingsWrapper("hkunlp/instructor-large"), } - if embeddings_name not in embeddings_factory: - raise ValueError(f"Invalid embeddings_name: {embeddings_name}") - - return embeddings_factory[embeddings_name](*args, **kwargs) + if embeddings_name in embeddings_factory: + return embeddings_factory[embeddings_name](*args, **kwargs) + else: + return EmbeddingsWrapper(embeddings_name, *args, **kwargs) class BaseVectorStore(ABC): def __init__(self): @@ -58,22 +74,14 @@ class BaseVectorStore(ABC): embeddings_name, openai_api_key=embeddings_key ) - elif embeddings_name == "cohere_medium": - embedding_instance = EmbeddingsSingleton.get_instance( - embeddings_name, - cohere_api_key=embeddings_key - ) elif embeddings_name == "huggingface_sentence-transformers/all-mpnet-base-v2": if os.path.exists("./model/all-mpnet-base-v2"): embedding_instance = EmbeddingsSingleton.get_instance( - embeddings_name, - model_name="./model/all-mpnet-base-v2", - model_kwargs={"device": "cpu"} + embeddings_name="./model/all-mpnet-base-v2", ) else: embedding_instance = EmbeddingsSingleton.get_instance( embeddings_name, - model_kwargs={"device": "cpu"} ) else: embedding_instance = EmbeddingsSingleton.get_instance(embeddings_name) diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 8e8f3b8e..46f6e8cb 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -14,7 +14,8 @@ class FaissStore(BaseVectorStore): ) else: self.docsearch = FAISS.load_local( - self.path, embeddings + self.path, embeddings, + allow_dangerous_deserialization=True ) self.assert_embedding_dimensions(embeddings) @@ -37,10 +38,10 @@ class FaissStore(BaseVectorStore): """ if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2": try: - word_embedding_dimension = embeddings.client[1].word_embedding_dimension + word_embedding_dimension = embeddings.dimension except AttributeError as e: - raise AttributeError("word_embedding_dimension not found in embeddings.client[1]") from e + raise AttributeError("'dimension' attribute not found in embeddings instance. Make sure the embeddings object is properly initialized.") from e docsearch_index_dimension = self.docsearch.index.d if word_embedding_dimension != docsearch_index_dimension: - raise ValueError(f"word_embedding_dimension ({word_embedding_dimension}) " + - f"!= docsearch_index_word_embedding_dimension ({docsearch_index_dimension})") \ No newline at end of file + raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " + + f"!= docsearch index dimension ({docsearch_index_dimension})") \ No newline at end of file diff --git a/application/vectorstore/milvus.py b/application/vectorstore/milvus.py index 0861f593..9871991e 100644 --- a/application/vectorstore/milvus.py +++ b/application/vectorstore/milvus.py @@ -1,5 +1,6 @@ from typing import List, Optional -from langchain_community.vectorstores.milvus import Milvus +from uuid import uuid4 + from application.core.settings import settings from application.vectorstore.base import BaseVectorStore @@ -8,28 +9,26 @@ from application.vectorstore.base import BaseVectorStore class MilvusStore(BaseVectorStore): def __init__(self, path: str = "", embeddings_key: str = "embeddings"): super().__init__() - if path: - connection_args ={ - "uri": path, - "tpken": settings.MILVUS_TOKEN, - } - else: - connection_args = { - "uri": settings.MILVUS_URL, - 'token': settings.MILVUS_TOKEN, - } + from langchain_milvus import Milvus + + connection_args = { + "uri": settings.MILVUS_URI, + "token": settings.MILVUS_TOKEN, + } self._docsearch = Milvus( embedding_function=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key), - collection_name=settings.COLLECTION_NAME, + collection_name=settings.MILVUS_COLLECTION_NAME, connection_args=connection_args, - drop_old=True, ) + self._path = path def search(self, question, k=2, *args, **kwargs): - return self._docsearch.similarity_search(query=question, k=k, *args, **kwargs) + return self._docsearch.similarity_search(query=question, k=k, filter={"path": self._path} *args, **kwargs) def add_texts(self, texts: List[str], metadatas: Optional[List[dict]], *args, **kwargs): - return self._docsearch.add_texts(texts=texts, metadatas=metadatas, *args, **kwargs) + ids = [str(uuid4()) for _ in range(len(texts))] + + return self._docsearch.add_texts(texts=texts, metadatas=metadatas, ids=ids, *args, **kwargs) def save_local(self, *args, **kwargs): pass diff --git a/docker-compose-azure.yaml b/docker-compose-azure.yaml index 70a16808..601831e5 100644 --- a/docker-compose-azure.yaml +++ b/docker-compose-azure.yaml @@ -1,5 +1,3 @@ -version: "3.9" - services: frontend: build: ./frontend diff --git a/docker-compose-dev.yaml b/docker-compose-dev.yaml index f68e4e07..8a3e75c4 100644 --- a/docker-compose-dev.yaml +++ b/docker-compose-dev.yaml @@ -1,5 +1,3 @@ -version: "3.9" - services: redis: diff --git a/docker-compose-local.yaml b/docker-compose-local.yaml index 3aebe8b5..74bf0101 100644 --- a/docker-compose-local.yaml +++ b/docker-compose-local.yaml @@ -1,5 +1,3 @@ -version: "3.9" - services: frontend: build: ./frontend diff --git a/docker-compose-mock.yaml b/docker-compose-mock.yaml index a5c7419b..b4a917c9 100644 --- a/docker-compose-mock.yaml +++ b/docker-compose-mock.yaml @@ -1,5 +1,3 @@ -version: "3.9" - services: frontend: build: ./frontend diff --git a/docker-compose.yaml b/docker-compose.yaml index 7008b53d..05c8c059 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,5 +1,3 @@ -version: "3.9" - services: frontend: build: ./frontend From a1d3592d086eac0654ee92459e7714aad2542b7a Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 9 Sep 2024 21:06:30 +0100 Subject: [PATCH 3/3] fix: typo --- application/retriever/retriever_creator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/application/retriever/retriever_creator.py b/application/retriever/retriever_creator.py index ad071401..07be373d 100644 --- a/application/retriever/retriever_creator.py +++ b/application/retriever/retriever_creator.py @@ -5,15 +5,16 @@ from application.retriever.brave_search import BraveRetSearch class RetrieverCreator: - retievers = { + retrievers = { 'classic': ClassicRAG, 'duckduck_search': DuckDuckSearch, - 'brave_search': BraveRetSearch + 'brave_search': BraveRetSearch, + 'default': ClassicRAG } @classmethod def create_retriever(cls, type, *args, **kwargs): - retiever_class = cls.retievers.get(type.lower()) + retiever_class = cls.retrievers.get(type.lower()) if not retiever_class: raise ValueError(f"No retievers class found for type {type}") return retiever_class(*args, **kwargs) \ No newline at end of file