From 783e7f6939da251d2f76bd592eaaeefa98db299b Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 29 Sep 2023 00:32:19 +0100 Subject: [PATCH] working es --- application/parser/open_ai_func.py | 7 +- application/requirements.txt | 1 + application/vectorstore/base.py | 2 +- application/vectorstore/elasticsearch.py | 199 +++++++++++++++++++++++ application/vectorstore/faiss.py | 5 +- frontend/package-lock.json | 52 +++--- 6 files changed, 235 insertions(+), 31 deletions(-) create mode 100644 application/vectorstore/elasticsearch.py diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py index 969165d2..0489eb87 100644 --- a/application/parser/open_ai_func.py +++ b/application/parser/open_ai_func.py @@ -1,8 +1,7 @@ import os import tiktoken -from langchain.embeddings import OpenAIEmbeddings -from langchain.vectorstores import FAISS +from application.vectorstore.faiss import FaissStore from retry import retry @@ -33,11 +32,9 @@ def call_openai_api(docs, folder_name, task_status): os.makedirs(f"{folder_name}") from tqdm import tqdm - docs_test = [docs[0]] - docs.pop(0) c1 = 0 - store = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=os.getenv("EMBEDDINGS_KEY"))) + store = FaissStore(path=f"{folder_name}", embeddings_key=os.getenv("EMBEDDINGS_KEY")) # Uncomment for MPNet embeddings # model_name = "sentence-transformers/all-mpnet-base-v2" diff --git a/application/requirements.txt b/application/requirements.txt index d978cb41..68532aa1 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -22,6 +22,7 @@ decorator==5.1.1 dill==0.3.6 dnspython==2.3.0 ecdsa==0.18.0 +elasticsearch==8.9.0 entrypoints==0.4 faiss-cpu==1.7.3 filelock==3.9.0 diff --git a/application/vectorstore/base.py b/application/vectorstore/base.py index ad481744..18a3881b 100644 --- a/application/vectorstore/base.py +++ b/application/vectorstore/base.py @@ -19,7 +19,7 @@ class BaseVectorStore(ABC): def is_azure_configured(self): return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME - def _get_docsearch(self, embeddings_name, embeddings_key=None): + def _get_embeddings(self, embeddings_name, embeddings_key=None): embeddings_factory = { "openai_text-embedding-ada-002": OpenAIEmbeddings, "huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceHubEmbeddings, diff --git a/application/vectorstore/elasticsearch.py b/application/vectorstore/elasticsearch.py new file mode 100644 index 00000000..b87f851a --- /dev/null +++ b/application/vectorstore/elasticsearch.py @@ -0,0 +1,199 @@ +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings +import elasticsearch +#from langchain.vectorstores.elasticsearch import ElasticsearchStore + + +class ElasticsearchStore(BaseVectorStore): + _es_connection = None # Class attribute to hold the Elasticsearch connection + + def __init__(self, path, embeddings_key, index_name="docsgpt"): + super().__init__() + self.path = path.replace("/app/application/indexes/", "") + self.embeddings_key = embeddings_key + self.index_name = index_name + + if ElasticsearchStore._es_connection is None: + connection_params = {} + connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID + connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD) + ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params) + + self.docsearch = ElasticsearchStore._es_connection + + def connect_to_elasticsearch( + *, + es_url = None, + cloud_id = None, + api_key = None, + username = None, + password = None, + ): + try: + import elasticsearch + except ImportError: + raise ImportError( + "Could not import elasticsearch python package. " + "Please install it with `pip install elasticsearch`." + ) + + if es_url and cloud_id: + raise ValueError( + "Both es_url and cloud_id are defined. Please provide only one." + ) + + connection_params = {} + + if es_url: + connection_params["hosts"] = [es_url] + elif cloud_id: + connection_params["cloud_id"] = cloud_id + else: + raise ValueError("Please provide either elasticsearch_url or cloud_id.") + + if api_key: + connection_params["api_key"] = api_key + elif username and password: + connection_params["basic_auth"] = (username, password) + + es_client = elasticsearch.Elasticsearch( + **connection_params, + ) + try: + es_client.info() + except Exception as e: + raise e + + return es_client + + def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs): + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) + vector = embeddings.embed_query(question) + knn = { + "filter": [{"match": {"metadata.filename.keyword": self.path}}], + "field": "vector", + "k": k, + "num_candidates": 100, + "query_vector": vector, + } + full_query = { + "knn": knn, + "query": { + "bool": { + "must": [ + { + "match": { + "text": { + "query": question, + } + } + } + ], + "filter": [{"match": {"metadata.filename.keyword": self.path}}], + } + }, + "rank": {"rrf": {}}, + } + resp = self.docsearch.search(index=index_name, query=full_query['query'], size=k, knn=full_query['knn']) + return resp + + def _create_index_if_not_exists( + self, index_name, dims_length + ): + + if self.client.indices.exists(index=index_name): + print(f"Index {index_name} already exists.") + + else: + self.strategy.before_index_setup( + client=self.client, + text_field=self.query_field, + vector_query_field=self.vector_query_field, + ) + + indexSettings = self.index( + dims_length=dims_length, + ) + self.client.indices.create(index=index_name, **indexSettings) + def index( + self, + dims_length, + ): + + + return { + "mappings": { + "properties": { + "vector": { + "type": "dense_vector", + "dims": dims_length, + "index": True, + "similarity": "cosine", + }, + } + } + } + + def add_texts( + self, + texts, + metadatas = None, + ids = None, + refresh_indices = True, + create_index_if_not_exists = True, + bulk_kwargs = None, + **kwargs, + ): + + from elasticsearch.helpers import BulkIndexError, bulk + + bulk_kwargs = bulk_kwargs or {} + import uuid + embeddings = [] + ids = ids or [str(uuid.uuid4()) for _ in texts] + requests = [] + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) + + vectors = embeddings.embed_documents(list(texts)) + + dims_length = len(vectors[0]) + + if create_index_if_not_exists: + self._create_index_if_not_exists( + index_name=self.index_name, dims_length=dims_length + ) + + for i, (text, vector) in enumerate(zip(texts, vectors)): + metadata = metadatas[i] if metadatas else {} + + requests.append( + { + "_op_type": "index", + "_index": self.index_name, + "text": text, + "vector": vector, + "metadata": metadata, + "_id": ids[i], + } + ) + + + if len(requests) > 0: + try: + success, failed = bulk( + self.client, + requests, + stats_only=True, + refresh=refresh_indices, + **bulk_kwargs, + ) + return ids + except BulkIndexError as e: + print(f"Error adding texts: {e}") + firstError = e.errors[0].get("index", {}).get("error", {}) + print(f"First error reason: {firstError.get('reason')}") + raise e + + else: + return [] + diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 9a562dce..d85b6084 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -8,8 +8,11 @@ class FaissStore(BaseVectorStore): super().__init__() self.path = path self.docsearch = FAISS.load_local( - self.path, self._get_docsearch(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY) + self.path, self._get_embeddings(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY) ) def search(self, *args, **kwargs): return self.docsearch.similarity_search(*args, **kwargs) + + def add_texts(self, *args, **kwargs): + return self.docsearch.add_texts(*args, **kwargs) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index ff8a21f6..415c483b 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1346,9 +1346,9 @@ } }, "node_modules/@typescript-eslint/eslint-plugin/node_modules/semver": { - "version": "7.3.8", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz", - "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "dev": true, "dependencies": { "lru-cache": "^6.0.0" @@ -1490,9 +1490,9 @@ } }, "node_modules/@typescript-eslint/typescript-estree/node_modules/semver": { - "version": "7.3.8", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz", - "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "dev": true, "dependencies": { "lru-cache": "^6.0.0" @@ -1549,9 +1549,9 @@ } }, "node_modules/@typescript-eslint/utils/node_modules/semver": { - "version": "7.3.8", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz", - "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "dev": true, "dependencies": { "lru-cache": "^6.0.0" @@ -1991,9 +1991,9 @@ } }, "node_modules/builtins/node_modules/semver": { - "version": "7.3.8", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz", - "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "dev": true, "dependencies": { "lru-cache": "^6.0.0" @@ -2055,9 +2055,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001450", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001450.tgz", - "integrity": "sha512-qMBmvmQmFXaSxexkjjfMvD5rnDL0+m+dUMZKoDYsGG8iZN29RuYh9eRoMvKsT6uMAWlyUUGDEQGJJYjzCIO9ew==", + "version": "1.0.30001541", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001541.tgz", + "integrity": "sha512-bLOsqxDgTqUBkzxbNlSBt8annkDpQB9NdzdTbO2ooJ+eC/IQcvDspDc058g84ejCelF7vHUx57KIOjEecOHXaw==", "dev": true, "funding": [ { @@ -2067,6 +2067,10 @@ { "type": "tidelift", "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" } ] }, @@ -2889,9 +2893,9 @@ } }, "node_modules/eslint-plugin-n/node_modules/semver": { - "version": "7.3.8", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz", - "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "dev": true, "dependencies": { "lru-cache": "^6.0.0" @@ -4478,9 +4482,9 @@ } }, "node_modules/lint-staged/node_modules/yaml": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.2.1.tgz", - "integrity": "sha512-e0WHiYql7+9wr4cWMx3TVQrNwejKaEe7/rHNmQmqRjazfOP5W8PB6Jpebb5o6fIapbz9o9+2ipcaTM2ZwDI6lw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.3.2.tgz", + "integrity": "sha512-N/lyzTPaJasoDmfV7YTrYCI0G/3ivm/9wdG0aHuheKowWQwGTsK0Eoiw6utmzAnI6pkJa0DUVygvp3spqqEKXg==", "dev": true, "engines": { "node": ">= 14" @@ -6532,9 +6536,9 @@ } }, "node_modules/semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==", + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", "dev": true, "bin": { "semver": "bin/semver.js"