mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
working es
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
import os
|
||||
|
||||
import tiktoken
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
from application.vectorstore.faiss import FaissStore
|
||||
from retry import retry
|
||||
|
||||
|
||||
@@ -33,11 +32,9 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
os.makedirs(f"{folder_name}")
|
||||
|
||||
from tqdm import tqdm
|
||||
docs_test = [docs[0]]
|
||||
docs.pop(0)
|
||||
c1 = 0
|
||||
|
||||
store = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=os.getenv("EMBEDDINGS_KEY")))
|
||||
store = FaissStore(path=f"{folder_name}", embeddings_key=os.getenv("EMBEDDINGS_KEY"))
|
||||
|
||||
# Uncomment for MPNet embeddings
|
||||
# model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
|
||||
@@ -22,6 +22,7 @@ decorator==5.1.1
|
||||
dill==0.3.6
|
||||
dnspython==2.3.0
|
||||
ecdsa==0.18.0
|
||||
elasticsearch==8.9.0
|
||||
entrypoints==0.4
|
||||
faiss-cpu==1.7.3
|
||||
filelock==3.9.0
|
||||
|
||||
@@ -19,7 +19,7 @@ class BaseVectorStore(ABC):
|
||||
def is_azure_configured(self):
|
||||
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
||||
|
||||
def _get_docsearch(self, embeddings_name, embeddings_key=None):
|
||||
def _get_embeddings(self, embeddings_name, embeddings_key=None):
|
||||
embeddings_factory = {
|
||||
"openai_text-embedding-ada-002": OpenAIEmbeddings,
|
||||
"huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceHubEmbeddings,
|
||||
|
||||
199
application/vectorstore/elasticsearch.py
Normal file
199
application/vectorstore/elasticsearch.py
Normal file
@@ -0,0 +1,199 @@
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from application.core.settings import settings
|
||||
import elasticsearch
|
||||
#from langchain.vectorstores.elasticsearch import ElasticsearchStore
|
||||
|
||||
|
||||
class ElasticsearchStore(BaseVectorStore):
|
||||
_es_connection = None # Class attribute to hold the Elasticsearch connection
|
||||
|
||||
def __init__(self, path, embeddings_key, index_name="docsgpt"):
|
||||
super().__init__()
|
||||
self.path = path.replace("/app/application/indexes/", "")
|
||||
self.embeddings_key = embeddings_key
|
||||
self.index_name = index_name
|
||||
|
||||
if ElasticsearchStore._es_connection is None:
|
||||
connection_params = {}
|
||||
connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID
|
||||
connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
|
||||
ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params)
|
||||
|
||||
self.docsearch = ElasticsearchStore._es_connection
|
||||
|
||||
def connect_to_elasticsearch(
|
||||
*,
|
||||
es_url = None,
|
||||
cloud_id = None,
|
||||
api_key = None,
|
||||
username = None,
|
||||
password = None,
|
||||
):
|
||||
try:
|
||||
import elasticsearch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import elasticsearch python package. "
|
||||
"Please install it with `pip install elasticsearch`."
|
||||
)
|
||||
|
||||
if es_url and cloud_id:
|
||||
raise ValueError(
|
||||
"Both es_url and cloud_id are defined. Please provide only one."
|
||||
)
|
||||
|
||||
connection_params = {}
|
||||
|
||||
if es_url:
|
||||
connection_params["hosts"] = [es_url]
|
||||
elif cloud_id:
|
||||
connection_params["cloud_id"] = cloud_id
|
||||
else:
|
||||
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
|
||||
|
||||
if api_key:
|
||||
connection_params["api_key"] = api_key
|
||||
elif username and password:
|
||||
connection_params["basic_auth"] = (username, password)
|
||||
|
||||
es_client = elasticsearch.Elasticsearch(
|
||||
**connection_params,
|
||||
)
|
||||
try:
|
||||
es_client.info()
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
return es_client
|
||||
|
||||
def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs):
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||
vector = embeddings.embed_query(question)
|
||||
knn = {
|
||||
"filter": [{"match": {"metadata.filename.keyword": self.path}}],
|
||||
"field": "vector",
|
||||
"k": k,
|
||||
"num_candidates": 100,
|
||||
"query_vector": vector,
|
||||
}
|
||||
full_query = {
|
||||
"knn": knn,
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{
|
||||
"match": {
|
||||
"text": {
|
||||
"query": question,
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"filter": [{"match": {"metadata.filename.keyword": self.path}}],
|
||||
}
|
||||
},
|
||||
"rank": {"rrf": {}},
|
||||
}
|
||||
resp = self.docsearch.search(index=index_name, query=full_query['query'], size=k, knn=full_query['knn'])
|
||||
return resp
|
||||
|
||||
def _create_index_if_not_exists(
|
||||
self, index_name, dims_length
|
||||
):
|
||||
|
||||
if self.client.indices.exists(index=index_name):
|
||||
print(f"Index {index_name} already exists.")
|
||||
|
||||
else:
|
||||
self.strategy.before_index_setup(
|
||||
client=self.client,
|
||||
text_field=self.query_field,
|
||||
vector_query_field=self.vector_query_field,
|
||||
)
|
||||
|
||||
indexSettings = self.index(
|
||||
dims_length=dims_length,
|
||||
)
|
||||
self.client.indices.create(index=index_name, **indexSettings)
|
||||
def index(
|
||||
self,
|
||||
dims_length,
|
||||
):
|
||||
|
||||
|
||||
return {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": dims_length,
|
||||
"index": True,
|
||||
"similarity": "cosine",
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts,
|
||||
metadatas = None,
|
||||
ids = None,
|
||||
refresh_indices = True,
|
||||
create_index_if_not_exists = True,
|
||||
bulk_kwargs = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
from elasticsearch.helpers import BulkIndexError, bulk
|
||||
|
||||
bulk_kwargs = bulk_kwargs or {}
|
||||
import uuid
|
||||
embeddings = []
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
requests = []
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||
|
||||
vectors = embeddings.embed_documents(list(texts))
|
||||
|
||||
dims_length = len(vectors[0])
|
||||
|
||||
if create_index_if_not_exists:
|
||||
self._create_index_if_not_exists(
|
||||
index_name=self.index_name, dims_length=dims_length
|
||||
)
|
||||
|
||||
for i, (text, vector) in enumerate(zip(texts, vectors)):
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
|
||||
requests.append(
|
||||
{
|
||||
"_op_type": "index",
|
||||
"_index": self.index_name,
|
||||
"text": text,
|
||||
"vector": vector,
|
||||
"metadata": metadata,
|
||||
"_id": ids[i],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if len(requests) > 0:
|
||||
try:
|
||||
success, failed = bulk(
|
||||
self.client,
|
||||
requests,
|
||||
stats_only=True,
|
||||
refresh=refresh_indices,
|
||||
**bulk_kwargs,
|
||||
)
|
||||
return ids
|
||||
except BulkIndexError as e:
|
||||
print(f"Error adding texts: {e}")
|
||||
firstError = e.errors[0].get("index", {}).get("error", {})
|
||||
print(f"First error reason: {firstError.get('reason')}")
|
||||
raise e
|
||||
|
||||
else:
|
||||
return []
|
||||
|
||||
@@ -8,8 +8,11 @@ class FaissStore(BaseVectorStore):
|
||||
super().__init__()
|
||||
self.path = path
|
||||
self.docsearch = FAISS.load_local(
|
||||
self.path, self._get_docsearch(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY)
|
||||
self.path, self._get_embeddings(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY)
|
||||
)
|
||||
|
||||
def search(self, *args, **kwargs):
|
||||
return self.docsearch.similarity_search(*args, **kwargs)
|
||||
|
||||
def add_texts(self, *args, **kwargs):
|
||||
return self.docsearch.add_texts(*args, **kwargs)
|
||||
|
||||
52
frontend/package-lock.json
generated
52
frontend/package-lock.json
generated
@@ -1346,9 +1346,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@typescript-eslint/eslint-plugin/node_modules/semver": {
|
||||
"version": "7.3.8",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
|
||||
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
|
||||
"version": "7.5.4",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
|
||||
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"lru-cache": "^6.0.0"
|
||||
@@ -1490,9 +1490,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@typescript-eslint/typescript-estree/node_modules/semver": {
|
||||
"version": "7.3.8",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
|
||||
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
|
||||
"version": "7.5.4",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
|
||||
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"lru-cache": "^6.0.0"
|
||||
@@ -1549,9 +1549,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@typescript-eslint/utils/node_modules/semver": {
|
||||
"version": "7.3.8",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
|
||||
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
|
||||
"version": "7.5.4",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
|
||||
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"lru-cache": "^6.0.0"
|
||||
@@ -1991,9 +1991,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/builtins/node_modules/semver": {
|
||||
"version": "7.3.8",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
|
||||
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
|
||||
"version": "7.5.4",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
|
||||
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"lru-cache": "^6.0.0"
|
||||
@@ -2055,9 +2055,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/caniuse-lite": {
|
||||
"version": "1.0.30001450",
|
||||
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001450.tgz",
|
||||
"integrity": "sha512-qMBmvmQmFXaSxexkjjfMvD5rnDL0+m+dUMZKoDYsGG8iZN29RuYh9eRoMvKsT6uMAWlyUUGDEQGJJYjzCIO9ew==",
|
||||
"version": "1.0.30001541",
|
||||
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001541.tgz",
|
||||
"integrity": "sha512-bLOsqxDgTqUBkzxbNlSBt8annkDpQB9NdzdTbO2ooJ+eC/IQcvDspDc058g84ejCelF7vHUx57KIOjEecOHXaw==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
{
|
||||
@@ -2067,6 +2067,10 @@
|
||||
{
|
||||
"type": "tidelift",
|
||||
"url": "https://tidelift.com/funding/github/npm/caniuse-lite"
|
||||
},
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/ai"
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -2889,9 +2893,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/eslint-plugin-n/node_modules/semver": {
|
||||
"version": "7.3.8",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
|
||||
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
|
||||
"version": "7.5.4",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
|
||||
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"lru-cache": "^6.0.0"
|
||||
@@ -4478,9 +4482,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/lint-staged/node_modules/yaml": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.2.1.tgz",
|
||||
"integrity": "sha512-e0WHiYql7+9wr4cWMx3TVQrNwejKaEe7/rHNmQmqRjazfOP5W8PB6Jpebb5o6fIapbz9o9+2ipcaTM2ZwDI6lw==",
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.3.2.tgz",
|
||||
"integrity": "sha512-N/lyzTPaJasoDmfV7YTrYCI0G/3ivm/9wdG0aHuheKowWQwGTsK0Eoiw6utmzAnI6pkJa0DUVygvp3spqqEKXg==",
|
||||
"dev": true,
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
@@ -6532,9 +6536,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/semver": {
|
||||
"version": "6.3.0",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
|
||||
"integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
|
||||
"version": "6.3.1",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
|
||||
"integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
|
||||
"dev": true,
|
||||
"bin": {
|
||||
"semver": "bin/semver.js"
|
||||
|
||||
Reference in New Issue
Block a user