feat: new vectors structure

This commit is contained in:
Alex
2024-09-08 16:59:51 +01:00
parent 0891ef6d0a
commit a353e69648
14 changed files with 85 additions and 99 deletions

View File

@@ -35,12 +35,12 @@ def upload_index_files():
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
tokens = secure_filename(request.form["tokens"])
""""
ObjectId serves as a dir name in application/indexes,
and for indexing the vector metadata in the collection
"""
_id = ObjectId()
save_dir = os.path.join(current_dir, "indexes", str(_id))
retriever = secure_filename(request.form["retriever"])
id = secure_filename(request.form["id"])
type = secure_filename(request.form["type"])
remote_data = secure_filename(request.form["remote_data"]) if "remote_data" in request.form else None
save_dir = os.path.join(current_dir, "indexes", str(id))
if settings.VECTOR_STORE == "faiss":
if "file_faiss" not in request.files:
print("No file part")
@@ -63,15 +63,16 @@ def upload_index_files():
# create entry in vectors_collection
vectors_collection.insert_one(
{
"_id":_id,
"_id": ObjectId(id),
"user": user,
"name": job_name,
"language": job_name,
"location": save_dir,
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": settings.EMBEDDINGS_NAME,
"type": "local",
"tokens": tokens
"type": type,
"tokens": tokens,
"retriever": retriever,
"remote_data": remote_data
}
)
return {"status": "ok"}

View File

@@ -237,15 +237,11 @@ def combined_json():
data = [
{
"name": "default",
"language": "default",
"version": "",
"description": "default",
"fullName": "default",
"date": "default",
"docLink": "default",
"model": settings.EMBEDDINGS_NAME,
"location": "remote",
"tokens": "",
"retriever": "classic",
}
]
# structure: name, language, version, description, fullName, date, docLink
@@ -255,35 +251,22 @@ def combined_json():
{
"id": str(index["_id"]),
"name": index["name"],
"language": index["language"],
"version": "",
"description": index["name"],
"fullName": index["name"],
"date": index["date"],
"docLink": index["location"],
"model": settings.EMBEDDINGS_NAME,
"location": "local",
"tokens": index["tokens"] if ("tokens" in index.keys()) else "",
"retriever": index["retriever"] if ("retriever" in index.keys()) else "classic",
}
)
if settings.VECTOR_STORE == "faiss":
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
for index in data_remote:
index["location"] = "remote"
data.append(index)
if "duckduck_search" in settings.RETRIEVERS_ENABLED:
data.append(
{
"name": "DuckDuckGo Search",
"language": "en",
"version": "",
"description": "duckduck_search",
"fullName": "DuckDuckGo Search",
"date": "duckduck_search",
"docLink": "duckduck_search",
"model": settings.EMBEDDINGS_NAME,
"location": "custom",
"tokens": "",
"retriever": "duckduck_search",
}
)
if "brave_search" in settings.RETRIEVERS_ENABLED:
@@ -291,14 +274,11 @@ def combined_json():
{
"name": "Brave Search",
"language": "en",
"version": "",
"description": "brave_search",
"fullName": "Brave Search",
"date": "brave_search",
"docLink": "brave_search",
"model": settings.EMBEDDINGS_NAME,
"location": "custom",
"tokens": "",
"retriever": "brave_search",
}
)

View File

@@ -21,7 +21,7 @@ class ClassicRAG(BaseRetriever):
user_api_key=None,
):
self.question = question
self.vectorstore = self._get_vectorstore(source=source)
self.vectorstore = source['active_docs'] if 'active_docs' in source else None
self.chat_history = chat_history
self.prompt = prompt
self.chunks = chunks
@@ -38,14 +38,6 @@ class ClassicRAG(BaseRetriever):
)
self.user_api_key = user_api_key
def _get_vectorstore(self, source):
if "active_docs" in source:
vectorstore = "indexes/"+source["active_docs"]
else:
vectorstore = ""
vectorstore = os.path.join("application", vectorstore)
return vectorstore
def _get_data(self):
if self.chunks == 0:
docs = []

View File

@@ -210,4 +210,3 @@ class ElasticsearchStore(BaseVectorStore):
def delete_index(self):
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
"metadata.store.keyword": self.path}},)

View File

@@ -1,12 +1,22 @@
from langchain_community.vectorstores import FAISS
from application.vectorstore.base import BaseVectorStore
from application.core.settings import settings
import os
def get_vectorstore(path):
if path:
vectorstore = "indexes/"+path
vectorstore = os.path.join("application", vectorstore)
else:
vectorstore = os.path.join("application")
return vectorstore
class FaissStore(BaseVectorStore):
def __init__(self, path, embeddings_key, docs_init=None):
super().__init__()
self.path = path
self.path = get_vectorstore(path)
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
if docs_init:
self.docsearch = FAISS.from_documents(

View File

@@ -6,6 +6,7 @@ import tiktoken
from urllib.parse import urljoin
import requests
from bson.objectid import ObjectId
from application.core.settings import settings
from application.parser.file.bulk import SimpleDirectoryReader
@@ -57,7 +58,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
# Define the main function for ingesting and processing documents.
def ingest_worker(self, directory, formats, name_job, filename, user):
def ingest_worker(self, directory, formats, name_job, filename, user, retriever="classic"):
"""
Ingest and process documents.
@@ -68,6 +69,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
name_job (str): Name of the job for this ingestion task.
filename (str): Name of the file to be ingested.
user (str): Identifier for the user initiating the ingestion.
retriever (str): Type of retriever to use for processing the documents.
Returns:
dict: Information about the completed ingestion task, including input parameters and a "limited" flag.
@@ -136,7 +138,8 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
file_data = {"name": name_job, "user": user, "tokens": tokens}
id = ObjectId()
file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), 'type': 'local'}
if settings.VECTOR_STORE == "faiss":
files = {
"file_faiss": open(full_path + "/index.faiss", "rb"),
@@ -160,7 +163,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
}
def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
def remote_worker(self, source_data, name_job, user, loader, directory="temp", retriever="classic"):
token_check = True
min_tokens = 150
max_tokens = 1250
@@ -180,12 +183,14 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
token_check=token_check,
)
# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
call_openai_api(docs, full_path, self)
tokens = count_tokens_docs(docs)
call_openai_api(docs, full_path, self)
self.update_state(state="PROGRESS", meta={"current": 100})
# Proceed with uploading and cleaning as in the original function
file_data = {"name": name_job, "user": user, "tokens": tokens}
id = ObjectId()
file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever,
"id": str(id), 'type': loader, 'remote_data': source_data}
if settings.VECTOR_STORE == "faiss":
files = {
"file_faiss": open(full_path + "/index.faiss", "rb"),