mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
Merge pull request #1062 from ManishMadan2882/1059-migrating-database-to-new-model
Migrating database to new model
This commit is contained in:
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
vectors_collection = db["vectors"]
|
||||
sources_collection = db["sources"]
|
||||
prompts_collection = db["prompts"]
|
||||
api_key_collection = db["api_keys"]
|
||||
answer = Blueprint("answer", __name__)
|
||||
@@ -77,40 +77,27 @@ def get_data_from_api_key(api_key):
|
||||
if data is None:
|
||||
raise Exception("Invalid API Key, please generate new key", 401)
|
||||
|
||||
if isinstance(data["source"], DBRef):
|
||||
source_id = db.dereference(data["source"])["_id"]
|
||||
data["source"] = get_source(source_id)
|
||||
if "retriever" not in data:
|
||||
data["retriever"] = None
|
||||
|
||||
if "source" in data and isinstance(data["source"], DBRef):
|
||||
source_doc = db.dereference(data["source"])
|
||||
data["source"] = str(source_doc["_id"])
|
||||
if "retriever" in source_doc:
|
||||
data["retriever"] = source_doc["retriever"]
|
||||
else:
|
||||
data["source"] = {}
|
||||
return data
|
||||
|
||||
|
||||
def get_source(active_doc):
|
||||
if ObjectId.is_valid(active_doc):
|
||||
doc = vectors_collection.find_one({"_id": ObjectId(active_doc)})
|
||||
if doc is None:
|
||||
raise Exception("Source document does not exist", 404)
|
||||
print("res", doc)
|
||||
source = {"active_docs": "/".join(doc["location"].split("/")[-2:])}
|
||||
else:
|
||||
source = {"active_docs": active_doc}
|
||||
return source
|
||||
def get_retriever(source_id: str):
|
||||
doc = sources_collection.find_one({"_id": ObjectId(source_id)})
|
||||
if doc is None:
|
||||
raise Exception("Source document does not exist", 404)
|
||||
retriever_name = None if "retriever" not in doc else doc["retriever"]
|
||||
return retriever_name
|
||||
|
||||
|
||||
def get_vectorstore(data):
|
||||
if "active_docs" in data:
|
||||
if data["active_docs"].split("/")[0] == "default":
|
||||
vectorstore = ""
|
||||
elif data["active_docs"].split("/")[0] == "local":
|
||||
vectorstore = "indexes/" + data["active_docs"]
|
||||
else:
|
||||
vectorstore = "vectors/" + data["active_docs"]
|
||||
if data["active_docs"] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
vectorstore = os.path.join("application", vectorstore)
|
||||
return vectorstore
|
||||
|
||||
|
||||
def is_azure_configured():
|
||||
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
|
||||
@@ -244,28 +231,34 @@ def stream():
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
# check if active_docs or api_key is set
|
||||
## retriever can be "brave_search, duckduck_search or classic"
|
||||
retriever_name = data["retriever"] if "retriever" in data else "classic"
|
||||
|
||||
# check if active_docs or api_key is set
|
||||
if "api_key" in data:
|
||||
data_key = get_data_from_api_key(data["api_key"])
|
||||
chunks = int(data_key["chunks"])
|
||||
prompt_id = data_key["prompt_id"]
|
||||
source = data_key["source"]
|
||||
source = {"active_docs": data_key["source"]}
|
||||
retriever_name = data_key["retriever"] or retriever_name
|
||||
user_api_key = data["api_key"]
|
||||
|
||||
elif "active_docs" in data:
|
||||
source = get_source(data["active_docs"])
|
||||
source = {"active_docs" : data["active_docs"]}
|
||||
retriever_name = get_retriever(data["active_docs"]) or retriever_name
|
||||
user_api_key = None
|
||||
|
||||
else:
|
||||
source = {}
|
||||
user_api_key = None
|
||||
|
||||
if source["active_docs"].split("/")[0] == "default" or source["active_docs"].split("/")[0] == "local":
|
||||
""" if source["active_docs"].split("/")[0] == "default" or source["active_docs"].split("/")[0] == "local":
|
||||
retriever_name = "classic"
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
retriever_name = source["active_docs"] """
|
||||
|
||||
prompt = get_prompt(prompt_id)
|
||||
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
question=question,
|
||||
@@ -341,6 +334,9 @@ def api_answer():
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
## retriever can be brave_search, duckduck_search or classic
|
||||
retriever_name = data["retriever"] if "retriever" in data else "classic"
|
||||
|
||||
# use try and except to check for exception
|
||||
try:
|
||||
# check if the vectorstore is set
|
||||
@@ -348,16 +344,16 @@ def api_answer():
|
||||
data_key = get_data_from_api_key(data["api_key"])
|
||||
chunks = int(data_key["chunks"])
|
||||
prompt_id = data_key["prompt_id"]
|
||||
source = data_key["source"]
|
||||
source = {"active_docs": data_key["source"]}
|
||||
retriever_name = data_key["retriever"] or retriever_name
|
||||
user_api_key = data["api_key"]
|
||||
else:
|
||||
source = get_source(data["active_docs"])
|
||||
elif "active_docs" in data:
|
||||
source = {"active_docs":data["active_docs"]}
|
||||
retriever_name = get_retriever(data["active_docs"]) or retriever_name
|
||||
user_api_key = None
|
||||
|
||||
if source["active_docs"].split("/")[0] == "default" or source["active_docs"].split("/")[0] == "local":
|
||||
retriever_name = "classic"
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
source = {}
|
||||
user_api_key = None
|
||||
|
||||
prompt = get_prompt(prompt_id)
|
||||
|
||||
@@ -407,19 +403,19 @@ def api_search():
|
||||
if "api_key" in data:
|
||||
data_key = get_data_from_api_key(data["api_key"])
|
||||
chunks = int(data_key["chunks"])
|
||||
source = data_key["source"]
|
||||
source = {"active_docs":data_key["source"]}
|
||||
user_api_key = data_key["api_key"]
|
||||
elif "active_docs" in data:
|
||||
source = get_source(data["active_docs"])
|
||||
source = {"active_docs":data["active_docs"]}
|
||||
user_api_key = None
|
||||
else:
|
||||
source = {}
|
||||
user_api_key = None
|
||||
|
||||
if source["active_docs"].split("/")[0] == "default" or source["active_docs"].split("/")[0] == "local":
|
||||
retriever_name = "classic"
|
||||
if "retriever" in data:
|
||||
retriever_name = data["retriever"]
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
retriever_name = "classic"
|
||||
if "token_limit" in data:
|
||||
token_limit = data["token_limit"]
|
||||
else:
|
||||
|
||||
@@ -3,13 +3,13 @@ import datetime
|
||||
from flask import Blueprint, request, send_from_directory
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
from application.core.settings import settings
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
vectors_collection = db["vectors"]
|
||||
sources_collection = db["sources"]
|
||||
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
@@ -35,7 +35,12 @@ def upload_index_files():
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
tokens = secure_filename(request.form["tokens"])
|
||||
save_dir = os.path.join(current_dir, "indexes", user, job_name)
|
||||
retriever = secure_filename(request.form["retriever"])
|
||||
id = secure_filename(request.form["id"])
|
||||
type = secure_filename(request.form["type"])
|
||||
remote_data = secure_filename(request.form["remote_data"]) if "remote_data" in request.form else None
|
||||
|
||||
save_dir = os.path.join(current_dir, "indexes", str(id))
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
if "file_faiss" not in request.files:
|
||||
print("No file part")
|
||||
@@ -55,17 +60,19 @@ def upload_index_files():
|
||||
os.makedirs(save_dir)
|
||||
file_faiss.save(os.path.join(save_dir, "index.faiss"))
|
||||
file_pkl.save(os.path.join(save_dir, "index.pkl"))
|
||||
# create entry in vectors_collection
|
||||
vectors_collection.insert_one(
|
||||
# create entry in sources_collection
|
||||
sources_collection.insert_one(
|
||||
{
|
||||
"_id": ObjectId(id),
|
||||
"user": user,
|
||||
"name": job_name,
|
||||
"language": job_name,
|
||||
"location": save_dir,
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"type": "local",
|
||||
"tokens": tokens
|
||||
"type": type,
|
||||
"tokens": tokens,
|
||||
"retriever": retriever,
|
||||
"remote_data": remote_data
|
||||
}
|
||||
)
|
||||
return {"status": "ok"}
|
||||
@@ -2,8 +2,6 @@ import os
|
||||
import uuid
|
||||
import shutil
|
||||
from flask import Blueprint, request, jsonify
|
||||
from urllib.parse import urlparse
|
||||
import requests
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
from bson.binary import Binary, UuidRepresentation
|
||||
@@ -17,7 +15,7 @@ from application.vectorstore.vector_creator import VectorCreator
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
vectors_collection = db["vectors"]
|
||||
sources_collection = db["sources"]
|
||||
prompts_collection = db["prompts"]
|
||||
feedback_collection = db["feedback"]
|
||||
api_key_collection = db["api_keys"]
|
||||
@@ -25,9 +23,7 @@ shared_conversations_collections = db["shared_conversations"]
|
||||
|
||||
user = Blueprint("user", __name__)
|
||||
|
||||
current_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
@user.route("/api/delete_conversation", methods=["POST"])
|
||||
@@ -57,9 +53,7 @@ def get_conversations():
|
||||
conversations = conversations_collection.find().sort("date", -1).limit(30)
|
||||
list_conversations = []
|
||||
for conversation in conversations:
|
||||
list_conversations.append(
|
||||
{"id": str(conversation["_id"]), "name": conversation["name"]}
|
||||
)
|
||||
list_conversations.append({"id": str(conversation["_id"]), "name": conversation["name"]})
|
||||
|
||||
# list_conversations = [{"id": "default", "name": "default"}, {"id": "jeff", "name": "jeff"}]
|
||||
|
||||
@@ -110,7 +104,7 @@ def delete_by_ids():
|
||||
return {"status": "error"}
|
||||
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
result = vectors_collection.delete_index(ids=ids)
|
||||
result = sources_collection.delete_index(ids=ids)
|
||||
if result:
|
||||
return {"status": "ok"}
|
||||
return {"status": "error"}
|
||||
@@ -120,28 +114,24 @@ def delete_by_ids():
|
||||
def delete_old():
|
||||
"""Delete old indexes."""
|
||||
import shutil
|
||||
|
||||
path = request.args.get("path")
|
||||
dirs = path.split("/")
|
||||
dirs_clean = []
|
||||
for i in range(0, len(dirs)):
|
||||
dirs_clean.append(secure_filename(dirs[i]))
|
||||
# check that path strats with indexes or vectors
|
||||
|
||||
if dirs_clean[0] not in ["indexes", "vectors"]:
|
||||
return {"status": "error"}
|
||||
path_clean = "/".join(dirs_clean)
|
||||
vectors_collection.delete_one({"name": dirs_clean[-1], "user": dirs_clean[-2]})
|
||||
source_id = request.args.get("source_id")
|
||||
doc = sources_collection.find_one({
|
||||
"_id": ObjectId(source_id),
|
||||
"user": "local",
|
||||
})
|
||||
if(doc is None):
|
||||
return {"status":"not found"},404
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
try:
|
||||
shutil.rmtree(os.path.join(current_dir, path_clean))
|
||||
shutil.rmtree(os.path.join(current_dir, str(doc["_id"])))
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
vetorstore = VectorCreator.create_vectorstore(
|
||||
settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean)
|
||||
)
|
||||
vetorstore = VectorCreator.create_vectorstore(settings.VECTOR_STORE, source_id=str(doc["_id"]))
|
||||
vetorstore.delete_index()
|
||||
sources_collection.delete_one({
|
||||
"_id": ObjectId(source_id),
|
||||
})
|
||||
|
||||
return {"status": "ok"}
|
||||
|
||||
@@ -175,9 +165,7 @@ def upload_file():
|
||||
file.save(os.path.join(temp_dir, filename))
|
||||
|
||||
# Use shutil.make_archive to zip the temp directory
|
||||
zip_path = shutil.make_archive(
|
||||
base_name=os.path.join(save_dir, job_name), format="zip", root_dir=temp_dir
|
||||
)
|
||||
zip_path = shutil.make_archive(base_name=os.path.join(save_dir, job_name), format="zip", root_dir=temp_dir)
|
||||
final_filename = os.path.basename(zip_path)
|
||||
|
||||
# Clean up the temporary directory after zipping
|
||||
@@ -219,9 +207,7 @@ def upload_remote():
|
||||
source_data = request.form["data"]
|
||||
|
||||
if source_data:
|
||||
task = ingest_remote.delay(
|
||||
source_data=source_data, job_name=job_name, user=user, loader=source
|
||||
)
|
||||
task = ingest_remote.delay(source_data=source_data, job_name=job_name, user=user, loader=source)
|
||||
task_id = task.id
|
||||
return {"status": "ok", "task_id": task_id}
|
||||
else:
|
||||
@@ -248,55 +234,36 @@ def combined_json():
|
||||
data = [
|
||||
{
|
||||
"name": "default",
|
||||
"language": "default",
|
||||
"version": "",
|
||||
"description": "default",
|
||||
"fullName": "default",
|
||||
"date": "default",
|
||||
"docLink": "default",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "remote",
|
||||
"tokens": "",
|
||||
"retriever": "classic",
|
||||
}
|
||||
]
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
# append data from vectors_collection in sorted order in descending order of date
|
||||
for index in vectors_collection.find({"user": user}).sort("date", -1):
|
||||
# append data from sources_collection in sorted order in descending order of date
|
||||
for index in sources_collection.find({"user": user}).sort("date", -1):
|
||||
data.append(
|
||||
{
|
||||
"id":str(index["_id"]),
|
||||
"id": str(index["_id"]),
|
||||
"name": index["name"],
|
||||
"language": index["language"],
|
||||
"version": "",
|
||||
"description": index["name"],
|
||||
"fullName": index["name"],
|
||||
"date": index["date"],
|
||||
"docLink": index["location"],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
"tokens": index["tokens"] if ("tokens" in index.keys()) else "",
|
||||
"retriever": index["retriever"] if ("retriever" in index.keys()) else "classic",
|
||||
}
|
||||
)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
data_remote = requests.get(
|
||||
"https://d3dg1063dc54p9.cloudfront.net/combined.json"
|
||||
).json()
|
||||
for index in data_remote:
|
||||
index["location"] = "remote"
|
||||
data.append(index)
|
||||
if "duckduck_search" in settings.RETRIEVERS_ENABLED:
|
||||
data.append(
|
||||
{
|
||||
"name": "DuckDuckGo Search",
|
||||
"language": "en",
|
||||
"version": "",
|
||||
"description": "duckduck_search",
|
||||
"fullName": "DuckDuckGo Search",
|
||||
"date": "duckduck_search",
|
||||
"docLink": "duckduck_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens": "",
|
||||
"retriever": "duckduck_search",
|
||||
}
|
||||
)
|
||||
if "brave_search" in settings.RETRIEVERS_ENABLED:
|
||||
@@ -304,14 +271,11 @@ def combined_json():
|
||||
{
|
||||
"name": "Brave Search",
|
||||
"language": "en",
|
||||
"version": "",
|
||||
"description": "brave_search",
|
||||
"fullName": "Brave Search",
|
||||
"date": "brave_search",
|
||||
"docLink": "brave_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens": "",
|
||||
"retriever": "brave_search",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -320,39 +284,13 @@ def combined_json():
|
||||
|
||||
@user.route("/api/docs_check", methods=["POST"])
|
||||
def check_docs():
|
||||
# check if docs exist in a vectorstore folder
|
||||
data = request.get_json()
|
||||
# split docs on / and take first part
|
||||
if data["docs"].split("/")[0] == "local":
|
||||
return {"status": "exists"}
|
||||
|
||||
vectorstore = "vectors/" + secure_filename(data["docs"])
|
||||
base_path = "https://raw.githubusercontent.com/arc53/DocsHUB/main/"
|
||||
if os.path.exists(vectorstore) or data["docs"] == "default":
|
||||
return {"status": "exists"}
|
||||
else:
|
||||
file_url = urlparse(base_path + vectorstore + "index.faiss")
|
||||
|
||||
if (
|
||||
file_url.scheme in ["https"]
|
||||
and file_url.netloc == "raw.githubusercontent.com"
|
||||
and file_url.path.startswith("/arc53/DocsHUB/main/")
|
||||
):
|
||||
r = requests.get(file_url.geturl())
|
||||
if r.status_code != 200:
|
||||
return {"status": "null"}
|
||||
else:
|
||||
if not os.path.exists(vectorstore):
|
||||
os.makedirs(vectorstore)
|
||||
with open(vectorstore + "index.faiss", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
r = requests.get(base_path + vectorstore + "index.pkl")
|
||||
with open(vectorstore + "index.pkl", "wb") as f:
|
||||
f.write(r.content)
|
||||
else:
|
||||
return {"status": "null"}
|
||||
|
||||
return {"status": "loaded"}
|
||||
return {"status": "not found"}
|
||||
|
||||
|
||||
@user.route("/api/create_prompt", methods=["POST"])
|
||||
@@ -383,9 +321,7 @@ def get_prompts():
|
||||
list_prompts.append({"id": "creative", "name": "creative", "type": "public"})
|
||||
list_prompts.append({"id": "strict", "name": "strict", "type": "public"})
|
||||
for prompt in prompts:
|
||||
list_prompts.append(
|
||||
{"id": str(prompt["_id"]), "name": prompt["name"], "type": "private"}
|
||||
)
|
||||
list_prompts.append({"id": str(prompt["_id"]), "name": prompt["name"], "type": "private"})
|
||||
|
||||
return jsonify(list_prompts)
|
||||
|
||||
@@ -394,21 +330,15 @@ def get_prompts():
|
||||
def get_single_prompt():
|
||||
prompt_id = request.args.get("id")
|
||||
if prompt_id == "default":
|
||||
with open(
|
||||
os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r"
|
||||
) as f:
|
||||
with open(os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r") as f:
|
||||
chat_combine_template = f.read()
|
||||
return jsonify({"content": chat_combine_template})
|
||||
elif prompt_id == "creative":
|
||||
with open(
|
||||
os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r"
|
||||
) as f:
|
||||
with open(os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r") as f:
|
||||
chat_reduce_creative = f.read()
|
||||
return jsonify({"content": chat_reduce_creative})
|
||||
elif prompt_id == "strict":
|
||||
with open(
|
||||
os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r"
|
||||
) as f:
|
||||
with open(os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r") as f:
|
||||
chat_reduce_strict = f.read()
|
||||
return jsonify({"content": chat_reduce_strict})
|
||||
|
||||
@@ -437,9 +367,7 @@ def update_prompt_name():
|
||||
# check if name is null
|
||||
if name == "":
|
||||
return {"status": "error"}
|
||||
prompts_collection.update_one(
|
||||
{"_id": ObjectId(id)}, {"$set": {"name": name, "content": content}}
|
||||
)
|
||||
prompts_collection.update_one({"_id": ObjectId(id)}, {"$set": {"name": name, "content": content}})
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@@ -449,12 +377,23 @@ def get_api_keys():
|
||||
keys = api_key_collection.find({"user": user})
|
||||
list_keys = []
|
||||
for key in keys:
|
||||
if "source" in key and isinstance(key["source"],DBRef):
|
||||
source = db.dereference(key["source"])
|
||||
if source is None:
|
||||
continue
|
||||
else:
|
||||
source_name = source["name"]
|
||||
elif "retriever" in key:
|
||||
source_name = key["retriever"]
|
||||
else:
|
||||
continue
|
||||
|
||||
list_keys.append(
|
||||
{
|
||||
"id": str(key["_id"]),
|
||||
"name": key["name"],
|
||||
"key": key["key"][:4] + "..." + key["key"][-4:],
|
||||
"source": str(key["source"]),
|
||||
"source": source_name,
|
||||
"prompt_id": key["prompt_id"],
|
||||
"chunks": key["chunks"],
|
||||
}
|
||||
@@ -466,23 +405,22 @@ def get_api_keys():
|
||||
def create_api_key():
|
||||
data = request.get_json()
|
||||
name = data["name"]
|
||||
source = data["source"]
|
||||
prompt_id = data["prompt_id"]
|
||||
chunks = data["chunks"]
|
||||
key = str(uuid.uuid4())
|
||||
user = "local"
|
||||
if(ObjectId.is_valid(data["source"])):
|
||||
source = DBRef("vectors",ObjectId(data["source"]))
|
||||
resp = api_key_collection.insert_one(
|
||||
{
|
||||
"name": name,
|
||||
"key": key,
|
||||
"source": source,
|
||||
"user": user,
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
}
|
||||
)
|
||||
new_api_key = {
|
||||
"name": name,
|
||||
"key": key,
|
||||
"user": user,
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
}
|
||||
if "source" in data and ObjectId.is_valid(data["source"]):
|
||||
new_api_key["source"] = DBRef("sources", ObjectId(data["source"]))
|
||||
if "retriever" in data:
|
||||
new_api_key["retriever"] = data["retriever"]
|
||||
resp = api_key_collection.insert_one(new_api_key)
|
||||
new_id = str(resp.inserted_id)
|
||||
return {"id": new_id, "key": key}
|
||||
|
||||
@@ -509,36 +447,37 @@ def share_conversation():
|
||||
conversation_id = data["conversation_id"]
|
||||
isPromptable = request.args.get("isPromptable").lower() == "true"
|
||||
|
||||
conversation = conversations_collection.find_one(
|
||||
{"_id": ObjectId(conversation_id)}
|
||||
)
|
||||
conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)})
|
||||
if(conversation is None):
|
||||
raise Exception("Conversation does not exist")
|
||||
current_n_queries = len(conversation["queries"])
|
||||
|
||||
##generate binary representation of uuid
|
||||
explicit_binary = Binary.from_uuid(uuid.uuid4(), UuidRepresentation.STANDARD)
|
||||
|
||||
if isPromptable:
|
||||
source = "default" if "source" not in data else data["source"]
|
||||
prompt_id = "default" if "prompt_id" not in data else data["prompt_id"]
|
||||
chunks = "2" if "chunks" not in data else data["chunks"]
|
||||
|
||||
name = conversation["name"] + "(shared)"
|
||||
pre_existing_api_document = api_key_collection.find_one(
|
||||
{
|
||||
new_api_key_data = {
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
"source": DBRef("vectors",ObjectId(source)) if ObjectId.is_valid(source) else source,
|
||||
"user": user,
|
||||
}
|
||||
if "source" in data and ObjectId.is_valid(data["source"]):
|
||||
new_api_key_data["source"] = DBRef("sources",ObjectId(data["source"]))
|
||||
elif "retriever" in data:
|
||||
new_api_key_data["retriever"] = data["retriever"]
|
||||
|
||||
pre_existing_api_document = api_key_collection.find_one(
|
||||
new_api_key_data
|
||||
)
|
||||
api_uuid = str(uuid.uuid4())
|
||||
if pre_existing_api_document:
|
||||
api_uuid = pre_existing_api_document["key"]
|
||||
pre_existing = shared_conversations_collections.find_one(
|
||||
{
|
||||
"conversation_id": DBRef(
|
||||
"conversations", ObjectId(conversation_id)
|
||||
),
|
||||
"conversation_id": DBRef("conversations", ObjectId(conversation_id)),
|
||||
"isPromptable": isPromptable,
|
||||
"first_n_queries": current_n_queries,
|
||||
"user": user,
|
||||
@@ -569,21 +508,18 @@ def share_conversation():
|
||||
"api_key": api_uuid,
|
||||
}
|
||||
)
|
||||
return jsonify(
|
||||
{"success": True, "identifier": str(explicit_binary.as_uuid())}
|
||||
)
|
||||
return jsonify({"success": True, "identifier": str(explicit_binary.as_uuid())})
|
||||
else:
|
||||
api_key_collection.insert_one(
|
||||
{
|
||||
"name": name,
|
||||
"key": api_uuid,
|
||||
"source": DBRef("vectors",ObjectId(source)) if ObjectId.is_valid(source) else source,
|
||||
"user": user,
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
}
|
||||
)
|
||||
shared_conversations_collections.insert_one(
|
||||
|
||||
api_uuid = str(uuid.uuid4())
|
||||
new_api_key_data["key"] = api_uuid
|
||||
new_api_key_data["name"] = name
|
||||
if "source" in data and ObjectId.is_valid(data["source"]):
|
||||
new_api_key_data["source"] = DBRef("sources", ObjectId(data["source"]))
|
||||
if "retriever" in data:
|
||||
new_api_key_data["retriever"] = data["retriever"]
|
||||
api_key_collection.insert_one(new_api_key_data)
|
||||
shared_conversations_collections.insert_one(
|
||||
{
|
||||
"uuid": explicit_binary,
|
||||
"conversation_id": {
|
||||
@@ -595,12 +531,10 @@ def share_conversation():
|
||||
"user": user,
|
||||
"api_key": api_uuid,
|
||||
}
|
||||
)
|
||||
)
|
||||
## Identifier as route parameter in frontend
|
||||
return (
|
||||
jsonify(
|
||||
{"success": True, "identifier": str(explicit_binary.as_uuid())}
|
||||
),
|
||||
jsonify({"success": True, "identifier": str(explicit_binary.as_uuid())}),
|
||||
201,
|
||||
)
|
||||
|
||||
@@ -615,9 +549,7 @@ def share_conversation():
|
||||
)
|
||||
if pre_existing is not None:
|
||||
return (
|
||||
jsonify(
|
||||
{"success": True, "identifier": str(pre_existing["uuid"].as_uuid())}
|
||||
),
|
||||
jsonify({"success": True, "identifier": str(pre_existing["uuid"].as_uuid())}),
|
||||
200,
|
||||
)
|
||||
else:
|
||||
@@ -635,9 +567,7 @@ def share_conversation():
|
||||
)
|
||||
## Identifier as route parameter in frontend
|
||||
return (
|
||||
jsonify(
|
||||
{"success": True, "identifier": str(explicit_binary.as_uuid())}
|
||||
),
|
||||
jsonify({"success": True, "identifier": str(explicit_binary.as_uuid())}),
|
||||
201,
|
||||
)
|
||||
except Exception as err:
|
||||
@@ -649,16 +579,10 @@ def share_conversation():
|
||||
@user.route("/api/shared_conversation/<string:identifier>", methods=["GET"])
|
||||
def get_publicly_shared_conversations(identifier: str):
|
||||
try:
|
||||
query_uuid = Binary.from_uuid(
|
||||
uuid.UUID(identifier), UuidRepresentation.STANDARD
|
||||
)
|
||||
query_uuid = Binary.from_uuid(uuid.UUID(identifier), UuidRepresentation.STANDARD)
|
||||
shared = shared_conversations_collections.find_one({"uuid": query_uuid})
|
||||
conversation_queries = []
|
||||
if (
|
||||
shared
|
||||
and "conversation_id" in shared
|
||||
and isinstance(shared["conversation_id"], DBRef)
|
||||
):
|
||||
if shared and "conversation_id" in shared and isinstance(shared["conversation_id"], DBRef):
|
||||
# Resolve the DBRef
|
||||
conversation_ref = shared["conversation_id"]
|
||||
conversation = db.dereference(conversation_ref)
|
||||
@@ -672,9 +596,7 @@ def get_publicly_shared_conversations(identifier: str):
|
||||
),
|
||||
404,
|
||||
)
|
||||
conversation_queries = conversation["queries"][
|
||||
: (shared["first_n_queries"])
|
||||
]
|
||||
conversation_queries = conversation["queries"][: (shared["first_n_queries"])]
|
||||
for query in conversation_queries:
|
||||
query.pop("sources") ## avoid exposing sources
|
||||
else:
|
||||
|
||||
@@ -11,12 +11,14 @@ from retry import retry
|
||||
|
||||
|
||||
@retry(tries=10, delay=60)
|
||||
def store_add_texts_with_retry(store, i):
|
||||
def store_add_texts_with_retry(store, i, id):
|
||||
# add source_id to the metadata
|
||||
i.metadata["source_id"] = str(id)
|
||||
store.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
|
||||
|
||||
def call_openai_api(docs, folder_name, task_status):
|
||||
def call_openai_api(docs, folder_name, id, task_status):
|
||||
# Function to create a vector store from the documents and save it to disk
|
||||
|
||||
if not os.path.exists(f"{folder_name}"):
|
||||
@@ -32,13 +34,13 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
store = VectorCreator.create_vectorstore(
|
||||
settings.VECTOR_STORE,
|
||||
docs_init=docs_init,
|
||||
path=f"{folder_name}",
|
||||
source_id=f"{folder_name}",
|
||||
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
|
||||
)
|
||||
else:
|
||||
store = VectorCreator.create_vectorstore(
|
||||
settings.VECTOR_STORE,
|
||||
path=f"{folder_name}",
|
||||
source_id=str(id),
|
||||
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
|
||||
)
|
||||
# Uncomment for MPNet embeddings
|
||||
@@ -57,7 +59,7 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
task_status.update_state(
|
||||
state="PROGRESS", meta={"current": int((c1 / s1) * 100)}
|
||||
)
|
||||
store_add_texts_with_retry(store, i)
|
||||
store_add_texts_with_retry(store, i, id)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print("Error on ", i)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
from application.retriever.base import BaseRetriever
|
||||
from application.core.settings import settings
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
@@ -21,7 +20,7 @@ class ClassicRAG(BaseRetriever):
|
||||
user_api_key=None,
|
||||
):
|
||||
self.question = question
|
||||
self.vectorstore = self._get_vectorstore(source=source)
|
||||
self.vectorstore = source['active_docs'] if 'active_docs' in source else None
|
||||
self.chat_history = chat_history
|
||||
self.prompt = prompt
|
||||
self.chunks = chunks
|
||||
@@ -38,21 +37,6 @@ class ClassicRAG(BaseRetriever):
|
||||
)
|
||||
self.user_api_key = user_api_key
|
||||
|
||||
def _get_vectorstore(self, source):
|
||||
if "active_docs" in source:
|
||||
if source["active_docs"].split("/")[0] == "default":
|
||||
vectorstore = ""
|
||||
elif source["active_docs"].split("/")[0] == "local":
|
||||
vectorstore = "indexes/" + source["active_docs"]
|
||||
else:
|
||||
vectorstore = "vectors/" + source["active_docs"]
|
||||
if source["active_docs"] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
vectorstore = os.path.join("application", vectorstore)
|
||||
return vectorstore
|
||||
|
||||
def _get_data(self):
|
||||
if self.chunks == 0:
|
||||
docs = []
|
||||
|
||||
@@ -9,9 +9,9 @@ import elasticsearch
|
||||
class ElasticsearchStore(BaseVectorStore):
|
||||
_es_connection = None # Class attribute to hold the Elasticsearch connection
|
||||
|
||||
def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
|
||||
def __init__(self, source_id, embeddings_key, index_name=settings.ELASTIC_INDEX):
|
||||
super().__init__()
|
||||
self.path = path.replace("application/indexes/", "").rstrip("/")
|
||||
self.source_id = source_id.replace("application/indexes/", "").rstrip("/")
|
||||
self.embeddings_key = embeddings_key
|
||||
self.index_name = index_name
|
||||
|
||||
@@ -81,7 +81,7 @@ class ElasticsearchStore(BaseVectorStore):
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
|
||||
vector = embeddings.embed_query(question)
|
||||
knn = {
|
||||
"filter": [{"match": {"metadata.store.keyword": self.path}}],
|
||||
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
|
||||
"field": "vector",
|
||||
"k": k,
|
||||
"num_candidates": 100,
|
||||
@@ -100,7 +100,7 @@ class ElasticsearchStore(BaseVectorStore):
|
||||
}
|
||||
}
|
||||
],
|
||||
"filter": [{"match": {"metadata.store.keyword": self.path}}],
|
||||
"filter": [{"match": {"metadata.source_id.keyword": self.source_id}}],
|
||||
}
|
||||
},
|
||||
"rank": {"rrf": {}},
|
||||
@@ -209,5 +209,4 @@ class ElasticsearchStore(BaseVectorStore):
|
||||
|
||||
def delete_index(self):
|
||||
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
|
||||
"metadata.store.keyword": self.path}},)
|
||||
|
||||
"metadata.source_id.keyword": self.source_id}},)
|
||||
|
||||
@@ -1,12 +1,22 @@
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from application.core.settings import settings
|
||||
import os
|
||||
|
||||
def get_vectorstore(path):
|
||||
if path:
|
||||
vectorstore = "indexes/"+path
|
||||
vectorstore = os.path.join("application", vectorstore)
|
||||
else:
|
||||
vectorstore = os.path.join("application")
|
||||
|
||||
return vectorstore
|
||||
|
||||
class FaissStore(BaseVectorStore):
|
||||
|
||||
def __init__(self, path, embeddings_key, docs_init=None):
|
||||
def __init__(self, source_id, embeddings_key, docs_init=None):
|
||||
super().__init__()
|
||||
self.path = path
|
||||
self.path = get_vectorstore(source_id)
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
|
||||
if docs_init:
|
||||
self.docsearch = FAISS.from_documents(
|
||||
|
||||
@@ -5,7 +5,7 @@ from application.vectorstore.document_class import Document
|
||||
class MongoDBVectorStore(BaseVectorStore):
|
||||
def __init__(
|
||||
self,
|
||||
path: str = "",
|
||||
source_id: str = "",
|
||||
embeddings_key: str = "embeddings",
|
||||
collection: str = "documents",
|
||||
index_name: str = "vector_search_index",
|
||||
@@ -18,7 +18,7 @@ class MongoDBVectorStore(BaseVectorStore):
|
||||
self._embedding_key = embedding_key
|
||||
self._embeddings_key = embeddings_key
|
||||
self._mongo_uri = settings.MONGO_URI
|
||||
self._path = path.replace("application/indexes/", "").rstrip("/")
|
||||
self._source_id = source_id.replace("application/indexes/", "").rstrip("/")
|
||||
self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
|
||||
|
||||
try:
|
||||
@@ -46,7 +46,7 @@ class MongoDBVectorStore(BaseVectorStore):
|
||||
"numCandidates": k * 10,
|
||||
"index": self._index_name,
|
||||
"filter": {
|
||||
"store": {"$eq": self._path}
|
||||
"source_id": {"$eq": self._source_id}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -123,4 +123,4 @@ class MongoDBVectorStore(BaseVectorStore):
|
||||
return result_ids
|
||||
|
||||
def delete_index(self, *args, **kwargs):
|
||||
self._collection.delete_many({"store": self._path})
|
||||
self._collection.delete_many({"source_id": self._source_id})
|
||||
@@ -5,12 +5,12 @@ from qdrant_client import models
|
||||
|
||||
|
||||
class QdrantStore(BaseVectorStore):
|
||||
def __init__(self, path: str = "", embeddings_key: str = "embeddings"):
|
||||
def __init__(self, source_id: str = "", embeddings_key: str = "embeddings"):
|
||||
self._filter = models.Filter(
|
||||
must=[
|
||||
models.FieldCondition(
|
||||
key="metadata.store",
|
||||
match=models.MatchValue(value=path.replace("application/indexes/", "").rstrip("/")),
|
||||
key="metadata.source_id",
|
||||
match=models.MatchValue(value=source_id.replace("application/indexes/", "").rstrip("/")),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
@@ -6,6 +6,7 @@ import tiktoken
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.parser.file.bulk import SimpleDirectoryReader
|
||||
@@ -14,10 +15,10 @@ from application.parser.open_ai_func import call_openai_api
|
||||
from application.parser.schema.base import Document
|
||||
from application.parser.token_func import group_split
|
||||
|
||||
|
||||
# Define a function to extract metadata from a given filename.
|
||||
def metadata_from_filename(title):
|
||||
store = "/".join(title.split("/")[1:3])
|
||||
return {"title": title, "store": store}
|
||||
return {"title": title}
|
||||
|
||||
|
||||
# Define a function to generate a random string of a given length.
|
||||
@@ -25,9 +26,7 @@ def generate_random_string(length):
|
||||
return "".join([string.ascii_letters[i % 52] for i in range(length)])
|
||||
|
||||
|
||||
current_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
|
||||
@@ -58,7 +57,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
|
||||
|
||||
|
||||
# Define the main function for ingesting and processing documents.
|
||||
def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
def ingest_worker(self, directory, formats, name_job, filename, user, retriever="classic"):
|
||||
"""
|
||||
Ingest and process documents.
|
||||
|
||||
@@ -69,6 +68,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
name_job (str): Name of the job for this ingestion task.
|
||||
filename (str): Name of the file to be ingested.
|
||||
user (str): Identifier for the user initiating the ingestion.
|
||||
retriever (str): Type of retriever to use for processing the documents.
|
||||
|
||||
Returns:
|
||||
dict: Information about the completed ingestion task, including input parameters and a "limited" flag.
|
||||
@@ -93,9 +93,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
print(full_path, file=sys.stderr)
|
||||
# check if API_URL env variable is set
|
||||
file_data = {"name": name_job, "file": filename, "user": user}
|
||||
response = requests.get(
|
||||
urljoin(settings.API_URL, "/api/download"), params=file_data
|
||||
)
|
||||
response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
|
||||
# check if file is in the response
|
||||
print(response, file=sys.stderr)
|
||||
file = response.content
|
||||
@@ -107,9 +105,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
# check if file is .zip and extract it
|
||||
if filename.endswith(".zip"):
|
||||
extract_zip_recursive(
|
||||
os.path.join(full_path, filename), full_path, 0, recursion_depth
|
||||
)
|
||||
extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth)
|
||||
|
||||
self.update_state(state="PROGRESS", meta={"current": 1})
|
||||
|
||||
@@ -130,8 +126,9 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
)
|
||||
|
||||
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
id = ObjectId()
|
||||
|
||||
call_openai_api(docs, full_path, self)
|
||||
call_openai_api(docs, full_path, id, self)
|
||||
tokens = count_tokens_docs(docs)
|
||||
self.update_state(state="PROGRESS", meta={"current": 100})
|
||||
|
||||
@@ -141,22 +138,15 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
|
||||
# and send them to the server (provide user and name in form)
|
||||
file_data = {"name": name_job, "user": user, "tokens":tokens}
|
||||
file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), 'type': 'local'}
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {
|
||||
"file_faiss": open(full_path + "/index.faiss", "rb"),
|
||||
"file_pkl": open(full_path + "/index.pkl", "rb"),
|
||||
}
|
||||
response = requests.post(
|
||||
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
|
||||
)
|
||||
response = requests.get(
|
||||
urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)
|
||||
)
|
||||
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
|
||||
else:
|
||||
response = requests.post(
|
||||
urljoin(settings.API_URL, "/api/upload_index"), data=file_data
|
||||
)
|
||||
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
|
||||
|
||||
# delete local
|
||||
shutil.rmtree(full_path)
|
||||
@@ -171,7 +161,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
}
|
||||
|
||||
|
||||
def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
def remote_worker(self, source_data, name_job, user, loader, directory="temp", retriever="classic"):
|
||||
token_check = True
|
||||
min_tokens = 150
|
||||
max_tokens = 1250
|
||||
@@ -191,22 +181,21 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
token_check=token_check,
|
||||
)
|
||||
# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
call_openai_api(docs, full_path, self)
|
||||
tokens = count_tokens_docs(docs)
|
||||
id = ObjectId()
|
||||
call_openai_api(docs, full_path, id, self)
|
||||
self.update_state(state="PROGRESS", meta={"current": 100})
|
||||
|
||||
# Proceed with uploading and cleaning as in the original function
|
||||
file_data = {"name": name_job, "user": user, "tokens":tokens}
|
||||
file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever,
|
||||
"id": str(id), 'type': loader, 'remote_data': source_data}
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {
|
||||
"file_faiss": open(full_path + "/index.faiss", "rb"),
|
||||
"file_pkl": open(full_path + "/index.pkl", "rb"),
|
||||
}
|
||||
|
||||
requests.post(
|
||||
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
|
||||
)
|
||||
requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
|
||||
|
||||
requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
|
||||
else:
|
||||
requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
|
||||
|
||||
@@ -222,9 +211,7 @@ def count_tokens_docs(docs):
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens, total_price = num_tokens_from_string(
|
||||
string=docs_content, encoding_name="cl100k_base"
|
||||
)
|
||||
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
return tokens
|
||||
|
||||
@@ -234,4 +221,4 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = (num_tokens / 1000) * 0.0004
|
||||
return num_tokens, total_price
|
||||
return num_tokens, total_price
|
||||
|
||||
@@ -23,9 +23,9 @@ import {
|
||||
import ConversationTile from './conversation/ConversationTile';
|
||||
import { useDarkTheme, useMediaQuery, useOutsideAlerter } from './hooks';
|
||||
import DeleteConvModal from './modals/DeleteConvModal';
|
||||
import { ActiveState } from './models/misc';
|
||||
import { ActiveState, Doc } from './models/misc';
|
||||
import APIKeyModal from './preferences/APIKeyModal';
|
||||
import { Doc, getConversations, getDocs } from './preferences/preferenceApi';
|
||||
import { getConversations, getDocs } from './preferences/preferenceApi';
|
||||
import {
|
||||
selectApiKeyStatus,
|
||||
selectConversationId,
|
||||
@@ -124,10 +124,8 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
|
||||
};
|
||||
|
||||
const handleDeleteClick = (doc: Doc) => {
|
||||
const docPath = `indexes/local/${doc.name}`;
|
||||
|
||||
userService
|
||||
.deletePath(docPath)
|
||||
.deletePath(doc.id ?? '')
|
||||
.then(() => {
|
||||
return getDocs();
|
||||
})
|
||||
|
||||
@@ -10,7 +10,7 @@ const endpoints = {
|
||||
DELETE_PROMPT: '/api/delete_prompt',
|
||||
UPDATE_PROMPT: '/api/update_prompt',
|
||||
SINGLE_PROMPT: (id: string) => `/api/get_single_prompt?id=${id}`,
|
||||
DELETE_PATH: (docPath: string) => `/api/delete_old?path=${docPath}`,
|
||||
DELETE_PATH: (docPath: string) => `/api/delete_old?source_id=${docPath}`,
|
||||
TASK_STATUS: (task_id: string) => `/api/task_status?task_id=${task_id}`,
|
||||
},
|
||||
CONVERSATION: {
|
||||
|
||||
@@ -26,6 +26,7 @@ function Dropdown({
|
||||
| string
|
||||
| { label: string; value: string }
|
||||
| { value: number; description: string }
|
||||
| { name: string; id: string; type: string }
|
||||
| null;
|
||||
onSelect:
|
||||
| ((value: string) => void)
|
||||
@@ -96,6 +97,10 @@ function Dropdown({
|
||||
? selectedValue.value + ` (${selectedValue.description})`
|
||||
: selectedValue.description
|
||||
}`
|
||||
: selectedValue &&
|
||||
'name' in selectedValue &&
|
||||
'id' in selectedValue
|
||||
? `${selectedValue.name}`
|
||||
: placeholder
|
||||
? placeholder
|
||||
: 'From URL'}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import React from 'react';
|
||||
import Trash from '../assets/trash.svg';
|
||||
import Arrow2 from '../assets/dropdown-arrow.svg';
|
||||
import { Doc } from '../preferences/preferenceApi';
|
||||
import { Doc } from '../models/misc';
|
||||
import { useDispatch } from 'react-redux';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
type Props = {
|
||||
@@ -63,9 +63,6 @@ function SourceDropdown({
|
||||
<p className="max-w-3/4 truncate whitespace-nowrap">
|
||||
{selectedDocs?.name || 'None'}
|
||||
</p>
|
||||
<p className="flex flex-col items-center justify-center">
|
||||
{selectedDocs?.version}
|
||||
</p>
|
||||
</div>
|
||||
</span>
|
||||
<img
|
||||
|
||||
@@ -1,32 +1,6 @@
|
||||
import conversationService from '../api/services/conversationService';
|
||||
import { Doc } from '../preferences/preferenceApi';
|
||||
import { Answer, FEEDBACK } from './conversationModels';
|
||||
|
||||
function getDocPath(selectedDocs: Doc | null): string {
|
||||
let docPath = 'default';
|
||||
if (selectedDocs) {
|
||||
let namePath = selectedDocs.name;
|
||||
if (selectedDocs.language === namePath) {
|
||||
namePath = '.project';
|
||||
}
|
||||
if (selectedDocs.location === 'local') {
|
||||
docPath = 'local' + '/' + selectedDocs.name + '/';
|
||||
} else if (selectedDocs.location === 'remote') {
|
||||
docPath =
|
||||
selectedDocs.language +
|
||||
'/' +
|
||||
namePath +
|
||||
'/' +
|
||||
selectedDocs.version +
|
||||
'/' +
|
||||
selectedDocs.model +
|
||||
'/';
|
||||
} else if (selectedDocs.location === 'custom') {
|
||||
docPath = selectedDocs.docLink;
|
||||
}
|
||||
}
|
||||
return docPath;
|
||||
}
|
||||
import { Doc } from '../models/misc';
|
||||
import { Answer, FEEDBACK, RetrievalPayload } from './conversationModels';
|
||||
|
||||
export function handleFetchAnswer(
|
||||
question: string,
|
||||
@@ -54,23 +28,22 @@ export function handleFetchAnswer(
|
||||
title: any;
|
||||
}
|
||||
> {
|
||||
const docPath = getDocPath(selectedDocs);
|
||||
history = history.map((item) => {
|
||||
return { prompt: item.prompt, response: item.response };
|
||||
});
|
||||
const payload: RetrievalPayload = {
|
||||
question: question,
|
||||
history: JSON.stringify(history),
|
||||
conversation_id: conversationId,
|
||||
prompt_id: promptId,
|
||||
chunks: chunks,
|
||||
token_limit: token_limit,
|
||||
};
|
||||
if (selectedDocs && 'id' in selectedDocs)
|
||||
payload.active_docs = selectedDocs.id as string;
|
||||
payload.retriever = selectedDocs?.retriever as string;
|
||||
return conversationService
|
||||
.answer(
|
||||
{
|
||||
question: question,
|
||||
history: history,
|
||||
active_docs: docPath,
|
||||
conversation_id: conversationId,
|
||||
prompt_id: promptId,
|
||||
chunks: chunks,
|
||||
token_limit: token_limit,
|
||||
},
|
||||
signal,
|
||||
)
|
||||
.answer(payload, signal)
|
||||
.then((response) => {
|
||||
if (response.ok) {
|
||||
return response.json();
|
||||
@@ -101,24 +74,24 @@ export function handleFetchAnswerSteaming(
|
||||
token_limit: number,
|
||||
onEvent: (event: MessageEvent) => void,
|
||||
): Promise<Answer> {
|
||||
const docPath = getDocPath(selectedDocs);
|
||||
history = history.map((item) => {
|
||||
return { prompt: item.prompt, response: item.response };
|
||||
});
|
||||
const payload: RetrievalPayload = {
|
||||
question: question,
|
||||
history: JSON.stringify(history),
|
||||
conversation_id: conversationId,
|
||||
prompt_id: promptId,
|
||||
chunks: chunks,
|
||||
token_limit: token_limit,
|
||||
};
|
||||
if (selectedDocs && 'id' in selectedDocs)
|
||||
payload.active_docs = selectedDocs.id as string;
|
||||
payload.retriever = selectedDocs?.retriever as string;
|
||||
|
||||
return new Promise<Answer>((resolve, reject) => {
|
||||
conversationService
|
||||
.answerStream(
|
||||
{
|
||||
question: question,
|
||||
active_docs: docPath,
|
||||
history: JSON.stringify(history),
|
||||
conversation_id: conversationId,
|
||||
prompt_id: promptId,
|
||||
chunks: chunks,
|
||||
token_limit: token_limit,
|
||||
},
|
||||
signal,
|
||||
)
|
||||
.answerStream(payload, signal)
|
||||
.then((response) => {
|
||||
if (!response.body) throw Error('No response body');
|
||||
|
||||
@@ -175,16 +148,21 @@ export function handleSearch(
|
||||
chunks: string,
|
||||
token_limit: number,
|
||||
) {
|
||||
const docPath = getDocPath(selectedDocs);
|
||||
history = history.map((item) => {
|
||||
return { prompt: item.prompt, response: item.response };
|
||||
});
|
||||
const payload: RetrievalPayload = {
|
||||
question: question,
|
||||
history: JSON.stringify(history),
|
||||
conversation_id: conversation_id,
|
||||
chunks: chunks,
|
||||
token_limit: token_limit,
|
||||
};
|
||||
if (selectedDocs && 'id' in selectedDocs)
|
||||
payload.active_docs = selectedDocs.id as string;
|
||||
payload.retriever = selectedDocs?.retriever as string;
|
||||
return conversationService
|
||||
.search({
|
||||
question: question,
|
||||
active_docs: docPath,
|
||||
conversation_id,
|
||||
history,
|
||||
chunks: chunks,
|
||||
token_limit: token_limit,
|
||||
})
|
||||
.search(payload)
|
||||
.then((response) => response.json())
|
||||
.then((data) => {
|
||||
return data;
|
||||
|
||||
@@ -31,3 +31,13 @@ export interface Query {
|
||||
conversationId?: string | null;
|
||||
title?: string | null;
|
||||
}
|
||||
export interface RetrievalPayload {
|
||||
question: string;
|
||||
active_docs?: string;
|
||||
retriever?: string;
|
||||
history: string;
|
||||
conversation_id: string | null;
|
||||
prompt_id?: string | null;
|
||||
chunks: string;
|
||||
token_limit: number;
|
||||
}
|
||||
|
||||
@@ -22,8 +22,9 @@ export default function CreateAPIKeyModal({
|
||||
|
||||
const [APIKeyName, setAPIKeyName] = React.useState<string>('');
|
||||
const [sourcePath, setSourcePath] = React.useState<{
|
||||
label: string;
|
||||
value: string;
|
||||
name: string;
|
||||
id: string;
|
||||
type: string;
|
||||
} | null>(null);
|
||||
const [prompt, setPrompt] = React.useState<{
|
||||
name: string;
|
||||
@@ -41,27 +42,17 @@ export default function CreateAPIKeyModal({
|
||||
? docs
|
||||
.filter((doc) => doc.model === embeddingsName)
|
||||
.map((doc: Doc) => {
|
||||
let namePath = doc.name;
|
||||
if (doc.language === namePath) {
|
||||
namePath = '.project';
|
||||
}
|
||||
let docPath = 'default';
|
||||
if (doc.location === 'local') {
|
||||
docPath = 'local' + '/' + doc.name + '/';
|
||||
} else if (doc.location === 'remote') {
|
||||
docPath =
|
||||
doc.language +
|
||||
'/' +
|
||||
namePath +
|
||||
'/' +
|
||||
doc.version +
|
||||
'/' +
|
||||
doc.model +
|
||||
'/';
|
||||
if ('id' in doc) {
|
||||
return {
|
||||
name: doc.name,
|
||||
id: doc.id as string,
|
||||
type: 'local',
|
||||
};
|
||||
}
|
||||
return {
|
||||
label: doc.name,
|
||||
value: docPath,
|
||||
name: doc.name,
|
||||
id: doc.id ?? 'default',
|
||||
type: doc.type ?? 'default',
|
||||
};
|
||||
})
|
||||
: [];
|
||||
@@ -107,9 +98,14 @@ export default function CreateAPIKeyModal({
|
||||
<Dropdown
|
||||
placeholder={t('modals.createAPIKey.sourceDoc')}
|
||||
selectedValue={sourcePath}
|
||||
onSelect={(selection: { label: string; value: string }) =>
|
||||
setSourcePath(selection)
|
||||
}
|
||||
onSelect={(selection: {
|
||||
name: string;
|
||||
id: string;
|
||||
type: string;
|
||||
}) => {
|
||||
setSourcePath(selection);
|
||||
console.log(selection);
|
||||
}}
|
||||
options={extractDocPaths()}
|
||||
size="w-full"
|
||||
rounded="xl"
|
||||
@@ -142,16 +138,22 @@ export default function CreateAPIKeyModal({
|
||||
</div>
|
||||
<button
|
||||
disabled={!sourcePath || APIKeyName.length === 0 || !prompt}
|
||||
onClick={() =>
|
||||
sourcePath &&
|
||||
prompt &&
|
||||
createAPIKey({
|
||||
name: APIKeyName,
|
||||
source: sourcePath.value,
|
||||
prompt_id: prompt.id,
|
||||
chunks: chunk,
|
||||
})
|
||||
}
|
||||
onClick={() => {
|
||||
if (sourcePath && prompt) {
|
||||
const payload: any = {
|
||||
name: APIKeyName,
|
||||
prompt_id: prompt.id,
|
||||
chunks: chunk,
|
||||
};
|
||||
if (sourcePath.type === 'default') {
|
||||
payload.retriever = sourcePath.id;
|
||||
}
|
||||
if (sourcePath.type === 'local') {
|
||||
payload.source = sourcePath.id;
|
||||
}
|
||||
createAPIKey(payload);
|
||||
}
|
||||
}}
|
||||
className="float-right mt-4 rounded-full bg-purple-30 px-5 py-2 text-sm text-white hover:bg-[#6F3FD1] disabled:opacity-50"
|
||||
>
|
||||
{t('modals.createAPIKey.create')}
|
||||
|
||||
@@ -46,27 +46,9 @@ export const ShareConversationModal = ({
|
||||
? docs
|
||||
.filter((doc) => doc.model === embeddingsName)
|
||||
.map((doc: Doc) => {
|
||||
let namePath = doc.name;
|
||||
if (doc.language === namePath) {
|
||||
namePath = '.project';
|
||||
}
|
||||
let docPath = 'default';
|
||||
if (doc.location === 'local') {
|
||||
docPath = 'local' + '/' + doc.name + '/';
|
||||
} else if (doc.location === 'remote') {
|
||||
docPath =
|
||||
doc.language +
|
||||
'/' +
|
||||
namePath +
|
||||
'/' +
|
||||
doc.version +
|
||||
'/' +
|
||||
doc.model +
|
||||
'/';
|
||||
}
|
||||
return {
|
||||
label: doc.name,
|
||||
value: docPath,
|
||||
value: doc.id ?? 'default',
|
||||
};
|
||||
})
|
||||
: [];
|
||||
|
||||
@@ -4,16 +4,13 @@ export type User = {
|
||||
avatar: string;
|
||||
};
|
||||
export type Doc = {
|
||||
location: string;
|
||||
id?: string;
|
||||
name: string;
|
||||
language: string;
|
||||
version: string;
|
||||
description: string;
|
||||
fullName: string;
|
||||
date: string;
|
||||
docLink: string;
|
||||
model: string;
|
||||
tokens?: string;
|
||||
type?: string;
|
||||
retriever?: string;
|
||||
};
|
||||
|
||||
export type PromptProps = {
|
||||
|
||||
@@ -8,7 +8,8 @@ import {
|
||||
selectSourceDocs,
|
||||
selectSelectedDocs,
|
||||
} from './preferenceSlice';
|
||||
import { getDocs, Doc } from './preferenceApi';
|
||||
import { Doc } from '../models/misc';
|
||||
import { getDocs } from './preferenceApi';
|
||||
|
||||
export default function APIKeyModal({
|
||||
modalState,
|
||||
@@ -75,9 +76,7 @@ export default function APIKeyModal({
|
||||
{!localSelectedDocs ? (
|
||||
<p className="py-3 text-gray-500">Select</p>
|
||||
) : (
|
||||
<p className="py-3">
|
||||
{localSelectedDocs.name} {localSelectedDocs.version}
|
||||
</p>
|
||||
<p className="py-3">{localSelectedDocs.name}</p>
|
||||
)}
|
||||
</div>
|
||||
{isDocsListOpen && (
|
||||
@@ -94,9 +93,7 @@ export default function APIKeyModal({
|
||||
}}
|
||||
className="h-10 w-full cursor-pointer border-x-2 border-b-2 hover:bg-gray-100"
|
||||
>
|
||||
<p className="ml-5 py-3">
|
||||
{doc.name} {doc.version}
|
||||
</p>
|
||||
<p className="ml-5 py-3">{doc.name}</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,18 +1,6 @@
|
||||
import conversationService from '../api/services/conversationService';
|
||||
import userService from '../api/services/userService';
|
||||
|
||||
// not all properties in Doc are going to be present. Make some optional
|
||||
export type Doc = {
|
||||
location: string;
|
||||
name: string;
|
||||
language: string;
|
||||
version: string;
|
||||
description: string;
|
||||
fullName: string;
|
||||
date: string;
|
||||
docLink: string;
|
||||
model: string;
|
||||
};
|
||||
import { Doc } from '../models/misc';
|
||||
|
||||
//Fetches all JSON objects from the source. We only use the objects with the "model" property in SelectDocsModal.tsx. Hopefully can clean up the source file later.
|
||||
export async function getDocs(): Promise<Doc[] | null> {
|
||||
@@ -78,17 +66,10 @@ export function setLocalPrompt(prompt: string): void {
|
||||
|
||||
export function setLocalRecentDocs(doc: Doc): void {
|
||||
localStorage.setItem('DocsGPTRecentDocs', JSON.stringify(doc));
|
||||
let namePath = doc.name;
|
||||
if (doc.language === namePath) {
|
||||
namePath = '.project';
|
||||
}
|
||||
|
||||
let docPath = 'default';
|
||||
if (doc.location === 'local') {
|
||||
if (doc.type === 'local') {
|
||||
docPath = 'local' + '/' + doc.name + '/';
|
||||
} else if (doc.location === 'remote') {
|
||||
docPath =
|
||||
doc.language + '/' + namePath + '/' + doc.version + '/' + doc.model + '/';
|
||||
}
|
||||
userService
|
||||
.checkDocs({
|
||||
|
||||
@@ -4,9 +4,9 @@ import {
|
||||
createSlice,
|
||||
isAnyOf,
|
||||
} from '@reduxjs/toolkit';
|
||||
import { Doc, setLocalApiKey, setLocalRecentDocs } from './preferenceApi';
|
||||
import { setLocalApiKey, setLocalRecentDocs } from './preferenceApi';
|
||||
import { RootState } from '../store';
|
||||
import { ActiveState } from '../models/misc';
|
||||
import { ActiveState, Doc } from '../models/misc';
|
||||
|
||||
interface Preference {
|
||||
apiKey: string;
|
||||
@@ -25,15 +25,13 @@ const initialState: Preference = {
|
||||
chunks: '2',
|
||||
token_limit: 2000,
|
||||
selectedDocs: {
|
||||
id: 'default',
|
||||
name: 'default',
|
||||
language: 'default',
|
||||
location: 'default',
|
||||
version: 'default',
|
||||
description: 'default',
|
||||
fullName: 'default',
|
||||
type: 'remote',
|
||||
date: 'default',
|
||||
docLink: 'default',
|
||||
model: 'openai_text-embedding-ada-002',
|
||||
retriever: 'classic',
|
||||
} as Doc,
|
||||
sourceDocs: null,
|
||||
conversations: null,
|
||||
|
||||
@@ -48,7 +48,8 @@ export default function APIKeys() {
|
||||
|
||||
const handleCreateKey = (payload: {
|
||||
name: string;
|
||||
source: string;
|
||||
source?: string;
|
||||
retriever?: string;
|
||||
prompt_id: string;
|
||||
chunks: string;
|
||||
}) => {
|
||||
|
||||
@@ -61,12 +61,10 @@ const Documents: React.FC<DocumentsProps> = ({
|
||||
{document.tokens ? formatTokens(+document.tokens) : ''}
|
||||
</td>
|
||||
<td className="border-r border-t px-4 py-2">
|
||||
{document.location === 'remote'
|
||||
? 'Pre-loaded'
|
||||
: 'Private'}
|
||||
{document.type === 'remote' ? 'Pre-loaded' : 'Private'}
|
||||
</td>
|
||||
<td className="border-t px-4 py-2">
|
||||
{document.location !== 'remote' && (
|
||||
{document.type !== 'remote' && (
|
||||
<img
|
||||
src={Trash}
|
||||
alt="Delete"
|
||||
|
||||
@@ -6,7 +6,7 @@ import userService from '../api/services/userService';
|
||||
import ArrowLeft from '../assets/arrow-left.svg';
|
||||
import ArrowRight from '../assets/arrow-right.svg';
|
||||
import i18n from '../locale/i18n';
|
||||
import { Doc } from '../preferences/preferenceApi';
|
||||
import { Doc } from '../models/misc';
|
||||
import {
|
||||
selectSourceDocs,
|
||||
setSourceDocs,
|
||||
@@ -35,9 +35,8 @@ export default function Settings() {
|
||||
};
|
||||
|
||||
const handleDeleteClick = (index: number, doc: Doc) => {
|
||||
const docPath = 'indexes/' + 'local' + '/' + doc.name;
|
||||
userService
|
||||
.deletePath(docPath)
|
||||
.deletePath(doc.id ?? '')
|
||||
.then((response) => {
|
||||
if (response.ok && documents) {
|
||||
const updatedDocuments = [
|
||||
|
||||
@@ -26,15 +26,12 @@ const store = configureStore({
|
||||
conversations: null,
|
||||
sourceDocs: [
|
||||
{
|
||||
location: '',
|
||||
language: '',
|
||||
name: 'default',
|
||||
version: '',
|
||||
date: '',
|
||||
description: '',
|
||||
docLink: '',
|
||||
fullName: '',
|
||||
model: '1.0',
|
||||
type: 'remote',
|
||||
id: 'default',
|
||||
retriever: 'clasic',
|
||||
},
|
||||
],
|
||||
modalState: 'INACTIVE',
|
||||
|
||||
@@ -120,7 +120,7 @@ function Upload({
|
||||
dispatch(setSourceDocs(data));
|
||||
dispatch(
|
||||
setSelectedDocs(
|
||||
data?.find((d) => d.location.toLowerCase() === 'local'),
|
||||
data?.find((d) => d.type?.toLowerCase() === 'local'),
|
||||
),
|
||||
);
|
||||
});
|
||||
@@ -137,7 +137,7 @@ function Upload({
|
||||
dispatch(setSourceDocs(data));
|
||||
dispatch(
|
||||
setSelectedDocs(
|
||||
data?.find((d) => d.location.toLowerCase() === 'local'),
|
||||
data?.find((d) => d.type?.toLowerCase() === 'local'),
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
55
scripts/migrate_to_v1_vectorstore.py
Normal file
55
scripts/migrate_to_v1_vectorstore.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import pymongo
|
||||
import os
|
||||
|
||||
def migrate_to_v1_vectorstore_mongo():
|
||||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||||
db = client["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
sources_collection = db["sources"]
|
||||
|
||||
for vector in vectors_collection.find():
|
||||
if "location" in vector:
|
||||
del vector["location"]
|
||||
if "retriever" not in vector:
|
||||
vector["retriever"] = "classic"
|
||||
vector["remote_data"] = None
|
||||
vectors_collection.update_one({"_id": vector["_id"]}, {"$set": vector})
|
||||
|
||||
# move data from vectors_collection to sources_collection
|
||||
for vector in vectors_collection.find():
|
||||
sources_collection.insert_one(vector)
|
||||
|
||||
vectors_collection.drop()
|
||||
|
||||
client.close()
|
||||
|
||||
def migrate_faiss_to_v1_vectorstore():
|
||||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||||
db = client["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
|
||||
for vector in vectors_collection.find():
|
||||
old_path = f"./application/indexes/{vector['user']}/{vector['name']}"
|
||||
new_path = f"./application/indexes/{vector['_id']}"
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
except OSError as e:
|
||||
print(f"Error moving {old_path} to {new_path}: {e}")
|
||||
|
||||
client.close()
|
||||
|
||||
def migrate_mongo_atlas_vector_to_v1_vectorstore():
|
||||
client = pymongo.MongoClient("mongodb+srv://<username>:<password>@<cluster>/<dbname>?retryWrites=true&w=majority")
|
||||
db = client["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
|
||||
# mongodb atlas collection
|
||||
documents_collection = db["documents"]
|
||||
|
||||
for vector in vectors_collection.find():
|
||||
documents_collection.update_many({"store": vector["user"] + "/" + vector["name"]}, {"$set": {"source_id": str(vector["_id"])}})
|
||||
|
||||
client.close()
|
||||
|
||||
migrate_faiss_to_v1_vectorstore()
|
||||
migrate_to_v1_vectorstore_mongo()
|
||||
Reference in New Issue
Block a user