mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-12-01 09:33:14 +00:00
Merge branch 'main' into Jackson
This commit is contained in:
@@ -4,14 +4,11 @@ FROM ubuntu:24.04 as builder
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common
|
||||
|
||||
RUN add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository ppa:deadsnakes/ppa && \
|
||||
# Install necessary packages and Python
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends gcc curl wget unzip libc6-dev python3.11 python3.11-distutils python3.11-venv && \
|
||||
apt-get clean && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends gcc wget unzip libc6-dev python3.11 python3.11-distutils python3.11-venv && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Verify Python installation and setup symlink
|
||||
@@ -27,7 +24,7 @@ RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.z
|
||||
rm mpnet-base-v2.zip
|
||||
|
||||
# Install Rust
|
||||
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
|
||||
RUN wget -q -O - https://sh.rustup.rs | sh -s -- -y
|
||||
|
||||
# Clean up to reduce container size
|
||||
RUN apt-get remove --purge -y wget unzip && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
|
||||
@@ -50,12 +47,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
||||
FROM ubuntu:24.04 as final
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common
|
||||
|
||||
RUN add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository ppa:deadsnakes/ppa && \
|
||||
# Install Python
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends python3.11 && \
|
||||
apt-get update && apt-get install -y --no-install-recommends python3.11 && \
|
||||
ln -s /usr/bin/python3.11 /usr/bin/python && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from flask import Blueprint, request, Response
|
||||
from flask import Blueprint, request, Response, current_app
|
||||
import json
|
||||
import datetime
|
||||
import logging
|
||||
@@ -74,7 +74,7 @@ def run_async_chain(chain, question, chat_history):
|
||||
|
||||
def get_data_from_api_key(api_key):
|
||||
data = api_key_collection.find_one({"key": api_key})
|
||||
|
||||
|
||||
# # Raise custom exception if the API key is not found
|
||||
if data is None:
|
||||
raise Exception("Invalid API Key, please generate new key", 401)
|
||||
@@ -129,10 +129,10 @@ def save_conversation(conversation_id, question, response, source_log_docs, llm)
|
||||
"content": "Summarise following conversation in no more than 3 "
|
||||
"words, respond ONLY with the summary, use the same "
|
||||
"language as the system \n\nUser: "
|
||||
+question
|
||||
+"\n\n"
|
||||
+"AI: "
|
||||
+response,
|
||||
+ question
|
||||
+ "\n\n"
|
||||
+ "AI: "
|
||||
+ response,
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@@ -172,7 +172,9 @@ def get_prompt(prompt_id):
|
||||
return prompt
|
||||
|
||||
|
||||
def complete_stream(question, retriever, conversation_id, user_api_key):
|
||||
def complete_stream(
|
||||
question, retriever, conversation_id, user_api_key, isNoneDoc=False
|
||||
):
|
||||
|
||||
try:
|
||||
response_full = ""
|
||||
@@ -186,126 +188,142 @@ def complete_stream(question, retriever, conversation_id, user_api_key):
|
||||
elif "source" in line:
|
||||
source_log_docs.append(line["source"])
|
||||
|
||||
if isNoneDoc:
|
||||
for doc in source_log_docs:
|
||||
doc["source"] = "None"
|
||||
|
||||
llm = LLMCreator.create_llm(
|
||||
settings.LLM_NAME, api_key=settings.API_KEY, user_api_key=user_api_key
|
||||
)
|
||||
if(user_api_key is None):
|
||||
)
|
||||
if user_api_key is None:
|
||||
conversation_id = save_conversation(
|
||||
conversation_id, question, response_full, source_log_docs, llm
|
||||
)
|
||||
# send data.type = "end" to indicate that the stream has ended as json
|
||||
data = json.dumps({"type": "id", "id": str(conversation_id)})
|
||||
yield f"data: {data}\n\n"
|
||||
|
||||
|
||||
data = json.dumps({"type": "end"})
|
||||
yield f"data: {data}\n\n"
|
||||
except Exception as e:
|
||||
print("\033[91merr", str(e), file=sys.stderr)
|
||||
data = json.dumps({"type": "error","error":"Please try again later. We apologize for any inconvenience.",
|
||||
"error_exception": str(e)})
|
||||
data = json.dumps(
|
||||
{
|
||||
"type": "error",
|
||||
"error": "Please try again later. We apologize for any inconvenience.",
|
||||
"error_exception": str(e),
|
||||
}
|
||||
)
|
||||
yield f"data: {data}\n\n"
|
||||
return
|
||||
return
|
||||
|
||||
|
||||
@answer.route("/stream", methods=["POST"])
|
||||
def stream():
|
||||
try:
|
||||
data = request.get_json()
|
||||
# get parameter from url question
|
||||
question = data["question"]
|
||||
if "history" not in data:
|
||||
history = []
|
||||
else:
|
||||
history = data["history"]
|
||||
history = json.loads(history)
|
||||
if "conversation_id" not in data:
|
||||
conversation_id = None
|
||||
else:
|
||||
conversation_id = data["conversation_id"]
|
||||
if "prompt_id" in data:
|
||||
prompt_id = data["prompt_id"]
|
||||
else:
|
||||
prompt_id = "default"
|
||||
if "selectedDocs" in data and data["selectedDocs"] is None:
|
||||
chunks = 0
|
||||
elif "chunks" in data:
|
||||
chunks = int(data["chunks"])
|
||||
else:
|
||||
chunks = 2
|
||||
if "token_limit" in data:
|
||||
token_limit = data["token_limit"]
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
try:
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
if "history" not in data:
|
||||
history = []
|
||||
else:
|
||||
history = data["history"]
|
||||
history = json.loads(history)
|
||||
if "conversation_id" not in data:
|
||||
conversation_id = None
|
||||
else:
|
||||
conversation_id = data["conversation_id"]
|
||||
if "prompt_id" in data:
|
||||
prompt_id = data["prompt_id"]
|
||||
else:
|
||||
prompt_id = "default"
|
||||
if "selectedDocs" in data and data["selectedDocs"] is None:
|
||||
chunks = 0
|
||||
elif "chunks" in data:
|
||||
chunks = int(data["chunks"])
|
||||
else:
|
||||
chunks = 2
|
||||
if "token_limit" in data:
|
||||
token_limit = data["token_limit"]
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
# check if active_docs or api_key is set
|
||||
# check if active_docs or api_key is set
|
||||
|
||||
if "api_key" in data:
|
||||
data_key = get_data_from_api_key(data["api_key"])
|
||||
chunks = int(data_key["chunks"])
|
||||
prompt_id = data_key["prompt_id"]
|
||||
source = {"active_docs": data_key["source"]}
|
||||
user_api_key = data["api_key"]
|
||||
elif "active_docs" in data:
|
||||
source = {"active_docs": data["active_docs"]}
|
||||
user_api_key = None
|
||||
else:
|
||||
source = {}
|
||||
user_api_key = None
|
||||
if "api_key" in data:
|
||||
data_key = get_data_from_api_key(data["api_key"])
|
||||
chunks = int(data_key["chunks"])
|
||||
prompt_id = data_key["prompt_id"]
|
||||
source = {"active_docs": data_key["source"]}
|
||||
user_api_key = data["api_key"]
|
||||
elif "active_docs" in data:
|
||||
source = {"active_docs": data["active_docs"]}
|
||||
user_api_key = None
|
||||
else:
|
||||
source = {}
|
||||
user_api_key = None
|
||||
|
||||
if (
|
||||
source["active_docs"].split("/")[0] == "default"
|
||||
or source["active_docs"].split("/")[0] == "local"
|
||||
):
|
||||
retriever_name = "classic"
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
if source["active_docs"].split("/")[0] in ["default", "local"]:
|
||||
retriever_name = "classic"
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
|
||||
prompt = get_prompt(prompt_id)
|
||||
current_app.logger.info(f"/stream - request_data: {data}, source: {source}",
|
||||
extra={"data": json.dumps({"request_data": data, "source": source})}
|
||||
)
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
question=question,
|
||||
source=source,
|
||||
chat_history=history,
|
||||
prompt=prompt,
|
||||
chunks=chunks,
|
||||
token_limit=token_limit,
|
||||
gpt_model=gpt_model,
|
||||
user_api_key=user_api_key,
|
||||
)
|
||||
prompt = get_prompt(prompt_id)
|
||||
|
||||
return Response(
|
||||
complete_stream(
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
question=question,
|
||||
retriever=retriever,
|
||||
conversation_id=conversation_id,
|
||||
source=source,
|
||||
chat_history=history,
|
||||
prompt=prompt,
|
||||
chunks=chunks,
|
||||
token_limit=token_limit,
|
||||
gpt_model=gpt_model,
|
||||
user_api_key=user_api_key,
|
||||
),
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
|
||||
except ValueError:
|
||||
message = "Malformed request body"
|
||||
print("\033[91merr", str(message), file=sys.stderr)
|
||||
return Response(
|
||||
error_stream_generate(message),
|
||||
status=400,
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
except Exception as e:
|
||||
print("\033[91merr", str(e), file=sys.stderr)
|
||||
)
|
||||
|
||||
return Response(
|
||||
complete_stream(
|
||||
question=question,
|
||||
retriever=retriever,
|
||||
conversation_id=conversation_id,
|
||||
user_api_key=user_api_key,
|
||||
isNoneDoc=data.get("isNoneDoc"),
|
||||
),
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
|
||||
except ValueError:
|
||||
message = "Malformed request body"
|
||||
print("\033[91merr", str(message), file=sys.stderr)
|
||||
return Response(
|
||||
error_stream_generate(message),
|
||||
status=400,
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"/stream - error: {str(e)} - traceback: {traceback.format_exc()}",
|
||||
extra={"error": str(e), "traceback": traceback.format_exc()}
|
||||
)
|
||||
message = e.args[0]
|
||||
status_code = 400
|
||||
# # Custom exceptions with two arguments, index 1 as status code
|
||||
if(len(e.args) >= 2):
|
||||
if len(e.args) >= 2:
|
||||
status_code = e.args[1]
|
||||
return Response(
|
||||
error_stream_generate(message),
|
||||
status=status_code,
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
error_stream_generate(message),
|
||||
status=status_code,
|
||||
mimetype="text/event-stream",
|
||||
)
|
||||
|
||||
|
||||
def error_stream_generate(err_response):
|
||||
data = json.dumps({"type": "error", "error":err_response})
|
||||
yield f"data: {data}\n\n"
|
||||
data = json.dumps({"type": "error", "error": err_response})
|
||||
yield f"data: {data}\n\n"
|
||||
|
||||
|
||||
@answer.route("/api/answer", methods=["POST"])
|
||||
def api_answer():
|
||||
@@ -333,7 +351,6 @@ def api_answer():
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
# use try and except to check for exception
|
||||
try:
|
||||
# check if the vectorstore is set
|
||||
if "api_key" in data:
|
||||
@@ -346,16 +363,17 @@ def api_answer():
|
||||
source = data
|
||||
user_api_key = None
|
||||
|
||||
if (
|
||||
source["active_docs"].split("/")[0] == "default"
|
||||
or source["active_docs"].split("/")[0] == "local"
|
||||
):
|
||||
if source["active_docs"].split("/")[0] in ["default", "local"]:
|
||||
retriever_name = "classic"
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
|
||||
prompt = get_prompt(prompt_id)
|
||||
|
||||
current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}",
|
||||
extra={"data": json.dumps({"request_data": data, "source": source})}
|
||||
)
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
question=question,
|
||||
@@ -375,6 +393,10 @@ def api_answer():
|
||||
elif "answer" in line:
|
||||
response_full += line["answer"]
|
||||
|
||||
if data.get("isNoneDoc"):
|
||||
for doc in source_log_docs:
|
||||
doc["source"] = "None"
|
||||
|
||||
llm = LLMCreator.create_llm(
|
||||
settings.LLM_NAME, api_key=settings.API_KEY, user_api_key=user_api_key
|
||||
)
|
||||
@@ -386,16 +408,15 @@ def api_answer():
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
# print whole traceback
|
||||
traceback.print_exc()
|
||||
print(str(e))
|
||||
current_app.logger.error(f"/api/answer - error: {str(e)} - traceback: {traceback.format_exc()}",
|
||||
extra={"error": str(e), "traceback": traceback.format_exc()}
|
||||
)
|
||||
return bad_request(500, str(e))
|
||||
|
||||
|
||||
@answer.route("/api/search", methods=["POST"])
|
||||
def api_search():
|
||||
data = request.get_json()
|
||||
# get parameter from url question
|
||||
question = data["question"]
|
||||
if "chunks" in data:
|
||||
chunks = int(data["chunks"])
|
||||
@@ -413,10 +434,7 @@ def api_search():
|
||||
source = {}
|
||||
user_api_key = None
|
||||
|
||||
if (
|
||||
source["active_docs"].split("/")[0] == "default"
|
||||
or source["active_docs"].split("/")[0] == "local"
|
||||
):
|
||||
if source["active_docs"].split("/")[0] in ["default", "local"]:
|
||||
retriever_name = "classic"
|
||||
else:
|
||||
retriever_name = source["active_docs"]
|
||||
@@ -424,6 +442,10 @@ def api_search():
|
||||
token_limit = data["token_limit"]
|
||||
else:
|
||||
token_limit = settings.DEFAULT_MAX_HISTORY
|
||||
|
||||
current_app.logger.info(f"/api/answer - request_data: {data}, source: {source}",
|
||||
extra={"data": json.dumps({"request_data": data, "source": source})}
|
||||
)
|
||||
|
||||
retriever = RetrieverCreator.create_retriever(
|
||||
retriever_name,
|
||||
@@ -437,4 +459,9 @@ def api_search():
|
||||
user_api_key=user_api_key,
|
||||
)
|
||||
docs = retriever.search()
|
||||
|
||||
if data.get("isNoneDoc"):
|
||||
for doc in docs:
|
||||
doc["source"] = "None"
|
||||
|
||||
return docs
|
||||
|
||||
@@ -44,7 +44,7 @@ def delete_conversation():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@user.route("/api/delete_all_conversations", methods=["POST"])
|
||||
@user.route("/api/delete_all_conversations", methods=["GET"])
|
||||
def delete_all_conversations():
|
||||
user_id = "local"
|
||||
conversations_collection.delete_many({"user": user_id})
|
||||
@@ -256,7 +256,7 @@ def combined_json():
|
||||
"docLink": "default",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "remote",
|
||||
"tokens":""
|
||||
"tokens": "",
|
||||
}
|
||||
]
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
@@ -273,7 +273,7 @@ def combined_json():
|
||||
"docLink": index["location"],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
"tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
|
||||
"tokens": index["tokens"] if ("tokens" in index.keys()) else "",
|
||||
}
|
||||
)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
@@ -295,7 +295,7 @@ def combined_json():
|
||||
"docLink": "duckduck_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens":""
|
||||
"tokens": "",
|
||||
}
|
||||
)
|
||||
if "brave_search" in settings.RETRIEVERS_ENABLED:
|
||||
@@ -310,7 +310,7 @@ def combined_json():
|
||||
"docLink": "brave_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens":""
|
||||
"tokens": "",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -496,138 +496,204 @@ def delete_api_key():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
#route to share conversation
|
||||
# route to share conversation
|
||||
##isPromptable should be passed through queries
|
||||
@user.route("/api/share",methods=["POST"])
|
||||
@user.route("/api/share", methods=["POST"])
|
||||
def share_conversation():
|
||||
try:
|
||||
data = request.get_json()
|
||||
user = "local" if "user" not in data else data["user"]
|
||||
conversation_id = data["conversation_id"]
|
||||
isPromptable = request.args.get("isPromptable").lower() == "true"
|
||||
|
||||
conversation = conversations_collection.find_one({"_id": ObjectId(conversation_id)})
|
||||
|
||||
conversation = conversations_collection.find_one(
|
||||
{"_id": ObjectId(conversation_id)}
|
||||
)
|
||||
current_n_queries = len(conversation["queries"])
|
||||
|
||||
##generate binary representation of uuid
|
||||
|
||||
##generate binary representation of uuid
|
||||
explicit_binary = Binary.from_uuid(uuid.uuid4(), UuidRepresentation.STANDARD)
|
||||
|
||||
if(isPromptable):
|
||||
|
||||
if isPromptable:
|
||||
source = "default" if "source" not in data else data["source"]
|
||||
prompt_id = "default" if "prompt_id" not in data else data["prompt_id"]
|
||||
chunks = "2" if "chunks" not in data else data["chunks"]
|
||||
|
||||
name = conversation["name"]+"(shared)"
|
||||
pre_existing_api_document = api_key_collection.find_one({
|
||||
"prompt_id":prompt_id,
|
||||
"chunks":chunks,
|
||||
"source":source,
|
||||
"user":user
|
||||
})
|
||||
|
||||
name = conversation["name"] + "(shared)"
|
||||
pre_existing_api_document = api_key_collection.find_one(
|
||||
{
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
"source": source,
|
||||
"user": user,
|
||||
}
|
||||
)
|
||||
api_uuid = str(uuid.uuid4())
|
||||
if(pre_existing_api_document):
|
||||
api_uuid = pre_existing_api_document["key"]
|
||||
pre_existing = shared_conversations_collections.find_one({
|
||||
"conversation_id":DBRef("conversations",ObjectId(conversation_id)),
|
||||
"isPromptable":isPromptable,
|
||||
"first_n_queries":current_n_queries,
|
||||
"user":user,
|
||||
"api_key":api_uuid
|
||||
})
|
||||
if(pre_existing is not None):
|
||||
return jsonify({"success":True, "identifier":str(pre_existing["uuid"].as_uuid())}),200
|
||||
else:
|
||||
shared_conversations_collections.insert_one({
|
||||
"uuid":explicit_binary,
|
||||
"conversation_id": {
|
||||
"$ref":"conversations",
|
||||
"$id":ObjectId(conversation_id)
|
||||
} ,
|
||||
"isPromptable":isPromptable,
|
||||
"first_n_queries":current_n_queries,
|
||||
"user":user,
|
||||
"api_key":api_uuid
|
||||
})
|
||||
return jsonify({"success":True,"identifier":str(explicit_binary.as_uuid())})
|
||||
if pre_existing_api_document:
|
||||
api_uuid = pre_existing_api_document["key"]
|
||||
pre_existing = shared_conversations_collections.find_one(
|
||||
{
|
||||
"conversation_id": DBRef(
|
||||
"conversations", ObjectId(conversation_id)
|
||||
),
|
||||
"isPromptable": isPromptable,
|
||||
"first_n_queries": current_n_queries,
|
||||
"user": user,
|
||||
"api_key": api_uuid,
|
||||
}
|
||||
)
|
||||
if pre_existing is not None:
|
||||
return (
|
||||
jsonify(
|
||||
{
|
||||
"success": True,
|
||||
"identifier": str(pre_existing["uuid"].as_uuid()),
|
||||
}
|
||||
),
|
||||
200,
|
||||
)
|
||||
else:
|
||||
shared_conversations_collections.insert_one(
|
||||
{
|
||||
"uuid": explicit_binary,
|
||||
"conversation_id": {
|
||||
"$ref": "conversations",
|
||||
"$id": ObjectId(conversation_id),
|
||||
},
|
||||
"isPromptable": isPromptable,
|
||||
"first_n_queries": current_n_queries,
|
||||
"user": user,
|
||||
"api_key": api_uuid,
|
||||
}
|
||||
)
|
||||
return jsonify(
|
||||
{"success": True, "identifier": str(explicit_binary.as_uuid())}
|
||||
)
|
||||
else:
|
||||
api_key_collection.insert_one(
|
||||
{
|
||||
"name": name,
|
||||
"key": api_uuid,
|
||||
"source": source,
|
||||
"user": user,
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
}
|
||||
)
|
||||
shared_conversations_collections.insert_one({
|
||||
"uuid":explicit_binary,
|
||||
"conversation_id": {
|
||||
"$ref":"conversations",
|
||||
"$id":ObjectId(conversation_id)
|
||||
} ,
|
||||
"isPromptable":isPromptable,
|
||||
"first_n_queries":current_n_queries,
|
||||
"user":user,
|
||||
"api_key":api_uuid
|
||||
})
|
||||
{
|
||||
"name": name,
|
||||
"key": api_uuid,
|
||||
"source": source,
|
||||
"user": user,
|
||||
"prompt_id": prompt_id,
|
||||
"chunks": chunks,
|
||||
}
|
||||
)
|
||||
shared_conversations_collections.insert_one(
|
||||
{
|
||||
"uuid": explicit_binary,
|
||||
"conversation_id": {
|
||||
"$ref": "conversations",
|
||||
"$id": ObjectId(conversation_id),
|
||||
},
|
||||
"isPromptable": isPromptable,
|
||||
"first_n_queries": current_n_queries,
|
||||
"user": user,
|
||||
"api_key": api_uuid,
|
||||
}
|
||||
)
|
||||
## Identifier as route parameter in frontend
|
||||
return jsonify({"success":True, "identifier":str(explicit_binary.as_uuid())}),201
|
||||
|
||||
##isPromptable = False
|
||||
pre_existing = shared_conversations_collections.find_one({
|
||||
"conversation_id":DBRef("conversations",ObjectId(conversation_id)),
|
||||
"isPromptable":isPromptable,
|
||||
"first_n_queries":current_n_queries,
|
||||
"user":user
|
||||
})
|
||||
if(pre_existing is not None):
|
||||
return jsonify({"success":True, "identifier":str(pre_existing["uuid"].as_uuid())}),200
|
||||
else:
|
||||
shared_conversations_collections.insert_one({
|
||||
"uuid":explicit_binary,
|
||||
"conversation_id": {
|
||||
"$ref":"conversations",
|
||||
"$id":ObjectId(conversation_id)
|
||||
} ,
|
||||
"isPromptable":isPromptable,
|
||||
"first_n_queries":current_n_queries,
|
||||
"user":user
|
||||
})
|
||||
## Identifier as route parameter in frontend
|
||||
return jsonify({"success":True, "identifier":str(explicit_binary.as_uuid())}),201
|
||||
except Exception as err:
|
||||
print (err)
|
||||
return jsonify({"success":False,"error":str(err)}),400
|
||||
return (
|
||||
jsonify(
|
||||
{"success": True, "identifier": str(explicit_binary.as_uuid())}
|
||||
),
|
||||
201,
|
||||
)
|
||||
|
||||
#route to get publicly shared conversations
|
||||
@user.route("/api/shared_conversation/<string:identifier>",methods=["GET"])
|
||||
def get_publicly_shared_conversations(identifier : str):
|
||||
try:
|
||||
query_uuid = Binary.from_uuid(uuid.UUID(identifier), UuidRepresentation.STANDARD)
|
||||
shared = shared_conversations_collections.find_one({"uuid":query_uuid})
|
||||
conversation_queries=[]
|
||||
if shared and 'conversation_id' in shared and isinstance(shared['conversation_id'], DBRef):
|
||||
# Resolve the DBRef
|
||||
conversation_ref = shared['conversation_id']
|
||||
conversation = db.dereference(conversation_ref)
|
||||
if(conversation is None):
|
||||
return jsonify({"sucess":False,"error":"might have broken url or the conversation does not exist"}),404
|
||||
conversation_queries = conversation['queries'][:(shared["first_n_queries"])]
|
||||
for query in conversation_queries:
|
||||
query.pop("sources") ## avoid exposing sources
|
||||
##isPromptable = False
|
||||
pre_existing = shared_conversations_collections.find_one(
|
||||
{
|
||||
"conversation_id": DBRef("conversations", ObjectId(conversation_id)),
|
||||
"isPromptable": isPromptable,
|
||||
"first_n_queries": current_n_queries,
|
||||
"user": user,
|
||||
}
|
||||
)
|
||||
if pre_existing is not None:
|
||||
return (
|
||||
jsonify(
|
||||
{"success": True, "identifier": str(pre_existing["uuid"].as_uuid())}
|
||||
),
|
||||
200,
|
||||
)
|
||||
else:
|
||||
return jsonify({"sucess":False,"error":"might have broken url or the conversation does not exist"}),404
|
||||
shared_conversations_collections.insert_one(
|
||||
{
|
||||
"uuid": explicit_binary,
|
||||
"conversation_id": {
|
||||
"$ref": "conversations",
|
||||
"$id": ObjectId(conversation_id),
|
||||
},
|
||||
"isPromptable": isPromptable,
|
||||
"first_n_queries": current_n_queries,
|
||||
"user": user,
|
||||
}
|
||||
)
|
||||
## Identifier as route parameter in frontend
|
||||
return (
|
||||
jsonify(
|
||||
{"success": True, "identifier": str(explicit_binary.as_uuid())}
|
||||
),
|
||||
201,
|
||||
)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
return jsonify({"success": False, "error": str(err)}), 400
|
||||
|
||||
|
||||
# route to get publicly shared conversations
|
||||
@user.route("/api/shared_conversation/<string:identifier>", methods=["GET"])
|
||||
def get_publicly_shared_conversations(identifier: str):
|
||||
try:
|
||||
query_uuid = Binary.from_uuid(
|
||||
uuid.UUID(identifier), UuidRepresentation.STANDARD
|
||||
)
|
||||
shared = shared_conversations_collections.find_one({"uuid": query_uuid})
|
||||
conversation_queries = []
|
||||
if (
|
||||
shared
|
||||
and "conversation_id" in shared
|
||||
and isinstance(shared["conversation_id"], DBRef)
|
||||
):
|
||||
# Resolve the DBRef
|
||||
conversation_ref = shared["conversation_id"]
|
||||
conversation = db.dereference(conversation_ref)
|
||||
if conversation is None:
|
||||
return (
|
||||
jsonify(
|
||||
{
|
||||
"sucess": False,
|
||||
"error": "might have broken url or the conversation does not exist",
|
||||
}
|
||||
),
|
||||
404,
|
||||
)
|
||||
conversation_queries = conversation["queries"][
|
||||
: (shared["first_n_queries"])
|
||||
]
|
||||
for query in conversation_queries:
|
||||
query.pop("sources") ## avoid exposing sources
|
||||
else:
|
||||
return (
|
||||
jsonify(
|
||||
{
|
||||
"sucess": False,
|
||||
"error": "might have broken url or the conversation does not exist",
|
||||
}
|
||||
),
|
||||
404,
|
||||
)
|
||||
date = conversation["_id"].generation_time.isoformat()
|
||||
res = {
|
||||
"success":True,
|
||||
"queries":conversation_queries,
|
||||
"title":conversation["name"],
|
||||
"timestamp":date
|
||||
}
|
||||
if(shared["isPromptable"] and "api_key" in shared):
|
||||
"success": True,
|
||||
"queries": conversation_queries,
|
||||
"title": conversation["name"],
|
||||
"timestamp": date,
|
||||
}
|
||||
if shared["isPromptable"] and "api_key" in shared:
|
||||
res["api_key"] = shared["api_key"]
|
||||
return jsonify(res), 200
|
||||
except Exception as err:
|
||||
print (err)
|
||||
return jsonify({"success":False,"error":str(err)}),400
|
||||
print(err)
|
||||
return jsonify({"success": False, "error": str(err)}), 400
|
||||
|
||||
@@ -6,12 +6,14 @@ from application.core.settings import settings
|
||||
from application.api.user.routes import user
|
||||
from application.api.answer.routes import answer
|
||||
from application.api.internal.routes import internal
|
||||
from application.core.logging_config import setup_logging
|
||||
|
||||
if platform.system() == "Windows":
|
||||
import pathlib
|
||||
pathlib.PosixPath = pathlib.WindowsPath
|
||||
|
||||
dotenv.load_dotenv()
|
||||
setup_logging()
|
||||
|
||||
app = Flask(__name__)
|
||||
app.register_blueprint(user)
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
from celery import Celery
|
||||
from application.core.settings import settings
|
||||
from celery.signals import setup_logging
|
||||
|
||||
def make_celery(app_name=__name__):
|
||||
celery = Celery(app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND)
|
||||
celery.conf.update(settings)
|
||||
return celery
|
||||
|
||||
@setup_logging.connect
|
||||
def config_loggers(*args, **kwargs):
|
||||
from application.core.logging_config import setup_logging
|
||||
setup_logging()
|
||||
|
||||
celery = make_celery()
|
||||
|
||||
22
application/core/logging_config.py
Normal file
22
application/core/logging_config.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from logging.config import dictConfig
|
||||
|
||||
def setup_logging():
|
||||
dictConfig({
|
||||
'version': 1,
|
||||
'formatters': {
|
||||
'default': {
|
||||
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stdout",
|
||||
"formatter": "default",
|
||||
}
|
||||
},
|
||||
'root': {
|
||||
'level': 'INFO',
|
||||
'handlers': ['console'],
|
||||
},
|
||||
})
|
||||
@@ -29,6 +29,7 @@ class Settings(BaseSettings):
|
||||
OPENAI_API_VERSION: Optional[str] = None # azure openai api version
|
||||
AZURE_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for answering
|
||||
AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for embeddings
|
||||
OPENAI_BASE_URL: Optional[str] = None # openai base url for open ai compatable models
|
||||
|
||||
# elasticsearch
|
||||
ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch
|
||||
|
||||
@@ -2,25 +2,23 @@ from application.llm.base import BaseLLM
|
||||
from application.core.settings import settings
|
||||
|
||||
|
||||
|
||||
class OpenAILLM(BaseLLM):
|
||||
|
||||
def __init__(self, api_key=None, user_api_key=None, *args, **kwargs):
|
||||
global openai
|
||||
from openai import OpenAI
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.client = OpenAI(
|
||||
api_key=api_key,
|
||||
)
|
||||
if settings.OPENAI_BASE_URL:
|
||||
self.client = OpenAI(
|
||||
api_key=api_key,
|
||||
base_url=settings.OPENAI_BASE_URL
|
||||
)
|
||||
else:
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
self.api_key = api_key
|
||||
self.user_api_key = user_api_key
|
||||
|
||||
def _get_openai(self):
|
||||
# Import openai when needed
|
||||
import openai
|
||||
|
||||
return openai
|
||||
|
||||
def _raw_gen(
|
||||
self,
|
||||
baseself,
|
||||
@@ -29,7 +27,7 @@ class OpenAILLM(BaseLLM):
|
||||
stream=False,
|
||||
engine=settings.AZURE_DEPLOYMENT_NAME,
|
||||
**kwargs
|
||||
):
|
||||
):
|
||||
response = self.client.chat.completions.create(
|
||||
model=model, messages=messages, stream=stream, **kwargs
|
||||
)
|
||||
@@ -44,7 +42,7 @@ class OpenAILLM(BaseLLM):
|
||||
stream=True,
|
||||
engine=settings.AZURE_DEPLOYMENT_NAME,
|
||||
**kwargs
|
||||
):
|
||||
):
|
||||
response = self.client.chat.completions.create(
|
||||
model=model, messages=messages, stream=stream, **kwargs
|
||||
)
|
||||
@@ -73,8 +71,3 @@ class AzureOpenAILLM(OpenAILLM):
|
||||
api_base=settings.OPENAI_API_BASE,
|
||||
deployment_name=settings.AZURE_DEPLOYMENT_NAME,
|
||||
)
|
||||
|
||||
def _get_openai(self):
|
||||
openai = super()._get_openai()
|
||||
|
||||
return openai
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
Contains parser for html files.
|
||||
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
@@ -18,66 +17,8 @@ class HTMLParser(BaseParser):
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
|
||||
"""Parse file.
|
||||
from langchain_community.document_loaders import BSHTMLLoader
|
||||
|
||||
Returns:
|
||||
Union[str, List[str]]: a string or a List of strings.
|
||||
"""
|
||||
try:
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
except ImportError:
|
||||
raise ValueError("unstructured package is required to parse HTML files.")
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
with open(file, "r", encoding="utf-8") as fp:
|
||||
elements = partition_html(file=fp)
|
||||
isd = convert_to_isd(elements)
|
||||
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
|
||||
|
||||
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
|
||||
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
|
||||
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
|
||||
for isd_el in isd:
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
|
||||
|
||||
# Creating a list of all the indexes of isd_el['type'] = 'Title'
|
||||
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
|
||||
# Creating 'Chunks' - List of lists of strings
|
||||
# each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
|
||||
# Each Chunk can be thought of as an individual set of data, which can be sent to the model
|
||||
# Where Each Title is grouped together with the data under it
|
||||
|
||||
Chunks = [[]]
|
||||
final_chunks = list(list())
|
||||
|
||||
for i, isd_el in enumerate(isd):
|
||||
if i in title_indexes:
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
|
||||
# Removing all the chunks with sum of length of all the strings in the chunk < 25
|
||||
# TODO: This value can be an user defined variable
|
||||
for chunk in Chunks:
|
||||
# sum of length of all the strings in the chunk
|
||||
sum = 0
|
||||
sum += len(str(chunk))
|
||||
if sum < 25:
|
||||
Chunks.remove(chunk)
|
||||
else:
|
||||
# appending all the approved chunks to final_chunks as a single string
|
||||
final_chunks.append(" ".join([str(item) for item in chunk]))
|
||||
return final_chunks
|
||||
loader = BSHTMLLoader(file)
|
||||
data = loader.load()
|
||||
return data
|
||||
|
||||
@@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote
|
||||
|
||||
class CrawlerLoader(BaseRemote):
|
||||
def __init__(self, limit=10):
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
self.loader = WebBaseLoader # Initialize the document loader
|
||||
self.limit = limit # Set the limit for the number of pages to scrape
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from application.parser.remote.base import BaseRemote
|
||||
|
||||
class SitemapLoader(BaseRemote):
|
||||
def __init__(self, limit=20):
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
self.loader = WebBaseLoader
|
||||
self.limit = limit # Adding limit to control the number of URLs to process
|
||||
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
anthropic==0.12.0
|
||||
boto3==1.34.6
|
||||
anthropic==0.34.0
|
||||
boto3==1.34.153
|
||||
beautifulsoup4==4.12.3
|
||||
celery==5.3.6
|
||||
dataclasses_json==0.6.3
|
||||
dataclasses_json==0.6.7
|
||||
docx2txt==0.8
|
||||
duckduckgo-search==5.3.0
|
||||
duckduckgo-search==6.2.6
|
||||
EbookLib==0.18
|
||||
elasticsearch==8.12.0
|
||||
elasticsearch==8.14.0
|
||||
escodegen==1.0.11
|
||||
esprima==4.0.1
|
||||
faiss-cpu==1.8.0.post1
|
||||
Flask==3.0.1
|
||||
gunicorn==22.0.0
|
||||
faiss-cpu==1.8.0.post1
|
||||
gunicorn==23.0.0
|
||||
html2text==2020.1.16
|
||||
javalang==0.13.0
|
||||
langchain==0.2.16
|
||||
@@ -18,12 +19,12 @@ langchain-community==0.2.16
|
||||
langchain-core==0.2.38
|
||||
langchain-openai==0.1.23
|
||||
openapi3_parser==1.1.16
|
||||
pandas==2.2.0
|
||||
pydantic_settings==2.1.0
|
||||
pymongo==4.6.3
|
||||
pandas==2.2.2
|
||||
pydantic_settings==2.4.0
|
||||
pymongo==4.8.0
|
||||
PyPDF2==3.0.1
|
||||
python-dotenv==1.0.1
|
||||
qdrant-client==1.9.0
|
||||
qdrant-client==1.11.0
|
||||
redis==5.0.1
|
||||
Requests==2.32.0
|
||||
retry==0.9.2
|
||||
@@ -31,6 +32,5 @@ sentence-transformers
|
||||
tiktoken==0.7.0
|
||||
torch
|
||||
tqdm==4.66.3
|
||||
transformers==4.36.2
|
||||
unstructured==0.12.2
|
||||
transformers==4.44.0
|
||||
Werkzeug==3.0.3
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
from application.retriever.base import BaseRetriever
|
||||
from application.core.settings import settings
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
from langchain_community.tools import BraveSearch
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class BraveRetSearch(BaseRetriever):
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
i["response"]
|
||||
)
|
||||
if tokens_current_history + tokens_batch < self.token_limit:
|
||||
|
||||
@@ -4,7 +4,7 @@ from application.core.settings import settings
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class ClassicRAG(BaseRetriever):
|
||||
@@ -98,7 +98,7 @@ class ClassicRAG(BaseRetriever):
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
i["response"]
|
||||
)
|
||||
if tokens_current_history + tokens_batch < self.token_limit:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from application.retriever.base import BaseRetriever
|
||||
from application.core.settings import settings
|
||||
from application.llm.llm_creator import LLMCreator
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
from langchain_community.tools import DuckDuckGoSearchResults
|
||||
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
||||
|
||||
@@ -95,7 +95,7 @@ class DuckDuckSearch(BaseRetriever):
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = count_tokens(i["prompt"]) + count_tokens(
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
i["response"]
|
||||
)
|
||||
if tokens_current_history + tokens_batch < self.token_limit:
|
||||
|
||||
@@ -2,7 +2,7 @@ import sys
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
from application.core.settings import settings
|
||||
from application.utils import count_tokens
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
db = mongo["docsgpt"]
|
||||
@@ -24,9 +24,9 @@ def update_token_usage(user_api_key, token_usage):
|
||||
def gen_token_usage(func):
|
||||
def wrapper(self, model, messages, stream, **kwargs):
|
||||
for message in messages:
|
||||
self.token_usage["prompt_tokens"] += count_tokens(message["content"])
|
||||
self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"])
|
||||
result = func(self, model, messages, stream, **kwargs)
|
||||
self.token_usage["generated_tokens"] += count_tokens(result)
|
||||
self.token_usage["generated_tokens"] += num_tokens_from_string(result)
|
||||
update_token_usage(self.user_api_key, self.token_usage)
|
||||
return result
|
||||
|
||||
@@ -36,14 +36,14 @@ def gen_token_usage(func):
|
||||
def stream_token_usage(func):
|
||||
def wrapper(self, model, messages, stream, **kwargs):
|
||||
for message in messages:
|
||||
self.token_usage["prompt_tokens"] += count_tokens(message["content"])
|
||||
self.token_usage["prompt_tokens"] += num_tokens_from_string(message["content"])
|
||||
batch = []
|
||||
result = func(self, model, messages, stream, **kwargs)
|
||||
for r in result:
|
||||
batch.append(r)
|
||||
yield r
|
||||
for line in batch:
|
||||
self.token_usage["generated_tokens"] += count_tokens(line)
|
||||
self.token_usage["generated_tokens"] += num_tokens_from_string(line)
|
||||
update_token_usage(self.user_api_key, self.token_usage)
|
||||
|
||||
return wrapper
|
||||
|
||||
@@ -1,6 +1,22 @@
|
||||
from transformers import GPT2TokenizerFast
|
||||
import tiktoken
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
||||
tokenizer.model_max_length = 100000
|
||||
def count_tokens(string):
|
||||
return len(tokenizer(string)['input_ids'])
|
||||
_encoding = None
|
||||
|
||||
def get_encoding():
|
||||
global _encoding
|
||||
if _encoding is None:
|
||||
_encoding = tiktoken.get_encoding("cl100k_base")
|
||||
return _encoding
|
||||
|
||||
def num_tokens_from_string(string: str) -> int:
|
||||
encoding = get_encoding()
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
|
||||
def count_tokens_docs(docs):
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens = num_tokens_from_string(docs_content)
|
||||
return tokens
|
||||
@@ -2,8 +2,8 @@ import os
|
||||
import shutil
|
||||
import string
|
||||
import zipfile
|
||||
import tiktoken
|
||||
from urllib.parse import urljoin
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
@@ -13,6 +13,8 @@ from application.parser.remote.remote_creator import RemoteCreator
|
||||
from application.parser.open_ai_func import call_openai_api
|
||||
from application.parser.schema.base import Document
|
||||
from application.parser.token_func import group_split
|
||||
from application.utils import count_tokens_docs
|
||||
|
||||
|
||||
# Define a function to extract metadata from a given filename.
|
||||
def metadata_from_filename(title):
|
||||
@@ -41,7 +43,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
|
||||
max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
|
||||
"""
|
||||
if current_depth > max_depth:
|
||||
print(f"Reached maximum recursion depth of {max_depth}")
|
||||
logging.warning(f"Reached maximum recursion depth of {max_depth}")
|
||||
return
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
@@ -88,16 +90,13 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
max_tokens = 1250
|
||||
recursion_depth = 2
|
||||
full_path = os.path.join(directory, user, name_job)
|
||||
import sys
|
||||
|
||||
print(full_path, file=sys.stderr)
|
||||
logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
|
||||
# check if API_URL env variable is set
|
||||
file_data = {"name": name_job, "file": filename, "user": user}
|
||||
response = requests.get(
|
||||
urljoin(settings.API_URL, "/api/download"), params=file_data
|
||||
)
|
||||
# check if file is in the response
|
||||
print(response, file=sys.stderr)
|
||||
file = response.content
|
||||
|
||||
if not os.path.exists(full_path):
|
||||
@@ -137,7 +136,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
if sample:
|
||||
for i in range(min(5, len(raw_docs))):
|
||||
print(raw_docs[i].text)
|
||||
logging.info(f"Sample document {i}: {raw_docs[i]}")
|
||||
|
||||
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
|
||||
# and send them to the server (provide user and name in form)
|
||||
@@ -180,6 +179,7 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
self.update_state(state="PROGRESS", meta={"current": 1})
|
||||
logging.info(f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data})
|
||||
|
||||
remote_loader = RemoteCreator.create_loader(loader)
|
||||
raw_docs = remote_loader.load_data(source_data)
|
||||
@@ -212,26 +212,4 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
|
||||
shutil.rmtree(full_path)
|
||||
|
||||
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
|
||||
|
||||
|
||||
def count_tokens_docs(docs):
|
||||
# Here we convert the docs list to a string and calculate the number of tokens the string represents.
|
||||
# docs_content = (" ".join(docs))
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens, total_price = num_tokens_from_string(
|
||||
string=docs_content, encoding_name="cl100k_base"
|
||||
)
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
return tokens
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = (num_tokens / 1000) * 0.0004
|
||||
return num_tokens, total_price
|
||||
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
|
||||
Reference in New Issue
Block a user