mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
fix: truncate long text fields to prevent overflow in logs and sources
This commit is contained in:
@@ -147,20 +147,25 @@ class BaseAnswerResource:
|
|||||||
# Log the interaction
|
# Log the interaction
|
||||||
|
|
||||||
retriever_params = retriever.get_params()
|
retriever_params = retriever.get_params()
|
||||||
self.user_logs_collection.insert_one(
|
log_entry = {
|
||||||
{
|
"action": "stream_answer",
|
||||||
"action": "stream_answer",
|
"level": "info",
|
||||||
"level": "info",
|
"user": decoded_token.get("sub"),
|
||||||
"user": decoded_token.get("sub"),
|
"api_key": user_api_key,
|
||||||
"api_key": user_api_key,
|
"question": question,
|
||||||
"question": question,
|
"response": response_full,
|
||||||
"response": response_full,
|
"sources": source_log_docs,
|
||||||
"sources": source_log_docs,
|
"retriever_params": retriever_params,
|
||||||
"retriever_params": retriever_params,
|
"attachments": attachment_ids,
|
||||||
"attachments": attachment_ids,
|
"timestamp": datetime.datetime.now(datetime.timezone.utc),
|
||||||
"timestamp": datetime.datetime.now(datetime.timezone.utc),
|
}
|
||||||
}
|
|
||||||
)
|
# clean up text fields to be no longer than 10000 characters
|
||||||
|
for key, value in log_entry.items():
|
||||||
|
if isinstance(value, str) and len(value) > 10000:
|
||||||
|
log_entry[key] = value[:10000]
|
||||||
|
|
||||||
|
self.user_logs_collection.insert_one(log_entry)
|
||||||
|
|
||||||
# End of stream
|
# End of stream
|
||||||
|
|
||||||
|
|||||||
@@ -66,6 +66,11 @@ class ConversationService:
|
|||||||
if not user_id:
|
if not user_id:
|
||||||
raise ValueError("User ID not found in token")
|
raise ValueError("User ID not found in token")
|
||||||
current_time = datetime.now(timezone.utc)
|
current_time = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
# clean up in sources array such that we save max 1k characters for text part
|
||||||
|
for source in sources:
|
||||||
|
if "text" in source and isinstance(source["text"], str):
|
||||||
|
source["text"] = source["text"][:1000]
|
||||||
|
|
||||||
if conversation_id is not None and index is not None:
|
if conversation_id is not None and index is not None:
|
||||||
# Update existing conversation with new query
|
# Update existing conversation with new query
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ attachments_collection = db["attachments"]
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
agents_collection.create_index(
|
agents_collection.create_index(
|
||||||
[("shared_publicly", 1)],
|
[("shared", 1)],
|
||||||
name="shared_index",
|
name="shared_index",
|
||||||
background=True,
|
background=True,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -136,6 +136,8 @@ def _log_to_mongodb(
|
|||||||
mongo = MongoDB.get_client()
|
mongo = MongoDB.get_client()
|
||||||
db = mongo[settings.MONGO_DB_NAME]
|
db = mongo[settings.MONGO_DB_NAME]
|
||||||
user_logs_collection = db["stack_logs"]
|
user_logs_collection = db["stack_logs"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
log_entry = {
|
log_entry = {
|
||||||
"endpoint": endpoint,
|
"endpoint": endpoint,
|
||||||
@@ -147,6 +149,11 @@ def _log_to_mongodb(
|
|||||||
"stacks": stacks,
|
"stacks": stacks,
|
||||||
"timestamp": datetime.datetime.now(datetime.timezone.utc),
|
"timestamp": datetime.datetime.now(datetime.timezone.utc),
|
||||||
}
|
}
|
||||||
|
# clean up text fields to be no longer than 10000 characters
|
||||||
|
for key, value in log_entry.items():
|
||||||
|
if isinstance(value, str) and len(value) > 10000:
|
||||||
|
log_entry[key] = value[:10000]
|
||||||
|
|
||||||
user_logs_collection.insert_one(log_entry)
|
user_logs_collection.insert_one(log_entry)
|
||||||
logging.debug(f"Logged activity to MongoDB: {activity_id}")
|
logging.debug(f"Logged activity to MongoDB: {activity_id}")
|
||||||
|
|
||||||
|
|||||||
@@ -471,9 +471,13 @@ def attachment_worker(self, file_info, user):
|
|||||||
.load_data()[0]
|
.load_data()[0]
|
||||||
.text,
|
.text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
token_count = num_tokens_from_string(content)
|
token_count = num_tokens_from_string(content)
|
||||||
|
if token_count > 100000:
|
||||||
|
content = content[:250000]
|
||||||
|
token_count = num_tokens_from_string(content)
|
||||||
|
|
||||||
self.update_state(
|
self.update_state(
|
||||||
state="PROGRESS", meta={"current": 80, "status": "Storing in database"}
|
state="PROGRESS", meta={"current": 80, "status": "Storing in database"}
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user