fix: truncate long text fields to prevent overflow in logs and sources

This commit is contained in:
Alex
2025-08-11 14:56:31 +01:00
parent 56831fbcf2
commit f94a093e8c
5 changed files with 38 additions and 17 deletions

View File

@@ -147,20 +147,25 @@ class BaseAnswerResource:
# Log the interaction # Log the interaction
retriever_params = retriever.get_params() retriever_params = retriever.get_params()
self.user_logs_collection.insert_one( log_entry = {
{ "action": "stream_answer",
"action": "stream_answer", "level": "info",
"level": "info", "user": decoded_token.get("sub"),
"user": decoded_token.get("sub"), "api_key": user_api_key,
"api_key": user_api_key, "question": question,
"question": question, "response": response_full,
"response": response_full, "sources": source_log_docs,
"sources": source_log_docs, "retriever_params": retriever_params,
"retriever_params": retriever_params, "attachments": attachment_ids,
"attachments": attachment_ids, "timestamp": datetime.datetime.now(datetime.timezone.utc),
"timestamp": datetime.datetime.now(datetime.timezone.utc), }
}
) # clean up text fields to be no longer than 10000 characters
for key, value in log_entry.items():
if isinstance(value, str) and len(value) > 10000:
log_entry[key] = value[:10000]
self.user_logs_collection.insert_one(log_entry)
# End of stream # End of stream

View File

@@ -66,6 +66,11 @@ class ConversationService:
if not user_id: if not user_id:
raise ValueError("User ID not found in token") raise ValueError("User ID not found in token")
current_time = datetime.now(timezone.utc) current_time = datetime.now(timezone.utc)
# clean up in sources array such that we save max 1k characters for text part
for source in sources:
if "text" in source and isinstance(source["text"], str):
source["text"] = source["text"][:1000]
if conversation_id is not None and index is not None: if conversation_id is not None and index is not None:
# Update existing conversation with new query # Update existing conversation with new query

View File

@@ -64,7 +64,7 @@ attachments_collection = db["attachments"]
try: try:
agents_collection.create_index( agents_collection.create_index(
[("shared_publicly", 1)], [("shared", 1)],
name="shared_index", name="shared_index",
background=True, background=True,
) )

View File

@@ -136,6 +136,8 @@ def _log_to_mongodb(
mongo = MongoDB.get_client() mongo = MongoDB.get_client()
db = mongo[settings.MONGO_DB_NAME] db = mongo[settings.MONGO_DB_NAME]
user_logs_collection = db["stack_logs"] user_logs_collection = db["stack_logs"]
log_entry = { log_entry = {
"endpoint": endpoint, "endpoint": endpoint,
@@ -147,6 +149,11 @@ def _log_to_mongodb(
"stacks": stacks, "stacks": stacks,
"timestamp": datetime.datetime.now(datetime.timezone.utc), "timestamp": datetime.datetime.now(datetime.timezone.utc),
} }
# clean up text fields to be no longer than 10000 characters
for key, value in log_entry.items():
if isinstance(value, str) and len(value) > 10000:
log_entry[key] = value[:10000]
user_logs_collection.insert_one(log_entry) user_logs_collection.insert_one(log_entry)
logging.debug(f"Logged activity to MongoDB: {activity_id}") logging.debug(f"Logged activity to MongoDB: {activity_id}")

View File

@@ -471,9 +471,13 @@ def attachment_worker(self, file_info, user):
.load_data()[0] .load_data()[0]
.text, .text,
) )
token_count = num_tokens_from_string(content) token_count = num_tokens_from_string(content)
if token_count > 100000:
content = content[:250000]
token_count = num_tokens_from_string(content)
self.update_state( self.update_state(
state="PROGRESS", meta={"current": 80, "status": "Storing in database"} state="PROGRESS", meta={"current": 80, "status": "Storing in database"}
) )