From f94a093e8cff88508d8f2f949ab51ae580b1be66 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 11 Aug 2025 14:56:31 +0100 Subject: [PATCH] fix: truncate long text fields to prevent overflow in logs and sources --- application/api/answer/routes/base.py | 33 +++++++++++-------- .../answer/services/conversation_service.py | 5 +++ application/api/user/routes.py | 2 +- application/logging.py | 7 ++++ application/worker.py | 8 +++-- 5 files changed, 38 insertions(+), 17 deletions(-) diff --git a/application/api/answer/routes/base.py b/application/api/answer/routes/base.py index 682da1f0..3f876081 100644 --- a/application/api/answer/routes/base.py +++ b/application/api/answer/routes/base.py @@ -147,20 +147,25 @@ class BaseAnswerResource: # Log the interaction retriever_params = retriever.get_params() - self.user_logs_collection.insert_one( - { - "action": "stream_answer", - "level": "info", - "user": decoded_token.get("sub"), - "api_key": user_api_key, - "question": question, - "response": response_full, - "sources": source_log_docs, - "retriever_params": retriever_params, - "attachments": attachment_ids, - "timestamp": datetime.datetime.now(datetime.timezone.utc), - } - ) + log_entry = { + "action": "stream_answer", + "level": "info", + "user": decoded_token.get("sub"), + "api_key": user_api_key, + "question": question, + "response": response_full, + "sources": source_log_docs, + "retriever_params": retriever_params, + "attachments": attachment_ids, + "timestamp": datetime.datetime.now(datetime.timezone.utc), + } + + # clean up text fields to be no longer than 10000 characters + for key, value in log_entry.items(): + if isinstance(value, str) and len(value) > 10000: + log_entry[key] = value[:10000] + + self.user_logs_collection.insert_one(log_entry) # End of stream diff --git a/application/api/answer/services/conversation_service.py b/application/api/answer/services/conversation_service.py index e35fcc40..3ea7a136 100644 --- a/application/api/answer/services/conversation_service.py +++ b/application/api/answer/services/conversation_service.py @@ -66,6 +66,11 @@ class ConversationService: if not user_id: raise ValueError("User ID not found in token") current_time = datetime.now(timezone.utc) + + # clean up in sources array such that we save max 1k characters for text part + for source in sources: + if "text" in source and isinstance(source["text"], str): + source["text"] = source["text"][:1000] if conversation_id is not None and index is not None: # Update existing conversation with new query diff --git a/application/api/user/routes.py b/application/api/user/routes.py index a6c0d55b..259da757 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -64,7 +64,7 @@ attachments_collection = db["attachments"] try: agents_collection.create_index( - [("shared_publicly", 1)], + [("shared", 1)], name="shared_index", background=True, ) diff --git a/application/logging.py b/application/logging.py index d48fb17e..2c5cde27 100644 --- a/application/logging.py +++ b/application/logging.py @@ -136,6 +136,8 @@ def _log_to_mongodb( mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] user_logs_collection = db["stack_logs"] + + log_entry = { "endpoint": endpoint, @@ -147,6 +149,11 @@ def _log_to_mongodb( "stacks": stacks, "timestamp": datetime.datetime.now(datetime.timezone.utc), } + # clean up text fields to be no longer than 10000 characters + for key, value in log_entry.items(): + if isinstance(value, str) and len(value) > 10000: + log_entry[key] = value[:10000] + user_logs_collection.insert_one(log_entry) logging.debug(f"Logged activity to MongoDB: {activity_id}") diff --git a/application/worker.py b/application/worker.py index 23f96bf5..71747005 100755 --- a/application/worker.py +++ b/application/worker.py @@ -471,9 +471,13 @@ def attachment_worker(self, file_info, user): .load_data()[0] .text, ) - + + token_count = num_tokens_from_string(content) - + if token_count > 100000: + content = content[:250000] + token_count = num_tokens_from_string(content) + self.update_state( state="PROGRESS", meta={"current": 80, "status": "Storing in database"} )