Merge branch 'main' of https://github.com/manishmadan2882/docsgpt

2026-01-20 05:50:58 +00:00 · 2025-08-12 18:05:11 +05:30
parent 5c615d6f2d d72558eb36
commit 3d1fe724e5
5 changed files with 38 additions and 17 deletions
--- a/application/api/answer/routes/base.py
+++ b/application/api/answer/routes/base.py
@@ -147,20 +147,25 @@ class BaseAnswerResource:
            # Log the interaction

            retriever_params = retriever.get_params()
-            self.user_logs_collection.insert_one(
-                {
-                    "action": "stream_answer",
-                    "level": "info",
-                    "user": decoded_token.get("sub"),
-                    "api_key": user_api_key,
-                    "question": question,
-                    "response": response_full,
-                    "sources": source_log_docs,
-                    "retriever_params": retriever_params,
-                    "attachments": attachment_ids,
-                    "timestamp": datetime.datetime.now(datetime.timezone.utc),
-                }
-            )
+            log_entry = {
+                "action": "stream_answer",
+                "level": "info",
+                "user": decoded_token.get("sub"),
+                "api_key": user_api_key,
+                "question": question,
+                "response": response_full,
+                "sources": source_log_docs,
+                "retriever_params": retriever_params,
+                "attachments": attachment_ids,
+                "timestamp": datetime.datetime.now(datetime.timezone.utc),
+            }
+            
+            # clean up text fields to be no longer than 10000 characters
+            for key, value in log_entry.items():
+                if isinstance(value, str) and len(value) > 10000:
+                    log_entry[key] = value[:10000]
+            
+            self.user_logs_collection.insert_one(log_entry)

            # End of stream

--- a/application/api/answer/services/conversation_service.py
+++ b/application/api/answer/services/conversation_service.py
@@ -66,6 +66,11 @@ class ConversationService:
        if not user_id:
            raise ValueError("User ID not found in token")
        current_time = datetime.now(timezone.utc)
+        
+        # clean up in sources array such that we save max 1k characters for text part
+        for source in sources:
+            if "text" in source and isinstance(source["text"], str):
+                source["text"] = source["text"][:1000]

        if conversation_id is not None and index is not None:
            # Update existing conversation with new query
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -65,7 +65,7 @@ attachments_collection = db["attachments"]

 try:
    agents_collection.create_index(
-        [("shared_publicly", 1)],
+        [("shared", 1)],
        name="shared_index",
        background=True,
    )
--- a/application/logging.py
+++ b/application/logging.py
@@ -136,6 +136,8 @@ def _log_to_mongodb(
        mongo = MongoDB.get_client()
        db = mongo[settings.MONGO_DB_NAME]
        user_logs_collection = db["stack_logs"]
+        
+

        log_entry = {
            "endpoint": endpoint,
@@ -147,6 +149,11 @@ def _log_to_mongodb(
            "stacks": stacks,
            "timestamp": datetime.datetime.now(datetime.timezone.utc),
        }
+        # clean up text fields to be no longer than 10000 characters
+        for key, value in log_entry.items():
+            if isinstance(value, str) and len(value) > 10000:
+                log_entry[key] = value[:10000]
+    
        user_logs_collection.insert_one(log_entry)
        logging.debug(f"Logged activity to MongoDB: {activity_id}")

--- a/application/worker.py
+++ b/application/worker.py
@@ -746,9 +746,13 @@ def attachment_worker(self, file_info, user):
            .load_data()[0]
            .text,
        )
-
+        
+        
        token_count = num_tokens_from_string(content)
-
+        if token_count > 100000:
+            content = content[:250000]
+            token_count = num_tokens_from_string(content)
+        
        self.update_state(
            state="PROGRESS", meta={"current": 80, "status": "Storing in database"}
        )