diff --git a/application/agents/base.py b/application/agents/base.py index 44df7ee4..eace50b1 100644 --- a/application/agents/base.py +++ b/application/agents/base.py @@ -346,12 +346,81 @@ class BaseAgent(ABC): logger.error(f"Error checking context limit: {str(e)}", exc_info=True) return False + def _validate_context_size(self, messages: List[Dict]) -> None: + """ + Pre-flight validation before calling LLM. Logs warnings but never raises errors. + + Args: + messages: Messages to be sent to LLM + """ + from application.core.model_utils import get_token_limit + + current_tokens = self._calculate_current_context_tokens(messages) + self.current_token_count = current_tokens + context_limit = get_token_limit(self.model_id) + + percentage = (current_tokens / context_limit) * 100 + + # Log based on usage level + if current_tokens >= context_limit: + logger.warning( + f"Context at limit: {current_tokens:,}/{context_limit:,} tokens " + f"({percentage:.1f}%). Model: {self.model_id}" + ) + elif current_tokens >= int(context_limit * settings.COMPRESSION_THRESHOLD_PERCENTAGE): + logger.info( + f"Context approaching limit: {current_tokens:,}/{context_limit:,} tokens " + f"({percentage:.1f}%)" + ) + + def _truncate_text_middle(self, text: str, max_tokens: int) -> str: + """ + Truncate text by removing content from the middle, preserving start and end. + + Args: + text: Text to truncate + max_tokens: Maximum tokens allowed + + Returns: + Truncated text with middle removed if needed + """ + from application.utils import num_tokens_from_string + + current_tokens = num_tokens_from_string(text) + if current_tokens <= max_tokens: + return text + + # Estimate chars per token (roughly 4 chars per token for English) + chars_per_token = len(text) / current_tokens if current_tokens > 0 else 4 + target_chars = int(max_tokens * chars_per_token * 0.95) # 5% safety margin + + if target_chars <= 0: + return "" + + # Split: keep 40% from start, 40% from end, remove middle + start_chars = int(target_chars * 0.4) + end_chars = int(target_chars * 0.4) + + truncation_marker = "\n\n[... content truncated to fit context limit ...]\n\n" + + truncated = text[:start_chars] + truncation_marker + text[-end_chars:] + + logger.info( + f"Truncated text from {current_tokens:,} to ~{max_tokens:,} tokens " + f"(removed middle section)" + ) + + return truncated + def _build_messages( self, system_prompt: str, query: str, ) -> List[Dict]: """Build messages using pre-rendered system prompt""" + from application.core.model_utils import get_token_limit + from application.utils import num_tokens_from_string + # Append compression summary to system prompt if present if self.compressed_summary: compression_context = ( @@ -363,9 +432,34 @@ class BaseAgent(ABC): ) system_prompt = system_prompt + compression_context + context_limit = get_token_limit(self.model_id) + system_tokens = num_tokens_from_string(system_prompt) + + # Reserve 10% for response/tools + safety_buffer = int(context_limit * 0.1) + available_after_system = context_limit - system_tokens - safety_buffer + + # Max tokens for query: 80% of available space (leave room for history) + max_query_tokens = int(available_after_system * 0.8) + query_tokens = num_tokens_from_string(query) + + # Truncate query from middle if it exceeds 80% of available context + if query_tokens > max_query_tokens: + query = self._truncate_text_middle(query, max_query_tokens) + query_tokens = num_tokens_from_string(query) + + # Calculate remaining budget for chat history + available_for_history = max(available_after_system - query_tokens, 0) + + # Truncate chat history to fit within available budget + working_history = self._truncate_history_to_fit( + self.chat_history, + available_for_history, + ) + messages = [{"role": "system", "content": system_prompt}] - for i in self.chat_history: + for i in working_history: if "prompt" in i and "response" in i: messages.append({"role": "user", "content": i["prompt"]}) messages.append({"role": "assistant", "content": i["response"]}) @@ -397,7 +491,65 @@ class BaseAgent(ABC): messages.append({"role": "user", "content": query}) return messages + def _truncate_history_to_fit( + self, + history: List[Dict], + max_tokens: int, + ) -> List[Dict]: + """ + Truncate chat history to fit within token budget, keeping most recent messages. + + Args: + history: Full chat history + max_tokens: Maximum tokens allowed for history + + Returns: + Truncated history (most recent messages that fit) + """ + from application.utils import num_tokens_from_string + + if not history or max_tokens <= 0: + return [] + + truncated = [] + current_tokens = 0 + + # Iterate from newest to oldest + for message in reversed(history): + message_tokens = 0 + + if "prompt" in message and "response" in message: + message_tokens += num_tokens_from_string(message["prompt"]) + message_tokens += num_tokens_from_string(message["response"]) + + if "tool_calls" in message: + for tool_call in message["tool_calls"]: + tool_str = ( + f"Tool: {tool_call.get('tool_name')} | " + f"Action: {tool_call.get('action_name')} | " + f"Args: {tool_call.get('arguments')} | " + f"Response: {tool_call.get('result')}" + ) + message_tokens += num_tokens_from_string(tool_str) + + if current_tokens + message_tokens <= max_tokens: + current_tokens += message_tokens + truncated.insert(0, message) # Maintain chronological order + else: + break + + if len(truncated) < len(history): + logger.info( + f"Truncated chat history from {len(history)} to {len(truncated)} messages " + f"to fit within {max_tokens:,} token budget" + ) + + return truncated + def _llm_gen(self, messages: List[Dict], log_context: Optional[LogContext] = None): + # Pre-flight context validation - fail fast if over limit + self._validate_context_size(messages) + gen_kwargs = {"model": self.model_id, "messages": messages} if ( diff --git a/docs/pages/Guides/_meta.json b/docs/pages/Guides/_meta.json index 065cd8d8..f5ea6c6d 100644 --- a/docs/pages/Guides/_meta.json +++ b/docs/pages/Guides/_meta.json @@ -21,6 +21,10 @@ "title": "🏗️ Architecture", "href": "/Guides/Architecture" }, + "compression": { + "title": "🗜️ Context Compression", + "href": "/Guides/compression" + }, "Integrations": { "title": "🔗 Integrations" } diff --git a/docs/pages/Guides/compression.md b/docs/pages/Guides/compression.md new file mode 100644 index 00000000..95be9686 --- /dev/null +++ b/docs/pages/Guides/compression.md @@ -0,0 +1,37 @@ +# Context Compression + +DocsGPT implements a smart context compression system to manage long conversations effectively. This feature prevents conversations from hitting the LLM's context window limit while preserving critical information and continuity. + +## How It Works + +The compression system operates on a "summarize and truncate" principle: + +1. **Threshold Check**: Before each request, the system calculates the total token count of the conversation history. +2. **Trigger**: If the token count exceeds a configured threshold (default: 80% of the model's context limit), compression is triggered. +3. **Summarization**: An LLM (potentially a different, cheaper/faster one) processes the older part of the conversation—including previous summaries, user messages, agent responses, and tool outputs. +4. **Context Replacement**: The system generates a comprehensive summary of the older history. For subsequent requests, the LLM receives this **Summary + Recent Messages** instead of the full raw history. + +### Key Features + +* **Recursive Summarization**: New summaries incorporate previous summaries, ensuring that information from the very beginning of a long chat is not lost. +* **Tool Call Support**: The compression logic explicitly handles tool calls and their outputs (e.g., file readings, search results), summarizing their results so the agent retains knowledge of what it has already done. +* **"Needle in a Haystack" Preservation**: The prompts are designed to identify and preserve specific, critical details (like passwords, keys, or specific user instructions) even when compressing large amounts of text. + +## Configuration + +You can configure the compression behavior in your `.env` file or `application/core/settings.py`: + +| Setting | Default | Description | +| :--- | :--- | :--- | +| `ENABLE_CONVERSATION_COMPRESSION` | `True` | Master switch to enable/disable the feature. | +| `COMPRESSION_THRESHOLD_PERCENTAGE` | `0.8` | The fraction of the context window (0.0 to 1.0) that triggers compression. | +| `COMPRESSION_MODEL_OVERRIDE` | `None` | (Optional) Specify a different model ID to use specifically for the summarization task (e.g., using `gpt-3.5-turbo` to compress for `gpt-4`). | +| `COMPRESSION_MAX_HISTORY_POINTS` | `3` | The number of past compression points to keep in the database (older ones are discarded as they are incorporated into newer summaries). | + +## Architecture + +The system is modularized into several components: + +* **`CompressionThresholdChecker`**: Calculates token usage and decides when to compress. +* **`CompressionService`**: Orchestrates the compression process, manages DB updates, and reconstructs the context (Summary + Recent Messages) for the LLM. +* **`CompressionPromptBuilder`**: Constructs the specific prompts used to instruct the LLM to summarize the conversation effectively.