From 4b504d1a6f230eec09c108fb63b55640e58adff8 Mon Sep 17 00:00:00 2001 From: GH05TCREW Date: Sat, 10 Jan 2026 18:35:40 -0700 Subject: [PATCH] feat(memory): improve summarization --- pentestagent/llm/memory.py | 105 ++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 25 deletions(-) diff --git a/pentestagent/llm/memory.py b/pentestagent/llm/memory.py index 74d0b5f..cd2b2f8 100644 --- a/pentestagent/llm/memory.py +++ b/pentestagent/llm/memory.py @@ -2,22 +2,30 @@ from typing import Awaitable, Callable, List, Optional -SUMMARY_PROMPT = """Summarize this conversation history for a pentesting agent. Be terse. +SUMMARY_PROMPT = """Summarize the following conversation segment for a penetration testing agent. +The summary will be used to continue the security assessment, so preserve all critical operational details. -Focus on: -- Targets discovered (IPs, domains, hosts) -- Open ports and services found -- Credentials or secrets discovered -- Vulnerabilities identified -- What was attempted and failed (to avoid repeating) -- Current objective/progress +What to preserve: +- Discovered targets (IPs, domains, hostnames) and network topology +- Services, versions, and technologies identified (keep exact version strings) +- Open ports and running services with specific details +- Vulnerabilities found or suspected (CVEs, misconfigurations, weaknesses) +- Credentials, tokens, API keys, or authentication details discovered +- Attack vectors attempted and their outcomes (success or failure) +- System architecture and relationships between hosts +- Important error messages or behaviors that may indicate vulnerabilities +- Current testing strategy and next planned steps -Omit: verbose tool output, back-and-forth clarifications, redundant info. +Compression approach: +- Consolidate redundant or repetitive findings into single statements +- Reduce verbose tool output while maintaining key technical findings +- Keep technical precision: exact paths, URLs, parameters, version numbers +- Remove conversational back-and-forth but preserve decisions made -Conversation to summarize: +Conversation segment: {conversation} -Summary:""" +Provide a concise technical summary:""" class ConversationMemory: @@ -59,6 +67,16 @@ class ConversationMemory: self._encoder = None return self._encoder + def _count_tokens_with_litellm(self, text: str, model: str) -> Optional[int]: + """Try to count tokens using litellm for better accuracy.""" + try: + import litellm + + count = litellm.token_counter(model=model, text=text) + return int(count) + except Exception: + return None + @property def token_budget(self) -> int: """Available tokens for history.""" @@ -159,7 +177,7 @@ class ConversationMemory: self, messages: List[dict], llm_call: Callable[[str], Awaitable[str]] ) -> str: """ - Summarize a list of messages. + Summarize a list of messages using chunked approach for better granularity. Args: messages: Messages to summarize @@ -168,18 +186,41 @@ class ConversationMemory: Returns: Summary string """ - # Format messages for summarization - conversation_text = self._format_for_summary(messages) + if not messages: + return "[No messages to summarize]" - # Call LLM for summary - prompt = SUMMARY_PROMPT.format(conversation=conversation_text) + # Use chunked summarization for better context preservation + # Process in chunks of 8-12 messages for balance between detail and efficiency + chunk_size = 10 + summaries = [] - try: - summary = await llm_call(prompt) - return summary.strip() - except Exception as e: - # Fallback: simple truncation indicator - return f"[{len(messages)} earlier messages - summarization failed: {e}]" + for i in range(0, len(messages), chunk_size): + chunk = messages[i : i + chunk_size] + conversation_text = self._format_for_summary(chunk) + prompt = SUMMARY_PROMPT.format(conversation=conversation_text) + + try: + chunk_summary = await llm_call(prompt) + if chunk_summary and chunk_summary.strip(): + summaries.append(chunk_summary.strip()) + except Exception as e: + # Log failure but continue with other chunks + summaries.append( + f"[{len(chunk)} messages from segment {i // chunk_size + 1} - summary failed: {e}]" + ) + + # Combine chunk summaries + if not summaries: + return f"[{len(messages)} earlier messages - all summarization attempts failed]" + + # If we have multiple summaries, join them with context markers + if len(summaries) == 1: + return summaries[0] + else: + combined = "\n\n".join( + f"Segment {i + 1}: {summary}" for i, summary in enumerate(summaries) + ) + return combined def _format_for_summary(self, messages: List[dict]) -> str: """Format messages as text for summarization.""" @@ -188,9 +229,20 @@ class ConversationMemory: role = msg.get("role", "unknown") content = msg.get("content", "") - # Truncate very long messages for summarization input - if len(content) > 2000: - content = content[:2000] + "...[truncated]" + # Preserve more content for tool outputs (they contain findings) + # but still limit to avoid overwhelming the summarizer + max_length = 4000 if role == "tool" else 2000 + if len(content) > max_length: + # For tool outputs, try to preserve beginning and end + if role == "tool": + half = max_length // 2 + content = ( + content[:half] + + f"\n...[{len(content) - max_length} chars truncated]...\n" + + content[-half:] + ) + else: + content = content[:max_length] + "...[truncated]" if role == "user": lines.append(f"User: {content}") @@ -199,6 +251,9 @@ class ConversationMemory: elif role == "tool": tool_name = msg.get("name", "tool") lines.append(f"Tool ({tool_name}): {content}") + elif role == "system": + # Skip system messages in summarization input + continue return "\n\n".join(lines)