diff --git a/application/agents/base.py b/application/agents/base.py index 33c86f7a..32d860b8 100644 --- a/application/agents/base.py +++ b/application/agents/base.py @@ -1,9 +1,12 @@ +import logging import uuid from abc import ABC, abstractmethod from typing import Dict, Generator, List, Optional from bson.objectid import ObjectId +logger = logging.getLogger(__name__) + from application.agents.tools.tool_action_parser import ToolActionParser from application.agents.tools.tool_manager import ToolManager @@ -139,6 +142,40 @@ class BaseAgent(ABC): tool_id, action_name, call_args = parser.parse_args(call) call_id = getattr(call, "id", None) or str(uuid.uuid4()) + + # Check if parsing failed + if tool_id is None or action_name is None: + error_message = f"Error: Failed to parse LLM tool call. Tool name: {getattr(call, 'name', 'unknown')}" + logger.error(error_message) + + tool_call_data = { + "tool_name": "unknown", + "call_id": call_id, + "action_name": getattr(call, 'name', 'unknown'), + "arguments": call_args or {}, + "result": f"Failed to parse tool call. Invalid tool name format: {getattr(call, 'name', 'unknown')}", + } + yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}} + self.tool_calls.append(tool_call_data) + return f"Failed to parse tool call.", call_id + + # Check if tool_id exists in available tools + if tool_id not in tools_dict: + error_message = f"Error: Tool ID '{tool_id}' extracted from LLM call not found in available tools_dict. Available IDs: {list(tools_dict.keys())}" + logger.error(error_message) + + # Return error result + tool_call_data = { + "tool_name": "unknown", + "call_id": call_id, + "action_name": f"{action_name}_{tool_id}", + "arguments": call_args, + "result": f"Tool with ID {tool_id} not found. Available tools: {list(tools_dict.keys())}", + } + yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}} + self.tool_calls.append(tool_call_data) + return f"Tool with ID {tool_id} not found.", call_id + tool_call_data = { "tool_name": tools_dict[tool_id]["name"], "call_id": call_id, diff --git a/application/agents/tools/tool_action_parser.py b/application/agents/tools/tool_action_parser.py index 0589ac88..ea544338 100644 --- a/application/agents/tools/tool_action_parser.py +++ b/application/agents/tools/tool_action_parser.py @@ -19,8 +19,20 @@ class ToolActionParser: def _parse_openai_llm(self, call): try: call_args = json.loads(call.arguments) - tool_id = call.name.split("_")[-1] - action_name = call.name.rsplit("_", 1)[0] + tool_parts = call.name.split("_") + + # If the tool name doesn't contain an underscore, it's likely a hallucinated tool + if len(tool_parts) < 2: + logger.warning(f"Invalid tool name format: {call.name}. Expected format: action_name_tool_id") + return None, None, None + + tool_id = tool_parts[-1] + action_name = "_".join(tool_parts[:-1]) + + # Validate that tool_id looks like a numerical ID + if not tool_id.isdigit(): + logger.warning(f"Tool ID '{tool_id}' is not numerical. This might be a hallucinated tool call.") + except (AttributeError, TypeError) as e: logger.error(f"Error parsing OpenAI LLM call: {e}") return None, None, None @@ -29,8 +41,20 @@ class ToolActionParser: def _parse_google_llm(self, call): try: call_args = call.arguments - tool_id = call.name.split("_")[-1] - action_name = call.name.rsplit("_", 1)[0] + tool_parts = call.name.split("_") + + # If the tool name doesn't contain an underscore, it's likely a hallucinated tool + if len(tool_parts) < 2: + logger.warning(f"Invalid tool name format: {call.name}. Expected format: action_name_tool_id") + return None, None, None + + tool_id = tool_parts[-1] + action_name = "_".join(tool_parts[:-1]) + + # Validate that tool_id looks like a numerical ID + if not tool_id.isdigit(): + logger.warning(f"Tool ID '{tool_id}' is not numerical. This might be a hallucinated tool call.") + except (AttributeError, TypeError) as e: logger.error(f"Error parsing Google LLM call: {e}") return None, None, None diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index 38492c7c..7511f3df 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -6,6 +6,21 @@ from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator +def sanitize_content(content: str) -> str: + """ + Remove NUL characters that can cause vector store ingestion to fail. + + Args: + content (str): Raw content that may contain NUL characters + + Returns: + str: Sanitized content with NUL characters removed + """ + if not content: + return content + return content.replace('\x00', '') + + @retry(tries=10, delay=60) def add_text_to_store_with_retry(store, doc, source_id): """ @@ -16,6 +31,9 @@ def add_text_to_store_with_retry(store, doc, source_id): source_id: Unique identifier for the source. """ try: + # Sanitize content to remove NUL characters that cause ingestion failures + doc.page_content = sanitize_content(doc.page_content) + doc.metadata["source_id"] = str(source_id) store.add_texts([doc.page_content], metadatas=[doc.metadata]) except Exception as e: