feat: process pdfs synthetically im model does not support file natively (#2263)

* feat: process pdfs synthetically im model does not support file natively * fix: small code optimisations
2026-04-28 21:10:27 +00:00 · 2026-01-15 02:30:33 +02:00
parent 2c55c6cd9a
commit f61d112cea
13 changed files with 449 additions and 72 deletions
--- a/application/llm/anthropic.py
+++ b/application/llm/anthropic.py
@@ -1,7 +1,13 @@
+import base64
+import logging
+
 from anthropic import AI_PROMPT, Anthropic, HUMAN_PROMPT

 from application.core.settings import settings
 from application.llm.base import BaseLLM
+from application.storage.storage_creator import StorageCreator
+
+logger = logging.getLogger(__name__)


 class AnthropicLLM(BaseLLM):
@@ -20,6 +26,7 @@ class AnthropicLLM(BaseLLM):

        self.HUMAN_PROMPT = HUMAN_PROMPT
        self.AI_PROMPT = AI_PROMPT
+        self.storage = StorageCreator.get_storage()

    def _raw_gen(
        self,
@@ -70,3 +77,115 @@ class AnthropicLLM(BaseLLM):
        finally:
            if hasattr(stream_response, "close"):
                stream_response.close()
+
+    def get_supported_attachment_types(self):
+        """
+        Return a list of MIME types supported by Anthropic Claude for file uploads.
+        Claude supports images but not PDFs natively.
+        PDFs are synthetically supported via PDF-to-image conversion in the handler.
+
+        Returns:
+            list: List of supported MIME types
+        """
+        return [
+            "image/png",
+            "image/jpeg",
+            "image/jpg",
+            "image/webp",
+            "image/gif",
+        ]
+
+    def prepare_messages_with_attachments(self, messages, attachments=None):
+        """
+        Process attachments for Anthropic Claude API.
+        Formats images using Claude's vision message format.
+
+        Args:
+            messages (list): List of message dictionaries.
+            attachments (list): List of attachment dictionaries with content and metadata.
+
+        Returns:
+            list: Messages formatted with image content for Claude API.
+        """
+        if not attachments:
+            return messages
+
+        prepared_messages = messages.copy()
+
+        # Find the last user message to attach images to
+        user_message_index = None
+        for i in range(len(prepared_messages) - 1, -1, -1):
+            if prepared_messages[i].get("role") == "user":
+                user_message_index = i
+                break
+
+        if user_message_index is None:
+            user_message = {"role": "user", "content": []}
+            prepared_messages.append(user_message)
+            user_message_index = len(prepared_messages) - 1
+
+        # Convert content to list format if it's a string
+        if isinstance(prepared_messages[user_message_index].get("content"), str):
+            text_content = prepared_messages[user_message_index]["content"]
+            prepared_messages[user_message_index]["content"] = [
+                {"type": "text", "text": text_content}
+            ]
+        elif not isinstance(prepared_messages[user_message_index].get("content"), list):
+            prepared_messages[user_message_index]["content"] = []
+
+        for attachment in attachments:
+            mime_type = attachment.get("mime_type")
+
+            if mime_type and mime_type.startswith("image/"):
+                try:
+                    # Check if this is a pre-converted image (from PDF-to-image conversion)
+                    # These have 'data' key with base64 already
+                    if "data" in attachment:
+                        base64_image = attachment["data"]
+                    else:
+                        base64_image = self._get_base64_image(attachment)
+
+                    # Claude uses a specific format for images
+                    prepared_messages[user_message_index]["content"].append(
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": mime_type,
+                                "data": base64_image,
+                            },
+                        }
+                    )
+
+                except Exception as e:
+                    logger.error(
+                        f"Error processing image attachment: {e}", exc_info=True
+                    )
+                    if "content" in attachment:
+                        prepared_messages[user_message_index]["content"].append(
+                            {
+                                "type": "text",
+                                "text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]",
+                            }
+                        )
+
+        return prepared_messages
+
+    def _get_base64_image(self, attachment):
+        """
+        Convert an image file to base64 encoding.
+
+        Args:
+            attachment (dict): Attachment dictionary with path and metadata.
+
+        Returns:
+            str: Base64-encoded image data.
+        """
+        file_path = attachment.get("path")
+        if not file_path:
+            raise ValueError("No file path provided in attachment")
+        try:
+            with self.storage.get_file(file_path) as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        except FileNotFoundError:
+            raise FileNotFoundError(f"File not found: {file_path}")
--- a/application/llm/handlers/base.py
+++ b/application/llm/handlers/base.py
@@ -105,6 +105,7 @@ class LLMHandler(ABC):
        """
        Prepare messages with attachments and provider-specific formatting.

+
        Args:
            agent: The agent instance
            messages: Original messages
@@ -118,11 +119,40 @@ class LLMHandler(ABC):
        logger.info(f"Preparing messages with {len(attachments)} attachments")
        supported_types = agent.llm.get_supported_attachment_types()

+        # Check if provider supports images but not PDF (synthetic PDF support)
+        supports_images = any(t.startswith("image/") for t in supported_types)
+        supports_pdf = "application/pdf" in supported_types
+
+        # Process attachments, converting PDFs to images if needed
+        processed_attachments = []
+        for attachment in attachments:
+            mime_type = attachment.get("mime_type")
+
+            # Synthetic PDF support: convert PDF to images if LLM supports images but not PDF
+            if mime_type == "application/pdf" and supports_images and not supports_pdf:
+                logger.info(
+                    f"Converting PDF to images for synthetic PDF support: {attachment.get('path', 'unknown')}"
+                )
+                try:
+                    converted_images = self._convert_pdf_to_images(attachment)
+                    processed_attachments.extend(converted_images)
+                    logger.info(
+                        f"Converted PDF to {len(converted_images)} images"
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"Failed to convert PDF to images, falling back to text: {e}"
+                    )
+                    # Fall back to treating as unsupported (text extraction)
+                    processed_attachments.append(attachment)
+            else:
+                processed_attachments.append(attachment)
+
        supported_attachments = [
-            a for a in attachments if a.get("mime_type") in supported_types
+            a for a in processed_attachments if a.get("mime_type") in supported_types
        ]
        unsupported_attachments = [
-            a for a in attachments if a.get("mime_type") not in supported_types
+            a for a in processed_attachments if a.get("mime_type") not in supported_types
        ]

        # Process supported attachments with the LLM's custom method
@@ -145,6 +175,37 @@ class LLMHandler(ABC):
            )
        return messages

+    def _convert_pdf_to_images(self, attachment: Dict) -> List[Dict]:
+        """
+        Convert a PDF attachment to a list of image attachments.
+
+        This enables synthetic PDF support for LLMs that support images but not PDFs.
+
+        Args:
+            attachment: PDF attachment dictionary with 'path' and optional 'content'
+
+        Returns:
+            List of image attachment dictionaries with 'data', 'mime_type', and 'page'
+        """
+        from application.utils import convert_pdf_to_images
+        from application.storage.storage_creator import StorageCreator
+
+        file_path = attachment.get("path")
+        if not file_path:
+            raise ValueError("No file path provided in PDF attachment")
+
+        storage = StorageCreator.get_storage()
+
+        # Convert PDF to images
+        images_data = convert_pdf_to_images(
+            file_path=file_path,
+            storage=storage,
+            max_pages=20,
+            dpi=150,
+        )
+
+        return images_data
+
    def _append_unsupported_attachments(
        self, messages: List[Dict], attachments: List[Dict]
    ) -> List[Dict]:
--- a/application/llm/llm_creator.py
+++ b/application/llm/llm_creator.py
@@ -9,6 +9,7 @@ from application.llm.novita import NovitaLLM
 from application.llm.openai import AzureOpenAILLM, OpenAILLM
 from application.llm.premai import PremAILLM
 from application.llm.sagemaker import SagemakerAPILLM
+from application.llm.open_router import OpenRouterLLM

 logger = logging.getLogger(__name__)

@@ -25,6 +26,7 @@ class LLMCreator:
        "groq": GroqLLM,
        "google": GoogleLLM,
        "novita": NovitaLLM,
+        "openrouter": OpenRouterLLM,
    }

    @classmethod
--- a/application/llm/open_router.py
+++ b/application/llm/open_router.py
@@ -0,0 +1,15 @@
+from application.core.settings import settings
+from application.llm.openai import OpenAILLM
+
+OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+
+
+class OpenRouterLLM(OpenAILLM):
+    def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
+        super().__init__(
+            api_key=api_key or settings.OPEN_ROUTER_API_KEY or settings.API_KEY,
+            user_api_key=user_api_key,
+            base_url=base_url or OPEN_ROUTER_BASE_URL,
+            *args,
+            **kwargs,
+        )
--- a/application/llm/openai.py
+++ b/application/llm/openai.py
@@ -9,6 +9,57 @@ from application.llm.base import BaseLLM
 from application.storage.storage_creator import StorageCreator


+def _truncate_base64_for_logging(messages):
+    """
+    Create a copy of messages with base64 data truncated for readable logging.
+
+    Args:
+        messages: List of message dicts
+
+    Returns:
+        Copy of messages with truncated base64 content
+    """
+    import copy
+
+    def truncate_content(content):
+        if isinstance(content, str):
+            # Check if it looks like a data URL with base64
+            if content.startswith("data:") and ";base64," in content:
+                prefix_end = content.index(";base64,") + len(";base64,")
+                prefix = content[:prefix_end]
+                return f"{prefix}[BASE64_DATA_TRUNCATED, length={len(content) - prefix_end}]"
+            return content
+        elif isinstance(content, list):
+            return [truncate_item(item) for item in content]
+        elif isinstance(content, dict):
+            return {k: truncate_content(v) for k, v in content.items()}
+        return content
+
+    def truncate_item(item):
+        if isinstance(item, dict):
+            result = {}
+            for k, v in item.items():
+                if k == "url" and isinstance(v, str) and ";base64," in v:
+                    prefix_end = v.index(";base64,") + len(";base64,")
+                    prefix = v[:prefix_end]
+                    result[k] = f"{prefix}[BASE64_DATA_TRUNCATED, length={len(v) - prefix_end}]"
+                elif k == "data" and isinstance(v, str) and len(v) > 100:
+                    result[k] = f"[BASE64_DATA_TRUNCATED, length={len(v)}]"
+                else:
+                    result[k] = truncate_content(v)
+            return result
+        return truncate_content(item)
+
+    truncated = []
+    for msg in messages:
+        msg_copy = copy.copy(msg)
+        if "content" in msg_copy:
+            msg_copy["content"] = truncate_content(msg_copy["content"])
+        truncated.append(msg_copy)
+
+    return truncated
+
+
 class OpenAILLM(BaseLLM):

    def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
@@ -44,12 +95,12 @@ class OpenAILLM(BaseLLM):
                if isinstance(content, str):
                    cleaned_messages.append({"role": role, "content": content})
                elif isinstance(content, list):
+                    # Collect all content parts into a single message
+                    content_parts = []
+
                    for item in content:
-                        if "text" in item:
-                            cleaned_messages.append(
-                                {"role": role, "content": item["text"]}
-                            )
-                        elif "function_call" in item:
+                        if "function_call" in item:
+                            # Function calls need their own message
                            cleaned_args = self._remove_null_values(
                                item["function_call"]["args"]
                            )
@@ -69,6 +120,7 @@ class OpenAILLM(BaseLLM):
                                }
                            )
                        elif "function_response" in item:
+                            # Function responses need their own message
                            cleaned_messages.append(
                                {
                                    "role": "tool",
@@ -81,36 +133,20 @@ class OpenAILLM(BaseLLM):
                                }
                            )
                        elif isinstance(item, dict):
-                            content_parts = []
-                            if "text" in item:
-                                content_parts.append(
-                                    {"type": "text", "text": item["text"]}
-                                )
-                            elif (
-                                "type" in item
-                                and item["type"] == "text"
-                                and "text" in item
-                            ):
+                            # Collect content parts (text, images, files) into a single message
+                            if "type" in item and item["type"] == "text" and "text" in item:
                                content_parts.append(item)
-                            elif (
-                                "type" in item
-                                and item["type"] == "file"
-                                and "file" in item
-                            ):
+                            elif "type" in item and item["type"] == "file" and "file" in item:
                                content_parts.append(item)
-                            elif (
-                                "type" in item
-                                and item["type"] == "image_url"
-                                and "image_url" in item
-                            ):
+                            elif "type" in item and item["type"] == "image_url" and "image_url" in item:
                                content_parts.append(item)
-                            cleaned_messages.append(
-                                {"role": role, "content": content_parts}
-                            )
-                        else:
-                            raise ValueError(
-                                f"Unexpected content dictionary format: {item}"
-                            )
+                            elif "text" in item and "type" not in item:
+                                # Legacy format: {"text": "..."} without type
+                                content_parts.append({"type": "text", "text": item["text"]})
+
+                    # Add the collected content parts as a single message
+                    if content_parts:
+                        cleaned_messages.append({"role": role, "content": content_parts})
                else:
                    raise ValueError(f"Unexpected content type: {type(content)}")
        return cleaned_messages
@@ -127,7 +163,7 @@ class OpenAILLM(BaseLLM):
        **kwargs,
    ):
        messages = self._clean_messages_openai(messages)
-        logging.info(f"Cleaned messages: {messages}")
+        logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")

        # Convert max_tokens to max_completion_tokens for newer models
        if "max_tokens" in kwargs:
@@ -163,7 +199,7 @@ class OpenAILLM(BaseLLM):
        **kwargs,
    ):
        messages = self._clean_messages_openai(messages)
-        logging.info(f"Cleaned messages: {messages}")
+        logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")

        # Convert max_tokens to max_completion_tokens for newer models
        if "max_tokens" in kwargs:
@@ -261,17 +297,14 @@ class OpenAILLM(BaseLLM):
        """
        Return a list of MIME types supported by OpenAI for file uploads.

+        This reads from the model config to ensure consistency.
+        If no model config found, falls back to images only (safest default).
+
        Returns:
            list: List of supported MIME types
        """
-        return [
-            "application/pdf",
-            "image/png",
-            "image/jpeg",
-            "image/jpg",
-            "image/webp",
-            "image/gif",
-        ]
+        from application.core.model_configs import OPENAI_ATTACHMENTS
+        return OPENAI_ATTACHMENTS

    def prepare_messages_with_attachments(self, messages, attachments=None):
        """
@@ -308,10 +341,16 @@ class OpenAILLM(BaseLLM):
            prepared_messages[user_message_index]["content"] = []
        for attachment in attachments:
            mime_type = attachment.get("mime_type")
+            logging.info(f"Processing attachment with mime_type: {mime_type}, has_data: {'data' in attachment}, has_path: {'path' in attachment}")

            if mime_type and mime_type.startswith("image/"):
                try:
-                    base64_image = self._get_base64_image(attachment)
+                    # Check if this is a pre-converted image (from PDF-to-image conversion)
+                    if "data" in attachment:
+                        base64_image = attachment["data"]
+                    else:
+                        base64_image = self._get_base64_image(attachment)
+
                    prepared_messages[user_message_index]["content"].append(
                        {
                            "type": "image_url",
@@ -320,6 +359,7 @@ class OpenAILLM(BaseLLM):
                            },
                        }
                    )
+
                except Exception as e:
                    logging.error(
                        f"Error processing image attachment: {e}", exc_info=True
@@ -334,6 +374,7 @@ class OpenAILLM(BaseLLM):
            # Handle PDFs using the file API

            elif mime_type == "application/pdf":
+                logging.info(f"Attempting to upload PDF to OpenAI: {attachment.get('path', 'unknown')}")
                try:
                    file_id = self._upload_file_to_openai(attachment)
                    prepared_messages[user_message_index]["content"].append(
@@ -348,6 +389,8 @@ class OpenAILLM(BaseLLM):
                                "text": f"File content:\n\n{attachment['content']}",
                            }
                        )
+            else:
+                logging.warning(f"Unsupported attachment type in OpenAI provider: {mime_type}")
        return prepared_messages

    def _get_base64_image(self, attachment):