feat: process pdfs synthetically im model does not support file natively (#2263)

* feat: process pdfs synthetically im model does not support file natively

* fix: small code optimisations
This commit is contained in:
Alex
2026-01-15 02:30:33 +02:00
committed by GitHub
parent 2c55c6cd9a
commit f61d112cea
13 changed files with 449 additions and 72 deletions

View File

@@ -1,7 +1,13 @@
import base64
import logging
from anthropic import AI_PROMPT, Anthropic, HUMAN_PROMPT
from application.core.settings import settings
from application.llm.base import BaseLLM
from application.storage.storage_creator import StorageCreator
logger = logging.getLogger(__name__)
class AnthropicLLM(BaseLLM):
@@ -20,6 +26,7 @@ class AnthropicLLM(BaseLLM):
self.HUMAN_PROMPT = HUMAN_PROMPT
self.AI_PROMPT = AI_PROMPT
self.storage = StorageCreator.get_storage()
def _raw_gen(
self,
@@ -70,3 +77,115 @@ class AnthropicLLM(BaseLLM):
finally:
if hasattr(stream_response, "close"):
stream_response.close()
def get_supported_attachment_types(self):
"""
Return a list of MIME types supported by Anthropic Claude for file uploads.
Claude supports images but not PDFs natively.
PDFs are synthetically supported via PDF-to-image conversion in the handler.
Returns:
list: List of supported MIME types
"""
return [
"image/png",
"image/jpeg",
"image/jpg",
"image/webp",
"image/gif",
]
def prepare_messages_with_attachments(self, messages, attachments=None):
"""
Process attachments for Anthropic Claude API.
Formats images using Claude's vision message format.
Args:
messages (list): List of message dictionaries.
attachments (list): List of attachment dictionaries with content and metadata.
Returns:
list: Messages formatted with image content for Claude API.
"""
if not attachments:
return messages
prepared_messages = messages.copy()
# Find the last user message to attach images to
user_message_index = None
for i in range(len(prepared_messages) - 1, -1, -1):
if prepared_messages[i].get("role") == "user":
user_message_index = i
break
if user_message_index is None:
user_message = {"role": "user", "content": []}
prepared_messages.append(user_message)
user_message_index = len(prepared_messages) - 1
# Convert content to list format if it's a string
if isinstance(prepared_messages[user_message_index].get("content"), str):
text_content = prepared_messages[user_message_index]["content"]
prepared_messages[user_message_index]["content"] = [
{"type": "text", "text": text_content}
]
elif not isinstance(prepared_messages[user_message_index].get("content"), list):
prepared_messages[user_message_index]["content"] = []
for attachment in attachments:
mime_type = attachment.get("mime_type")
if mime_type and mime_type.startswith("image/"):
try:
# Check if this is a pre-converted image (from PDF-to-image conversion)
# These have 'data' key with base64 already
if "data" in attachment:
base64_image = attachment["data"]
else:
base64_image = self._get_base64_image(attachment)
# Claude uses a specific format for images
prepared_messages[user_message_index]["content"].append(
{
"type": "image",
"source": {
"type": "base64",
"media_type": mime_type,
"data": base64_image,
},
}
)
except Exception as e:
logger.error(
f"Error processing image attachment: {e}", exc_info=True
)
if "content" in attachment:
prepared_messages[user_message_index]["content"].append(
{
"type": "text",
"text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]",
}
)
return prepared_messages
def _get_base64_image(self, attachment):
"""
Convert an image file to base64 encoding.
Args:
attachment (dict): Attachment dictionary with path and metadata.
Returns:
str: Base64-encoded image data.
"""
file_path = attachment.get("path")
if not file_path:
raise ValueError("No file path provided in attachment")
try:
with self.storage.get_file(file_path) as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {file_path}")

View File

@@ -105,6 +105,7 @@ class LLMHandler(ABC):
"""
Prepare messages with attachments and provider-specific formatting.
Args:
agent: The agent instance
messages: Original messages
@@ -118,11 +119,40 @@ class LLMHandler(ABC):
logger.info(f"Preparing messages with {len(attachments)} attachments")
supported_types = agent.llm.get_supported_attachment_types()
# Check if provider supports images but not PDF (synthetic PDF support)
supports_images = any(t.startswith("image/") for t in supported_types)
supports_pdf = "application/pdf" in supported_types
# Process attachments, converting PDFs to images if needed
processed_attachments = []
for attachment in attachments:
mime_type = attachment.get("mime_type")
# Synthetic PDF support: convert PDF to images if LLM supports images but not PDF
if mime_type == "application/pdf" and supports_images and not supports_pdf:
logger.info(
f"Converting PDF to images for synthetic PDF support: {attachment.get('path', 'unknown')}"
)
try:
converted_images = self._convert_pdf_to_images(attachment)
processed_attachments.extend(converted_images)
logger.info(
f"Converted PDF to {len(converted_images)} images"
)
except Exception as e:
logger.error(
f"Failed to convert PDF to images, falling back to text: {e}"
)
# Fall back to treating as unsupported (text extraction)
processed_attachments.append(attachment)
else:
processed_attachments.append(attachment)
supported_attachments = [
a for a in attachments if a.get("mime_type") in supported_types
a for a in processed_attachments if a.get("mime_type") in supported_types
]
unsupported_attachments = [
a for a in attachments if a.get("mime_type") not in supported_types
a for a in processed_attachments if a.get("mime_type") not in supported_types
]
# Process supported attachments with the LLM's custom method
@@ -145,6 +175,37 @@ class LLMHandler(ABC):
)
return messages
def _convert_pdf_to_images(self, attachment: Dict) -> List[Dict]:
"""
Convert a PDF attachment to a list of image attachments.
This enables synthetic PDF support for LLMs that support images but not PDFs.
Args:
attachment: PDF attachment dictionary with 'path' and optional 'content'
Returns:
List of image attachment dictionaries with 'data', 'mime_type', and 'page'
"""
from application.utils import convert_pdf_to_images
from application.storage.storage_creator import StorageCreator
file_path = attachment.get("path")
if not file_path:
raise ValueError("No file path provided in PDF attachment")
storage = StorageCreator.get_storage()
# Convert PDF to images
images_data = convert_pdf_to_images(
file_path=file_path,
storage=storage,
max_pages=20,
dpi=150,
)
return images_data
def _append_unsupported_attachments(
self, messages: List[Dict], attachments: List[Dict]
) -> List[Dict]:

View File

@@ -9,6 +9,7 @@ from application.llm.novita import NovitaLLM
from application.llm.openai import AzureOpenAILLM, OpenAILLM
from application.llm.premai import PremAILLM
from application.llm.sagemaker import SagemakerAPILLM
from application.llm.open_router import OpenRouterLLM
logger = logging.getLogger(__name__)
@@ -25,6 +26,7 @@ class LLMCreator:
"groq": GroqLLM,
"google": GoogleLLM,
"novita": NovitaLLM,
"openrouter": OpenRouterLLM,
}
@classmethod

View File

@@ -0,0 +1,15 @@
from application.core.settings import settings
from application.llm.openai import OpenAILLM
OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1"
class OpenRouterLLM(OpenAILLM):
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
super().__init__(
api_key=api_key or settings.OPEN_ROUTER_API_KEY or settings.API_KEY,
user_api_key=user_api_key,
base_url=base_url or OPEN_ROUTER_BASE_URL,
*args,
**kwargs,
)

View File

@@ -9,6 +9,57 @@ from application.llm.base import BaseLLM
from application.storage.storage_creator import StorageCreator
def _truncate_base64_for_logging(messages):
"""
Create a copy of messages with base64 data truncated for readable logging.
Args:
messages: List of message dicts
Returns:
Copy of messages with truncated base64 content
"""
import copy
def truncate_content(content):
if isinstance(content, str):
# Check if it looks like a data URL with base64
if content.startswith("data:") and ";base64," in content:
prefix_end = content.index(";base64,") + len(";base64,")
prefix = content[:prefix_end]
return f"{prefix}[BASE64_DATA_TRUNCATED, length={len(content) - prefix_end}]"
return content
elif isinstance(content, list):
return [truncate_item(item) for item in content]
elif isinstance(content, dict):
return {k: truncate_content(v) for k, v in content.items()}
return content
def truncate_item(item):
if isinstance(item, dict):
result = {}
for k, v in item.items():
if k == "url" and isinstance(v, str) and ";base64," in v:
prefix_end = v.index(";base64,") + len(";base64,")
prefix = v[:prefix_end]
result[k] = f"{prefix}[BASE64_DATA_TRUNCATED, length={len(v) - prefix_end}]"
elif k == "data" and isinstance(v, str) and len(v) > 100:
result[k] = f"[BASE64_DATA_TRUNCATED, length={len(v)}]"
else:
result[k] = truncate_content(v)
return result
return truncate_content(item)
truncated = []
for msg in messages:
msg_copy = copy.copy(msg)
if "content" in msg_copy:
msg_copy["content"] = truncate_content(msg_copy["content"])
truncated.append(msg_copy)
return truncated
class OpenAILLM(BaseLLM):
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
@@ -44,12 +95,12 @@ class OpenAILLM(BaseLLM):
if isinstance(content, str):
cleaned_messages.append({"role": role, "content": content})
elif isinstance(content, list):
# Collect all content parts into a single message
content_parts = []
for item in content:
if "text" in item:
cleaned_messages.append(
{"role": role, "content": item["text"]}
)
elif "function_call" in item:
if "function_call" in item:
# Function calls need their own message
cleaned_args = self._remove_null_values(
item["function_call"]["args"]
)
@@ -69,6 +120,7 @@ class OpenAILLM(BaseLLM):
}
)
elif "function_response" in item:
# Function responses need their own message
cleaned_messages.append(
{
"role": "tool",
@@ -81,36 +133,20 @@ class OpenAILLM(BaseLLM):
}
)
elif isinstance(item, dict):
content_parts = []
if "text" in item:
content_parts.append(
{"type": "text", "text": item["text"]}
)
elif (
"type" in item
and item["type"] == "text"
and "text" in item
):
# Collect content parts (text, images, files) into a single message
if "type" in item and item["type"] == "text" and "text" in item:
content_parts.append(item)
elif (
"type" in item
and item["type"] == "file"
and "file" in item
):
elif "type" in item and item["type"] == "file" and "file" in item:
content_parts.append(item)
elif (
"type" in item
and item["type"] == "image_url"
and "image_url" in item
):
elif "type" in item and item["type"] == "image_url" and "image_url" in item:
content_parts.append(item)
cleaned_messages.append(
{"role": role, "content": content_parts}
)
else:
raise ValueError(
f"Unexpected content dictionary format: {item}"
)
elif "text" in item and "type" not in item:
# Legacy format: {"text": "..."} without type
content_parts.append({"type": "text", "text": item["text"]})
# Add the collected content parts as a single message
if content_parts:
cleaned_messages.append({"role": role, "content": content_parts})
else:
raise ValueError(f"Unexpected content type: {type(content)}")
return cleaned_messages
@@ -127,7 +163,7 @@ class OpenAILLM(BaseLLM):
**kwargs,
):
messages = self._clean_messages_openai(messages)
logging.info(f"Cleaned messages: {messages}")
logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")
# Convert max_tokens to max_completion_tokens for newer models
if "max_tokens" in kwargs:
@@ -163,7 +199,7 @@ class OpenAILLM(BaseLLM):
**kwargs,
):
messages = self._clean_messages_openai(messages)
logging.info(f"Cleaned messages: {messages}")
logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")
# Convert max_tokens to max_completion_tokens for newer models
if "max_tokens" in kwargs:
@@ -261,17 +297,14 @@ class OpenAILLM(BaseLLM):
"""
Return a list of MIME types supported by OpenAI for file uploads.
This reads from the model config to ensure consistency.
If no model config found, falls back to images only (safest default).
Returns:
list: List of supported MIME types
"""
return [
"application/pdf",
"image/png",
"image/jpeg",
"image/jpg",
"image/webp",
"image/gif",
]
from application.core.model_configs import OPENAI_ATTACHMENTS
return OPENAI_ATTACHMENTS
def prepare_messages_with_attachments(self, messages, attachments=None):
"""
@@ -308,10 +341,16 @@ class OpenAILLM(BaseLLM):
prepared_messages[user_message_index]["content"] = []
for attachment in attachments:
mime_type = attachment.get("mime_type")
logging.info(f"Processing attachment with mime_type: {mime_type}, has_data: {'data' in attachment}, has_path: {'path' in attachment}")
if mime_type and mime_type.startswith("image/"):
try:
base64_image = self._get_base64_image(attachment)
# Check if this is a pre-converted image (from PDF-to-image conversion)
if "data" in attachment:
base64_image = attachment["data"]
else:
base64_image = self._get_base64_image(attachment)
prepared_messages[user_message_index]["content"].append(
{
"type": "image_url",
@@ -320,6 +359,7 @@ class OpenAILLM(BaseLLM):
},
}
)
except Exception as e:
logging.error(
f"Error processing image attachment: {e}", exc_info=True
@@ -334,6 +374,7 @@ class OpenAILLM(BaseLLM):
# Handle PDFs using the file API
elif mime_type == "application/pdf":
logging.info(f"Attempting to upload PDF to OpenAI: {attachment.get('path', 'unknown')}")
try:
file_id = self._upload_file_to_openai(attachment)
prepared_messages[user_message_index]["content"].append(
@@ -348,6 +389,8 @@ class OpenAILLM(BaseLLM):
"text": f"File content:\n\n{attachment['content']}",
}
)
else:
logging.warning(f"Unsupported attachment type in OpenAI provider: {mime_type}")
return prepared_messages
def _get_base64_image(self, attachment):