mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-04-28 21:10:27 +00:00
feat: process pdfs synthetically im model does not support file natively (#2263)
* feat: process pdfs synthetically im model does not support file natively * fix: small code optimisations
This commit is contained in:
@@ -1,7 +1,13 @@
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from anthropic import AI_PROMPT, Anthropic, HUMAN_PROMPT
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.llm.base import BaseLLM
|
||||
from application.storage.storage_creator import StorageCreator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnthropicLLM(BaseLLM):
|
||||
@@ -20,6 +26,7 @@ class AnthropicLLM(BaseLLM):
|
||||
|
||||
self.HUMAN_PROMPT = HUMAN_PROMPT
|
||||
self.AI_PROMPT = AI_PROMPT
|
||||
self.storage = StorageCreator.get_storage()
|
||||
|
||||
def _raw_gen(
|
||||
self,
|
||||
@@ -70,3 +77,115 @@ class AnthropicLLM(BaseLLM):
|
||||
finally:
|
||||
if hasattr(stream_response, "close"):
|
||||
stream_response.close()
|
||||
|
||||
def get_supported_attachment_types(self):
|
||||
"""
|
||||
Return a list of MIME types supported by Anthropic Claude for file uploads.
|
||||
Claude supports images but not PDFs natively.
|
||||
PDFs are synthetically supported via PDF-to-image conversion in the handler.
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
]
|
||||
|
||||
def prepare_messages_with_attachments(self, messages, attachments=None):
|
||||
"""
|
||||
Process attachments for Anthropic Claude API.
|
||||
Formats images using Claude's vision message format.
|
||||
|
||||
Args:
|
||||
messages (list): List of message dictionaries.
|
||||
attachments (list): List of attachment dictionaries with content and metadata.
|
||||
|
||||
Returns:
|
||||
list: Messages formatted with image content for Claude API.
|
||||
"""
|
||||
if not attachments:
|
||||
return messages
|
||||
|
||||
prepared_messages = messages.copy()
|
||||
|
||||
# Find the last user message to attach images to
|
||||
user_message_index = None
|
||||
for i in range(len(prepared_messages) - 1, -1, -1):
|
||||
if prepared_messages[i].get("role") == "user":
|
||||
user_message_index = i
|
||||
break
|
||||
|
||||
if user_message_index is None:
|
||||
user_message = {"role": "user", "content": []}
|
||||
prepared_messages.append(user_message)
|
||||
user_message_index = len(prepared_messages) - 1
|
||||
|
||||
# Convert content to list format if it's a string
|
||||
if isinstance(prepared_messages[user_message_index].get("content"), str):
|
||||
text_content = prepared_messages[user_message_index]["content"]
|
||||
prepared_messages[user_message_index]["content"] = [
|
||||
{"type": "text", "text": text_content}
|
||||
]
|
||||
elif not isinstance(prepared_messages[user_message_index].get("content"), list):
|
||||
prepared_messages[user_message_index]["content"] = []
|
||||
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get("mime_type")
|
||||
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
try:
|
||||
# Check if this is a pre-converted image (from PDF-to-image conversion)
|
||||
# These have 'data' key with base64 already
|
||||
if "data" in attachment:
|
||||
base64_image = attachment["data"]
|
||||
else:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
|
||||
# Claude uses a specific format for images
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": mime_type,
|
||||
"data": base64_image,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing image attachment: {e}", exc_info=True
|
||||
)
|
||||
if "content" in attachment:
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]",
|
||||
}
|
||||
)
|
||||
|
||||
return prepared_messages
|
||||
|
||||
def _get_base64_image(self, attachment):
|
||||
"""
|
||||
Convert an image file to base64 encoding.
|
||||
|
||||
Args:
|
||||
attachment (dict): Attachment dictionary with path and metadata.
|
||||
|
||||
Returns:
|
||||
str: Base64-encoded image data.
|
||||
"""
|
||||
file_path = attachment.get("path")
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in attachment")
|
||||
try:
|
||||
with self.storage.get_file(file_path) as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
@@ -105,6 +105,7 @@ class LLMHandler(ABC):
|
||||
"""
|
||||
Prepare messages with attachments and provider-specific formatting.
|
||||
|
||||
|
||||
Args:
|
||||
agent: The agent instance
|
||||
messages: Original messages
|
||||
@@ -118,11 +119,40 @@ class LLMHandler(ABC):
|
||||
logger.info(f"Preparing messages with {len(attachments)} attachments")
|
||||
supported_types = agent.llm.get_supported_attachment_types()
|
||||
|
||||
# Check if provider supports images but not PDF (synthetic PDF support)
|
||||
supports_images = any(t.startswith("image/") for t in supported_types)
|
||||
supports_pdf = "application/pdf" in supported_types
|
||||
|
||||
# Process attachments, converting PDFs to images if needed
|
||||
processed_attachments = []
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get("mime_type")
|
||||
|
||||
# Synthetic PDF support: convert PDF to images if LLM supports images but not PDF
|
||||
if mime_type == "application/pdf" and supports_images and not supports_pdf:
|
||||
logger.info(
|
||||
f"Converting PDF to images for synthetic PDF support: {attachment.get('path', 'unknown')}"
|
||||
)
|
||||
try:
|
||||
converted_images = self._convert_pdf_to_images(attachment)
|
||||
processed_attachments.extend(converted_images)
|
||||
logger.info(
|
||||
f"Converted PDF to {len(converted_images)} images"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to convert PDF to images, falling back to text: {e}"
|
||||
)
|
||||
# Fall back to treating as unsupported (text extraction)
|
||||
processed_attachments.append(attachment)
|
||||
else:
|
||||
processed_attachments.append(attachment)
|
||||
|
||||
supported_attachments = [
|
||||
a for a in attachments if a.get("mime_type") in supported_types
|
||||
a for a in processed_attachments if a.get("mime_type") in supported_types
|
||||
]
|
||||
unsupported_attachments = [
|
||||
a for a in attachments if a.get("mime_type") not in supported_types
|
||||
a for a in processed_attachments if a.get("mime_type") not in supported_types
|
||||
]
|
||||
|
||||
# Process supported attachments with the LLM's custom method
|
||||
@@ -145,6 +175,37 @@ class LLMHandler(ABC):
|
||||
)
|
||||
return messages
|
||||
|
||||
def _convert_pdf_to_images(self, attachment: Dict) -> List[Dict]:
|
||||
"""
|
||||
Convert a PDF attachment to a list of image attachments.
|
||||
|
||||
This enables synthetic PDF support for LLMs that support images but not PDFs.
|
||||
|
||||
Args:
|
||||
attachment: PDF attachment dictionary with 'path' and optional 'content'
|
||||
|
||||
Returns:
|
||||
List of image attachment dictionaries with 'data', 'mime_type', and 'page'
|
||||
"""
|
||||
from application.utils import convert_pdf_to_images
|
||||
from application.storage.storage_creator import StorageCreator
|
||||
|
||||
file_path = attachment.get("path")
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in PDF attachment")
|
||||
|
||||
storage = StorageCreator.get_storage()
|
||||
|
||||
# Convert PDF to images
|
||||
images_data = convert_pdf_to_images(
|
||||
file_path=file_path,
|
||||
storage=storage,
|
||||
max_pages=20,
|
||||
dpi=150,
|
||||
)
|
||||
|
||||
return images_data
|
||||
|
||||
def _append_unsupported_attachments(
|
||||
self, messages: List[Dict], attachments: List[Dict]
|
||||
) -> List[Dict]:
|
||||
|
||||
@@ -9,6 +9,7 @@ from application.llm.novita import NovitaLLM
|
||||
from application.llm.openai import AzureOpenAILLM, OpenAILLM
|
||||
from application.llm.premai import PremAILLM
|
||||
from application.llm.sagemaker import SagemakerAPILLM
|
||||
from application.llm.open_router import OpenRouterLLM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -25,6 +26,7 @@ class LLMCreator:
|
||||
"groq": GroqLLM,
|
||||
"google": GoogleLLM,
|
||||
"novita": NovitaLLM,
|
||||
"openrouter": OpenRouterLLM,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
||||
15
application/llm/open_router.py
Normal file
15
application/llm/open_router.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from application.core.settings import settings
|
||||
from application.llm.openai import OpenAILLM
|
||||
|
||||
OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
||||
|
||||
|
||||
class OpenRouterLLM(OpenAILLM):
|
||||
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
|
||||
super().__init__(
|
||||
api_key=api_key or settings.OPEN_ROUTER_API_KEY or settings.API_KEY,
|
||||
user_api_key=user_api_key,
|
||||
base_url=base_url or OPEN_ROUTER_BASE_URL,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -9,6 +9,57 @@ from application.llm.base import BaseLLM
|
||||
from application.storage.storage_creator import StorageCreator
|
||||
|
||||
|
||||
def _truncate_base64_for_logging(messages):
|
||||
"""
|
||||
Create a copy of messages with base64 data truncated for readable logging.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts
|
||||
|
||||
Returns:
|
||||
Copy of messages with truncated base64 content
|
||||
"""
|
||||
import copy
|
||||
|
||||
def truncate_content(content):
|
||||
if isinstance(content, str):
|
||||
# Check if it looks like a data URL with base64
|
||||
if content.startswith("data:") and ";base64," in content:
|
||||
prefix_end = content.index(";base64,") + len(";base64,")
|
||||
prefix = content[:prefix_end]
|
||||
return f"{prefix}[BASE64_DATA_TRUNCATED, length={len(content) - prefix_end}]"
|
||||
return content
|
||||
elif isinstance(content, list):
|
||||
return [truncate_item(item) for item in content]
|
||||
elif isinstance(content, dict):
|
||||
return {k: truncate_content(v) for k, v in content.items()}
|
||||
return content
|
||||
|
||||
def truncate_item(item):
|
||||
if isinstance(item, dict):
|
||||
result = {}
|
||||
for k, v in item.items():
|
||||
if k == "url" and isinstance(v, str) and ";base64," in v:
|
||||
prefix_end = v.index(";base64,") + len(";base64,")
|
||||
prefix = v[:prefix_end]
|
||||
result[k] = f"{prefix}[BASE64_DATA_TRUNCATED, length={len(v) - prefix_end}]"
|
||||
elif k == "data" and isinstance(v, str) and len(v) > 100:
|
||||
result[k] = f"[BASE64_DATA_TRUNCATED, length={len(v)}]"
|
||||
else:
|
||||
result[k] = truncate_content(v)
|
||||
return result
|
||||
return truncate_content(item)
|
||||
|
||||
truncated = []
|
||||
for msg in messages:
|
||||
msg_copy = copy.copy(msg)
|
||||
if "content" in msg_copy:
|
||||
msg_copy["content"] = truncate_content(msg_copy["content"])
|
||||
truncated.append(msg_copy)
|
||||
|
||||
return truncated
|
||||
|
||||
|
||||
class OpenAILLM(BaseLLM):
|
||||
|
||||
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
|
||||
@@ -44,12 +95,12 @@ class OpenAILLM(BaseLLM):
|
||||
if isinstance(content, str):
|
||||
cleaned_messages.append({"role": role, "content": content})
|
||||
elif isinstance(content, list):
|
||||
# Collect all content parts into a single message
|
||||
content_parts = []
|
||||
|
||||
for item in content:
|
||||
if "text" in item:
|
||||
cleaned_messages.append(
|
||||
{"role": role, "content": item["text"]}
|
||||
)
|
||||
elif "function_call" in item:
|
||||
if "function_call" in item:
|
||||
# Function calls need their own message
|
||||
cleaned_args = self._remove_null_values(
|
||||
item["function_call"]["args"]
|
||||
)
|
||||
@@ -69,6 +120,7 @@ class OpenAILLM(BaseLLM):
|
||||
}
|
||||
)
|
||||
elif "function_response" in item:
|
||||
# Function responses need their own message
|
||||
cleaned_messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
@@ -81,36 +133,20 @@ class OpenAILLM(BaseLLM):
|
||||
}
|
||||
)
|
||||
elif isinstance(item, dict):
|
||||
content_parts = []
|
||||
if "text" in item:
|
||||
content_parts.append(
|
||||
{"type": "text", "text": item["text"]}
|
||||
)
|
||||
elif (
|
||||
"type" in item
|
||||
and item["type"] == "text"
|
||||
and "text" in item
|
||||
):
|
||||
# Collect content parts (text, images, files) into a single message
|
||||
if "type" in item and item["type"] == "text" and "text" in item:
|
||||
content_parts.append(item)
|
||||
elif (
|
||||
"type" in item
|
||||
and item["type"] == "file"
|
||||
and "file" in item
|
||||
):
|
||||
elif "type" in item and item["type"] == "file" and "file" in item:
|
||||
content_parts.append(item)
|
||||
elif (
|
||||
"type" in item
|
||||
and item["type"] == "image_url"
|
||||
and "image_url" in item
|
||||
):
|
||||
elif "type" in item and item["type"] == "image_url" and "image_url" in item:
|
||||
content_parts.append(item)
|
||||
cleaned_messages.append(
|
||||
{"role": role, "content": content_parts}
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected content dictionary format: {item}"
|
||||
)
|
||||
elif "text" in item and "type" not in item:
|
||||
# Legacy format: {"text": "..."} without type
|
||||
content_parts.append({"type": "text", "text": item["text"]})
|
||||
|
||||
# Add the collected content parts as a single message
|
||||
if content_parts:
|
||||
cleaned_messages.append({"role": role, "content": content_parts})
|
||||
else:
|
||||
raise ValueError(f"Unexpected content type: {type(content)}")
|
||||
return cleaned_messages
|
||||
@@ -127,7 +163,7 @@ class OpenAILLM(BaseLLM):
|
||||
**kwargs,
|
||||
):
|
||||
messages = self._clean_messages_openai(messages)
|
||||
logging.info(f"Cleaned messages: {messages}")
|
||||
logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")
|
||||
|
||||
# Convert max_tokens to max_completion_tokens for newer models
|
||||
if "max_tokens" in kwargs:
|
||||
@@ -163,7 +199,7 @@ class OpenAILLM(BaseLLM):
|
||||
**kwargs,
|
||||
):
|
||||
messages = self._clean_messages_openai(messages)
|
||||
logging.info(f"Cleaned messages: {messages}")
|
||||
logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")
|
||||
|
||||
# Convert max_tokens to max_completion_tokens for newer models
|
||||
if "max_tokens" in kwargs:
|
||||
@@ -261,17 +297,14 @@ class OpenAILLM(BaseLLM):
|
||||
"""
|
||||
Return a list of MIME types supported by OpenAI for file uploads.
|
||||
|
||||
This reads from the model config to ensure consistency.
|
||||
If no model config found, falls back to images only (safest default).
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
]
|
||||
from application.core.model_configs import OPENAI_ATTACHMENTS
|
||||
return OPENAI_ATTACHMENTS
|
||||
|
||||
def prepare_messages_with_attachments(self, messages, attachments=None):
|
||||
"""
|
||||
@@ -308,10 +341,16 @@ class OpenAILLM(BaseLLM):
|
||||
prepared_messages[user_message_index]["content"] = []
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get("mime_type")
|
||||
logging.info(f"Processing attachment with mime_type: {mime_type}, has_data: {'data' in attachment}, has_path: {'path' in attachment}")
|
||||
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
try:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
# Check if this is a pre-converted image (from PDF-to-image conversion)
|
||||
if "data" in attachment:
|
||||
base64_image = attachment["data"]
|
||||
else:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
@@ -320,6 +359,7 @@ class OpenAILLM(BaseLLM):
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Error processing image attachment: {e}", exc_info=True
|
||||
@@ -334,6 +374,7 @@ class OpenAILLM(BaseLLM):
|
||||
# Handle PDFs using the file API
|
||||
|
||||
elif mime_type == "application/pdf":
|
||||
logging.info(f"Attempting to upload PDF to OpenAI: {attachment.get('path', 'unknown')}")
|
||||
try:
|
||||
file_id = self._upload_file_to_openai(attachment)
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
@@ -348,6 +389,8 @@ class OpenAILLM(BaseLLM):
|
||||
"text": f"File content:\n\n{attachment['content']}",
|
||||
}
|
||||
)
|
||||
else:
|
||||
logging.warning(f"Unsupported attachment type in OpenAI provider: {mime_type}")
|
||||
return prepared_messages
|
||||
|
||||
def _get_base64_image(self, attachment):
|
||||
|
||||
Reference in New Issue
Block a user