mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-01-20 14:00:55 +00:00
feat: process pdfs synthetically im model does not support file natively (#2263)
* feat: process pdfs synthetically im model does not support file natively * fix: small code optimisations
This commit is contained in:
@@ -52,6 +52,7 @@ RUN apt-get update && \
|
||||
python3.12 \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
poppler-utils \
|
||||
&& \
|
||||
ln -s /usr/bin/python3.12 /usr/bin/python && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -8,8 +8,8 @@ from application.core.model_settings import (
|
||||
ModelProvider,
|
||||
)
|
||||
|
||||
OPENAI_ATTACHMENTS = [
|
||||
"application/pdf",
|
||||
# Base image attachment types supported by most vision-capable LLMs
|
||||
IMAGE_ATTACHMENTS = [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
@@ -17,14 +17,15 @@ OPENAI_ATTACHMENTS = [
|
||||
"image/gif",
|
||||
]
|
||||
|
||||
GOOGLE_ATTACHMENTS = [
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
]
|
||||
# PDF excluded: most OpenAI-compatible endpoints don't support native PDF uploads.
|
||||
# When excluded, PDFs are synthetically processed by converting pages to images.
|
||||
OPENAI_ATTACHMENTS = IMAGE_ATTACHMENTS
|
||||
|
||||
GOOGLE_ATTACHMENTS = ["application/pdf"] + IMAGE_ATTACHMENTS
|
||||
|
||||
ANTHROPIC_ATTACHMENTS = IMAGE_ATTACHMENTS
|
||||
|
||||
OPENROUTER_ATTACHMENTS = IMAGE_ATTACHMENTS
|
||||
|
||||
|
||||
OPENAI_MODELS = [
|
||||
@@ -63,6 +64,7 @@ ANTHROPIC_MODELS = [
|
||||
description="Latest Claude 3.5 Sonnet with enhanced capabilities",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
supported_attachment_types=ANTHROPIC_ATTACHMENTS,
|
||||
context_window=200000,
|
||||
),
|
||||
),
|
||||
@@ -73,6 +75,7 @@ ANTHROPIC_MODELS = [
|
||||
description="Balanced performance and capability",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
supported_attachment_types=ANTHROPIC_ATTACHMENTS,
|
||||
context_window=200000,
|
||||
),
|
||||
),
|
||||
@@ -83,6 +86,7 @@ ANTHROPIC_MODELS = [
|
||||
description="Most capable Claude model",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
supported_attachment_types=ANTHROPIC_ATTACHMENTS,
|
||||
context_window=200000,
|
||||
),
|
||||
),
|
||||
@@ -93,6 +97,7 @@ ANTHROPIC_MODELS = [
|
||||
description="Fastest Claude model",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
supported_attachment_types=ANTHROPIC_ATTACHMENTS,
|
||||
context_window=200000,
|
||||
),
|
||||
),
|
||||
@@ -151,28 +156,43 @@ GROQ_MODELS = [
|
||||
),
|
||||
),
|
||||
AvailableModel(
|
||||
id="llama-3.1-8b-instant",
|
||||
id="openai/gpt-oss-120b",
|
||||
provider=ModelProvider.GROQ,
|
||||
display_name="Llama 3.1 8B",
|
||||
description="Ultra-fast inference",
|
||||
display_name="GPT-OSS 120B",
|
||||
description="Open-source GPT model optimized for speed",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
context_window=128000,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
OPENROUTER_MODELS = [
|
||||
AvailableModel(
|
||||
id="mixtral-8x7b-32768",
|
||||
provider=ModelProvider.GROQ,
|
||||
display_name="Mixtral 8x7B",
|
||||
description="High-speed inference with tools",
|
||||
id="qwen/qwen3-coder:free",
|
||||
provider=ModelProvider.OPENROUTER,
|
||||
display_name="Qwen 3 Coder",
|
||||
description="Latest Qwen model with high-speed inference",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
context_window=32768,
|
||||
context_window=128000,
|
||||
supported_attachment_types=OPENROUTER_ATTACHMENTS
|
||||
),
|
||||
),
|
||||
AvailableModel(
|
||||
id="google/gemma-3-27b-it:free",
|
||||
provider=ModelProvider.OPENROUTER,
|
||||
display_name="Gemma 3 27B",
|
||||
description="Latest Gemma model with high-speed inference",
|
||||
capabilities=ModelCapabilities(
|
||||
supports_tools=True,
|
||||
context_window=128000,
|
||||
supported_attachment_types=OPENROUTER_ATTACHMENTS
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
AZURE_OPENAI_MODELS = [
|
||||
AvailableModel(
|
||||
id="azure-gpt-4",
|
||||
|
||||
@@ -8,6 +8,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class ModelProvider(str, Enum):
|
||||
OPENAI = "openai"
|
||||
OPENROUTER = "openrouter"
|
||||
AZURE_OPENAI = "azure_openai"
|
||||
ANTHROPIC = "anthropic"
|
||||
GROQ = "groq"
|
||||
@@ -107,6 +108,10 @@ class ModelRegistry:
|
||||
settings.LLM_PROVIDER == "groq" and settings.API_KEY
|
||||
):
|
||||
self._add_groq_models(settings)
|
||||
if settings.OPEN_ROUTER_API_KEY or (
|
||||
settings.LLM_PROVIDER == "openrouter" and settings.API_KEY
|
||||
):
|
||||
self._add_openrouter_models(settings)
|
||||
if settings.HUGGINGFACE_API_KEY or (
|
||||
settings.LLM_PROVIDER == "huggingface" and settings.API_KEY
|
||||
):
|
||||
@@ -211,6 +216,21 @@ class ModelRegistry:
|
||||
return
|
||||
for model in GROQ_MODELS:
|
||||
self.models[model.id] = model
|
||||
|
||||
def _add_openrouter_models(self, settings):
|
||||
from application.core.model_configs import OPENROUTER_MODELS
|
||||
|
||||
if settings.OPEN_ROUTER_API_KEY:
|
||||
for model in OPENROUTER_MODELS:
|
||||
self.models[model.id] = model
|
||||
return
|
||||
if settings.LLM_PROVIDER == "openrouter" and settings.LLM_NAME:
|
||||
for model in OPENROUTER_MODELS:
|
||||
if model.id == settings.LLM_NAME:
|
||||
self.models[model.id] = model
|
||||
return
|
||||
for model in OPENROUTER_MODELS:
|
||||
self.models[model.id] = model
|
||||
|
||||
def _add_docsgpt_models(self, settings):
|
||||
model_id = "docsgpt-local"
|
||||
|
||||
@@ -9,6 +9,7 @@ def get_api_key_for_provider(provider: str) -> Optional[str]:
|
||||
|
||||
provider_key_map = {
|
||||
"openai": settings.OPENAI_API_KEY,
|
||||
"openrouter": settings.OPEN_ROUTER_API_KEY,
|
||||
"anthropic": settings.ANTHROPIC_API_KEY,
|
||||
"google": settings.GOOGLE_API_KEY,
|
||||
"groq": settings.GROQ_API_KEY,
|
||||
|
||||
@@ -81,6 +81,7 @@ class Settings(BaseSettings):
|
||||
GOOGLE_API_KEY: Optional[str] = None
|
||||
GROQ_API_KEY: Optional[str] = None
|
||||
HUGGINGFACE_API_KEY: Optional[str] = None
|
||||
OPEN_ROUTER_API_KEY: Optional[str] = None
|
||||
|
||||
OPENAI_API_BASE: Optional[str] = None # azure openai api base url
|
||||
OPENAI_API_VERSION: Optional[str] = None # azure openai api version
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from anthropic import AI_PROMPT, Anthropic, HUMAN_PROMPT
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.llm.base import BaseLLM
|
||||
from application.storage.storage_creator import StorageCreator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnthropicLLM(BaseLLM):
|
||||
@@ -20,6 +26,7 @@ class AnthropicLLM(BaseLLM):
|
||||
|
||||
self.HUMAN_PROMPT = HUMAN_PROMPT
|
||||
self.AI_PROMPT = AI_PROMPT
|
||||
self.storage = StorageCreator.get_storage()
|
||||
|
||||
def _raw_gen(
|
||||
self,
|
||||
@@ -70,3 +77,115 @@ class AnthropicLLM(BaseLLM):
|
||||
finally:
|
||||
if hasattr(stream_response, "close"):
|
||||
stream_response.close()
|
||||
|
||||
def get_supported_attachment_types(self):
|
||||
"""
|
||||
Return a list of MIME types supported by Anthropic Claude for file uploads.
|
||||
Claude supports images but not PDFs natively.
|
||||
PDFs are synthetically supported via PDF-to-image conversion in the handler.
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
]
|
||||
|
||||
def prepare_messages_with_attachments(self, messages, attachments=None):
|
||||
"""
|
||||
Process attachments for Anthropic Claude API.
|
||||
Formats images using Claude's vision message format.
|
||||
|
||||
Args:
|
||||
messages (list): List of message dictionaries.
|
||||
attachments (list): List of attachment dictionaries with content and metadata.
|
||||
|
||||
Returns:
|
||||
list: Messages formatted with image content for Claude API.
|
||||
"""
|
||||
if not attachments:
|
||||
return messages
|
||||
|
||||
prepared_messages = messages.copy()
|
||||
|
||||
# Find the last user message to attach images to
|
||||
user_message_index = None
|
||||
for i in range(len(prepared_messages) - 1, -1, -1):
|
||||
if prepared_messages[i].get("role") == "user":
|
||||
user_message_index = i
|
||||
break
|
||||
|
||||
if user_message_index is None:
|
||||
user_message = {"role": "user", "content": []}
|
||||
prepared_messages.append(user_message)
|
||||
user_message_index = len(prepared_messages) - 1
|
||||
|
||||
# Convert content to list format if it's a string
|
||||
if isinstance(prepared_messages[user_message_index].get("content"), str):
|
||||
text_content = prepared_messages[user_message_index]["content"]
|
||||
prepared_messages[user_message_index]["content"] = [
|
||||
{"type": "text", "text": text_content}
|
||||
]
|
||||
elif not isinstance(prepared_messages[user_message_index].get("content"), list):
|
||||
prepared_messages[user_message_index]["content"] = []
|
||||
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get("mime_type")
|
||||
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
try:
|
||||
# Check if this is a pre-converted image (from PDF-to-image conversion)
|
||||
# These have 'data' key with base64 already
|
||||
if "data" in attachment:
|
||||
base64_image = attachment["data"]
|
||||
else:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
|
||||
# Claude uses a specific format for images
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": mime_type,
|
||||
"data": base64_image,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing image attachment: {e}", exc_info=True
|
||||
)
|
||||
if "content" in attachment:
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]",
|
||||
}
|
||||
)
|
||||
|
||||
return prepared_messages
|
||||
|
||||
def _get_base64_image(self, attachment):
|
||||
"""
|
||||
Convert an image file to base64 encoding.
|
||||
|
||||
Args:
|
||||
attachment (dict): Attachment dictionary with path and metadata.
|
||||
|
||||
Returns:
|
||||
str: Base64-encoded image data.
|
||||
"""
|
||||
file_path = attachment.get("path")
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in attachment")
|
||||
try:
|
||||
with self.storage.get_file(file_path) as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
@@ -105,6 +105,7 @@ class LLMHandler(ABC):
|
||||
"""
|
||||
Prepare messages with attachments and provider-specific formatting.
|
||||
|
||||
|
||||
Args:
|
||||
agent: The agent instance
|
||||
messages: Original messages
|
||||
@@ -118,11 +119,40 @@ class LLMHandler(ABC):
|
||||
logger.info(f"Preparing messages with {len(attachments)} attachments")
|
||||
supported_types = agent.llm.get_supported_attachment_types()
|
||||
|
||||
# Check if provider supports images but not PDF (synthetic PDF support)
|
||||
supports_images = any(t.startswith("image/") for t in supported_types)
|
||||
supports_pdf = "application/pdf" in supported_types
|
||||
|
||||
# Process attachments, converting PDFs to images if needed
|
||||
processed_attachments = []
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get("mime_type")
|
||||
|
||||
# Synthetic PDF support: convert PDF to images if LLM supports images but not PDF
|
||||
if mime_type == "application/pdf" and supports_images and not supports_pdf:
|
||||
logger.info(
|
||||
f"Converting PDF to images for synthetic PDF support: {attachment.get('path', 'unknown')}"
|
||||
)
|
||||
try:
|
||||
converted_images = self._convert_pdf_to_images(attachment)
|
||||
processed_attachments.extend(converted_images)
|
||||
logger.info(
|
||||
f"Converted PDF to {len(converted_images)} images"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to convert PDF to images, falling back to text: {e}"
|
||||
)
|
||||
# Fall back to treating as unsupported (text extraction)
|
||||
processed_attachments.append(attachment)
|
||||
else:
|
||||
processed_attachments.append(attachment)
|
||||
|
||||
supported_attachments = [
|
||||
a for a in attachments if a.get("mime_type") in supported_types
|
||||
a for a in processed_attachments if a.get("mime_type") in supported_types
|
||||
]
|
||||
unsupported_attachments = [
|
||||
a for a in attachments if a.get("mime_type") not in supported_types
|
||||
a for a in processed_attachments if a.get("mime_type") not in supported_types
|
||||
]
|
||||
|
||||
# Process supported attachments with the LLM's custom method
|
||||
@@ -145,6 +175,37 @@ class LLMHandler(ABC):
|
||||
)
|
||||
return messages
|
||||
|
||||
def _convert_pdf_to_images(self, attachment: Dict) -> List[Dict]:
|
||||
"""
|
||||
Convert a PDF attachment to a list of image attachments.
|
||||
|
||||
This enables synthetic PDF support for LLMs that support images but not PDFs.
|
||||
|
||||
Args:
|
||||
attachment: PDF attachment dictionary with 'path' and optional 'content'
|
||||
|
||||
Returns:
|
||||
List of image attachment dictionaries with 'data', 'mime_type', and 'page'
|
||||
"""
|
||||
from application.utils import convert_pdf_to_images
|
||||
from application.storage.storage_creator import StorageCreator
|
||||
|
||||
file_path = attachment.get("path")
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in PDF attachment")
|
||||
|
||||
storage = StorageCreator.get_storage()
|
||||
|
||||
# Convert PDF to images
|
||||
images_data = convert_pdf_to_images(
|
||||
file_path=file_path,
|
||||
storage=storage,
|
||||
max_pages=20,
|
||||
dpi=150,
|
||||
)
|
||||
|
||||
return images_data
|
||||
|
||||
def _append_unsupported_attachments(
|
||||
self, messages: List[Dict], attachments: List[Dict]
|
||||
) -> List[Dict]:
|
||||
|
||||
@@ -9,6 +9,7 @@ from application.llm.novita import NovitaLLM
|
||||
from application.llm.openai import AzureOpenAILLM, OpenAILLM
|
||||
from application.llm.premai import PremAILLM
|
||||
from application.llm.sagemaker import SagemakerAPILLM
|
||||
from application.llm.open_router import OpenRouterLLM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -25,6 +26,7 @@ class LLMCreator:
|
||||
"groq": GroqLLM,
|
||||
"google": GoogleLLM,
|
||||
"novita": NovitaLLM,
|
||||
"openrouter": OpenRouterLLM,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
||||
15
application/llm/open_router.py
Normal file
15
application/llm/open_router.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from application.core.settings import settings
|
||||
from application.llm.openai import OpenAILLM
|
||||
|
||||
OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
||||
|
||||
|
||||
class OpenRouterLLM(OpenAILLM):
|
||||
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
|
||||
super().__init__(
|
||||
api_key=api_key or settings.OPEN_ROUTER_API_KEY or settings.API_KEY,
|
||||
user_api_key=user_api_key,
|
||||
base_url=base_url or OPEN_ROUTER_BASE_URL,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -9,6 +9,57 @@ from application.llm.base import BaseLLM
|
||||
from application.storage.storage_creator import StorageCreator
|
||||
|
||||
|
||||
def _truncate_base64_for_logging(messages):
|
||||
"""
|
||||
Create a copy of messages with base64 data truncated for readable logging.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts
|
||||
|
||||
Returns:
|
||||
Copy of messages with truncated base64 content
|
||||
"""
|
||||
import copy
|
||||
|
||||
def truncate_content(content):
|
||||
if isinstance(content, str):
|
||||
# Check if it looks like a data URL with base64
|
||||
if content.startswith("data:") and ";base64," in content:
|
||||
prefix_end = content.index(";base64,") + len(";base64,")
|
||||
prefix = content[:prefix_end]
|
||||
return f"{prefix}[BASE64_DATA_TRUNCATED, length={len(content) - prefix_end}]"
|
||||
return content
|
||||
elif isinstance(content, list):
|
||||
return [truncate_item(item) for item in content]
|
||||
elif isinstance(content, dict):
|
||||
return {k: truncate_content(v) for k, v in content.items()}
|
||||
return content
|
||||
|
||||
def truncate_item(item):
|
||||
if isinstance(item, dict):
|
||||
result = {}
|
||||
for k, v in item.items():
|
||||
if k == "url" and isinstance(v, str) and ";base64," in v:
|
||||
prefix_end = v.index(";base64,") + len(";base64,")
|
||||
prefix = v[:prefix_end]
|
||||
result[k] = f"{prefix}[BASE64_DATA_TRUNCATED, length={len(v) - prefix_end}]"
|
||||
elif k == "data" and isinstance(v, str) and len(v) > 100:
|
||||
result[k] = f"[BASE64_DATA_TRUNCATED, length={len(v)}]"
|
||||
else:
|
||||
result[k] = truncate_content(v)
|
||||
return result
|
||||
return truncate_content(item)
|
||||
|
||||
truncated = []
|
||||
for msg in messages:
|
||||
msg_copy = copy.copy(msg)
|
||||
if "content" in msg_copy:
|
||||
msg_copy["content"] = truncate_content(msg_copy["content"])
|
||||
truncated.append(msg_copy)
|
||||
|
||||
return truncated
|
||||
|
||||
|
||||
class OpenAILLM(BaseLLM):
|
||||
|
||||
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
|
||||
@@ -44,12 +95,12 @@ class OpenAILLM(BaseLLM):
|
||||
if isinstance(content, str):
|
||||
cleaned_messages.append({"role": role, "content": content})
|
||||
elif isinstance(content, list):
|
||||
# Collect all content parts into a single message
|
||||
content_parts = []
|
||||
|
||||
for item in content:
|
||||
if "text" in item:
|
||||
cleaned_messages.append(
|
||||
{"role": role, "content": item["text"]}
|
||||
)
|
||||
elif "function_call" in item:
|
||||
if "function_call" in item:
|
||||
# Function calls need their own message
|
||||
cleaned_args = self._remove_null_values(
|
||||
item["function_call"]["args"]
|
||||
)
|
||||
@@ -69,6 +120,7 @@ class OpenAILLM(BaseLLM):
|
||||
}
|
||||
)
|
||||
elif "function_response" in item:
|
||||
# Function responses need their own message
|
||||
cleaned_messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
@@ -81,36 +133,20 @@ class OpenAILLM(BaseLLM):
|
||||
}
|
||||
)
|
||||
elif isinstance(item, dict):
|
||||
content_parts = []
|
||||
if "text" in item:
|
||||
content_parts.append(
|
||||
{"type": "text", "text": item["text"]}
|
||||
)
|
||||
elif (
|
||||
"type" in item
|
||||
and item["type"] == "text"
|
||||
and "text" in item
|
||||
):
|
||||
# Collect content parts (text, images, files) into a single message
|
||||
if "type" in item and item["type"] == "text" and "text" in item:
|
||||
content_parts.append(item)
|
||||
elif (
|
||||
"type" in item
|
||||
and item["type"] == "file"
|
||||
and "file" in item
|
||||
):
|
||||
elif "type" in item and item["type"] == "file" and "file" in item:
|
||||
content_parts.append(item)
|
||||
elif (
|
||||
"type" in item
|
||||
and item["type"] == "image_url"
|
||||
and "image_url" in item
|
||||
):
|
||||
elif "type" in item and item["type"] == "image_url" and "image_url" in item:
|
||||
content_parts.append(item)
|
||||
cleaned_messages.append(
|
||||
{"role": role, "content": content_parts}
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected content dictionary format: {item}"
|
||||
)
|
||||
elif "text" in item and "type" not in item:
|
||||
# Legacy format: {"text": "..."} without type
|
||||
content_parts.append({"type": "text", "text": item["text"]})
|
||||
|
||||
# Add the collected content parts as a single message
|
||||
if content_parts:
|
||||
cleaned_messages.append({"role": role, "content": content_parts})
|
||||
else:
|
||||
raise ValueError(f"Unexpected content type: {type(content)}")
|
||||
return cleaned_messages
|
||||
@@ -127,7 +163,7 @@ class OpenAILLM(BaseLLM):
|
||||
**kwargs,
|
||||
):
|
||||
messages = self._clean_messages_openai(messages)
|
||||
logging.info(f"Cleaned messages: {messages}")
|
||||
logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")
|
||||
|
||||
# Convert max_tokens to max_completion_tokens for newer models
|
||||
if "max_tokens" in kwargs:
|
||||
@@ -163,7 +199,7 @@ class OpenAILLM(BaseLLM):
|
||||
**kwargs,
|
||||
):
|
||||
messages = self._clean_messages_openai(messages)
|
||||
logging.info(f"Cleaned messages: {messages}")
|
||||
logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}")
|
||||
|
||||
# Convert max_tokens to max_completion_tokens for newer models
|
||||
if "max_tokens" in kwargs:
|
||||
@@ -261,17 +297,14 @@ class OpenAILLM(BaseLLM):
|
||||
"""
|
||||
Return a list of MIME types supported by OpenAI for file uploads.
|
||||
|
||||
This reads from the model config to ensure consistency.
|
||||
If no model config found, falls back to images only (safest default).
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
]
|
||||
from application.core.model_configs import OPENAI_ATTACHMENTS
|
||||
return OPENAI_ATTACHMENTS
|
||||
|
||||
def prepare_messages_with_attachments(self, messages, attachments=None):
|
||||
"""
|
||||
@@ -308,10 +341,16 @@ class OpenAILLM(BaseLLM):
|
||||
prepared_messages[user_message_index]["content"] = []
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get("mime_type")
|
||||
logging.info(f"Processing attachment with mime_type: {mime_type}, has_data: {'data' in attachment}, has_path: {'path' in attachment}")
|
||||
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
try:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
# Check if this is a pre-converted image (from PDF-to-image conversion)
|
||||
if "data" in attachment:
|
||||
base64_image = attachment["data"]
|
||||
else:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
@@ -320,6 +359,7 @@ class OpenAILLM(BaseLLM):
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Error processing image attachment: {e}", exc_info=True
|
||||
@@ -334,6 +374,7 @@ class OpenAILLM(BaseLLM):
|
||||
# Handle PDFs using the file API
|
||||
|
||||
elif mime_type == "application/pdf":
|
||||
logging.info(f"Attempting to upload PDF to OpenAI: {attachment.get('path', 'unknown')}")
|
||||
try:
|
||||
file_id = self._upload_file_to_openai(attachment)
|
||||
prepared_messages[user_message_index]["content"].append(
|
||||
@@ -348,6 +389,8 @@ class OpenAILLM(BaseLLM):
|
||||
"text": f"File content:\n\n{attachment['content']}",
|
||||
}
|
||||
)
|
||||
else:
|
||||
logging.warning(f"Unsupported attachment type in OpenAI provider: {mime_type}")
|
||||
return prepared_messages
|
||||
|
||||
def _get_base64_image(self, attachment):
|
||||
|
||||
@@ -60,14 +60,14 @@ def get_default_file_extractor(
|
||||
".rst": RstParser(),
|
||||
".adoc": DoclingAsciiDocParser(),
|
||||
".asciidoc": DoclingAsciiDocParser(),
|
||||
# Images (with OCR)
|
||||
".png": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".jpg": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".tiff": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".tif": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".bmp": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
".webp": DoclingImageParser(ocr_enabled=ocr_enabled),
|
||||
# Images (with OCR) - only use Docling when OCR is enabled
|
||||
".png": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
".jpg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
".tiff": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
".tif": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
".bmp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
".webp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(),
|
||||
# Media/subtitles
|
||||
".vtt": DoclingVTTParser(),
|
||||
# Specialized XML formats
|
||||
|
||||
@@ -56,6 +56,7 @@ packaging==24.2
|
||||
pandas==2.3.3
|
||||
openpyxl==3.1.5
|
||||
pathable==0.4.4
|
||||
pdf2image>=1.17.0
|
||||
pillow
|
||||
portalocker>=2.7.0,<3.0.0
|
||||
prance==25.4.8.0
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import List
|
||||
|
||||
import tiktoken
|
||||
from flask import jsonify, make_response
|
||||
@@ -11,6 +15,8 @@ from application.core.model_utils import get_token_limit
|
||||
|
||||
from application.core.settings import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_encoding = None
|
||||
|
||||
@@ -215,6 +221,93 @@ def calculate_compression_threshold(
|
||||
return threshold
|
||||
|
||||
|
||||
def convert_pdf_to_images(
|
||||
file_path: str,
|
||||
storage=None,
|
||||
max_pages: int = 20,
|
||||
dpi: int = 150,
|
||||
image_format: str = "PNG",
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Convert PDF pages to images for LLMs that support images but not PDFs.
|
||||
|
||||
This enables "synthetic PDF support" by converting each PDF page to an image
|
||||
that can be sent to vision-capable LLMs like Claude.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file (can be storage path)
|
||||
storage: Optional storage instance for retrieving files
|
||||
max_pages: Maximum number of pages to convert (default 20 to avoid context overflow)
|
||||
dpi: Resolution for rendering (default 150 for balance of quality/size)
|
||||
image_format: Output format (PNG recommended for quality)
|
||||
|
||||
Returns:
|
||||
List of dicts with keys:
|
||||
- 'data': base64-encoded image data
|
||||
- 'mime_type': MIME type (e.g., 'image/png')
|
||||
- 'page': Page number (1-indexed)
|
||||
|
||||
Raises:
|
||||
ImportError: If pdf2image is not installed
|
||||
FileNotFoundError: If file doesn't exist
|
||||
Exception: If conversion fails
|
||||
"""
|
||||
try:
|
||||
from pdf2image import convert_from_path, convert_from_bytes
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pdf2image is required for PDF-to-image conversion. "
|
||||
"Install it with: pip install pdf2image\n"
|
||||
"Also ensure poppler-utils is installed on your system."
|
||||
)
|
||||
|
||||
images_data = []
|
||||
mime_type = f"image/{image_format.lower()}"
|
||||
|
||||
try:
|
||||
# Get PDF content either from storage or direct file path
|
||||
if storage and hasattr(storage, "get_file"):
|
||||
with storage.get_file(file_path) as pdf_file:
|
||||
pdf_bytes = pdf_file.read()
|
||||
pil_images = convert_from_bytes(
|
||||
pdf_bytes,
|
||||
dpi=dpi,
|
||||
fmt=image_format.lower(),
|
||||
first_page=1,
|
||||
last_page=max_pages,
|
||||
)
|
||||
else:
|
||||
pil_images = convert_from_path(
|
||||
file_path,
|
||||
dpi=dpi,
|
||||
fmt=image_format.lower(),
|
||||
first_page=1,
|
||||
last_page=max_pages,
|
||||
)
|
||||
|
||||
for page_num, pil_image in enumerate(pil_images, start=1):
|
||||
# Convert PIL image to base64
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format=image_format)
|
||||
buffer.seek(0)
|
||||
base64_data = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
|
||||
images_data.append({
|
||||
"data": base64_data,
|
||||
"mime_type": mime_type,
|
||||
"page": page_num,
|
||||
})
|
||||
|
||||
return images_data
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error(f"PDF file not found: {file_path}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting PDF to images: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def clean_text_for_tts(text: str) -> str:
|
||||
"""
|
||||
clean text for Text-to-Speech processing.
|
||||
|
||||
Reference in New Issue
Block a user