diff --git a/application/Dockerfile b/application/Dockerfile index efdf6908..48d29e57 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -52,6 +52,7 @@ RUN apt-get update && \ python3.12 \ libgl1 \ libglib2.0-0 \ + poppler-utils \ && \ ln -s /usr/bin/python3.12 /usr/bin/python && \ rm -rf /var/lib/apt/lists/* diff --git a/application/core/model_configs.py b/application/core/model_configs.py index a25c4440..841be925 100644 --- a/application/core/model_configs.py +++ b/application/core/model_configs.py @@ -8,8 +8,8 @@ from application.core.model_settings import ( ModelProvider, ) -OPENAI_ATTACHMENTS = [ - "application/pdf", +# Base image attachment types supported by most vision-capable LLMs +IMAGE_ATTACHMENTS = [ "image/png", "image/jpeg", "image/jpg", @@ -17,14 +17,15 @@ OPENAI_ATTACHMENTS = [ "image/gif", ] -GOOGLE_ATTACHMENTS = [ - "application/pdf", - "image/png", - "image/jpeg", - "image/jpg", - "image/webp", - "image/gif", -] +# PDF excluded: most OpenAI-compatible endpoints don't support native PDF uploads. +# When excluded, PDFs are synthetically processed by converting pages to images. +OPENAI_ATTACHMENTS = IMAGE_ATTACHMENTS + +GOOGLE_ATTACHMENTS = ["application/pdf"] + IMAGE_ATTACHMENTS + +ANTHROPIC_ATTACHMENTS = IMAGE_ATTACHMENTS + +OPENROUTER_ATTACHMENTS = IMAGE_ATTACHMENTS OPENAI_MODELS = [ @@ -63,6 +64,7 @@ ANTHROPIC_MODELS = [ description="Latest Claude 3.5 Sonnet with enhanced capabilities", capabilities=ModelCapabilities( supports_tools=True, + supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), @@ -73,6 +75,7 @@ ANTHROPIC_MODELS = [ description="Balanced performance and capability", capabilities=ModelCapabilities( supports_tools=True, + supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), @@ -83,6 +86,7 @@ ANTHROPIC_MODELS = [ description="Most capable Claude model", capabilities=ModelCapabilities( supports_tools=True, + supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), @@ -93,6 +97,7 @@ ANTHROPIC_MODELS = [ description="Fastest Claude model", capabilities=ModelCapabilities( supports_tools=True, + supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), @@ -151,28 +156,43 @@ GROQ_MODELS = [ ), ), AvailableModel( - id="llama-3.1-8b-instant", + id="openai/gpt-oss-120b", provider=ModelProvider.GROQ, - display_name="Llama 3.1 8B", - description="Ultra-fast inference", + display_name="GPT-OSS 120B", + description="Open-source GPT model optimized for speed", capabilities=ModelCapabilities( supports_tools=True, context_window=128000, ), ), +] + + +OPENROUTER_MODELS = [ AvailableModel( - id="mixtral-8x7b-32768", - provider=ModelProvider.GROQ, - display_name="Mixtral 8x7B", - description="High-speed inference with tools", + id="qwen/qwen3-coder:free", + provider=ModelProvider.OPENROUTER, + display_name="Qwen 3 Coder", + description="Latest Qwen model with high-speed inference", capabilities=ModelCapabilities( supports_tools=True, - context_window=32768, + context_window=128000, + supported_attachment_types=OPENROUTER_ATTACHMENTS + ), + ), + AvailableModel( + id="google/gemma-3-27b-it:free", + provider=ModelProvider.OPENROUTER, + display_name="Gemma 3 27B", + description="Latest Gemma model with high-speed inference", + capabilities=ModelCapabilities( + supports_tools=True, + context_window=128000, + supported_attachment_types=OPENROUTER_ATTACHMENTS ), ), ] - AZURE_OPENAI_MODELS = [ AvailableModel( id="azure-gpt-4", diff --git a/application/core/model_settings.py b/application/core/model_settings.py index bc38239a..206589f6 100644 --- a/application/core/model_settings.py +++ b/application/core/model_settings.py @@ -8,6 +8,7 @@ logger = logging.getLogger(__name__) class ModelProvider(str, Enum): OPENAI = "openai" + OPENROUTER = "openrouter" AZURE_OPENAI = "azure_openai" ANTHROPIC = "anthropic" GROQ = "groq" @@ -107,6 +108,10 @@ class ModelRegistry: settings.LLM_PROVIDER == "groq" and settings.API_KEY ): self._add_groq_models(settings) + if settings.OPEN_ROUTER_API_KEY or ( + settings.LLM_PROVIDER == "openrouter" and settings.API_KEY + ): + self._add_openrouter_models(settings) if settings.HUGGINGFACE_API_KEY or ( settings.LLM_PROVIDER == "huggingface" and settings.API_KEY ): @@ -211,6 +216,21 @@ class ModelRegistry: return for model in GROQ_MODELS: self.models[model.id] = model + + def _add_openrouter_models(self, settings): + from application.core.model_configs import OPENROUTER_MODELS + + if settings.OPEN_ROUTER_API_KEY: + for model in OPENROUTER_MODELS: + self.models[model.id] = model + return + if settings.LLM_PROVIDER == "openrouter" and settings.LLM_NAME: + for model in OPENROUTER_MODELS: + if model.id == settings.LLM_NAME: + self.models[model.id] = model + return + for model in OPENROUTER_MODELS: + self.models[model.id] = model def _add_docsgpt_models(self, settings): model_id = "docsgpt-local" diff --git a/application/core/model_utils.py b/application/core/model_utils.py index f24dbf47..94dc8973 100644 --- a/application/core/model_utils.py +++ b/application/core/model_utils.py @@ -9,6 +9,7 @@ def get_api_key_for_provider(provider: str) -> Optional[str]: provider_key_map = { "openai": settings.OPENAI_API_KEY, + "openrouter": settings.OPEN_ROUTER_API_KEY, "anthropic": settings.ANTHROPIC_API_KEY, "google": settings.GOOGLE_API_KEY, "groq": settings.GROQ_API_KEY, diff --git a/application/core/settings.py b/application/core/settings.py index 1cc36b18..66018b6f 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -81,6 +81,7 @@ class Settings(BaseSettings): GOOGLE_API_KEY: Optional[str] = None GROQ_API_KEY: Optional[str] = None HUGGINGFACE_API_KEY: Optional[str] = None + OPEN_ROUTER_API_KEY: Optional[str] = None OPENAI_API_BASE: Optional[str] = None # azure openai api base url OPENAI_API_VERSION: Optional[str] = None # azure openai api version diff --git a/application/llm/anthropic.py b/application/llm/anthropic.py index 4d26f925..e8b3ba2f 100644 --- a/application/llm/anthropic.py +++ b/application/llm/anthropic.py @@ -1,7 +1,13 @@ +import base64 +import logging + from anthropic import AI_PROMPT, Anthropic, HUMAN_PROMPT from application.core.settings import settings from application.llm.base import BaseLLM +from application.storage.storage_creator import StorageCreator + +logger = logging.getLogger(__name__) class AnthropicLLM(BaseLLM): @@ -20,6 +26,7 @@ class AnthropicLLM(BaseLLM): self.HUMAN_PROMPT = HUMAN_PROMPT self.AI_PROMPT = AI_PROMPT + self.storage = StorageCreator.get_storage() def _raw_gen( self, @@ -70,3 +77,115 @@ class AnthropicLLM(BaseLLM): finally: if hasattr(stream_response, "close"): stream_response.close() + + def get_supported_attachment_types(self): + """ + Return a list of MIME types supported by Anthropic Claude for file uploads. + Claude supports images but not PDFs natively. + PDFs are synthetically supported via PDF-to-image conversion in the handler. + + Returns: + list: List of supported MIME types + """ + return [ + "image/png", + "image/jpeg", + "image/jpg", + "image/webp", + "image/gif", + ] + + def prepare_messages_with_attachments(self, messages, attachments=None): + """ + Process attachments for Anthropic Claude API. + Formats images using Claude's vision message format. + + Args: + messages (list): List of message dictionaries. + attachments (list): List of attachment dictionaries with content and metadata. + + Returns: + list: Messages formatted with image content for Claude API. + """ + if not attachments: + return messages + + prepared_messages = messages.copy() + + # Find the last user message to attach images to + user_message_index = None + for i in range(len(prepared_messages) - 1, -1, -1): + if prepared_messages[i].get("role") == "user": + user_message_index = i + break + + if user_message_index is None: + user_message = {"role": "user", "content": []} + prepared_messages.append(user_message) + user_message_index = len(prepared_messages) - 1 + + # Convert content to list format if it's a string + if isinstance(prepared_messages[user_message_index].get("content"), str): + text_content = prepared_messages[user_message_index]["content"] + prepared_messages[user_message_index]["content"] = [ + {"type": "text", "text": text_content} + ] + elif not isinstance(prepared_messages[user_message_index].get("content"), list): + prepared_messages[user_message_index]["content"] = [] + + for attachment in attachments: + mime_type = attachment.get("mime_type") + + if mime_type and mime_type.startswith("image/"): + try: + # Check if this is a pre-converted image (from PDF-to-image conversion) + # These have 'data' key with base64 already + if "data" in attachment: + base64_image = attachment["data"] + else: + base64_image = self._get_base64_image(attachment) + + # Claude uses a specific format for images + prepared_messages[user_message_index]["content"].append( + { + "type": "image", + "source": { + "type": "base64", + "media_type": mime_type, + "data": base64_image, + }, + } + ) + + except Exception as e: + logger.error( + f"Error processing image attachment: {e}", exc_info=True + ) + if "content" in attachment: + prepared_messages[user_message_index]["content"].append( + { + "type": "text", + "text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]", + } + ) + + return prepared_messages + + def _get_base64_image(self, attachment): + """ + Convert an image file to base64 encoding. + + Args: + attachment (dict): Attachment dictionary with path and metadata. + + Returns: + str: Base64-encoded image data. + """ + file_path = attachment.get("path") + if not file_path: + raise ValueError("No file path provided in attachment") + try: + with self.storage.get_file(file_path) as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + except FileNotFoundError: + raise FileNotFoundError(f"File not found: {file_path}") diff --git a/application/llm/handlers/base.py b/application/llm/handlers/base.py index dbc5a879..b673a604 100644 --- a/application/llm/handlers/base.py +++ b/application/llm/handlers/base.py @@ -105,6 +105,7 @@ class LLMHandler(ABC): """ Prepare messages with attachments and provider-specific formatting. + Args: agent: The agent instance messages: Original messages @@ -118,11 +119,40 @@ class LLMHandler(ABC): logger.info(f"Preparing messages with {len(attachments)} attachments") supported_types = agent.llm.get_supported_attachment_types() + # Check if provider supports images but not PDF (synthetic PDF support) + supports_images = any(t.startswith("image/") for t in supported_types) + supports_pdf = "application/pdf" in supported_types + + # Process attachments, converting PDFs to images if needed + processed_attachments = [] + for attachment in attachments: + mime_type = attachment.get("mime_type") + + # Synthetic PDF support: convert PDF to images if LLM supports images but not PDF + if mime_type == "application/pdf" and supports_images and not supports_pdf: + logger.info( + f"Converting PDF to images for synthetic PDF support: {attachment.get('path', 'unknown')}" + ) + try: + converted_images = self._convert_pdf_to_images(attachment) + processed_attachments.extend(converted_images) + logger.info( + f"Converted PDF to {len(converted_images)} images" + ) + except Exception as e: + logger.error( + f"Failed to convert PDF to images, falling back to text: {e}" + ) + # Fall back to treating as unsupported (text extraction) + processed_attachments.append(attachment) + else: + processed_attachments.append(attachment) + supported_attachments = [ - a for a in attachments if a.get("mime_type") in supported_types + a for a in processed_attachments if a.get("mime_type") in supported_types ] unsupported_attachments = [ - a for a in attachments if a.get("mime_type") not in supported_types + a for a in processed_attachments if a.get("mime_type") not in supported_types ] # Process supported attachments with the LLM's custom method @@ -145,6 +175,37 @@ class LLMHandler(ABC): ) return messages + def _convert_pdf_to_images(self, attachment: Dict) -> List[Dict]: + """ + Convert a PDF attachment to a list of image attachments. + + This enables synthetic PDF support for LLMs that support images but not PDFs. + + Args: + attachment: PDF attachment dictionary with 'path' and optional 'content' + + Returns: + List of image attachment dictionaries with 'data', 'mime_type', and 'page' + """ + from application.utils import convert_pdf_to_images + from application.storage.storage_creator import StorageCreator + + file_path = attachment.get("path") + if not file_path: + raise ValueError("No file path provided in PDF attachment") + + storage = StorageCreator.get_storage() + + # Convert PDF to images + images_data = convert_pdf_to_images( + file_path=file_path, + storage=storage, + max_pages=20, + dpi=150, + ) + + return images_data + def _append_unsupported_attachments( self, messages: List[Dict], attachments: List[Dict] ) -> List[Dict]: diff --git a/application/llm/llm_creator.py b/application/llm/llm_creator.py index ca39194c..96653831 100644 --- a/application/llm/llm_creator.py +++ b/application/llm/llm_creator.py @@ -9,6 +9,7 @@ from application.llm.novita import NovitaLLM from application.llm.openai import AzureOpenAILLM, OpenAILLM from application.llm.premai import PremAILLM from application.llm.sagemaker import SagemakerAPILLM +from application.llm.open_router import OpenRouterLLM logger = logging.getLogger(__name__) @@ -25,6 +26,7 @@ class LLMCreator: "groq": GroqLLM, "google": GoogleLLM, "novita": NovitaLLM, + "openrouter": OpenRouterLLM, } @classmethod diff --git a/application/llm/open_router.py b/application/llm/open_router.py new file mode 100644 index 00000000..39654572 --- /dev/null +++ b/application/llm/open_router.py @@ -0,0 +1,15 @@ +from application.core.settings import settings +from application.llm.openai import OpenAILLM + +OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1" + + +class OpenRouterLLM(OpenAILLM): + def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): + super().__init__( + api_key=api_key or settings.OPEN_ROUTER_API_KEY or settings.API_KEY, + user_api_key=user_api_key, + base_url=base_url or OPEN_ROUTER_BASE_URL, + *args, + **kwargs, + ) diff --git a/application/llm/openai.py b/application/llm/openai.py index e851f078..263b4b5a 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -9,6 +9,57 @@ from application.llm.base import BaseLLM from application.storage.storage_creator import StorageCreator +def _truncate_base64_for_logging(messages): + """ + Create a copy of messages with base64 data truncated for readable logging. + + Args: + messages: List of message dicts + + Returns: + Copy of messages with truncated base64 content + """ + import copy + + def truncate_content(content): + if isinstance(content, str): + # Check if it looks like a data URL with base64 + if content.startswith("data:") and ";base64," in content: + prefix_end = content.index(";base64,") + len(";base64,") + prefix = content[:prefix_end] + return f"{prefix}[BASE64_DATA_TRUNCATED, length={len(content) - prefix_end}]" + return content + elif isinstance(content, list): + return [truncate_item(item) for item in content] + elif isinstance(content, dict): + return {k: truncate_content(v) for k, v in content.items()} + return content + + def truncate_item(item): + if isinstance(item, dict): + result = {} + for k, v in item.items(): + if k == "url" and isinstance(v, str) and ";base64," in v: + prefix_end = v.index(";base64,") + len(";base64,") + prefix = v[:prefix_end] + result[k] = f"{prefix}[BASE64_DATA_TRUNCATED, length={len(v) - prefix_end}]" + elif k == "data" and isinstance(v, str) and len(v) > 100: + result[k] = f"[BASE64_DATA_TRUNCATED, length={len(v)}]" + else: + result[k] = truncate_content(v) + return result + return truncate_content(item) + + truncated = [] + for msg in messages: + msg_copy = copy.copy(msg) + if "content" in msg_copy: + msg_copy["content"] = truncate_content(msg_copy["content"]) + truncated.append(msg_copy) + + return truncated + + class OpenAILLM(BaseLLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): @@ -44,12 +95,12 @@ class OpenAILLM(BaseLLM): if isinstance(content, str): cleaned_messages.append({"role": role, "content": content}) elif isinstance(content, list): + # Collect all content parts into a single message + content_parts = [] + for item in content: - if "text" in item: - cleaned_messages.append( - {"role": role, "content": item["text"]} - ) - elif "function_call" in item: + if "function_call" in item: + # Function calls need their own message cleaned_args = self._remove_null_values( item["function_call"]["args"] ) @@ -69,6 +120,7 @@ class OpenAILLM(BaseLLM): } ) elif "function_response" in item: + # Function responses need their own message cleaned_messages.append( { "role": "tool", @@ -81,36 +133,20 @@ class OpenAILLM(BaseLLM): } ) elif isinstance(item, dict): - content_parts = [] - if "text" in item: - content_parts.append( - {"type": "text", "text": item["text"]} - ) - elif ( - "type" in item - and item["type"] == "text" - and "text" in item - ): + # Collect content parts (text, images, files) into a single message + if "type" in item and item["type"] == "text" and "text" in item: content_parts.append(item) - elif ( - "type" in item - and item["type"] == "file" - and "file" in item - ): + elif "type" in item and item["type"] == "file" and "file" in item: content_parts.append(item) - elif ( - "type" in item - and item["type"] == "image_url" - and "image_url" in item - ): + elif "type" in item and item["type"] == "image_url" and "image_url" in item: content_parts.append(item) - cleaned_messages.append( - {"role": role, "content": content_parts} - ) - else: - raise ValueError( - f"Unexpected content dictionary format: {item}" - ) + elif "text" in item and "type" not in item: + # Legacy format: {"text": "..."} without type + content_parts.append({"type": "text", "text": item["text"]}) + + # Add the collected content parts as a single message + if content_parts: + cleaned_messages.append({"role": role, "content": content_parts}) else: raise ValueError(f"Unexpected content type: {type(content)}") return cleaned_messages @@ -127,7 +163,7 @@ class OpenAILLM(BaseLLM): **kwargs, ): messages = self._clean_messages_openai(messages) - logging.info(f"Cleaned messages: {messages}") + logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}") # Convert max_tokens to max_completion_tokens for newer models if "max_tokens" in kwargs: @@ -163,7 +199,7 @@ class OpenAILLM(BaseLLM): **kwargs, ): messages = self._clean_messages_openai(messages) - logging.info(f"Cleaned messages: {messages}") + logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}") # Convert max_tokens to max_completion_tokens for newer models if "max_tokens" in kwargs: @@ -261,17 +297,14 @@ class OpenAILLM(BaseLLM): """ Return a list of MIME types supported by OpenAI for file uploads. + This reads from the model config to ensure consistency. + If no model config found, falls back to images only (safest default). + Returns: list: List of supported MIME types """ - return [ - "application/pdf", - "image/png", - "image/jpeg", - "image/jpg", - "image/webp", - "image/gif", - ] + from application.core.model_configs import OPENAI_ATTACHMENTS + return OPENAI_ATTACHMENTS def prepare_messages_with_attachments(self, messages, attachments=None): """ @@ -308,10 +341,16 @@ class OpenAILLM(BaseLLM): prepared_messages[user_message_index]["content"] = [] for attachment in attachments: mime_type = attachment.get("mime_type") + logging.info(f"Processing attachment with mime_type: {mime_type}, has_data: {'data' in attachment}, has_path: {'path' in attachment}") if mime_type and mime_type.startswith("image/"): try: - base64_image = self._get_base64_image(attachment) + # Check if this is a pre-converted image (from PDF-to-image conversion) + if "data" in attachment: + base64_image = attachment["data"] + else: + base64_image = self._get_base64_image(attachment) + prepared_messages[user_message_index]["content"].append( { "type": "image_url", @@ -320,6 +359,7 @@ class OpenAILLM(BaseLLM): }, } ) + except Exception as e: logging.error( f"Error processing image attachment: {e}", exc_info=True @@ -334,6 +374,7 @@ class OpenAILLM(BaseLLM): # Handle PDFs using the file API elif mime_type == "application/pdf": + logging.info(f"Attempting to upload PDF to OpenAI: {attachment.get('path', 'unknown')}") try: file_id = self._upload_file_to_openai(attachment) prepared_messages[user_message_index]["content"].append( @@ -348,6 +389,8 @@ class OpenAILLM(BaseLLM): "text": f"File content:\n\n{attachment['content']}", } ) + else: + logging.warning(f"Unsupported attachment type in OpenAI provider: {mime_type}") return prepared_messages def _get_base64_image(self, attachment): diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 64860c0c..bee0dd34 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -60,14 +60,14 @@ def get_default_file_extractor( ".rst": RstParser(), ".adoc": DoclingAsciiDocParser(), ".asciidoc": DoclingAsciiDocParser(), - # Images (with OCR) - ".png": DoclingImageParser(ocr_enabled=ocr_enabled), - ".jpg": DoclingImageParser(ocr_enabled=ocr_enabled), - ".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled), - ".tiff": DoclingImageParser(ocr_enabled=ocr_enabled), - ".tif": DoclingImageParser(ocr_enabled=ocr_enabled), - ".bmp": DoclingImageParser(ocr_enabled=ocr_enabled), - ".webp": DoclingImageParser(ocr_enabled=ocr_enabled), + # Images (with OCR) - only use Docling when OCR is enabled + ".png": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), + ".jpg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), + ".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), + ".tiff": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), + ".tif": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), + ".bmp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), + ".webp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), # Media/subtitles ".vtt": DoclingVTTParser(), # Specialized XML formats diff --git a/application/requirements.txt b/application/requirements.txt index 5a17a7c7..85d1e391 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -56,6 +56,7 @@ packaging==24.2 pandas==2.3.3 openpyxl==3.1.5 pathable==0.4.4 +pdf2image>=1.17.0 pillow portalocker>=2.7.0,<3.0.0 prance==25.4.8.0 diff --git a/application/utils.py b/application/utils.py index 35b61036..5a23517d 100644 --- a/application/utils.py +++ b/application/utils.py @@ -1,7 +1,11 @@ +import base64 import hashlib +import io +import logging import os import re import uuid +from typing import List import tiktoken from flask import jsonify, make_response @@ -11,6 +15,8 @@ from application.core.model_utils import get_token_limit from application.core.settings import settings +logger = logging.getLogger(__name__) + _encoding = None @@ -215,6 +221,93 @@ def calculate_compression_threshold( return threshold +def convert_pdf_to_images( + file_path: str, + storage=None, + max_pages: int = 20, + dpi: int = 150, + image_format: str = "PNG", +) -> List[dict]: + """ + Convert PDF pages to images for LLMs that support images but not PDFs. + + This enables "synthetic PDF support" by converting each PDF page to an image + that can be sent to vision-capable LLMs like Claude. + + Args: + file_path: Path to the PDF file (can be storage path) + storage: Optional storage instance for retrieving files + max_pages: Maximum number of pages to convert (default 20 to avoid context overflow) + dpi: Resolution for rendering (default 150 for balance of quality/size) + image_format: Output format (PNG recommended for quality) + + Returns: + List of dicts with keys: + - 'data': base64-encoded image data + - 'mime_type': MIME type (e.g., 'image/png') + - 'page': Page number (1-indexed) + + Raises: + ImportError: If pdf2image is not installed + FileNotFoundError: If file doesn't exist + Exception: If conversion fails + """ + try: + from pdf2image import convert_from_path, convert_from_bytes + except ImportError: + raise ImportError( + "pdf2image is required for PDF-to-image conversion. " + "Install it with: pip install pdf2image\n" + "Also ensure poppler-utils is installed on your system." + ) + + images_data = [] + mime_type = f"image/{image_format.lower()}" + + try: + # Get PDF content either from storage or direct file path + if storage and hasattr(storage, "get_file"): + with storage.get_file(file_path) as pdf_file: + pdf_bytes = pdf_file.read() + pil_images = convert_from_bytes( + pdf_bytes, + dpi=dpi, + fmt=image_format.lower(), + first_page=1, + last_page=max_pages, + ) + else: + pil_images = convert_from_path( + file_path, + dpi=dpi, + fmt=image_format.lower(), + first_page=1, + last_page=max_pages, + ) + + for page_num, pil_image in enumerate(pil_images, start=1): + # Convert PIL image to base64 + buffer = io.BytesIO() + pil_image.save(buffer, format=image_format) + buffer.seek(0) + base64_data = base64.b64encode(buffer.read()).decode("utf-8") + + images_data.append({ + "data": base64_data, + "mime_type": mime_type, + "page": page_num, + }) + + return images_data + + except FileNotFoundError: + logger.error(f"PDF file not found: {file_path}") + raise + except Exception as e: + logger.error(f"Error converting PDF to images: {e}", exc_info=True) + raise + + def clean_text_for_tts(text: str) -> str: """ clean text for Text-to-Speech processing.