feat: process pdfs synthetically im model does not support file natively (#2263)

* feat: process pdfs synthetically im model does not support file natively

* fix: small code optimisations
This commit is contained in:
Alex
2026-01-15 02:30:33 +02:00
committed by GitHub
parent 2c55c6cd9a
commit f61d112cea
13 changed files with 449 additions and 72 deletions

View File

@@ -1,7 +1,11 @@
import base64
import hashlib
import io
import logging
import os
import re
import uuid
from typing import List
import tiktoken
from flask import jsonify, make_response
@@ -11,6 +15,8 @@ from application.core.model_utils import get_token_limit
from application.core.settings import settings
logger = logging.getLogger(__name__)
_encoding = None
@@ -215,6 +221,93 @@ def calculate_compression_threshold(
return threshold
def convert_pdf_to_images(
file_path: str,
storage=None,
max_pages: int = 20,
dpi: int = 150,
image_format: str = "PNG",
) -> List[dict]:
"""
Convert PDF pages to images for LLMs that support images but not PDFs.
This enables "synthetic PDF support" by converting each PDF page to an image
that can be sent to vision-capable LLMs like Claude.
Args:
file_path: Path to the PDF file (can be storage path)
storage: Optional storage instance for retrieving files
max_pages: Maximum number of pages to convert (default 20 to avoid context overflow)
dpi: Resolution for rendering (default 150 for balance of quality/size)
image_format: Output format (PNG recommended for quality)
Returns:
List of dicts with keys:
- 'data': base64-encoded image data
- 'mime_type': MIME type (e.g., 'image/png')
- 'page': Page number (1-indexed)
Raises:
ImportError: If pdf2image is not installed
FileNotFoundError: If file doesn't exist
Exception: If conversion fails
"""
try:
from pdf2image import convert_from_path, convert_from_bytes
except ImportError:
raise ImportError(
"pdf2image is required for PDF-to-image conversion. "
"Install it with: pip install pdf2image\n"
"Also ensure poppler-utils is installed on your system."
)
images_data = []
mime_type = f"image/{image_format.lower()}"
try:
# Get PDF content either from storage or direct file path
if storage and hasattr(storage, "get_file"):
with storage.get_file(file_path) as pdf_file:
pdf_bytes = pdf_file.read()
pil_images = convert_from_bytes(
pdf_bytes,
dpi=dpi,
fmt=image_format.lower(),
first_page=1,
last_page=max_pages,
)
else:
pil_images = convert_from_path(
file_path,
dpi=dpi,
fmt=image_format.lower(),
first_page=1,
last_page=max_pages,
)
for page_num, pil_image in enumerate(pil_images, start=1):
# Convert PIL image to base64
buffer = io.BytesIO()
pil_image.save(buffer, format=image_format)
buffer.seek(0)
base64_data = base64.b64encode(buffer.read()).decode("utf-8")
images_data.append({
"data": base64_data,
"mime_type": mime_type,
"page": page_num,
})
return images_data
except FileNotFoundError:
logger.error(f"PDF file not found: {file_path}")
raise
except Exception as e:
logger.error(f"Error converting PDF to images: {e}", exc_info=True)
raise
def clean_text_for_tts(text: str) -> str:
"""
clean text for Text-to-Speech processing.