fix: avoid exploding options cache using lru and expose size parameter (#101)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-03-17 08:52:29 +01:00
committed by GitHub
parent 07c48edd5d
commit ea090288d3
3 changed files with 36 additions and 41 deletions

View File

@@ -20,8 +20,7 @@ from fastapi import (
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import RedirectResponse
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import DocumentStream
from docling_serve.datamodel.convert import ConvertDocumentsOptions
from docling_serve.datamodel.requests import (
@@ -37,7 +36,7 @@ from docling_serve.datamodel.responses import (
)
from docling_serve.docling_conversion import (
convert_documents,
converters,
get_converter,
get_pdf_pipeline_opts,
)
from docling_serve.engines import get_orchestrator
@@ -86,15 +85,8 @@ _log = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
# Converter with default options
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
)
converters[options_hash].initialize_pipeline(InputFormat.PDF)
pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
get_converter(pdf_format_option)
orchestrator = get_orchestrator()
@@ -110,11 +102,6 @@ async def lifespan(app: FastAPI):
except asyncio.CancelledError:
_log.info("Queue processor cancelled.")
converters.clear()
# if WITH_UI:
# gradio_ui.close()
##################################
# App creation and configuration #

View File

@@ -2,6 +2,7 @@ import hashlib
import json
import logging
from collections.abc import Iterable, Iterator
from functools import lru_cache
from pathlib import Path
from typing import Any, Optional, Union
@@ -33,13 +34,9 @@ from docling_serve.settings import docling_serve_settings
_log = logging.getLogger(__name__)
# Document converters will be preloaded and stored in a dictionary
converters: dict[bytes, DocumentConverter] = {}
# Custom serializer for PdfFormatOption
# (model_dump_json does not work with some classes)
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
data = pdf_format_option.model_dump()
# pipeline_options are not fully serialized by model_dump, dedicated pass
@@ -64,13 +61,36 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
)
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
return json.dumps(data, sort_keys=True)
serialized_data = json.dumps(data, sort_keys=True)
options_hash = hashlib.sha1(serialized_data.encode()).digest()
return options_hash
# Cache of DocumentConverter objects
_options_map: dict[bytes, PdfFormatOption] = {}
@lru_cache(maxsize=docling_serve_settings.options_cache_size)
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
pdf_format_option = _options_map[options_hash]
format_options: dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
return DocumentConverter(format_options=format_options)
def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
options_hash = _hash_pdf_format_option(pdf_format_option)
_options_map[options_hash] = pdf_format_option
return _get_converter_from_hash(options_hash)
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
def get_pdf_pipeline_opts( # noqa: C901
request: ConvertDocumentsOptions,
) -> tuple[PdfFormatOption, bytes]:
) -> PdfFormatOption:
if request.ocr_engine == OcrEngine.EASYOCR:
try:
import easyocr # noqa: F401
@@ -172,11 +192,7 @@ def get_pdf_pipeline_opts( # noqa: C901
backend=backend,
)
serialized_data = _serialize_pdf_format_option(pdf_format_option)
options_hash = hashlib.sha1(serialized_data.encode()).digest()
return pdf_format_option, options_hash
return pdf_format_option
def convert_documents(
@@ -184,18 +200,9 @@ def convert_documents(
options: ConvertDocumentsOptions,
headers: Optional[dict[str, Any]] = None,
):
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
if options_hash not in converters:
format_options: dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
converters[options_hash] = DocumentConverter(format_options=format_options)
_log.info(f"We now have {len(converters)} converters in memory.")
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
pdf_format_option = get_pdf_pipeline_opts(options)
converter = get_converter(pdf_format_option)
results: Iterator[ConversionResult] = converter.convert_all(
sources,
headers=headers,
)

View File

@@ -30,6 +30,7 @@ class DoclingServeSettings(BaseSettings):
enable_ui: bool = False
artifacts_path: Optional[Path] = None
options_cache_size: int = 2
eng_kind: AsyncEngine = AsyncEngine.LOCAL
eng_loc_num_workers: int = 2