fix: avoid exploding options cache using lru and expose size parameter (#101)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-11-29 08:33:50 +00:00 · 2025-03-17 08:52:29 +01:00
parent 07c48edd5d
commit ea090288d3
3 changed files with 36 additions and 41 deletions
--- a/docling_serve/app.py
+++ b/docling_serve/app.py
@@ -20,8 +20,7 @@ from fastapi import (
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse

-from docling.datamodel.base_models import DocumentStream, InputFormat
-from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import DocumentStream

 from docling_serve.datamodel.convert import ConvertDocumentsOptions
 from docling_serve.datamodel.requests import (
@@ -37,7 +36,7 @@ from docling_serve.datamodel.responses import (
 )
 from docling_serve.docling_conversion import (
    convert_documents,
-    converters,
+    get_converter,
    get_pdf_pipeline_opts,
 )
 from docling_serve.engines import get_orchestrator
@@ -86,15 +85,8 @@ _log = logging.getLogger(__name__)
@asynccontextmanager
 async def lifespan(app: FastAPI):
    # Converter with default options
-    pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
-    converters[options_hash] = DocumentConverter(
-        format_options={
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
-    )
-
-    converters[options_hash].initialize_pipeline(InputFormat.PDF)
+    pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
+    get_converter(pdf_format_option)

    orchestrator = get_orchestrator()

@@ -110,11 +102,6 @@ async def lifespan(app: FastAPI):
    except asyncio.CancelledError:
        _log.info("Queue processor cancelled.")

-    converters.clear()
-
-    # if WITH_UI:
-    #     gradio_ui.close()
-

 ##################################
 # App creation and configuration #
--- a/docling_serve/docling_conversion.py
+++ b/docling_serve/docling_conversion.py
@@ -2,6 +2,7 @@ import hashlib
 import json
 import logging
 from collections.abc import Iterable, Iterator
+from functools import lru_cache
 from pathlib import Path
 from typing import Any, Optional, Union

@@ -33,13 +34,9 @@ from docling_serve.settings import docling_serve_settings
 _log = logging.getLogger(__name__)


-# Document converters will be preloaded and stored in a dictionary
-converters: dict[bytes, DocumentConverter] = {}
-
-
 # Custom serializer for PdfFormatOption
 # (model_dump_json does not work with some classes)
-def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
+def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
    data = pdf_format_option.model_dump()

    # pipeline_options are not fully serialized by model_dump, dedicated pass
@@ -64,13 +61,36 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
        )

    # Serialize the dictionary to JSON with sorted keys to have consistent hashes
-    return json.dumps(data, sort_keys=True)
+    serialized_data = json.dumps(data, sort_keys=True)
+    options_hash = hashlib.sha1(serialized_data.encode()).digest()
+    return options_hash
+
+
+# Cache of DocumentConverter objects
+_options_map: dict[bytes, PdfFormatOption] = {}
+
+
+@lru_cache(maxsize=docling_serve_settings.options_cache_size)
+def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
+    pdf_format_option = _options_map[options_hash]
+    format_options: dict[InputFormat, FormatOption] = {
+        InputFormat.PDF: pdf_format_option,
+        InputFormat.IMAGE: pdf_format_option,
+    }
+
+    return DocumentConverter(format_options=format_options)
+
+
+def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
+    options_hash = _hash_pdf_format_option(pdf_format_option)
+    _options_map[options_hash] = pdf_format_option
+    return _get_converter_from_hash(options_hash)


 # Computes the PDF pipeline options and returns the PdfFormatOption and its hash
 def get_pdf_pipeline_opts(  # noqa: C901
    request: ConvertDocumentsOptions,
-) -> tuple[PdfFormatOption, bytes]:
+) -> PdfFormatOption:
    if request.ocr_engine == OcrEngine.EASYOCR:
        try:
            import easyocr  # noqa: F401
@@ -172,11 +192,7 @@ def get_pdf_pipeline_opts(  # noqa: C901
        backend=backend,
    )

-    serialized_data = _serialize_pdf_format_option(pdf_format_option)
-
-    options_hash = hashlib.sha1(serialized_data.encode()).digest()
-
-    return pdf_format_option, options_hash
+    return pdf_format_option


 def convert_documents(
@@ -184,18 +200,9 @@ def convert_documents(
    options: ConvertDocumentsOptions,
    headers: Optional[dict[str, Any]] = None,
 ):
-    pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
-
-    if options_hash not in converters:
-        format_options: dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
-
-        converters[options_hash] = DocumentConverter(format_options=format_options)
-        _log.info(f"We now have {len(converters)} converters in memory.")
-
-    results: Iterator[ConversionResult] = converters[options_hash].convert_all(
+    pdf_format_option = get_pdf_pipeline_opts(options)
+    converter = get_converter(pdf_format_option)
+    results: Iterator[ConversionResult] = converter.convert_all(
        sources,
        headers=headers,
    )
--- a/docling_serve/settings.py
+++ b/docling_serve/settings.py
@@ -30,6 +30,7 @@ class DoclingServeSettings(BaseSettings):

    enable_ui: bool = False
    artifacts_path: Optional[Path] = None
+    options_cache_size: int = 2

    eng_kind: AsyncEngine = AsyncEngine.LOCAL
    eng_loc_num_workers: int = 2