From ea090288d3eec4ea8fbdcd32a6a497a99c89189d Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Mon, 17 Mar 2025 08:52:29 +0100
Subject: [PATCH] fix: avoid exploding options cache using lru and expose size
 parameter (#101)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling_serve/app.py                | 21 +++--------
 docling_serve/docling_conversion.py | 55 ++++++++++++++++-------------
 docling_serve/settings.py           |  1 +
 3 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/docling_serve/app.py b/docling_serve/app.py
index 9fb535e..35c967f 100644
--- a/docling_serve/app.py
+++ b/docling_serve/app.py
@@ -20,8 +20,7 @@ from fastapi import (
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
 
-from docling.datamodel.base_models import DocumentStream, InputFormat
-from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import DocumentStream
 
 from docling_serve.datamodel.convert import ConvertDocumentsOptions
 from docling_serve.datamodel.requests import (
@@ -37,7 +36,7 @@ from docling_serve.datamodel.responses import (
 )
 from docling_serve.docling_conversion import (
     convert_documents,
-    converters,
+    get_converter,
     get_pdf_pipeline_opts,
 )
 from docling_serve.engines import get_orchestrator
@@ -86,15 +85,8 @@ _log = logging.getLogger(__name__)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # Converter with default options
-    pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
-    converters[options_hash] = DocumentConverter(
-        format_options={
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
-    )
-
-    converters[options_hash].initialize_pipeline(InputFormat.PDF)
+    pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
+    get_converter(pdf_format_option)
 
     orchestrator = get_orchestrator()
 
@@ -110,11 +102,6 @@ async def lifespan(app: FastAPI):
     except asyncio.CancelledError:
         _log.info("Queue processor cancelled.")
 
-    converters.clear()
-
-    # if WITH_UI:
-    #     gradio_ui.close()
-
 
 ##################################
 # App creation and configuration #
diff --git a/docling_serve/docling_conversion.py b/docling_serve/docling_conversion.py
index 6faaf19..dbf4f68 100644
--- a/docling_serve/docling_conversion.py
+++ b/docling_serve/docling_conversion.py
@@ -2,6 +2,7 @@ import hashlib
 import json
 import logging
 from collections.abc import Iterable, Iterator
+from functools import lru_cache
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -33,13 +34,9 @@ from docling_serve.settings import docling_serve_settings
 _log = logging.getLogger(__name__)
 
 
-# Document converters will be preloaded and stored in a dictionary
-converters: dict[bytes, DocumentConverter] = {}
-
-
 # Custom serializer for PdfFormatOption
 # (model_dump_json does not work with some classes)
-def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
+def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
     data = pdf_format_option.model_dump()
 
     # pipeline_options are not fully serialized by model_dump, dedicated pass
@@ -64,13 +61,36 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
         )
 
     # Serialize the dictionary to JSON with sorted keys to have consistent hashes
-    return json.dumps(data, sort_keys=True)
+    serialized_data = json.dumps(data, sort_keys=True)
+    options_hash = hashlib.sha1(serialized_data.encode()).digest()
+    return options_hash
+
+
+# Cache of DocumentConverter objects
+_options_map: dict[bytes, PdfFormatOption] = {}
+
+
+@lru_cache(maxsize=docling_serve_settings.options_cache_size)
+def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
+    pdf_format_option = _options_map[options_hash]
+    format_options: dict[InputFormat, FormatOption] = {
+        InputFormat.PDF: pdf_format_option,
+        InputFormat.IMAGE: pdf_format_option,
+    }
+
+    return DocumentConverter(format_options=format_options)
+
+
+def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
+    options_hash = _hash_pdf_format_option(pdf_format_option)
+    _options_map[options_hash] = pdf_format_option
+    return _get_converter_from_hash(options_hash)
 
 
 # Computes the PDF pipeline options and returns the PdfFormatOption and its hash
 def get_pdf_pipeline_opts(  # noqa: C901
     request: ConvertDocumentsOptions,
-) -> tuple[PdfFormatOption, bytes]:
+) -> PdfFormatOption:
     if request.ocr_engine == OcrEngine.EASYOCR:
         try:
             import easyocr  # noqa: F401
@@ -172,11 +192,7 @@ def get_pdf_pipeline_opts(  # noqa: C901
         backend=backend,
     )
 
-    serialized_data = _serialize_pdf_format_option(pdf_format_option)
-
-    options_hash = hashlib.sha1(serialized_data.encode()).digest()
-
-    return pdf_format_option, options_hash
+    return pdf_format_option
 
 
 def convert_documents(
@@ -184,18 +200,9 @@ def convert_documents(
     options: ConvertDocumentsOptions,
     headers: Optional[dict[str, Any]] = None,
 ):
-    pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
-
-    if options_hash not in converters:
-        format_options: dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
-
-        converters[options_hash] = DocumentConverter(format_options=format_options)
-        _log.info(f"We now have {len(converters)} converters in memory.")
-
-    results: Iterator[ConversionResult] = converters[options_hash].convert_all(
+    pdf_format_option = get_pdf_pipeline_opts(options)
+    converter = get_converter(pdf_format_option)
+    results: Iterator[ConversionResult] = converter.convert_all(
         sources,
         headers=headers,
     )
diff --git a/docling_serve/settings.py b/docling_serve/settings.py
index f9e630e..f493e3e 100644
--- a/docling_serve/settings.py
+++ b/docling_serve/settings.py
@@ -30,6 +30,7 @@ class DoclingServeSettings(BaseSettings):
 
     enable_ui: bool = False
     artifacts_path: Optional[Path] = None
+    options_cache_size: int = 2
 
     eng_kind: AsyncEngine = AsyncEngine.LOCAL
     eng_loc_num_workers: int = 2