From ea090288d3eec4ea8fbdcd32a6a497a99c89189d Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 17 Mar 2025 08:52:29 +0100 Subject: [PATCH] fix: avoid exploding options cache using lru and expose size parameter (#101) Signed-off-by: Michele Dolfi --- docling_serve/app.py | 21 +++-------- docling_serve/docling_conversion.py | 55 ++++++++++++++++------------- docling_serve/settings.py | 1 + 3 files changed, 36 insertions(+), 41 deletions(-) diff --git a/docling_serve/app.py b/docling_serve/app.py index 9fb535e..35c967f 100644 --- a/docling_serve/app.py +++ b/docling_serve/app.py @@ -20,8 +20,7 @@ from fastapi import ( from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import RedirectResponse -from docling.datamodel.base_models import DocumentStream, InputFormat -from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import DocumentStream from docling_serve.datamodel.convert import ConvertDocumentsOptions from docling_serve.datamodel.requests import ( @@ -37,7 +36,7 @@ from docling_serve.datamodel.responses import ( ) from docling_serve.docling_conversion import ( convert_documents, - converters, + get_converter, get_pdf_pipeline_opts, ) from docling_serve.engines import get_orchestrator @@ -86,15 +85,8 @@ _log = logging.getLogger(__name__) @asynccontextmanager async def lifespan(app: FastAPI): # Converter with default options - pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions()) - converters[options_hash] = DocumentConverter( - format_options={ - InputFormat.PDF: pdf_format_option, - InputFormat.IMAGE: pdf_format_option, - } - ) - - converters[options_hash].initialize_pipeline(InputFormat.PDF) + pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions()) + get_converter(pdf_format_option) orchestrator = get_orchestrator() @@ -110,11 +102,6 @@ async def lifespan(app: FastAPI): except asyncio.CancelledError: _log.info("Queue processor cancelled.") - converters.clear() - - # if WITH_UI: - # gradio_ui.close() - ################################## # App creation and configuration # diff --git a/docling_serve/docling_conversion.py b/docling_serve/docling_conversion.py index 6faaf19..dbf4f68 100644 --- a/docling_serve/docling_conversion.py +++ b/docling_serve/docling_conversion.py @@ -2,6 +2,7 @@ import hashlib import json import logging from collections.abc import Iterable, Iterator +from functools import lru_cache from pathlib import Path from typing import Any, Optional, Union @@ -33,13 +34,9 @@ from docling_serve.settings import docling_serve_settings _log = logging.getLogger(__name__) -# Document converters will be preloaded and stored in a dictionary -converters: dict[bytes, DocumentConverter] = {} - - # Custom serializer for PdfFormatOption # (model_dump_json does not work with some classes) -def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str: +def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes: data = pdf_format_option.model_dump() # pipeline_options are not fully serialized by model_dump, dedicated pass @@ -64,13 +61,36 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str: ) # Serialize the dictionary to JSON with sorted keys to have consistent hashes - return json.dumps(data, sort_keys=True) + serialized_data = json.dumps(data, sort_keys=True) + options_hash = hashlib.sha1(serialized_data.encode()).digest() + return options_hash + + +# Cache of DocumentConverter objects +_options_map: dict[bytes, PdfFormatOption] = {} + + +@lru_cache(maxsize=docling_serve_settings.options_cache_size) +def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter: + pdf_format_option = _options_map[options_hash] + format_options: dict[InputFormat, FormatOption] = { + InputFormat.PDF: pdf_format_option, + InputFormat.IMAGE: pdf_format_option, + } + + return DocumentConverter(format_options=format_options) + + +def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter: + options_hash = _hash_pdf_format_option(pdf_format_option) + _options_map[options_hash] = pdf_format_option + return _get_converter_from_hash(options_hash) # Computes the PDF pipeline options and returns the PdfFormatOption and its hash def get_pdf_pipeline_opts( # noqa: C901 request: ConvertDocumentsOptions, -) -> tuple[PdfFormatOption, bytes]: +) -> PdfFormatOption: if request.ocr_engine == OcrEngine.EASYOCR: try: import easyocr # noqa: F401 @@ -172,11 +192,7 @@ def get_pdf_pipeline_opts( # noqa: C901 backend=backend, ) - serialized_data = _serialize_pdf_format_option(pdf_format_option) - - options_hash = hashlib.sha1(serialized_data.encode()).digest() - - return pdf_format_option, options_hash + return pdf_format_option def convert_documents( @@ -184,18 +200,9 @@ def convert_documents( options: ConvertDocumentsOptions, headers: Optional[dict[str, Any]] = None, ): - pdf_format_option, options_hash = get_pdf_pipeline_opts(options) - - if options_hash not in converters: - format_options: dict[InputFormat, FormatOption] = { - InputFormat.PDF: pdf_format_option, - InputFormat.IMAGE: pdf_format_option, - } - - converters[options_hash] = DocumentConverter(format_options=format_options) - _log.info(f"We now have {len(converters)} converters in memory.") - - results: Iterator[ConversionResult] = converters[options_hash].convert_all( + pdf_format_option = get_pdf_pipeline_opts(options) + converter = get_converter(pdf_format_option) + results: Iterator[ConversionResult] = converter.convert_all( sources, headers=headers, ) diff --git a/docling_serve/settings.py b/docling_serve/settings.py index f9e630e..f493e3e 100644 --- a/docling_serve/settings.py +++ b/docling_serve/settings.py @@ -30,6 +30,7 @@ class DoclingServeSettings(BaseSettings): enable_ui: bool = False artifacts_path: Optional[Path] = None + options_cache_size: int = 2 eng_kind: AsyncEngine = AsyncEngine.LOCAL eng_loc_num_workers: int = 2