mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 08:33:50 +00:00
fix: avoid exploding options cache using lru and expose size parameter (#101)
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -20,8 +20,7 @@ from fastapi import (
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import RedirectResponse
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
from docling_serve.datamodel.requests import (
|
||||
@@ -37,7 +36,7 @@ from docling_serve.datamodel.responses import (
|
||||
)
|
||||
from docling_serve.docling_conversion import (
|
||||
convert_documents,
|
||||
converters,
|
||||
get_converter,
|
||||
get_pdf_pipeline_opts,
|
||||
)
|
||||
from docling_serve.engines import get_orchestrator
|
||||
@@ -86,15 +85,8 @@ _log = logging.getLogger(__name__)
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Converter with default options
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
)
|
||||
|
||||
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
||||
pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
get_converter(pdf_format_option)
|
||||
|
||||
orchestrator = get_orchestrator()
|
||||
|
||||
@@ -110,11 +102,6 @@ async def lifespan(app: FastAPI):
|
||||
except asyncio.CancelledError:
|
||||
_log.info("Queue processor cancelled.")
|
||||
|
||||
converters.clear()
|
||||
|
||||
# if WITH_UI:
|
||||
# gradio_ui.close()
|
||||
|
||||
|
||||
##################################
|
||||
# App creation and configuration #
|
||||
|
||||
@@ -2,6 +2,7 @@ import hashlib
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Iterable, Iterator
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
@@ -33,13 +34,9 @@ from docling_serve.settings import docling_serve_settings
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Document converters will be preloaded and stored in a dictionary
|
||||
converters: dict[bytes, DocumentConverter] = {}
|
||||
|
||||
|
||||
# Custom serializer for PdfFormatOption
|
||||
# (model_dump_json does not work with some classes)
|
||||
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
||||
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
||||
data = pdf_format_option.model_dump()
|
||||
|
||||
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
||||
@@ -64,13 +61,36 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
||||
)
|
||||
|
||||
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
||||
return json.dumps(data, sort_keys=True)
|
||||
serialized_data = json.dumps(data, sort_keys=True)
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||
return options_hash
|
||||
|
||||
|
||||
# Cache of DocumentConverter objects
|
||||
_options_map: dict[bytes, PdfFormatOption] = {}
|
||||
|
||||
|
||||
@lru_cache(maxsize=docling_serve_settings.options_cache_size)
|
||||
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
||||
pdf_format_option = _options_map[options_hash]
|
||||
format_options: dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
return DocumentConverter(format_options=format_options)
|
||||
|
||||
|
||||
def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
|
||||
options_hash = _hash_pdf_format_option(pdf_format_option)
|
||||
_options_map[options_hash] = pdf_format_option
|
||||
return _get_converter_from_hash(options_hash)
|
||||
|
||||
|
||||
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
||||
def get_pdf_pipeline_opts( # noqa: C901
|
||||
request: ConvertDocumentsOptions,
|
||||
) -> tuple[PdfFormatOption, bytes]:
|
||||
) -> PdfFormatOption:
|
||||
if request.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
@@ -172,11 +192,7 @@ def get_pdf_pipeline_opts( # noqa: C901
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
serialized_data = _serialize_pdf_format_option(pdf_format_option)
|
||||
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||
|
||||
return pdf_format_option, options_hash
|
||||
return pdf_format_option
|
||||
|
||||
|
||||
def convert_documents(
|
||||
@@ -184,18 +200,9 @@ def convert_documents(
|
||||
options: ConvertDocumentsOptions,
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
||||
|
||||
if options_hash not in converters:
|
||||
format_options: dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
converters[options_hash] = DocumentConverter(format_options=format_options)
|
||||
_log.info(f"We now have {len(converters)} converters in memory.")
|
||||
|
||||
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
|
||||
pdf_format_option = get_pdf_pipeline_opts(options)
|
||||
converter = get_converter(pdf_format_option)
|
||||
results: Iterator[ConversionResult] = converter.convert_all(
|
||||
sources,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
@@ -30,6 +30,7 @@ class DoclingServeSettings(BaseSettings):
|
||||
|
||||
enable_ui: bool = False
|
||||
artifacts_path: Optional[Path] = None
|
||||
options_cache_size: int = 2
|
||||
|
||||
eng_kind: AsyncEngine = AsyncEngine.LOCAL
|
||||
eng_loc_num_workers: int = 2
|
||||
|
||||
Reference in New Issue
Block a user