mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 08:33:50 +00:00
fix: avoid exploding options cache using lru and expose size parameter (#101)
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -20,8 +20,7 @@ from fastapi import (
|
|||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import RedirectResponse
|
from fastapi.responses import RedirectResponse
|
||||||
|
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream
|
||||||
from docling.document_converter import DocumentConverter
|
|
||||||
|
|
||||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||||
from docling_serve.datamodel.requests import (
|
from docling_serve.datamodel.requests import (
|
||||||
@@ -37,7 +36,7 @@ from docling_serve.datamodel.responses import (
|
|||||||
)
|
)
|
||||||
from docling_serve.docling_conversion import (
|
from docling_serve.docling_conversion import (
|
||||||
convert_documents,
|
convert_documents,
|
||||||
converters,
|
get_converter,
|
||||||
get_pdf_pipeline_opts,
|
get_pdf_pipeline_opts,
|
||||||
)
|
)
|
||||||
from docling_serve.engines import get_orchestrator
|
from docling_serve.engines import get_orchestrator
|
||||||
@@ -86,15 +85,8 @@ _log = logging.getLogger(__name__)
|
|||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
# Converter with default options
|
# Converter with default options
|
||||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||||
converters[options_hash] = DocumentConverter(
|
get_converter(pdf_format_option)
|
||||||
format_options={
|
|
||||||
InputFormat.PDF: pdf_format_option,
|
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
|
||||||
|
|
||||||
orchestrator = get_orchestrator()
|
orchestrator = get_orchestrator()
|
||||||
|
|
||||||
@@ -110,11 +102,6 @@ async def lifespan(app: FastAPI):
|
|||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
_log.info("Queue processor cancelled.")
|
_log.info("Queue processor cancelled.")
|
||||||
|
|
||||||
converters.clear()
|
|
||||||
|
|
||||||
# if WITH_UI:
|
|
||||||
# gradio_ui.close()
|
|
||||||
|
|
||||||
|
|
||||||
##################################
|
##################################
|
||||||
# App creation and configuration #
|
# App creation and configuration #
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import hashlib
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterable, Iterator
|
||||||
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
@@ -33,13 +34,9 @@ from docling_serve.settings import docling_serve_settings
|
|||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Document converters will be preloaded and stored in a dictionary
|
|
||||||
converters: dict[bytes, DocumentConverter] = {}
|
|
||||||
|
|
||||||
|
|
||||||
# Custom serializer for PdfFormatOption
|
# Custom serializer for PdfFormatOption
|
||||||
# (model_dump_json does not work with some classes)
|
# (model_dump_json does not work with some classes)
|
||||||
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
||||||
data = pdf_format_option.model_dump()
|
data = pdf_format_option.model_dump()
|
||||||
|
|
||||||
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
||||||
@@ -64,13 +61,36 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
||||||
return json.dumps(data, sort_keys=True)
|
serialized_data = json.dumps(data, sort_keys=True)
|
||||||
|
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||||
|
return options_hash
|
||||||
|
|
||||||
|
|
||||||
|
# Cache of DocumentConverter objects
|
||||||
|
_options_map: dict[bytes, PdfFormatOption] = {}
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=docling_serve_settings.options_cache_size)
|
||||||
|
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
||||||
|
pdf_format_option = _options_map[options_hash]
|
||||||
|
format_options: dict[InputFormat, FormatOption] = {
|
||||||
|
InputFormat.PDF: pdf_format_option,
|
||||||
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
|
}
|
||||||
|
|
||||||
|
return DocumentConverter(format_options=format_options)
|
||||||
|
|
||||||
|
|
||||||
|
def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
|
||||||
|
options_hash = _hash_pdf_format_option(pdf_format_option)
|
||||||
|
_options_map[options_hash] = pdf_format_option
|
||||||
|
return _get_converter_from_hash(options_hash)
|
||||||
|
|
||||||
|
|
||||||
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
||||||
def get_pdf_pipeline_opts( # noqa: C901
|
def get_pdf_pipeline_opts( # noqa: C901
|
||||||
request: ConvertDocumentsOptions,
|
request: ConvertDocumentsOptions,
|
||||||
) -> tuple[PdfFormatOption, bytes]:
|
) -> PdfFormatOption:
|
||||||
if request.ocr_engine == OcrEngine.EASYOCR:
|
if request.ocr_engine == OcrEngine.EASYOCR:
|
||||||
try:
|
try:
|
||||||
import easyocr # noqa: F401
|
import easyocr # noqa: F401
|
||||||
@@ -172,11 +192,7 @@ def get_pdf_pipeline_opts( # noqa: C901
|
|||||||
backend=backend,
|
backend=backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
serialized_data = _serialize_pdf_format_option(pdf_format_option)
|
return pdf_format_option
|
||||||
|
|
||||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
|
||||||
|
|
||||||
return pdf_format_option, options_hash
|
|
||||||
|
|
||||||
|
|
||||||
def convert_documents(
|
def convert_documents(
|
||||||
@@ -184,18 +200,9 @@ def convert_documents(
|
|||||||
options: ConvertDocumentsOptions,
|
options: ConvertDocumentsOptions,
|
||||||
headers: Optional[dict[str, Any]] = None,
|
headers: Optional[dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
pdf_format_option = get_pdf_pipeline_opts(options)
|
||||||
|
converter = get_converter(pdf_format_option)
|
||||||
if options_hash not in converters:
|
results: Iterator[ConversionResult] = converter.convert_all(
|
||||||
format_options: dict[InputFormat, FormatOption] = {
|
|
||||||
InputFormat.PDF: pdf_format_option,
|
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
|
||||||
}
|
|
||||||
|
|
||||||
converters[options_hash] = DocumentConverter(format_options=format_options)
|
|
||||||
_log.info(f"We now have {len(converters)} converters in memory.")
|
|
||||||
|
|
||||||
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
|
|
||||||
sources,
|
sources,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ class DoclingServeSettings(BaseSettings):
|
|||||||
|
|
||||||
enable_ui: bool = False
|
enable_ui: bool = False
|
||||||
artifacts_path: Optional[Path] = None
|
artifacts_path: Optional[Path] = None
|
||||||
|
options_cache_size: int = 2
|
||||||
|
|
||||||
eng_kind: AsyncEngine = AsyncEngine.LOCAL
|
eng_kind: AsyncEngine = AsyncEngine.LOCAL
|
||||||
eng_loc_num_workers: int = 2
|
eng_loc_num_workers: int = 2
|
||||||
|
|||||||
Reference in New Issue
Block a user