docling-serve/docling_serve/docling_conversion.py

import hashlib
import json
import logging
import sys
from collections.abc import Iterable, Iterator
from functools import lru_cache
from pathlib import Path
from typing import Any, Optional, Union

from fastapi import HTTPException

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
    OcrOptions,
    PdfBackend,
    PdfPipeline,
    PdfPipelineOptions,
    PictureDescriptionApiOptions,
    PictureDescriptionVlmOptions,
    TableFormerMode,
    VlmPipelineOptions,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling_core.types.doc import ImageRefMode

from docling_serve.datamodel.convert import ConvertDocumentsOptions, ocr_factory
from docling_serve.helper_functions import _to_list_of_strings
from docling_serve.settings import docling_serve_settings

_log = logging.getLogger(__name__)


# Custom serializer for PdfFormatOption
# (model_dump_json does not work with some classes)
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
    data = pdf_format_option.model_dump(serialize_as_any=True)

    # pipeline_options are not fully serialized by model_dump, dedicated pass
    if pdf_format_option.pipeline_options:
        data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
            serialize_as_any=True, mode="json"
        )

    # Replace `pipeline_cls` with a string representation
    data["pipeline_cls"] = repr(data["pipeline_cls"])

    # Replace `backend` with a string representation
    data["backend"] = repr(data["backend"])

    # Serialize the dictionary to JSON with sorted keys to have consistent hashes
    serialized_data = json.dumps(data, sort_keys=True)
    options_hash = hashlib.sha1(
        serialized_data.encode(), usedforsecurity=False
    ).digest()
    return options_hash


# Cache of DocumentConverter objects
_options_map: dict[bytes, PdfFormatOption] = {}


@lru_cache(maxsize=docling_serve_settings.options_cache_size)
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
    pdf_format_option = _options_map[options_hash]
    format_options: dict[InputFormat, FormatOption] = {
        InputFormat.PDF: pdf_format_option,
        InputFormat.IMAGE: pdf_format_option,
    }

    return DocumentConverter(format_options=format_options)


def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
    options_hash = _hash_pdf_format_option(pdf_format_option)
    _options_map[options_hash] = pdf_format_option
    return _get_converter_from_hash(options_hash)


def _parse_standard_pdf_opts(
    request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
) -> PdfPipelineOptions:
    try:
        ocr_options: OcrOptions = ocr_factory.create_options(
            kind=request.ocr_engine.value,  # type: ignore
            force_full_page_ocr=request.force_ocr,
        )
    except ImportError as err:
        raise HTTPException(
            status_code=400,
            detail="The requested OCR engine"
            f" (ocr_engine={request.ocr_engine.value})"  # type: ignore
            " is not available on this system. Please choose another OCR engine "
            "or contact your system administrator.\n"
            f"{err}",
        )

    if request.ocr_lang is not None:
        if isinstance(request.ocr_lang, str):
            ocr_options.lang = _to_list_of_strings(request.ocr_lang)
        else:
            ocr_options.lang = request.ocr_lang

    pipeline_options = PdfPipelineOptions(
        artifacts_path=artifacts_path,
        enable_remote_services=docling_serve_settings.enable_remote_services,
        document_timeout=request.document_timeout,
        do_ocr=request.do_ocr,
        ocr_options=ocr_options,
        do_table_structure=request.do_table_structure,
        do_code_enrichment=request.do_code_enrichment,
        do_formula_enrichment=request.do_formula_enrichment,
        do_picture_classification=request.do_picture_classification,
        do_picture_description=request.do_picture_description,
    )
    pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)

    if request.image_export_mode != ImageRefMode.PLACEHOLDER:
        pipeline_options.generate_page_images = True
        if request.image_export_mode == ImageRefMode.REFERENCED:
            pipeline_options.generate_picture_images = True
        if request.images_scale:
            pipeline_options.images_scale = request.images_scale

    if request.picture_description_local is not None:
        pipeline_options.picture_description_options = (
            PictureDescriptionVlmOptions.model_validate(
                request.picture_description_local.model_dump()
            )
        )

    if request.picture_description_api is not None:
        pipeline_options.picture_description_options = (
            PictureDescriptionApiOptions.model_validate(
                request.picture_description_api.model_dump()
            )
        )
    pipeline_options.picture_description_options.picture_area_threshold = (
        request.picture_description_area_threshold
    )

    return pipeline_options


def _parse_backend(request: ConvertDocumentsOptions) -> type[PdfDocumentBackend]:
    if request.pdf_backend == PdfBackend.DLPARSE_V1:
        backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
    elif request.pdf_backend == PdfBackend.DLPARSE_V2:
        backend = DoclingParseV2DocumentBackend
    elif request.pdf_backend == PdfBackend.DLPARSE_V4:
        backend = DoclingParseV4DocumentBackend
    elif request.pdf_backend == PdfBackend.PYPDFIUM2:
        backend = PyPdfiumDocumentBackend
    else:
        raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")

    return backend


def _parse_vlm_pdf_opts(
    request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
) -> VlmPipelineOptions:
    pipeline_options = VlmPipelineOptions(
        artifacts_path=artifacts_path,
        document_timeout=request.document_timeout,
    )
    pipeline_options.vlm_options = smoldocling_vlm_conversion_options
    if sys.platform == "darwin":
        try:
            import mlx_vlm  # noqa: F401

            pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
        except ImportError:
            _log.warning(
                "To run SmolDocling faster, please install mlx-vlm:\n"
                "pip install mlx-vlm"
            )
    return pipeline_options


# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
def get_pdf_pipeline_opts(
    request: ConvertDocumentsOptions,
) -> PdfFormatOption:
    artifacts_path: Optional[Path] = None
    if docling_serve_settings.artifacts_path is not None:
        if str(docling_serve_settings.artifacts_path.absolute()) == "":
            _log.info(
                "artifacts_path is an empty path, model weights will be downloaded "
                "at runtime."
            )
            artifacts_path = None
        elif docling_serve_settings.artifacts_path.is_dir():
            _log.info(
                "artifacts_path is set to a valid directory. "
                "No model weights will be downloaded at runtime."
            )
            artifacts_path = docling_serve_settings.artifacts_path
        else:
            _log.warning(
                "artifacts_path is set to an invalid directory. "
                "The system will download the model weights at runtime."
            )
            artifacts_path = None
    else:
        _log.info(
            "artifacts_path is unset. "
            "The system will download the model weights at runtime."
        )

    pipeline_options: Union[PdfPipelineOptions, VlmPipelineOptions]
    if request.pipeline == PdfPipeline.STANDARD:
        pipeline_options = _parse_standard_pdf_opts(request, artifacts_path)
        backend = _parse_backend(request)
        pdf_format_option = PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=backend,
        )

    elif request.pipeline == PdfPipeline.VLM:
        pipeline_options = _parse_vlm_pdf_opts(request, artifacts_path)
        pdf_format_option = PdfFormatOption(
            pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
        )
    else:
        raise NotImplementedError(
            f"The pipeline {request.pipeline} is not implemented."
        )

    return pdf_format_option


def convert_documents(
    sources: Iterable[Union[Path, str, DocumentStream]],
    options: ConvertDocumentsOptions,
    headers: Optional[dict[str, Any]] = None,
):
    pdf_format_option = get_pdf_pipeline_opts(options)
    converter = get_converter(pdf_format_option)
    results: Iterator[ConversionResult] = converter.convert_all(
        sources,
        headers=headers,
        page_range=options.page_range,
        max_file_size=docling_serve_settings.max_file_size,
        max_num_pages=docling_serve_settings.max_num_pages,
    )

    return results