docling-serve/docling_serve/datamodel/convert.py

# Define the input options for the API
from typing import Annotated, Any, Optional

from pydantic import AnyUrl, BaseModel, Field, model_validator
from typing_extensions import Self

from docling.datamodel.base_models import InputFormat, OutputFormat
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfBackend,
    PdfPipeline,
    TableFormerMode,
    TableStructureOptions,
)
from docling.datamodel.settings import (
    DEFAULT_PAGE_RANGE,
    PageRange,
)
from docling.models.factories import get_ocr_factory
from docling_core.types.doc import ImageRefMode

from docling_serve.settings import docling_serve_settings

ocr_factory = get_ocr_factory(
    allow_external_plugins=docling_serve_settings.allow_external_plugins
)
ocr_engines_enum = ocr_factory.get_enum()


class PictureDescriptionLocal(BaseModel):
    repo_id: Annotated[
        str,
        Field(
            description="Repository id from the Hugging Face Hub.",
            examples=[
                "HuggingFaceTB/SmolVLM-256M-Instruct",
                "ibm-granite/granite-vision-3.2-2b",
            ],
        ),
    ]
    prompt: Annotated[
        str,
        Field(
            description="Prompt used when calling the vision-language model.",
            examples=[
                "Describe this image in a few sentences.",
                "This is a figure from a document. Provide a detailed description of it.",
            ],
        ),
    ] = "Describe this image in a few sentences."
    generation_config: Annotated[
        dict[str, Any],
        Field(
            description="Config from https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig",
            examples=[{"max_new_tokens": 200, "do_sample": False}],
        ),
    ] = {"max_new_tokens": 200, "do_sample": False}


class PictureDescriptionApi(BaseModel):
    url: Annotated[
        AnyUrl,
        Field(
            description="Endpoint which accepts openai-api compatible requests.",
            examples=[
                AnyUrl(
                    "http://localhost:8000/v1/chat/completions"
                ),  # example of a local vllm api
                AnyUrl(
                    "http://localhost:11434/v1/chat/completions"
                ),  # example of ollama
            ],
        ),
    ]
    headers: Annotated[
        dict[str, str],
        Field(
            description="Headers used for calling the API endpoint. For example, it could include authentication headers."
        ),
    ] = {}
    params: Annotated[
        dict[str, Any],
        Field(
            description="Model parameters.",
            examples=[
                {  # on vllm
                    "model": "HuggingFaceTB/SmolVLM-256M-Instruct",
                    "max_completion_tokens": 200,
                },
                {  # on vllm
                    "model": "ibm-granite/granite-vision-3.2-2b",
                    "max_completion_tokens": 200,
                },
                {  # on ollama
                    "model": "granite3.2-vision:2b"
                },
            ],
        ),
    ] = {}
    timeout: Annotated[float, Field(description="Timeout for the API request.")] = 20
    prompt: Annotated[
        str,
        Field(
            description="Prompt used when calling the vision-language model.",
            examples=[
                "Describe this image in a few sentences.",
                "This is a figures from a document. Provide a detailed description of it.",
            ],
        ),
    ] = "Describe this image in a few sentences."


class ConvertDocumentsOptions(BaseModel):
    from_formats: Annotated[
        list[InputFormat],
        Field(
            description=(
                "Input format(s) to convert from. String or list of strings. "
                f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
                "Optional, defaults to all formats."
            ),
            examples=[[v.value for v in InputFormat]],
        ),
    ] = list(InputFormat)

    to_formats: Annotated[
        list[OutputFormat],
        Field(
            description=(
                "Output format(s) to convert to. String or list of strings. "
                f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
                "Optional, defaults to Markdown."
            ),
            examples=[[OutputFormat.MARKDOWN]],
        ),
    ] = [OutputFormat.MARKDOWN]

    image_export_mode: Annotated[
        ImageRefMode,
        Field(
            description=(
                "Image export mode for the document (in case of JSON,"
                " Markdown or HTML). "
                f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
                "Optional, defaults to Embedded."
            ),
            examples=[ImageRefMode.EMBEDDED.value],
            # pattern="embedded|placeholder|referenced",
        ),
    ] = ImageRefMode.EMBEDDED

    do_ocr: Annotated[
        bool,
        Field(
            description=(
                "If enabled, the bitmap content will be processed using OCR. "
                "Boolean. Optional, defaults to true"
            ),
            # examples=[True],
        ),
    ] = True

    force_ocr: Annotated[
        bool,
        Field(
            description=(
                "If enabled, replace existing text with OCR-generated "
                "text over content. Boolean. Optional, defaults to false."
            ),
            # examples=[False],
        ),
    ] = False

    ocr_engine: Annotated[  # type: ignore
        ocr_engines_enum,
        Field(
            description=(
                "The OCR engine to use. String. "
                f"Allowed values: {', '.join([v.value for v in ocr_engines_enum])}. "
                "Optional, defaults to easyocr."
            ),
            examples=[EasyOcrOptions.kind],
        ),
    ] = ocr_engines_enum(EasyOcrOptions.kind)  # type: ignore

    ocr_lang: Annotated[
        Optional[list[str]],
        Field(
            description=(
                "List of languages used by the OCR engine. "
                "Note that each OCR engine has "
                "different values for the language names. String or list of strings. "
                "Optional, defaults to empty."
            ),
            examples=[["fr", "de", "es", "en"]],
        ),
    ] = None

    pdf_backend: Annotated[
        PdfBackend,
        Field(
            description=(
                "The PDF backend to use. String. "
                f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
                f"Optional, defaults to {PdfBackend.DLPARSE_V4.value}."
            ),
            examples=[PdfBackend.DLPARSE_V4],
        ),
    ] = PdfBackend.DLPARSE_V4

    table_mode: Annotated[
        TableFormerMode,
        Field(
            description=(
                "Mode to use for table structure, String. "
                f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
                "Optional, defaults to fast."
            ),
            examples=[TableStructureOptions().mode],
            # pattern="fast|accurate",
        ),
    ] = TableStructureOptions().mode

    pipeline: Annotated[
        PdfPipeline,
        Field(description="Choose the pipeline to process PDF or image files."),
    ] = PdfPipeline.STANDARD

    page_range: Annotated[
        PageRange,
        Field(
            description="Only convert a range of pages. The page number starts at 1.",
            examples=[(1, 4)],
        ),
    ] = DEFAULT_PAGE_RANGE

    document_timeout: Annotated[
        float,
        Field(
            description="The timeout for processing each document, in seconds.",
            gt=0,
            le=docling_serve_settings.max_document_timeout,
        ),
    ] = docling_serve_settings.max_document_timeout

    abort_on_error: Annotated[
        bool,
        Field(
            description=(
                "Abort on error if enabled. Boolean. Optional, defaults to false."
            ),
            # examples=[False],
        ),
    ] = False

    return_as_file: Annotated[
        bool,
        Field(
            description=(
                "Return the output as a zip file "
                "(will happen anyway if multiple files are generated). "
                "Boolean. Optional, defaults to false."
            ),
            examples=[False],
        ),
    ] = False

    do_table_structure: Annotated[
        bool,
        Field(
            description=(
                "If enabled, the table structure will be extracted. "
                "Boolean. Optional, defaults to true."
            ),
            examples=[True],
        ),
    ] = True

    include_images: Annotated[
        bool,
        Field(
            description=(
                "If enabled, images will be extracted from the document. "
                "Boolean. Optional, defaults to true."
            ),
            examples=[True],
        ),
    ] = True

    images_scale: Annotated[
        float,
        Field(
            description="Scale factor for images. Float. Optional, defaults to 2.0.",
            examples=[2.0],
        ),
    ] = 2.0

    do_code_enrichment: Annotated[
        bool,
        Field(
            description=(
                "If enabled, perform OCR code enrichment. "
                "Boolean. Optional, defaults to false."
            ),
            examples=[False],
        ),
    ] = False

    do_formula_enrichment: Annotated[
        bool,
        Field(
            description=(
                "If enabled, perform formula OCR, return LaTeX code. "
                "Boolean. Optional, defaults to false."
            ),
            examples=[False],
        ),
    ] = False

    do_picture_classification: Annotated[
        bool,
        Field(
            description=(
                "If enabled, classify pictures in documents. "
                "Boolean. Optional, defaults to false."
            ),
            examples=[False],
        ),
    ] = False

    do_picture_description: Annotated[
        bool,
        Field(
            description=(
                "If enabled, describe pictures in documents. "
                "Boolean. Optional, defaults to false."
            ),
            examples=[False],
        ),
    ] = False

    picture_description_local: Annotated[
        Optional[PictureDescriptionLocal],
        Field(
            description="Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api."
        ),
    ] = None

    picture_description_api: Annotated[
        Optional[PictureDescriptionApi],
        Field(
            description="API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local."
        ),
    ] = None

    @model_validator(mode="after")
    def picture_description_exclusivity(self) -> Self:
        # Validate picture description options
        if (
            self.picture_description_local is not None
            and self.picture_description_api is not None
        ):
            raise ValueError(
                "The parameters picture_description_local and picture_description_api are mutually exclusive, only one of them can be set."
            )

        return self