mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 08:33:50 +00:00
368 lines
11 KiB
Python
368 lines
11 KiB
Python
# Define the input options for the API
|
|
from typing import Annotated, Any, Optional
|
|
|
|
from pydantic import AnyUrl, BaseModel, Field, model_validator
|
|
from typing_extensions import Self
|
|
|
|
from docling.datamodel.base_models import InputFormat, OutputFormat
|
|
from docling.datamodel.pipeline_options import (
|
|
EasyOcrOptions,
|
|
PdfBackend,
|
|
PdfPipeline,
|
|
TableFormerMode,
|
|
TableStructureOptions,
|
|
)
|
|
from docling.datamodel.settings import (
|
|
DEFAULT_PAGE_RANGE,
|
|
PageRange,
|
|
)
|
|
from docling.models.factories import get_ocr_factory
|
|
from docling_core.types.doc import ImageRefMode
|
|
|
|
from docling_serve.settings import docling_serve_settings
|
|
|
|
ocr_factory = get_ocr_factory(
|
|
allow_external_plugins=docling_serve_settings.allow_external_plugins
|
|
)
|
|
ocr_engines_enum = ocr_factory.get_enum()
|
|
|
|
|
|
class PictureDescriptionLocal(BaseModel):
|
|
repo_id: Annotated[
|
|
str,
|
|
Field(
|
|
description="Repository id from the Hugging Face Hub.",
|
|
examples=[
|
|
"HuggingFaceTB/SmolVLM-256M-Instruct",
|
|
"ibm-granite/granite-vision-3.2-2b",
|
|
],
|
|
),
|
|
]
|
|
prompt: Annotated[
|
|
str,
|
|
Field(
|
|
description="Prompt used when calling the vision-language model.",
|
|
examples=[
|
|
"Describe this image in a few sentences.",
|
|
"This is a figure from a document. Provide a detailed description of it.",
|
|
],
|
|
),
|
|
] = "Describe this image in a few sentences."
|
|
generation_config: Annotated[
|
|
dict[str, Any],
|
|
Field(
|
|
description="Config from https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig",
|
|
examples=[{"max_new_tokens": 200, "do_sample": False}],
|
|
),
|
|
] = {"max_new_tokens": 200, "do_sample": False}
|
|
|
|
|
|
class PictureDescriptionApi(BaseModel):
|
|
url: Annotated[
|
|
AnyUrl,
|
|
Field(
|
|
description="Endpoint which accepts openai-api compatible requests.",
|
|
examples=[
|
|
AnyUrl(
|
|
"http://localhost:8000/v1/chat/completions"
|
|
), # example of a local vllm api
|
|
AnyUrl(
|
|
"http://localhost:11434/v1/chat/completions"
|
|
), # example of ollama
|
|
],
|
|
),
|
|
]
|
|
headers: Annotated[
|
|
dict[str, str],
|
|
Field(
|
|
description="Headers used for calling the API endpoint. For example, it could include authentication headers."
|
|
),
|
|
] = {}
|
|
params: Annotated[
|
|
dict[str, Any],
|
|
Field(
|
|
description="Model parameters.",
|
|
examples=[
|
|
{ # on vllm
|
|
"model": "HuggingFaceTB/SmolVLM-256M-Instruct",
|
|
"max_completion_tokens": 200,
|
|
},
|
|
{ # on vllm
|
|
"model": "ibm-granite/granite-vision-3.2-2b",
|
|
"max_completion_tokens": 200,
|
|
},
|
|
{ # on ollama
|
|
"model": "granite3.2-vision:2b"
|
|
},
|
|
],
|
|
),
|
|
] = {}
|
|
timeout: Annotated[float, Field(description="Timeout for the API request.")] = 20
|
|
prompt: Annotated[
|
|
str,
|
|
Field(
|
|
description="Prompt used when calling the vision-language model.",
|
|
examples=[
|
|
"Describe this image in a few sentences.",
|
|
"This is a figures from a document. Provide a detailed description of it.",
|
|
],
|
|
),
|
|
] = "Describe this image in a few sentences."
|
|
|
|
|
|
class ConvertDocumentsOptions(BaseModel):
|
|
from_formats: Annotated[
|
|
list[InputFormat],
|
|
Field(
|
|
description=(
|
|
"Input format(s) to convert from. String or list of strings. "
|
|
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
|
"Optional, defaults to all formats."
|
|
),
|
|
examples=[[v.value for v in InputFormat]],
|
|
),
|
|
] = list(InputFormat)
|
|
|
|
to_formats: Annotated[
|
|
list[OutputFormat],
|
|
Field(
|
|
description=(
|
|
"Output format(s) to convert to. String or list of strings. "
|
|
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
|
"Optional, defaults to Markdown."
|
|
),
|
|
examples=[[OutputFormat.MARKDOWN]],
|
|
),
|
|
] = [OutputFormat.MARKDOWN]
|
|
|
|
image_export_mode: Annotated[
|
|
ImageRefMode,
|
|
Field(
|
|
description=(
|
|
"Image export mode for the document (in case of JSON,"
|
|
" Markdown or HTML). "
|
|
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
|
"Optional, defaults to Embedded."
|
|
),
|
|
examples=[ImageRefMode.EMBEDDED.value],
|
|
# pattern="embedded|placeholder|referenced",
|
|
),
|
|
] = ImageRefMode.EMBEDDED
|
|
|
|
do_ocr: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, the bitmap content will be processed using OCR. "
|
|
"Boolean. Optional, defaults to true"
|
|
),
|
|
# examples=[True],
|
|
),
|
|
] = True
|
|
|
|
force_ocr: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, replace existing text with OCR-generated "
|
|
"text over content. Boolean. Optional, defaults to false."
|
|
),
|
|
# examples=[False],
|
|
),
|
|
] = False
|
|
|
|
ocr_engine: Annotated[ # type: ignore
|
|
ocr_engines_enum,
|
|
Field(
|
|
description=(
|
|
"The OCR engine to use. String. "
|
|
f"Allowed values: {', '.join([v.value for v in ocr_engines_enum])}. "
|
|
"Optional, defaults to easyocr."
|
|
),
|
|
examples=[EasyOcrOptions.kind],
|
|
),
|
|
] = ocr_engines_enum(EasyOcrOptions.kind) # type: ignore
|
|
|
|
ocr_lang: Annotated[
|
|
Optional[list[str]],
|
|
Field(
|
|
description=(
|
|
"List of languages used by the OCR engine. "
|
|
"Note that each OCR engine has "
|
|
"different values for the language names. String or list of strings. "
|
|
"Optional, defaults to empty."
|
|
),
|
|
examples=[["fr", "de", "es", "en"]],
|
|
),
|
|
] = None
|
|
|
|
pdf_backend: Annotated[
|
|
PdfBackend,
|
|
Field(
|
|
description=(
|
|
"The PDF backend to use. String. "
|
|
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
|
f"Optional, defaults to {PdfBackend.DLPARSE_V4.value}."
|
|
),
|
|
examples=[PdfBackend.DLPARSE_V4],
|
|
),
|
|
] = PdfBackend.DLPARSE_V4
|
|
|
|
table_mode: Annotated[
|
|
TableFormerMode,
|
|
Field(
|
|
description=(
|
|
"Mode to use for table structure, String. "
|
|
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
|
"Optional, defaults to fast."
|
|
),
|
|
examples=[TableStructureOptions().mode],
|
|
# pattern="fast|accurate",
|
|
),
|
|
] = TableStructureOptions().mode
|
|
|
|
pipeline: Annotated[
|
|
PdfPipeline,
|
|
Field(description="Choose the pipeline to process PDF or image files."),
|
|
] = PdfPipeline.STANDARD
|
|
|
|
page_range: Annotated[
|
|
PageRange,
|
|
Field(
|
|
description="Only convert a range of pages. The page number starts at 1.",
|
|
examples=[(1, 4)],
|
|
),
|
|
] = DEFAULT_PAGE_RANGE
|
|
|
|
document_timeout: Annotated[
|
|
float,
|
|
Field(
|
|
description="The timeout for processing each document, in seconds.",
|
|
gt=0,
|
|
le=docling_serve_settings.max_document_timeout,
|
|
),
|
|
] = docling_serve_settings.max_document_timeout
|
|
|
|
abort_on_error: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
|
),
|
|
# examples=[False],
|
|
),
|
|
] = False
|
|
|
|
return_as_file: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"Return the output as a zip file "
|
|
"(will happen anyway if multiple files are generated). "
|
|
"Boolean. Optional, defaults to false."
|
|
),
|
|
examples=[False],
|
|
),
|
|
] = False
|
|
|
|
do_table_structure: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, the table structure will be extracted. "
|
|
"Boolean. Optional, defaults to true."
|
|
),
|
|
examples=[True],
|
|
),
|
|
] = True
|
|
|
|
include_images: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, images will be extracted from the document. "
|
|
"Boolean. Optional, defaults to true."
|
|
),
|
|
examples=[True],
|
|
),
|
|
] = True
|
|
|
|
images_scale: Annotated[
|
|
float,
|
|
Field(
|
|
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
|
examples=[2.0],
|
|
),
|
|
] = 2.0
|
|
|
|
do_code_enrichment: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, perform OCR code enrichment. "
|
|
"Boolean. Optional, defaults to false."
|
|
),
|
|
examples=[False],
|
|
),
|
|
] = False
|
|
|
|
do_formula_enrichment: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, perform formula OCR, return LaTeX code. "
|
|
"Boolean. Optional, defaults to false."
|
|
),
|
|
examples=[False],
|
|
),
|
|
] = False
|
|
|
|
do_picture_classification: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, classify pictures in documents. "
|
|
"Boolean. Optional, defaults to false."
|
|
),
|
|
examples=[False],
|
|
),
|
|
] = False
|
|
|
|
do_picture_description: Annotated[
|
|
bool,
|
|
Field(
|
|
description=(
|
|
"If enabled, describe pictures in documents. "
|
|
"Boolean. Optional, defaults to false."
|
|
),
|
|
examples=[False],
|
|
),
|
|
] = False
|
|
|
|
picture_description_local: Annotated[
|
|
Optional[PictureDescriptionLocal],
|
|
Field(
|
|
description="Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api."
|
|
),
|
|
] = None
|
|
|
|
picture_description_api: Annotated[
|
|
Optional[PictureDescriptionApi],
|
|
Field(
|
|
description="API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local."
|
|
),
|
|
] = None
|
|
|
|
@model_validator(mode="after")
|
|
def picture_description_exclusivity(self) -> Self:
|
|
# Validate picture description options
|
|
if (
|
|
self.picture_description_local is not None
|
|
and self.picture_description_api is not None
|
|
):
|
|
raise ValueError(
|
|
"The parameters picture_description_local and picture_description_api are mutually exclusive, only one of them can be set."
|
|
)
|
|
|
|
return self
|