mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 00:23:36 +00:00
feat: expose picture description options (#148)
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
# Define the input options for the API
|
||||
from typing import Annotated, Optional
|
||||
from typing import Annotated, Any, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import AnyUrl, BaseModel, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.datamodel.base_models import InputFormat, OutputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
@@ -26,6 +27,89 @@ ocr_factory = get_ocr_factory(
|
||||
ocr_engines_enum = ocr_factory.get_enum()
|
||||
|
||||
|
||||
class PictureDescriptionLocal(BaseModel):
|
||||
repo_id: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Repository id from the Hugging Face Hub.",
|
||||
examples=[
|
||||
"HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||
"ibm-granite/granite-vision-3.2-2b",
|
||||
],
|
||||
),
|
||||
]
|
||||
prompt: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Prompt used when calling the vision-language model.",
|
||||
examples=[
|
||||
"Describe this image in a few sentences.",
|
||||
"This is a figure from a document. Provide a detailed description of it.",
|
||||
],
|
||||
),
|
||||
] = "Describe this image in a few sentences."
|
||||
generation_config: Annotated[
|
||||
dict[str, Any],
|
||||
Field(
|
||||
description="Config from https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig",
|
||||
examples=[{"max_new_tokens": 200, "do_sample": False}],
|
||||
),
|
||||
] = {"max_new_tokens": 200, "do_sample": False}
|
||||
|
||||
|
||||
class PictureDescriptionApi(BaseModel):
|
||||
url: Annotated[
|
||||
AnyUrl,
|
||||
Field(
|
||||
description="Endpoint which accepts openai-api compatible requests.",
|
||||
examples=[
|
||||
AnyUrl(
|
||||
"http://localhost:8000/v1/chat/completions"
|
||||
), # example of a local vllm api
|
||||
AnyUrl(
|
||||
"http://localhost:11434/v1/chat/completions"
|
||||
), # example of ollama
|
||||
],
|
||||
),
|
||||
]
|
||||
headers: Annotated[
|
||||
dict[str, str],
|
||||
Field(
|
||||
description="Headers used for calling the API endpoint. For example, it could include authentication headers."
|
||||
),
|
||||
] = {}
|
||||
params: Annotated[
|
||||
dict[str, Any],
|
||||
Field(
|
||||
description="Model parameters.",
|
||||
examples=[
|
||||
{ # on vllm
|
||||
"model": "HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||
"max_completion_tokens": 200,
|
||||
},
|
||||
{ # on vllm
|
||||
"model": "ibm-granite/granite-vision-3.2-2b",
|
||||
"max_completion_tokens": 200,
|
||||
},
|
||||
{ # on ollama
|
||||
"model": "granite3.2-vision:2b"
|
||||
},
|
||||
],
|
||||
),
|
||||
] = {}
|
||||
timeout: Annotated[float, Field(description="Timeout for the API request.")] = 20
|
||||
prompt: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Prompt used when calling the vision-language model.",
|
||||
examples=[
|
||||
"Describe this image in a few sentences.",
|
||||
"This is a figures from a document. Provide a detailed description of it.",
|
||||
],
|
||||
),
|
||||
] = "Describe this image in a few sentences."
|
||||
|
||||
|
||||
class ConvertDocumentsOptions(BaseModel):
|
||||
from_formats: Annotated[
|
||||
list[InputFormat],
|
||||
@@ -254,3 +338,30 @@ class ConvertDocumentsOptions(BaseModel):
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
picture_description_local: Annotated[
|
||||
Optional[PictureDescriptionLocal],
|
||||
Field(
|
||||
description="Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api."
|
||||
),
|
||||
] = None
|
||||
|
||||
picture_description_api: Annotated[
|
||||
Optional[PictureDescriptionApi],
|
||||
Field(
|
||||
description="API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local."
|
||||
),
|
||||
] = None
|
||||
|
||||
@model_validator(mode="after")
|
||||
def picture_description_exclusivity(self) -> Self:
|
||||
# Validate picture description options
|
||||
if (
|
||||
self.picture_description_local is not None
|
||||
and self.picture_description_api is not None
|
||||
):
|
||||
raise ValueError(
|
||||
"The parameters picture_description_local and picture_description_api are mutually exclusive, only one of them can be set."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
@@ -21,6 +21,8 @@ from docling.datamodel.pipeline_options import (
|
||||
PdfBackend,
|
||||
PdfPipeline,
|
||||
PdfPipelineOptions,
|
||||
PictureDescriptionApiOptions,
|
||||
PictureDescriptionVlmOptions,
|
||||
TableFormerMode,
|
||||
VlmPipelineOptions,
|
||||
smoldocling_vlm_conversion_options,
|
||||
@@ -116,6 +118,7 @@ def _parse_standard_pdf_opts(
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
artifacts_path=artifacts_path,
|
||||
enable_remote_services=docling_serve_settings.enable_remote_services,
|
||||
document_timeout=request.document_timeout,
|
||||
do_ocr=request.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
@@ -132,6 +135,20 @@ def _parse_standard_pdf_opts(
|
||||
if request.images_scale:
|
||||
pipeline_options.images_scale = request.images_scale
|
||||
|
||||
if request.picture_description_local is not None:
|
||||
pipeline_options.picture_description_options = (
|
||||
PictureDescriptionVlmOptions.model_validate(
|
||||
request.picture_description_local.model_dump()
|
||||
)
|
||||
)
|
||||
|
||||
if request.picture_description_api is not None:
|
||||
pipeline_options.picture_description_options = (
|
||||
PictureDescriptionApiOptions.model_validate(
|
||||
request.picture_description_api.model_dump()
|
||||
)
|
||||
)
|
||||
|
||||
return pipeline_options
|
||||
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ class DoclingServeSettings(BaseSettings):
|
||||
artifacts_path: Optional[Path] = None
|
||||
static_path: Optional[Path] = None
|
||||
options_cache_size: int = 2
|
||||
enable_remote_services: bool = False
|
||||
allow_external_plugins: bool = False
|
||||
|
||||
max_document_timeout: float = 3_600 * 24 * 7 # 7 days
|
||||
|
||||
@@ -28,6 +28,16 @@ async def test_convert_url(async_client: httpx.AsyncClient):
|
||||
"ocr": True,
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
# "do_picture_description": True,
|
||||
# "picture_description_api": {
|
||||
# "url": "http://localhost:11434/v1/chat/completions",
|
||||
# "params": {
|
||||
# "model": "granite3.2-vision:2b",
|
||||
# }
|
||||
# },
|
||||
# "picture_description_local": {
|
||||
# "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||
# },
|
||||
},
|
||||
# "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}],
|
||||
"file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
|
||||
|
||||
Reference in New Issue
Block a user