fix: allow users to set the area threshold for picture descriptions (#165)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-03 10:33:21 +00:00 · 2025-04-30 12:37:24 +02:00
parent 919cf5c041
commit 509f4889f8
3 changed files with 13 additions and 0 deletions
--- a/docling_serve/datamodel/convert.py
+++ b/docling_serve/datamodel/convert.py
@@ -9,6 +9,7 @@ from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfBackend,
    PdfPipeline,
+    PictureDescriptionBaseOptions,
    TableFormerMode,
    TableStructureOptions,
 )
@@ -339,6 +340,14 @@ class ConvertDocumentsOptions(BaseModel):
        ),
    ] = False

+    picture_description_area_threshold: Annotated[
+        float,
+        Field(
+            description="Minimum percentage of the area for a picture to be processed with the models.",
+            examples=[PictureDescriptionBaseOptions().picture_area_threshold],
+        ),
+    ] = PictureDescriptionBaseOptions().picture_area_threshold
+
    picture_description_local: Annotated[
        Optional[PictureDescriptionLocal],
        Field(
--- a/docling_serve/docling_conversion.py
+++ b/docling_serve/docling_conversion.py
@@ -150,6 +150,9 @@ def _parse_standard_pdf_opts(
                request.picture_description_api.model_dump()
            )
        )
+    pipeline_options.picture_description_options.picture_area_threshold = (
+        request.picture_description_area_threshold
+    )

    return pipeline_options

--- a/docs/usage.md
+++ b/docs/usage.md
@@ -23,6 +23,7 @@ On top of the source of file (see below), both endpoints support the same parame
 - `do_formula_enrichment` (bool): If enabled, perform formula OCR, return LaTeX code. Defaults to false.
 - `do_picture_classification` (bool): If enabled, classify pictures in documents. Defaults to false.
 - `do_picture_description` (bool): If enabled, describe pictures in documents. Defaults to false.
+- `picture_description_area_threshold` (float): Minimum percentage of the area for a picture to be processed with the models. Defaults to 0.05.
 - `picture_description_local` (dict): Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.
 - `picture_description_api` (dict): API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.
 - `include_images` (bool): If enabled, images will be extracted from the document. Defaults to false.