mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-12-03 10:33:21 +00:00
fix: allow users to set the area threshold for picture descriptions (#165)
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -9,6 +9,7 @@ from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfBackend,
|
||||
PdfPipeline,
|
||||
PictureDescriptionBaseOptions,
|
||||
TableFormerMode,
|
||||
TableStructureOptions,
|
||||
)
|
||||
@@ -339,6 +340,14 @@ class ConvertDocumentsOptions(BaseModel):
|
||||
),
|
||||
] = False
|
||||
|
||||
picture_description_area_threshold: Annotated[
|
||||
float,
|
||||
Field(
|
||||
description="Minimum percentage of the area for a picture to be processed with the models.",
|
||||
examples=[PictureDescriptionBaseOptions().picture_area_threshold],
|
||||
),
|
||||
] = PictureDescriptionBaseOptions().picture_area_threshold
|
||||
|
||||
picture_description_local: Annotated[
|
||||
Optional[PictureDescriptionLocal],
|
||||
Field(
|
||||
|
||||
@@ -150,6 +150,9 @@ def _parse_standard_pdf_opts(
|
||||
request.picture_description_api.model_dump()
|
||||
)
|
||||
)
|
||||
pipeline_options.picture_description_options.picture_area_threshold = (
|
||||
request.picture_description_area_threshold
|
||||
)
|
||||
|
||||
return pipeline_options
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ On top of the source of file (see below), both endpoints support the same parame
|
||||
- `do_formula_enrichment` (bool): If enabled, perform formula OCR, return LaTeX code. Defaults to false.
|
||||
- `do_picture_classification` (bool): If enabled, classify pictures in documents. Defaults to false.
|
||||
- `do_picture_description` (bool): If enabled, describe pictures in documents. Defaults to false.
|
||||
- `picture_description_area_threshold` (float): Minimum percentage of the area for a picture to be processed with the models. Defaults to 0.05.
|
||||
- `picture_description_local` (dict): Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.
|
||||
- `picture_description_api` (dict): API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.
|
||||
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to false.
|
||||
|
||||
Reference in New Issue
Block a user