feat: Add option for vlm pipeline (#143)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-04-22 14:46:33 +02:00
committed by GitHub
parent 6b3d281f02
commit ee89ee4dae
5 changed files with 1178 additions and 104 deletions

View File

@@ -7,6 +7,7 @@ from docling.datamodel.base_models import InputFormat, OutputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfBackend,
PdfPipeline,
TableFormerMode,
TableStructureOptions,
)
@@ -136,6 +137,11 @@ class ConvertDocumentsOptions(BaseModel):
),
] = TableStructureOptions().mode
pipeline: Annotated[
PdfPipeline,
Field(description="Choose the pipeline to process PDF or image files."),
] = PdfPipeline.STANDARD
page_range: Annotated[
PageRange,
Field(

View File

@@ -1,6 +1,7 @@
import hashlib
import json
import logging
import sys
from collections.abc import Iterable, Iterator
from functools import lru_cache
from pathlib import Path
@@ -18,10 +19,15 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
OcrOptions,
PdfBackend,
PdfPipeline,
PdfPipelineOptions,
TableFormerMode,
VlmPipelineOptions,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling_core.types.doc import ImageRefMode
from docling_serve.datamodel.convert import ConvertDocumentsOptions, ocr_factory
@@ -84,10 +90,9 @@ def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
return _get_converter_from_hash(options_hash)
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
def get_pdf_pipeline_opts(
request: ConvertDocumentsOptions,
) -> PdfFormatOption:
def _parse_standard_pdf_opts(
request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
) -> PdfPipelineOptions:
try:
ocr_options: OcrOptions = ocr_factory.create_options(
kind=request.ocr_engine.value, # type: ignore
@@ -110,6 +115,7 @@ def get_pdf_pipeline_opts(
ocr_options.lang = request.ocr_lang
pipeline_options = PdfPipelineOptions(
artifacts_path=artifacts_path,
document_timeout=request.document_timeout,
do_ocr=request.do_ocr,
ocr_options=ocr_options,
@@ -119,7 +125,6 @@ def get_pdf_pipeline_opts(
do_picture_classification=request.do_picture_classification,
do_picture_description=request.do_picture_description,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
@@ -127,6 +132,10 @@ def get_pdf_pipeline_opts(
if request.images_scale:
pipeline_options.images_scale = request.images_scale
return pipeline_options
def _parse_backend(request: ConvertDocumentsOptions) -> type[PdfDocumentBackend]:
if request.pdf_backend == PdfBackend.DLPARSE_V1:
backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
@@ -138,35 +147,78 @@ def get_pdf_pipeline_opts(
else:
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
return backend
def _parse_vlm_pdf_opts(
request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
) -> VlmPipelineOptions:
pipeline_options = VlmPipelineOptions(
artifacts_path=artifacts_path,
document_timeout=request.document_timeout,
)
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
if sys.platform == "darwin":
try:
import mlx_vlm # noqa: F401
pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
except ImportError:
_log.warning(
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
return pipeline_options
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
def get_pdf_pipeline_opts(
request: ConvertDocumentsOptions,
) -> PdfFormatOption:
artifacts_path: Optional[Path] = None
if docling_serve_settings.artifacts_path is not None:
if str(docling_serve_settings.artifacts_path.absolute()) == "":
_log.info(
"artifacts_path is an empty path, model weights will be dowloaded "
"at runtime."
)
pipeline_options.artifacts_path = None
artifacts_path = None
elif docling_serve_settings.artifacts_path.is_dir():
_log.info(
"artifacts_path is set to a valid directory. "
"No model weights will be downloaded at runtime."
)
pipeline_options.artifacts_path = docling_serve_settings.artifacts_path
artifacts_path = docling_serve_settings.artifacts_path
else:
_log.warning(
"artifacts_path is set to an invalid directory. "
"The system will download the model weights at runtime."
)
pipeline_options.artifacts_path = None
artifacts_path = None
else:
_log.info(
"artifacts_path is unset. "
"The system will download the model weights at runtime."
)
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend,
)
pipeline_options: Union[PdfPipelineOptions, VlmPipelineOptions]
if request.pipeline == PdfPipeline.STANDARD:
pipeline_options = _parse_standard_pdf_opts(request, artifacts_path)
backend = _parse_backend(request)
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend,
)
elif request.pipeline == PdfPipeline.VLM:
pipeline_options = _parse_vlm_pdf_opts(request, artifacts_path)
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
else:
raise NotImplementedError(
f"The pipeline {request.pipeline} is not implemented."
)
return pdf_format_option

View File

@@ -13,6 +13,7 @@ import httpx
from docling.datamodel.pipeline_options import (
PdfBackend,
PdfPipeline,
TableFormerMode,
TableStructureOptions,
)
@@ -274,6 +275,7 @@ def process_url(
input_sources,
to_formats,
image_export_mode,
pipeline,
ocr,
force_ocr,
ocr_engine,
@@ -292,6 +294,7 @@ def process_url(
"options": {
"to_formats": to_formats,
"image_export_mode": image_export_mode,
"pipeline": pipeline,
"ocr": ocr,
"force_ocr": force_ocr,
"ocr_engine": ocr_engine,
@@ -344,6 +347,7 @@ def process_file(
file,
to_formats,
image_export_mode,
pipeline,
ocr,
force_ocr,
ocr_engine,
@@ -367,6 +371,7 @@ def process_file(
"options": {
"to_formats": to_formats,
"image_export_mode": image_export_mode,
"pipeline": pipeline,
"ocr": ocr,
"force_ocr": force_ocr,
"ocr_engine": ocr_engine,
@@ -579,6 +584,13 @@ with gr.Blocks(
label="Image Export Mode",
value="embedded",
)
with gr.Row():
with gr.Column(scale=1, min_width=200):
pipeline = gr.Radio(
[(v.value.capitalize(), v.value) for v in PdfPipeline],
label="Pipeline type",
value=PdfPipeline.STANDARD.value,
)
with gr.Row():
with gr.Column(scale=1, min_width=200):
ocr = gr.Checkbox(label="Enable OCR", value=True)
@@ -712,6 +724,7 @@ with gr.Blocks(
url_input,
to_formats,
image_export_mode,
pipeline,
ocr,
force_ocr,
ocr_engine,
@@ -798,6 +811,7 @@ with gr.Blocks(
file_input,
to_formats,
image_export_mode,
pipeline,
ocr,
force_ocr,
ocr_engine,

View File

@@ -30,7 +30,8 @@ classifiers = [
]
requires-python = ">=3.10"
dependencies = [
"docling~=2.28",
"docling[vlm]~=2.28",
"mlx-vlm~=0.1.12; sys_platform == 'darwin' and platform_machine == 'arm64'",
"fastapi[standard]~=0.115",
"httpx~=0.28",
"pydantic~=2.10",
@@ -196,6 +197,7 @@ module = [
"tesserocr.*",
"rapidocr_onnxruntime.*",
"requests.*",
"mlx_vlm.*",
]
ignore_missing_imports = true

1182
uv.lock generated

File diff suppressed because it is too large Load Diff