feat: Add option for vlm pipeline (#143)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2026-04-23 18:12:56 +00:00 · 2025-04-22 14:46:33 +02:00
parent 6b3d281f02
commit ee89ee4dae
5 changed files with 1178 additions and 104 deletions
--- a/docling_serve/datamodel/convert.py
+++ b/docling_serve/datamodel/convert.py
@@ -7,6 +7,7 @@ from docling.datamodel.base_models import InputFormat, OutputFormat
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfBackend,
+    PdfPipeline,
    TableFormerMode,
    TableStructureOptions,
 )
@@ -136,6 +137,11 @@ class ConvertDocumentsOptions(BaseModel):
        ),
    ] = TableStructureOptions().mode

+    pipeline: Annotated[
+        PdfPipeline,
+        Field(description="Choose the pipeline to process PDF or image files."),
+    ] = PdfPipeline.STANDARD
+
    page_range: Annotated[
        PageRange,
        Field(
--- a/docling_serve/docling_conversion.py
+++ b/docling_serve/docling_conversion.py
@@ -1,6 +1,7 @@
 import hashlib
 import json
 import logging
+import sys
 from collections.abc import Iterable, Iterator
 from functools import lru_cache
 from pathlib import Path
@@ -18,10 +19,15 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    OcrOptions,
    PdfBackend,
+    PdfPipeline,
    PdfPipelineOptions,
    TableFormerMode,
+    VlmPipelineOptions,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.pipeline.vlm_pipeline import VlmPipeline
 from docling_core.types.doc import ImageRefMode

 from docling_serve.datamodel.convert import ConvertDocumentsOptions, ocr_factory
@@ -84,10 +90,9 @@ def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
    return _get_converter_from_hash(options_hash)


-# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
-def get_pdf_pipeline_opts(
-    request: ConvertDocumentsOptions,
-) -> PdfFormatOption:
+def _parse_standard_pdf_opts(
+    request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
+) -> PdfPipelineOptions:
    try:
        ocr_options: OcrOptions = ocr_factory.create_options(
            kind=request.ocr_engine.value,  # type: ignore
@@ -110,6 +115,7 @@ def get_pdf_pipeline_opts(
            ocr_options.lang = request.ocr_lang

    pipeline_options = PdfPipelineOptions(
+        artifacts_path=artifacts_path,
        document_timeout=request.document_timeout,
        do_ocr=request.do_ocr,
        ocr_options=ocr_options,
@@ -119,7 +125,6 @@ def get_pdf_pipeline_opts(
        do_picture_classification=request.do_picture_classification,
        do_picture_description=request.do_picture_description,
    )
-    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
    pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)

    if request.image_export_mode != ImageRefMode.PLACEHOLDER:
@@ -127,6 +132,10 @@ def get_pdf_pipeline_opts(
        if request.images_scale:
            pipeline_options.images_scale = request.images_scale

+    return pipeline_options
+
+
+def _parse_backend(request: ConvertDocumentsOptions) -> type[PdfDocumentBackend]:
    if request.pdf_backend == PdfBackend.DLPARSE_V1:
        backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
    elif request.pdf_backend == PdfBackend.DLPARSE_V2:
@@ -138,35 +147,78 @@ def get_pdf_pipeline_opts(
    else:
        raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")

+    return backend
+
+
+def _parse_vlm_pdf_opts(
+    request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
+) -> VlmPipelineOptions:
+    pipeline_options = VlmPipelineOptions(
+        artifacts_path=artifacts_path,
+        document_timeout=request.document_timeout,
+    )
+    pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+    if sys.platform == "darwin":
+        try:
+            import mlx_vlm  # noqa: F401
+
+            pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
+        except ImportError:
+            _log.warning(
+                "To run SmolDocling faster, please install mlx-vlm:\n"
+                "pip install mlx-vlm"
+            )
+    return pipeline_options
+
+
+# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
+def get_pdf_pipeline_opts(
+    request: ConvertDocumentsOptions,
+) -> PdfFormatOption:
+    artifacts_path: Optional[Path] = None
    if docling_serve_settings.artifacts_path is not None:
        if str(docling_serve_settings.artifacts_path.absolute()) == "":
            _log.info(
                "artifacts_path is an empty path, model weights will be dowloaded "
                "at runtime."
            )
-            pipeline_options.artifacts_path = None
+            artifacts_path = None
        elif docling_serve_settings.artifacts_path.is_dir():
            _log.info(
                "artifacts_path is set to a valid directory. "
                "No model weights will be downloaded at runtime."
            )
-            pipeline_options.artifacts_path = docling_serve_settings.artifacts_path
+            artifacts_path = docling_serve_settings.artifacts_path
        else:
            _log.warning(
                "artifacts_path is set to an invalid directory. "
                "The system will download the model weights at runtime."
            )
-            pipeline_options.artifacts_path = None
+            artifacts_path = None
    else:
        _log.info(
            "artifacts_path is unset. "
            "The system will download the model weights at runtime."
        )

-    pdf_format_option = PdfFormatOption(
-        pipeline_options=pipeline_options,
-        backend=backend,
-    )
+    pipeline_options: Union[PdfPipelineOptions, VlmPipelineOptions]
+    if request.pipeline == PdfPipeline.STANDARD:
+        pipeline_options = _parse_standard_pdf_opts(request, artifacts_path)
+        backend = _parse_backend(request)
+        pdf_format_option = PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=backend,
+        )
+
+    elif request.pipeline == PdfPipeline.VLM:
+        pipeline_options = _parse_vlm_pdf_opts(request, artifacts_path)
+        pdf_format_option = PdfFormatOption(
+            pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
+        )
+    else:
+        raise NotImplementedError(
+            f"The pipeline {request.pipeline} is not implemented."
+        )

    return pdf_format_option

--- a/docling_serve/gradio_ui.py
+++ b/docling_serve/gradio_ui.py
@@ -13,6 +13,7 @@ import httpx

 from docling.datamodel.pipeline_options import (
    PdfBackend,
+    PdfPipeline,
    TableFormerMode,
    TableStructureOptions,
 )
@@ -274,6 +275,7 @@ def process_url(
    input_sources,
    to_formats,
    image_export_mode,
+    pipeline,
    ocr,
    force_ocr,
    ocr_engine,
@@ -292,6 +294,7 @@ def process_url(
        "options": {
            "to_formats": to_formats,
            "image_export_mode": image_export_mode,
+            "pipeline": pipeline,
            "ocr": ocr,
            "force_ocr": force_ocr,
            "ocr_engine": ocr_engine,
@@ -344,6 +347,7 @@ def process_file(
    file,
    to_formats,
    image_export_mode,
+    pipeline,
    ocr,
    force_ocr,
    ocr_engine,
@@ -367,6 +371,7 @@ def process_file(
        "options": {
            "to_formats": to_formats,
            "image_export_mode": image_export_mode,
+            "pipeline": pipeline,
            "ocr": ocr,
            "force_ocr": force_ocr,
            "ocr_engine": ocr_engine,
@@ -579,6 +584,13 @@ with gr.Blocks(
                    label="Image Export Mode",
                    value="embedded",
                )
+        with gr.Row():
+            with gr.Column(scale=1, min_width=200):
+                pipeline = gr.Radio(
+                    [(v.value.capitalize(), v.value) for v in PdfPipeline],
+                    label="Pipeline type",
+                    value=PdfPipeline.STANDARD.value,
+                )
        with gr.Row():
            with gr.Column(scale=1, min_width=200):
                ocr = gr.Checkbox(label="Enable OCR", value=True)
@@ -712,6 +724,7 @@ with gr.Blocks(
            url_input,
            to_formats,
            image_export_mode,
+            pipeline,
            ocr,
            force_ocr,
            ocr_engine,
@@ -798,6 +811,7 @@ with gr.Blocks(
            file_input,
            to_formats,
            image_export_mode,
+            pipeline,
            ocr,
            force_ocr,
            ocr_engine,
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,8 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "docling~=2.28",
+    "docling[vlm]~=2.28",
+    "mlx-vlm~=0.1.12; sys_platform == 'darwin' and platform_machine == 'arm64'",
    "fastapi[standard]~=0.115",
    "httpx~=0.28",
    "pydantic~=2.10",
@@ -196,6 +197,7 @@ module = [
    "tesserocr.*",
    "rapidocr_onnxruntime.*",
    "requests.*",
+    "mlx_vlm.*",
 ]
 ignore_missing_imports = true

--- a/uv.lock
+++ b/uv.lock