feat: upgrade endpoint to docling v2 (#13)

* upgrade endpoint to docling v2 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix Containerfile Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2026-03-21 16:40:06 +00:00 · 2024-12-19 11:41:44 +01:00
parent 3824aa6b2f
commit b00718bcc9
5 changed files with 2650 additions and 2008 deletions
--- a/4
+++ b/4
@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/

-RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
+RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'

 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve

 EXPOSE 5000

-CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
+CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
--- a/docling_serve/app.py
+++ b/docling_serve/app.py
@@ -1,21 +1,55 @@
 import base64
+import hashlib
 from contextlib import asynccontextmanager
+from enum import Enum
 from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 import httpx
 from docling.datamodel.base_models import (
    ConversionStatus,
    DocumentStream,
-    PipelineOptions,
+    ErrorItem,
+    InputFormat,
 )
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PdfPipelineOptions,
+    RapidOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.utils.profiling import ProfilingItem
+from docling_core.types.doc import DoclingDocument, ImageRefMode
+from docling_core.utils.file import resolve_remote_filename
+from fastapi import FastAPI, HTTPException, Response
+from pydantic import AnyHttpUrl, BaseModel

-from docling_serve.settings import Settings
+
+# TODO: import enum from Docling, once it is exposed
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT = "tesseract"
+    RAPIDOCR = "rapidocr"
+
+
+class ConvertOptions(BaseModel):
+    output_docling_document: bool = True
+    output_markdown: bool = False
+    output_html: bool = False
+    do_ocr: bool = True
+    ocr_engine: OcrEngine = OcrEngine.EASYOCR
+    ocr_lang: Optional[List[str]] = None
+    force_ocr: bool = False
+    do_table_structure: bool = True
+    include_images: bool = True
+    images_scale: float = 2.0
+
+
+class DocumentConvertBase(BaseModel):
+    options: ConvertOptions = ConvertOptions()


 class HttpSource(BaseModel):
@@ -28,16 +62,30 @@ class FileSource(BaseModel):
    filename: str


-class ConvertDocumentHttpSourceRequest(BaseModel):
+class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
    http_source: HttpSource


-class ConvertDocumentFileSourceRequest(BaseModel):
+class ConvertDocumentFileSourceRequest(DocumentConvertBase):
    file_source: FileSource


+class DocumentResponse(BaseModel):
+    markdown: Optional[str] = None
+    docling_document: Optional[DoclingDocument] = None
+    html: Optional[str] = None
+
+
 class ConvertDocumentResponse(BaseModel):
-    content_md: str
+    document: DocumentResponse
+    status: ConversionStatus
+    errors: List[ErrorItem] = []
+    timings: Dict[str, ProfilingItem] = {}
+
+
+class ConvertDocumentErrorResponse(BaseModel):
+    status: ConversionStatus
+    # errors: List[ErrorItem] = []


 ConvertDocumentRequest = Union[
@@ -45,20 +93,93 @@ ConvertDocumentRequest = Union[
 ]


-models = {}
+class MarkdownTextResponse(Response):
+    media_type = "text/markdown"
+
+
+class HealthCheckResponse(BaseModel):
+    status: str = "ok"
+
+
+def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
+
+    if options.ocr_engine == OcrEngine.EASYOCR:
+        try:
+            import easyocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
+    elif options.ocr_engine == OcrEngine.TESSERACT:
+        try:
+            import tesserocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
+    elif options.ocr_engine == OcrEngine.RAPIDOCR:
+        try:
+            from rapidocr_onnxruntime import RapidOCR  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
+    else:
+        raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
+
+    if options.ocr_lang is not None:
+        ocr_options.lang = options.ocr_lang
+
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=options.do_ocr,
+        ocr_options=ocr_options,
+        do_table_structure=options.do_table_structure,
+        generate_page_images=options.include_images,
+        generate_picture_images=options.include_images,
+        images_scale=options.images_scale,
+    )
+
+    options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
+
+    return pipeline_options, options_hash
+
+
+converters: Dict[str, DocumentConverter] = {}


@asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Converter
-    settings = Settings()
-    pipeline_options = PipelineOptions()
-    pipeline_options.do_ocr = settings.do_ocr
-    pipeline_options.do_table_structure = settings.do_table_structure
-    models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
+    # settings = Settings()
+
+    # Converter with default options
+    pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
+    converters[options_hash] = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+            InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+        }
+    )
+
+    converters[options_hash].initialize_pipeline(InputFormat.PDF)
+
    yield

-    models.clear()
+    converters.clear()


 app = FastAPI(
@@ -67,10 +188,14 @@ app = FastAPI(
 )


-@app.post("/convert")
-def convert_pdf_document(
+@app.get("/health")
+def health() -> HealthCheckResponse:
+    return HealthCheckResponse()
+
+
+def _convert_document(
    body: ConvertDocumentRequest,
-) -> ConvertDocumentResponse:
+) -> ConversionResult:

    filename: str
    buf: BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
    elif isinstance(body, ConvertDocumentHttpSourceRequest):
        http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
        buf = BytesIO(http_res.content)
-        filename = Path(
-            body.http_source.url
-        ).name  # TODO: use better way to detect filename, e.g. from Content-Disposition
+        filename = resolve_remote_filename(
+            http_url=AnyHttpUrl(body.http_source.url),
+            response_headers=dict(**http_res.headers),
+        )

-    docs_input = DocumentConversionInput.from_streams(
-        [DocumentStream(filename=filename, stream=buf)]
+    doc_input = DocumentStream(name=filename, stream=buf)
+
+    pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
+    if options_hash not in converters:
+        converters[options_hash] = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+                InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+            }
+        )
+
+    result: ConversionResult = converters[options_hash].convert(doc_input)
+
+    if result is None or result.status == ConversionStatus.SKIPPED:
+        raise HTTPException(status_code=400, detail=result.errors)
+
+    if result is None or result.status not in {
+        ConversionStatus.SUCCESS,
+    }:
+        raise HTTPException(
+            status_code=500, detail={"errors": result.errors, "status": result.status}
+        )
+
+    return result
+
+
+@app.post(
+    "/convert",
+)
+def convert_document(
+    body: ConvertDocumentRequest,
+) -> ConvertDocumentResponse:
+
+    result = _convert_document(body=body)
+
+    image_mode = (
+        ImageRefMode.EMBEDDED
+        if body.options.include_images
+        else ImageRefMode.PLACEHOLDER
    )
-    result: ConversionResult = next(models["converter"].convert(docs_input), None)
+    doc_resp = DocumentResponse()
+    if body.options.output_docling_document:
+        doc_resp.docling_document = result.document
+    if body.options.output_markdown:
+        doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
+    if body.options.output_html:
+        doc_resp.html = result.document.export_to_html(image_mode=image_mode)

-    if result is None or result.status != ConversionStatus.SUCCESS:
-        raise HTTPException(status_code=500, detail={"errors": result.errors})
+    return ConvertDocumentResponse(
+        document=doc_resp, status=result.status, timings=result.timings
+    )

-    return ConvertDocumentResponse(content_md=result.render_as_markdown())
+
+@app.post("/convert/markdown", response_class=MarkdownTextResponse)
+def convert_document_md(
+    body: ConvertDocumentRequest,
+) -> MarkdownTextResponse:
+    result = _convert_document(body=body)
+    image_mode = (
+        ImageRefMode.EMBEDDED
+        if body.options.include_images
+        else ImageRefMode.PLACEHOLDER
+    )
+    return MarkdownTextResponse(
+        result.document.export_to_markdown(image_mode=image_mode)
+    )
--- a/docling_serve/settings.py
+++ b/docling_serve/settings.py
@@ -2,7 +2,5 @@ from pydantic_settings import BaseSettings, SettingsConfigDict


 class Settings(BaseSettings):
-    do_ocr: bool = True
-    do_table_structure: bool = True

    model_config = SettingsConfigDict(env_prefix="DOCLING_")
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,12 +30,26 @@ classifiers = [
 ]

 [tool.poetry.dependencies]
-python = "^3.10"
-docling = "^1.11.0"
-fastapi = {version = "^0.110.2", extras = ["standard"]}
-uvicorn = "^0.30.6"
+python = "^3.9"
+docling = "^2.10.0"
+fastapi = {version = "^0.115.6", extras = ["standard"]}
+uvicorn = "^0.32.1"
 pydantic-settings = "^2.4.0"
-httpx = "^0.27.2"
+httpx = "^0.28.1"
+tesserocr = { version = "^2.7.1", optional = true }
+rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
+onnxruntime = [
+  # 1.19.2 is the last version with python3.9 support,
+  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
+  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
+  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
+]
+
+
+[tool.poetry.extras]
+tesserocr = ["tesserocr"]
+rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
+

 [tool.poetry.group.pypi-torch]
 optional = false
@@ -63,6 +77,12 @@ torchvision = [
    {markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
 ]

+[tool.poetry.group.constraints.dependencies]
+numpy = [
+    { version = "^2.1.0", markers = 'python_version >= "3.13"' },
+    { version = "^1.24.4", markers = 'python_version < "3.13"' },
+]
+
 [tool.poetry.group.dev.dependencies]
 black = "^24.8.0"
 isort = "^5.13.2"
@@ -93,8 +113,17 @@ remove-unused-variables = true
 expand-star-imports = true
 recursive = true

+[tool.mypy]
+pretty = true
+# strict = true
+no_implicit_optional = true
+plugins = "pydantic.mypy"
+python_version = "3.10"
+
 [[tool.mypy.overrides]]
 module = [
-    "docling.*",
+    "easyocr.*",
+    "tesserocr.*",
+    "rapidocr_onnxruntime.*",
 ]
 ignore_missing_imports = true