api v1alpha1 (#17)

* api v1alpha1 Signed-off-by: Guillaume Moutier <gmoutier@redhat.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use actual types in request models and refactor Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * make gradio optional and update README Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Run workflow jobs sequentially to avoid disk space outage (#19) Github Action runners are running out of the space while building both the images in parallel. This change will build the image sequentially and also clean up the cpu images before start building gpu image. Signed-off-by: Anil Vishnoi <vishnoianil@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Add github job to build image (and not publish) on PR creation (#20) Signed-off-by: Anil Vishnoi <vishnoianil@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add start_server script for local dev Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix 3.12-only syntax Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix more py3.10-11 compatibility Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rework output format and background tasks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * speficy return schemas for openapi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add processing time and update REDAME Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * lint markdown Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add MD033 to config Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use port 5000 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use port 5001 as default Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update deps Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor input request Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * return docling document Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update new payload in README Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add base64 example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * wrap example in <details> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename /url in /source Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move main execution to __main__ Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Guillaume Moutier <gmoutier@redhat.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Anil Vishnoi <vishnoianil@gmail.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Anil Vishnoi <vishnoianil@gmail.com>
2025-11-29 16:43:24 +00:00 · 2025-02-03 05:00:54 -05:00
parent ddf3144512
commit c6539c42de
25 changed files with 3642 additions and 1259 deletions
--- a/docling_serve/docling_conversion.py
+++ b/docling_serve/docling_conversion.py
@@ -0,0 +1,400 @@
+import base64
+import hashlib
+import json
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import (
+    Annotated,
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import DocumentStream, InputFormat, OutputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrEngine,
+    OcrOptions,
+    PdfBackend,
+    PdfPipelineOptions,
+    RapidOcrOptions,
+    TableFormerMode,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+from fastapi import HTTPException
+from pydantic import BaseModel, Field
+
+from docling_serve.helper_functions import _to_list_of_strings
+
+_log = logging.getLogger(__name__)
+
+
+# Define the input options for the API
+class ConvertDocumentsOptions(BaseModel):
+    from_formats: Annotated[
+        List[InputFormat],
+        Field(
+            description=(
+                "Input format(s) to convert from. String or list of strings. "
+                f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
+                "Optional, defaults to all formats."
+            ),
+            examples=[[v.value for v in InputFormat]],
+        ),
+    ] = [v for v in InputFormat]
+
+    to_formats: Annotated[
+        List[OutputFormat],
+        Field(
+            description=(
+                "Output format(s) to convert to. String or list of strings. "
+                f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
+                "Optional, defaults to Markdown."
+            ),
+            examples=[[OutputFormat.MARKDOWN]],
+        ),
+    ] = [OutputFormat.MARKDOWN]
+
+    image_export_mode: Annotated[
+        ImageRefMode,
+        Field(
+            description=(
+                "Image export mode for the document (in case of JSON,"
+                " Markdown or HTML). "
+                f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
+                "Optional, defaults to Embedded."
+            ),
+            examples=[ImageRefMode.EMBEDDED.value],
+            # pattern="embedded|placeholder|referenced",
+        ),
+    ] = ImageRefMode.EMBEDDED
+
+    do_ocr: Annotated[
+        bool,
+        Field(
+            description=(
+                "If enabled, the bitmap content will be processed using OCR. "
+                "Boolean. Optional, defaults to true"
+            ),
+            # examples=[True],
+        ),
+    ] = True
+
+    force_ocr: Annotated[
+        bool,
+        Field(
+            description=(
+                "If enabled, replace existing text with OCR-generated "
+                "text over content. Boolean. Optional, defaults to false."
+            ),
+            # examples=[False],
+        ),
+    ] = False
+
+    # TODO: use a restricted list based on what is installed on the system
+    ocr_engine: Annotated[
+        OcrEngine,
+        Field(
+            description=(
+                "The OCR engine to use. String. "
+                "Allowed values: easyocr, tesseract, rapidocr. "
+                "Optional, defaults to easyocr."
+            ),
+            examples=[OcrEngine.EASYOCR],
+        ),
+    ] = OcrEngine.EASYOCR
+
+    ocr_lang: Annotated[
+        Optional[List[str]],
+        Field(
+            description=(
+                "List of languages used by the OCR engine. "
+                "Note that each OCR engine has "
+                "different values for the language names. String or list of strings. "
+                "Optional, defaults to empty."
+            ),
+            examples=[["fr", "de", "es", "en"]],
+        ),
+    ] = None
+
+    pdf_backend: Annotated[
+        PdfBackend,
+        Field(
+            description=(
+                "The PDF backend to use. String. "
+                f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
+                f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
+            ),
+            examples=[PdfBackend.DLPARSE_V2],
+        ),
+    ] = PdfBackend.DLPARSE_V2
+
+    table_mode: Annotated[
+        TableFormerMode,
+        Field(
+            TableFormerMode.FAST,
+            description=(
+                "Mode to use for table structure, String. "
+                f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
+                "Optional, defaults to fast."
+            ),
+            examples=[TableFormerMode.FAST],
+            # pattern="fast|accurate",
+        ),
+    ] = TableFormerMode.FAST
+
+    abort_on_error: Annotated[
+        bool,
+        Field(
+            description=(
+                "Abort on error if enabled. " "Boolean. Optional, defaults to false."
+            ),
+            # examples=[False],
+        ),
+    ] = False
+
+    return_as_file: Annotated[
+        bool,
+        Field(
+            description=(
+                "Return the output as a zip file "
+                "(will happen anyway if multiple files are generated). "
+                "Boolean. Optional, defaults to false."
+            ),
+            examples=[False],
+        ),
+    ] = False
+
+    do_table_structure: Annotated[
+        bool,
+        Field(
+            description=(
+                "If enabled, the table structure will be extracted. "
+                "Boolean. Optional, defaults to true."
+            ),
+            examples=[True],
+        ),
+    ] = True
+
+    include_images: Annotated[
+        bool,
+        Field(
+            description=(
+                "If enabled, images will be extracted from the document. "
+                "Boolean. Optional, defaults to true."
+            ),
+            examples=[True],
+        ),
+    ] = True
+
+    images_scale: Annotated[
+        float,
+        Field(
+            description="Scale factor for images. Float. Optional, defaults to 2.0.",
+            examples=[2.0],
+        ),
+    ] = 2.0
+
+
+class DocumentsConvertBase(BaseModel):
+    options: ConvertDocumentsOptions = ConvertDocumentsOptions()
+
+
+class HttpSource(BaseModel):
+    url: Annotated[
+        str,
+        Field(
+            description="HTTP url to process",
+            examples=["https://arxiv.org/pdf/2206.01062"],
+        ),
+    ]
+    headers: Annotated[
+        Dict[str, Any],
+        Field(
+            description="Additional headers used to fetch the urls, "
+            "e.g. authorization, agent, etc"
+        ),
+    ] = {}
+
+
+class FileSource(BaseModel):
+    base64_string: Annotated[
+        str,
+        Field(
+            description="Content of the file serialized in base64. "
+            "For example it can be obtained via "
+            "`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
+        ),
+    ]
+    filename: Annotated[
+        str,
+        Field(description="Filename of the uploaded document", examples=["file.pdf"]),
+    ]
+
+    def to_document_stream(self) -> DocumentStream:
+        buf = BytesIO(base64.b64decode(self.base64_string))
+        return DocumentStream(stream=buf, name=self.filename)
+
+
+class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
+    http_sources: List[HttpSource]
+
+
+class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
+    file_sources: List[FileSource]
+
+
+ConvertDocumentsRequest = Union[
+    ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
+]
+
+
+# Document converters will be preloaded and stored in a dictionary
+converters: Dict[str, DocumentConverter] = {}
+
+
+# Custom serializer for PdfFormatOption
+# (model_dump_json does not work with some classes)
+def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
+    data = pdf_format_option.model_dump()
+
+    # pipeline_options are not fully serialized by model_dump, dedicated pass
+    if pdf_format_option.pipeline_options:
+        data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()
+
+    # Replace `pipeline_cls` with a string representation
+    data["pipeline_cls"] = repr(data["pipeline_cls"])
+
+    # Replace `backend` with a string representation
+    data["backend"] = repr(data["backend"])
+
+    # Handle `device` in `accelerator_options`
+    if "accelerator_options" in data and "device" in data["accelerator_options"]:
+        data["accelerator_options"]["device"] = repr(
+            data["accelerator_options"]["device"]
+        )
+
+    # Serialize the dictionary to JSON with sorted keys to have consistent hashes
+    return json.dumps(data, sort_keys=True)
+
+
+# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
+def get_pdf_pipeline_opts(
+    request: ConvertDocumentsOptions,
+) -> Tuple[PdfFormatOption, str]:
+
+    if request.ocr_engine == OcrEngine.EASYOCR:
+        try:
+            import easyocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={request.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr)
+    elif request.ocr_engine == OcrEngine.TESSERACT:
+        try:
+            import tesserocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={request.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr)
+    elif request.ocr_engine == OcrEngine.RAPIDOCR:
+        try:
+            from rapidocr_onnxruntime import RapidOCR  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={request.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr)
+    else:
+        raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}")
+
+    if request.ocr_lang is not None:
+        if isinstance(request.ocr_lang, str):
+            ocr_options.lang = _to_list_of_strings(request.ocr_lang)
+        else:
+            ocr_options.lang = request.ocr_lang
+
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=request.do_ocr,
+        ocr_options=ocr_options,
+        do_table_structure=request.do_table_structure,
+    )
+    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
+    pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
+
+    if request.image_export_mode != ImageRefMode.PLACEHOLDER:
+        pipeline_options.generate_page_images = True
+        if request.images_scale:
+            pipeline_options.images_scale = request.images_scale
+
+    if request.pdf_backend == PdfBackend.DLPARSE_V1:
+        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+    elif request.pdf_backend == PdfBackend.DLPARSE_V2:
+        backend = DoclingParseV2DocumentBackend
+    elif request.pdf_backend == PdfBackend.PYPDFIUM2:
+        backend = PyPdfiumDocumentBackend
+    else:
+        raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
+
+    pdf_format_option = PdfFormatOption(
+        pipeline_options=pipeline_options,
+        backend=backend,
+    )
+
+    serialized_data = _serialize_pdf_format_option(pdf_format_option)
+
+    options_hash = hashlib.sha1(serialized_data.encode()).hexdigest()
+
+    return pdf_format_option, options_hash
+
+
+def convert_documents(
+    sources: Iterable[Union[Path, str, DocumentStream]],
+    options: ConvertDocumentsOptions,
+    headers: Optional[Dict[str, Any]] = None,
+):
+    pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
+
+    if options_hash not in converters:
+        format_options: Dict[InputFormat, FormatOption] = {
+            InputFormat.PDF: pdf_format_option,
+            InputFormat.IMAGE: pdf_format_option,
+        }
+
+        converters[options_hash] = DocumentConverter(format_options=format_options)
+        _log.info(f"We now have {len(converters)} converters in memory.")
+
+    results: Iterator[ConversionResult] = converters[options_hash].convert_all(
+        sources,
+        headers=headers,
+    )
+
+    return results