feat: upgrade endpoint to docling v2 (#13)

* upgrade endpoint to docling v2

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix Containerfile

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-12-19 11:41:44 +01:00
committed by GitHub
parent 3824aa6b2f
commit b00718bcc9
5 changed files with 2650 additions and 2008 deletions

View File

@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/
RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
EXPOSE 5000
CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]

View File

@@ -1,21 +1,55 @@
import base64
import hashlib
from contextlib import asynccontextmanager
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Union
from typing import Any, Dict, List, Optional, Tuple, Union
import httpx
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
ErrorItem,
InputFormat,
)
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.profiling import ProfilingItem
from docling_core.types.doc import DoclingDocument, ImageRefMode
from docling_core.utils.file import resolve_remote_filename
from fastapi import FastAPI, HTTPException, Response
from pydantic import AnyHttpUrl, BaseModel
from docling_serve.settings import Settings
# TODO: import enum from Docling, once it is exposed
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT = "tesseract"
RAPIDOCR = "rapidocr"
class ConvertOptions(BaseModel):
output_docling_document: bool = True
output_markdown: bool = False
output_html: bool = False
do_ocr: bool = True
ocr_engine: OcrEngine = OcrEngine.EASYOCR
ocr_lang: Optional[List[str]] = None
force_ocr: bool = False
do_table_structure: bool = True
include_images: bool = True
images_scale: float = 2.0
class DocumentConvertBase(BaseModel):
options: ConvertOptions = ConvertOptions()
class HttpSource(BaseModel):
@@ -28,16 +62,30 @@ class FileSource(BaseModel):
filename: str
class ConvertDocumentHttpSourceRequest(BaseModel):
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
http_source: HttpSource
class ConvertDocumentFileSourceRequest(BaseModel):
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
file_source: FileSource
class DocumentResponse(BaseModel):
markdown: Optional[str] = None
docling_document: Optional[DoclingDocument] = None
html: Optional[str] = None
class ConvertDocumentResponse(BaseModel):
content_md: str
document: DocumentResponse
status: ConversionStatus
errors: List[ErrorItem] = []
timings: Dict[str, ProfilingItem] = {}
class ConvertDocumentErrorResponse(BaseModel):
status: ConversionStatus
# errors: List[ErrorItem] = []
ConvertDocumentRequest = Union[
@@ -45,20 +93,93 @@ ConvertDocumentRequest = Union[
]
models = {}
class MarkdownTextResponse(Response):
media_type = "text/markdown"
class HealthCheckResponse(BaseModel):
status: str = "ok"
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
if options.ocr_engine == OcrEngine.EASYOCR:
try:
import easyocr # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
elif options.ocr_engine == OcrEngine.TESSERACT:
try:
import tesserocr # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
elif options.ocr_engine == OcrEngine.RAPIDOCR:
try:
from rapidocr_onnxruntime import RapidOCR # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
if options.ocr_lang is not None:
ocr_options.lang = options.ocr_lang
pipeline_options = PdfPipelineOptions(
do_ocr=options.do_ocr,
ocr_options=ocr_options,
do_table_structure=options.do_table_structure,
generate_page_images=options.include_images,
generate_picture_images=options.include_images,
images_scale=options.images_scale,
)
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
return pipeline_options, options_hash
converters: Dict[str, DocumentConverter] = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
# Converter
settings = Settings()
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = settings.do_ocr
pipeline_options.do_table_structure = settings.do_table_structure
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
# settings = Settings()
# Converter with default options
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
}
)
converters[options_hash].initialize_pipeline(InputFormat.PDF)
yield
models.clear()
converters.clear()
app = FastAPI(
@@ -67,10 +188,14 @@ app = FastAPI(
)
@app.post("/convert")
def convert_pdf_document(
@app.get("/health")
def health() -> HealthCheckResponse:
return HealthCheckResponse()
def _convert_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:
) -> ConversionResult:
filename: str
buf: BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
elif isinstance(body, ConvertDocumentHttpSourceRequest):
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
buf = BytesIO(http_res.content)
filename = Path(
body.http_source.url
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
filename = resolve_remote_filename(
http_url=AnyHttpUrl(body.http_source.url),
response_headers=dict(**http_res.headers),
)
docs_input = DocumentConversionInput.from_streams(
[DocumentStream(filename=filename, stream=buf)]
doc_input = DocumentStream(name=filename, stream=buf)
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
if options_hash not in converters:
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
}
)
result: ConversionResult = converters[options_hash].convert(doc_input)
if result is None or result.status == ConversionStatus.SKIPPED:
raise HTTPException(status_code=400, detail=result.errors)
if result is None or result.status not in {
ConversionStatus.SUCCESS,
}:
raise HTTPException(
status_code=500, detail={"errors": result.errors, "status": result.status}
)
return result
@app.post(
"/convert",
)
def convert_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:
result = _convert_document(body=body)
image_mode = (
ImageRefMode.EMBEDDED
if body.options.include_images
else ImageRefMode.PLACEHOLDER
)
result: ConversionResult = next(models["converter"].convert(docs_input), None)
doc_resp = DocumentResponse()
if body.options.output_docling_document:
doc_resp.docling_document = result.document
if body.options.output_markdown:
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
if body.options.output_html:
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
if result is None or result.status != ConversionStatus.SUCCESS:
raise HTTPException(status_code=500, detail={"errors": result.errors})
return ConvertDocumentResponse(
document=doc_resp, status=result.status, timings=result.timings
)
return ConvertDocumentResponse(content_md=result.render_as_markdown())
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
def convert_document_md(
body: ConvertDocumentRequest,
) -> MarkdownTextResponse:
result = _convert_document(body=body)
image_mode = (
ImageRefMode.EMBEDDED
if body.options.include_images
else ImageRefMode.PLACEHOLDER
)
return MarkdownTextResponse(
result.document.export_to_markdown(image_mode=image_mode)
)

View File

@@ -2,7 +2,5 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
do_ocr: bool = True
do_table_structure: bool = True
model_config = SettingsConfigDict(env_prefix="DOCLING_")

4366
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -30,12 +30,26 @@ classifiers = [
]
[tool.poetry.dependencies]
python = "^3.10"
docling = "^1.11.0"
fastapi = {version = "^0.110.2", extras = ["standard"]}
uvicorn = "^0.30.6"
python = "^3.9"
docling = "^2.10.0"
fastapi = {version = "^0.115.6", extras = ["standard"]}
uvicorn = "^0.32.1"
pydantic-settings = "^2.4.0"
httpx = "^0.27.2"
httpx = "^0.28.1"
tesserocr = { version = "^2.7.1", optional = true }
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
onnxruntime = [
# 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
[tool.poetry.extras]
tesserocr = ["tesserocr"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.group.pypi-torch]
optional = false
@@ -63,6 +77,12 @@ torchvision = [
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
]
[tool.poetry.group.constraints.dependencies]
numpy = [
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
]
[tool.poetry.group.dev.dependencies]
black = "^24.8.0"
isort = "^5.13.2"
@@ -93,8 +113,17 @@ remove-unused-variables = true
expand-star-imports = true
recursive = true
[tool.mypy]
pretty = true
# strict = true
no_implicit_optional = true
plugins = "pydantic.mypy"
python_version = "3.10"
[[tool.mypy.overrides]]
module = [
"docling.*",
"easyocr.*",
"tesserocr.*",
"rapidocr_onnxruntime.*",
]
ignore_missing_imports = true