mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 08:33:50 +00:00
feat: upgrade endpoint to docling v2 (#13)
* upgrade endpoint to docling v2 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix Containerfile Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
|
||||
ENV HF_HOME=/tmp/
|
||||
ENV TORCH_HOME=/tmp/
|
||||
|
||||
RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
|
||||
CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
|
||||
|
||||
@@ -1,21 +1,55 @@
|
||||
import base64
|
||||
import hashlib
|
||||
from contextlib import asynccontextmanager
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Union
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import httpx
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
PipelineOptions,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from docling_core.utils.file import resolve_remote_filename
|
||||
from fastapi import FastAPI, HTTPException, Response
|
||||
from pydantic import AnyHttpUrl, BaseModel
|
||||
|
||||
from docling_serve.settings import Settings
|
||||
|
||||
# TODO: import enum from Docling, once it is exposed
|
||||
class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT = "tesseract"
|
||||
RAPIDOCR = "rapidocr"
|
||||
|
||||
|
||||
class ConvertOptions(BaseModel):
|
||||
output_docling_document: bool = True
|
||||
output_markdown: bool = False
|
||||
output_html: bool = False
|
||||
do_ocr: bool = True
|
||||
ocr_engine: OcrEngine = OcrEngine.EASYOCR
|
||||
ocr_lang: Optional[List[str]] = None
|
||||
force_ocr: bool = False
|
||||
do_table_structure: bool = True
|
||||
include_images: bool = True
|
||||
images_scale: float = 2.0
|
||||
|
||||
|
||||
class DocumentConvertBase(BaseModel):
|
||||
options: ConvertOptions = ConvertOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
@@ -28,16 +62,30 @@ class FileSource(BaseModel):
|
||||
filename: str
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourceRequest(BaseModel):
|
||||
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
|
||||
http_source: HttpSource
|
||||
|
||||
|
||||
class ConvertDocumentFileSourceRequest(BaseModel):
|
||||
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
|
||||
file_source: FileSource
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
markdown: Optional[str] = None
|
||||
docling_document: Optional[DoclingDocument] = None
|
||||
html: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
content_md: str
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
# errors: List[ErrorItem] = []
|
||||
|
||||
|
||||
ConvertDocumentRequest = Union[
|
||||
@@ -45,20 +93,93 @@ ConvertDocumentRequest = Union[
|
||||
]
|
||||
|
||||
|
||||
models = {}
|
||||
class MarkdownTextResponse(Response):
|
||||
media_type = "text/markdown"
|
||||
|
||||
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
|
||||
|
||||
if options.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
|
||||
|
||||
if options.ocr_lang is not None:
|
||||
ocr_options.lang = options.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=options.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=options.do_table_structure,
|
||||
generate_page_images=options.include_images,
|
||||
generate_picture_images=options.include_images,
|
||||
images_scale=options.images_scale,
|
||||
)
|
||||
|
||||
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
|
||||
|
||||
return pipeline_options, options_hash
|
||||
|
||||
|
||||
converters: Dict[str, DocumentConverter] = {}
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Converter
|
||||
settings = Settings()
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = settings.do_ocr
|
||||
pipeline_options.do_table_structure = settings.do_table_structure
|
||||
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
|
||||
# settings = Settings()
|
||||
|
||||
# Converter with default options
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
}
|
||||
)
|
||||
|
||||
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
||||
|
||||
yield
|
||||
|
||||
models.clear()
|
||||
converters.clear()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
@@ -67,10 +188,14 @@ app = FastAPI(
|
||||
)
|
||||
|
||||
|
||||
@app.post("/convert")
|
||||
def convert_pdf_document(
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
|
||||
def _convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConvertDocumentResponse:
|
||||
) -> ConversionResult:
|
||||
|
||||
filename: str
|
||||
buf: BytesIO
|
||||
@@ -81,16 +206,74 @@ def convert_pdf_document(
|
||||
elif isinstance(body, ConvertDocumentHttpSourceRequest):
|
||||
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
|
||||
buf = BytesIO(http_res.content)
|
||||
filename = Path(
|
||||
body.http_source.url
|
||||
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
|
||||
filename = resolve_remote_filename(
|
||||
http_url=AnyHttpUrl(body.http_source.url),
|
||||
response_headers=dict(**http_res.headers),
|
||||
)
|
||||
|
||||
docs_input = DocumentConversionInput.from_streams(
|
||||
[DocumentStream(filename=filename, stream=buf)]
|
||||
doc_input = DocumentStream(name=filename, stream=buf)
|
||||
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
|
||||
if options_hash not in converters:
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
}
|
||||
)
|
||||
|
||||
result: ConversionResult = converters[options_hash].convert(doc_input)
|
||||
|
||||
if result is None or result.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=result.errors)
|
||||
|
||||
if result is None or result.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
}:
|
||||
raise HTTPException(
|
||||
status_code=500, detail={"errors": result.errors, "status": result.status}
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@app.post(
|
||||
"/convert",
|
||||
)
|
||||
def convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConvertDocumentResponse:
|
||||
|
||||
result = _convert_document(body=body)
|
||||
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
)
|
||||
result: ConversionResult = next(models["converter"].convert(docs_input), None)
|
||||
doc_resp = DocumentResponse()
|
||||
if body.options.output_docling_document:
|
||||
doc_resp.docling_document = result.document
|
||||
if body.options.output_markdown:
|
||||
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
|
||||
if body.options.output_html:
|
||||
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
|
||||
|
||||
if result is None or result.status != ConversionStatus.SUCCESS:
|
||||
raise HTTPException(status_code=500, detail={"errors": result.errors})
|
||||
return ConvertDocumentResponse(
|
||||
document=doc_resp, status=result.status, timings=result.timings
|
||||
)
|
||||
|
||||
return ConvertDocumentResponse(content_md=result.render_as_markdown())
|
||||
|
||||
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
|
||||
def convert_document_md(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> MarkdownTextResponse:
|
||||
result = _convert_document(body=body)
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
)
|
||||
return MarkdownTextResponse(
|
||||
result.document.export_to_markdown(image_mode=image_mode)
|
||||
)
|
||||
|
||||
@@ -2,7 +2,5 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
do_ocr: bool = True
|
||||
do_table_structure: bool = True
|
||||
|
||||
model_config = SettingsConfigDict(env_prefix="DOCLING_")
|
||||
|
||||
4366
poetry.lock
generated
4366
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -30,12 +30,26 @@ classifiers = [
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
docling = "^1.11.0"
|
||||
fastapi = {version = "^0.110.2", extras = ["standard"]}
|
||||
uvicorn = "^0.30.6"
|
||||
python = "^3.9"
|
||||
docling = "^2.10.0"
|
||||
fastapi = {version = "^0.115.6", extras = ["standard"]}
|
||||
uvicorn = "^0.32.1"
|
||||
pydantic-settings = "^2.4.0"
|
||||
httpx = "^0.27.2"
|
||||
httpx = "^0.28.1"
|
||||
tesserocr = { version = "^2.7.1", optional = true }
|
||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||
onnxruntime = [
|
||||
# 1.19.2 is the last version with python3.9 support,
|
||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||
]
|
||||
|
||||
|
||||
[tool.poetry.extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||
|
||||
|
||||
[tool.poetry.group.pypi-torch]
|
||||
optional = false
|
||||
@@ -63,6 +77,12 @@ torchvision = [
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
|
||||
]
|
||||
|
||||
[tool.poetry.group.constraints.dependencies]
|
||||
numpy = [
|
||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^24.8.0"
|
||||
isort = "^5.13.2"
|
||||
@@ -93,8 +113,17 @@ remove-unused-variables = true
|
||||
expand-star-imports = true
|
||||
recursive = true
|
||||
|
||||
[tool.mypy]
|
||||
pretty = true
|
||||
# strict = true
|
||||
no_implicit_optional = true
|
||||
plugins = "pydantic.mypy"
|
||||
python_version = "3.10"
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"docling.*",
|
||||
"easyocr.*",
|
||||
"tesserocr.*",
|
||||
"rapidocr_onnxruntime.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
Reference in New Issue
Block a user