mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 16:43:24 +00:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b6eece7ef0 | ||
|
|
f5af71e8f6 | ||
|
|
d95ea94087 | ||
|
|
5344505718 | ||
|
|
5edc624fbf | ||
|
|
45f0f3c8f9 | ||
|
|
0595d31d5b | ||
|
|
f6b5f0e063 | ||
|
|
8b22a39141 | ||
|
|
d4eac053f9 | ||
|
|
fa1c5f04f3 | ||
|
|
ba61af2359 | ||
|
|
6b6dd8a0d0 | ||
|
|
513ae0c119 | ||
|
|
bde040661f | ||
|
|
496f7ec26b | ||
|
|
9d6def0ec8 | ||
|
|
a4fed2d965 | ||
|
|
b0360d723b | ||
|
|
4adc0dfa79 |
205
.github/workflows/job-image.yml
vendored
205
.github/workflows/job-image.yml
vendored
@@ -108,6 +108,7 @@ jobs:
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
pull: true
|
||||
##
|
||||
## This stage runs after the build, so it leverages all build cache
|
||||
##
|
||||
@@ -117,8 +118,8 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
push: false
|
||||
load: true # == '--output=type=docker'
|
||||
tags: ${{ steps.ghcr_meta.outputs.tags }}-test
|
||||
load: true
|
||||
tags: ${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}:${{ github.sha }}-test
|
||||
labels: |
|
||||
org.opencontainers.image.title=docling-serve
|
||||
org.opencontainers.image.test=true
|
||||
@@ -133,7 +134,7 @@ jobs:
|
||||
run: |
|
||||
set -e
|
||||
|
||||
IMAGE_TAG="${{ steps.ghcr_meta.outputs.tags }}-test"
|
||||
IMAGE_TAG="${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}:${{ github.sha }}-test"
|
||||
echo "Testing local image: $IMAGE_TAG"
|
||||
|
||||
# Remove existing container if any
|
||||
@@ -226,202 +227,8 @@ jobs:
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
pull: true
|
||||
|
||||
- name: Remove Local Docker Images
|
||||
- name: Remove local Docker images
|
||||
run: |
|
||||
docker image prune -af
|
||||
##
|
||||
## Extra tests for released images
|
||||
##
|
||||
|
||||
# outputs:
|
||||
# image-tags: ${{ steps.ghcr_meta.outputs.tags }}
|
||||
# image-labels: ${{ steps.ghcr_meta.outputs.labels }}
|
||||
|
||||
# test-cpu-image:
|
||||
# needs:
|
||||
# - image
|
||||
# runs-on: ubuntu-latest
|
||||
# permissions:
|
||||
# contents: read
|
||||
# packages: read
|
||||
|
||||
# steps:
|
||||
# - name: Checkout code
|
||||
# uses: actions/checkout@v5
|
||||
|
||||
# - name: Test CPU images
|
||||
# run: |
|
||||
# set -e
|
||||
|
||||
# echo "Testing image: ${{ needs.image.outputs.image-tags }}"
|
||||
|
||||
# for tag in ${{ needs.image.outputs.image-tags }}; do
|
||||
# if echo "$tag" | grep -q -- '-cpu' && echo "$tag" | grep -qE ':[vV][0-9]+(\.[0-9]+){0,2}$'; then
|
||||
# echo "Testing CPU image: $tag"
|
||||
|
||||
# # Remove existing container if any
|
||||
# docker rm -f docling-serve-test-container 2>/dev/null || true
|
||||
|
||||
# echo "Pulling image..."
|
||||
# docker pull "$tag"
|
||||
|
||||
# echo "Waiting 5s after pull..."
|
||||
# sleep 5
|
||||
|
||||
# echo "Starting container..."
|
||||
# docker run -d -p 5001:5001 --name docling-serve-test-container "$tag"
|
||||
|
||||
# echo "Waiting 15s for container to boot..."
|
||||
# sleep 15
|
||||
|
||||
# echo "Checking service health..."
|
||||
# for i in {1..20}; do
|
||||
# health_response=$(curl -s http://localhost:5001/health || true)
|
||||
# echo "Health check response [$i]: $health_response"
|
||||
# if echo "$health_response" | grep -q '"status":"ok"'; then
|
||||
# echo "Service is healthy!"
|
||||
# echo "Sending test conversion request..."
|
||||
|
||||
# status_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST 'http://localhost:5001/v1/convert/source' \
|
||||
# -H 'accept: application/json' \
|
||||
# -H 'Content-Type: application/json' \
|
||||
# -d '{
|
||||
# "options": {
|
||||
# "from_formats": ["pdf"],
|
||||
# "to_formats": ["md"]
|
||||
# },
|
||||
# "sources": [
|
||||
# {
|
||||
# "kind": "http",
|
||||
# "url": "https://arxiv.org/pdf/2501.17887"
|
||||
# }
|
||||
# ],
|
||||
# "target": {
|
||||
# "kind": "inbody"
|
||||
# }
|
||||
# }')
|
||||
|
||||
# echo "Conversion request returned status code: $status_code"
|
||||
|
||||
# if [ "$status_code" -ne 200 ]; then
|
||||
# echo "Conversion failed!"
|
||||
# docker logs docling-serve-test-container
|
||||
# docker rm -f docling-serve-test-container
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
# break
|
||||
# else
|
||||
# echo "Waiting for service... [$i/20]"
|
||||
# sleep 3
|
||||
# fi
|
||||
# done
|
||||
|
||||
# if ! echo "$health_response" | grep -q '"status":"ok"'; then
|
||||
# echo "Service did not become healthy in time."
|
||||
# docker logs docling-serve-test-container
|
||||
# docker rm -f docling-serve-test-container
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
# echo "Cleaning up test container..."
|
||||
# docker rm -f docling-serve-test-container
|
||||
# else
|
||||
# echo "Skipping non-released or non-CPU image: $tag"
|
||||
# fi
|
||||
# done
|
||||
|
||||
# test-cuda-image:
|
||||
# needs:
|
||||
# - image
|
||||
# runs-on: ubuntu-latest # >> placeholder for GPU runner << #
|
||||
# permissions:
|
||||
# contents: read
|
||||
# packages: read
|
||||
|
||||
# steps:
|
||||
# - name: Checkout code
|
||||
# uses: actions/checkout@v5
|
||||
|
||||
# - name: Test CUDA images
|
||||
# run: |
|
||||
# set -e
|
||||
|
||||
# echo "Testing image: ${{ needs.image.outputs.image-tags }}"
|
||||
|
||||
# for tag in ${{ needs.image.outputs.image-tags }}; do
|
||||
# if echo "$tag" | grep -qE -- '-cu[0-9]+' && echo "$tag" | grep -qE ':[vV][0-9]+(\.[0-9]+){0,2}$'; then
|
||||
# echo "Testing CUDA image: $tag"
|
||||
|
||||
# # Remove existing container if any
|
||||
# docker rm -f docling-serve-test-container 2>/dev/null || true
|
||||
|
||||
# echo "Pulling image..."
|
||||
# docker pull "$tag"
|
||||
|
||||
# echo "Waiting 5s after pull..."
|
||||
# sleep 5
|
||||
|
||||
# echo "Starting container..."
|
||||
# docker run -d -p 5001:5001 --gpus all --name docling-serve-test-container "$tag"
|
||||
|
||||
# echo "Waiting 15s for container to boot..."
|
||||
# sleep 15
|
||||
|
||||
# echo "Checking service health..."
|
||||
# for i in {1..25}; do
|
||||
# health_response=$(curl -s http://localhost:5001/health || true)
|
||||
# echo "Health check response [$i]: $health_response"
|
||||
# if echo "$health_response" | grep -q '"status":"ok"'; then
|
||||
# echo "Service is healthy!"
|
||||
# echo "Sending test conversion request..."
|
||||
|
||||
# status_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST 'http://localhost:5001/v1/convert/source' \
|
||||
# -H 'accept: application/json' \
|
||||
# -H 'Content-Type: application/json' \
|
||||
# -d '{
|
||||
# "options": {
|
||||
# "from_formats": ["pdf"],
|
||||
# "to_formats": ["md"]
|
||||
# },
|
||||
# "sources": [
|
||||
# {
|
||||
# "kind": "http",
|
||||
# "url": "https://arxiv.org/pdf/2501.17887"
|
||||
# }
|
||||
# ],
|
||||
# "target": {
|
||||
# "kind": "inbody"
|
||||
# }
|
||||
# }')
|
||||
|
||||
# echo "Conversion request returned status code: $status_code"
|
||||
|
||||
# if [ "$status_code" -ne 200 ]; then
|
||||
# echo "Conversion failed!"
|
||||
# docker logs docling-serve-test-container
|
||||
# docker rm -f docling-serve-test-container
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
# break
|
||||
# else
|
||||
# echo "Waiting for service... [$i/25]"
|
||||
# sleep 3
|
||||
# fi
|
||||
# done
|
||||
|
||||
# if ! echo "$health_response" | grep -q '"status":"ok"'; then
|
||||
# echo "Service did not become healthy in time."
|
||||
# docker logs docling-serve-test-container
|
||||
# docker rm -f docling-serve-test-container
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
# echo "Cleaning up test container..."
|
||||
# docker rm -f docling-serve-test-container
|
||||
# else
|
||||
# echo "Skipping non-released or non-CUDA image: $tag"
|
||||
# fi
|
||||
# done
|
||||
|
||||
@@ -34,6 +34,6 @@ repos:
|
||||
files: \.md$
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
# uv version, https://github.com/astral-sh/uv-pre-commit/releases
|
||||
rev: 0.8.3
|
||||
rev: 0.8.19
|
||||
hooks:
|
||||
- id: uv-lock
|
||||
|
||||
94
CHANGELOG.md
94
CHANGELOG.md
@@ -1,3 +1,97 @@
|
||||
## [v1.7.0](https://github.com/docling-project/docling-serve/releases/tag/v1.7.0) - 2025-10-17
|
||||
|
||||
### Feature
|
||||
|
||||
* **UI:** Add auto and orcmac options in demo UI ([#408](https://github.com/docling-project/docling-serve/issues/408)) ([`f5af71e`](https://github.com/docling-project/docling-serve/commit/f5af71e8f6de00d7dd702471a3eea2e94d882410))
|
||||
* Docling with auto-ocr ([#403](https://github.com/docling-project/docling-serve/issues/403)) ([`d95ea94`](https://github.com/docling-project/docling-serve/commit/d95ea940870af0d8df689061baa50f6026efce28))
|
||||
|
||||
### Fix
|
||||
|
||||
* Run docling ui behind a reverse proxy using a context path ([#396](https://github.com/docling-project/docling-serve/issues/396)) ([`5344505`](https://github.com/docling-project/docling-serve/commit/53445057184aa731ee7456b33b70bc0ecf82f2a6))
|
||||
|
||||
### Docling libraries included in this release:
|
||||
- docling 2.57.0
|
||||
- docling-core 2.48.4
|
||||
- docling-ibm-models 3.9.1
|
||||
- docling-jobkit 1.6.0
|
||||
- docling-mcp 1.3.2
|
||||
- docling-parse 4.5.0
|
||||
- docling-serve 1.7.0
|
||||
|
||||
## [v1.6.0](https://github.com/docling-project/docling-serve/releases/tag/v1.6.0) - 2025-10-03
|
||||
|
||||
### Feature
|
||||
|
||||
* Pin new version of jobkit with granite-docling and connectors ([#391](https://github.com/docling-project/docling-serve/issues/391)) ([`0595d31`](https://github.com/docling-project/docling-serve/commit/0595d31d5b357553426215ca6771796a47e41324))
|
||||
|
||||
### Fix
|
||||
|
||||
* Update locked dependencies ([#392](https://github.com/docling-project/docling-serve/issues/392)) ([`45f0f3c`](https://github.com/docling-project/docling-serve/commit/45f0f3c8f95d418ac30e3744d27d02a63f9e4490))
|
||||
* **UI:** Allow both lowercase and uppercase extensions ([#386](https://github.com/docling-project/docling-serve/issues/386)) ([`8b22a39`](https://github.com/docling-project/docling-serve/commit/8b22a391418d22c1a4d706f880341f28702057b5))
|
||||
* Correctly raise HTTPException for Gateway Timeout ([#382](https://github.com/docling-project/docling-serve/issues/382)) ([`d4eac05`](https://github.com/docling-project/docling-serve/commit/d4eac053f9ce0a60f9070127335bdd56e193d7fa))
|
||||
* Pinning of higher version of dependencies to fix potential security issues ([#363](https://github.com/docling-project/docling-serve/issues/363)) ([`ba61af2`](https://github.com/docling-project/docling-serve/commit/ba61af23591eff200481aa2e532cf7d0701f0ea4))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Fix docs for websocket breaking condition ([#390](https://github.com/docling-project/docling-serve/issues/390)) ([`f6b5f0e`](https://github.com/docling-project/docling-serve/commit/f6b5f0e06354d2db7d03d274b114499e3407dccf))
|
||||
|
||||
### Docling libraries included in this release:
|
||||
- docling 2.55.1
|
||||
- docling-core 2.48.4
|
||||
- docling-ibm-models 3.9.1
|
||||
- docling-jobkit 1.6.0
|
||||
- docling-mcp 1.3.2
|
||||
- docling-parse 4.5.0
|
||||
- docling-serve 1.6.0
|
||||
|
||||
## [v1.5.1](https://github.com/docling-project/docling-serve/releases/tag/v1.5.1) - 2025-09-17
|
||||
|
||||
### Fix
|
||||
|
||||
* Remove old dependencies, fixes in docling-parse and more minor dependencies upgrade ([#362](https://github.com/docling-project/docling-serve/issues/362)) ([`513ae0c`](https://github.com/docling-project/docling-serve/commit/513ae0c119b66d3b17cf9a5d371a0f7971f43be7))
|
||||
* Updates rapidocr deps ([#361](https://github.com/docling-project/docling-serve/issues/361)) ([`bde0406`](https://github.com/docling-project/docling-serve/commit/bde040661fb65c67699326cd6281c0e6232e26f2))
|
||||
|
||||
### Docling libraries included in this release:
|
||||
- docling 2.52.0
|
||||
- docling-core 2.48.1
|
||||
- docling-ibm-models 3.9.1
|
||||
- docling-jobkit 1.5.0
|
||||
- docling-mcp 1.2.0
|
||||
- docling-parse 4.5.0
|
||||
- docling-serve 1.5.1
|
||||
|
||||
## [v1.5.0](https://github.com/docling-project/docling-serve/releases/tag/v1.5.0) - 2025-09-09
|
||||
|
||||
### Feature
|
||||
|
||||
* Add chunking endpoints ([#353](https://github.com/docling-project/docling-serve/issues/353)) ([`9d6def0`](https://github.com/docling-project/docling-serve/commit/9d6def0ec8b1804ad31aa71defa17658d73d29a1))
|
||||
|
||||
### Docling libraries included in this release:
|
||||
- docling 2.46.0
|
||||
- docling 2.51.0
|
||||
- docling-core 2.47.0
|
||||
- docling-ibm-models 3.9.1
|
||||
- docling-jobkit 1.5.0
|
||||
- docling-mcp 1.2.0
|
||||
- docling-parse 4.4.0
|
||||
- docling-serve 1.5.0
|
||||
|
||||
## [v1.4.1](https://github.com/docling-project/docling-serve/releases/tag/v1.4.1) - 2025-09-08
|
||||
|
||||
### Fix
|
||||
|
||||
* Trigger fix after ci fixes ([#355](https://github.com/docling-project/docling-serve/issues/355)) ([`b0360d7`](https://github.com/docling-project/docling-serve/commit/b0360d723bff202dcf44a25a3173ec1995945fc2))
|
||||
|
||||
### Docling libraries included in this release:
|
||||
- docling 2.46.0
|
||||
- docling 2.51.0
|
||||
- docling-core 2.47.0
|
||||
- docling-ibm-models 3.9.1
|
||||
- docling-jobkit 1.4.1
|
||||
- docling-mcp 1.2.0
|
||||
- docling-parse 4.4.0
|
||||
- docling-serve 1.4.1
|
||||
|
||||
## [v1.4.0](https://github.com/docling-project/docling-serve/releases/tag/v1.4.0) - 2025-09-05
|
||||
|
||||
### Feature
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
|
||||
|
||||
ARG UV_VERSION=0.8.3
|
||||
ARG UV_IMAGE=ghcr.io/astral-sh/uv:0.8.19
|
||||
|
||||
ARG UV_SYNC_EXTRA_ARGS=""
|
||||
|
||||
@@ -25,7 +25,7 @@ RUN /usr/bin/fix-permissions /opt/app-root/src/.cache
|
||||
|
||||
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
|
||||
FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv_stage
|
||||
FROM ${UV_IMAGE} AS uv_stage
|
||||
|
||||
###################################################################################################
|
||||
# Docling layer #
|
||||
@@ -58,7 +58,7 @@ RUN --mount=from=uv_stage,source=/uv,target=/bin/uv \
|
||||
uv sync ${UV_SYNC_ARGS} ${UV_SYNC_EXTRA_ARGS} --no-extra flash-attn && \
|
||||
FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE uv sync ${UV_SYNC_ARGS} ${UV_SYNC_EXTRA_ARGS} --no-build-isolation-package=flash-attn
|
||||
|
||||
ARG MODELS_LIST="layout tableformer picture_classifier easyocr"
|
||||
ARG MODELS_LIST="layout tableformer picture_classifier rapidocr easyocr"
|
||||
|
||||
RUN echo "Downloading models..." && \
|
||||
HF_HUB_DOWNLOAD_TIMEOUT="90" \
|
||||
|
||||
@@ -35,12 +35,17 @@ from docling_jobkit.datamodel.callback import (
|
||||
ProgressCallbackRequest,
|
||||
ProgressCallbackResponse,
|
||||
)
|
||||
from docling_jobkit.datamodel.chunking import (
|
||||
BaseChunkerOptions,
|
||||
ChunkingExportOptions,
|
||||
HierarchicalChunkerOptions,
|
||||
HybridChunkerOptions,
|
||||
)
|
||||
from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
|
||||
from docling_jobkit.datamodel.s3_coords import S3Coordinates
|
||||
from docling_jobkit.datamodel.task import Task, TaskSource
|
||||
from docling_jobkit.datamodel.task import Task, TaskSource, TaskType
|
||||
from docling_jobkit.datamodel.task_targets import (
|
||||
InBodyTarget,
|
||||
TaskTarget,
|
||||
ZipTarget,
|
||||
)
|
||||
from docling_jobkit.orchestrators.base_orchestrator import (
|
||||
@@ -54,11 +59,15 @@ from docling_serve.datamodel.convert import ConvertDocumentsRequestOptions
|
||||
from docling_serve.datamodel.requests import (
|
||||
ConvertDocumentsRequest,
|
||||
FileSourceRequest,
|
||||
GenericChunkDocumentsRequest,
|
||||
HttpSourceRequest,
|
||||
S3SourceRequest,
|
||||
TargetName,
|
||||
TargetRequest,
|
||||
make_request_model,
|
||||
)
|
||||
from docling_serve.datamodel.responses import (
|
||||
ChunkDocumentResponse,
|
||||
ClearResponse,
|
||||
ConvertDocumentResponse,
|
||||
HealthCheckResponse,
|
||||
@@ -185,16 +194,25 @@ def create_app(): # noqa: C901
|
||||
import gradio as gr
|
||||
|
||||
from docling_serve.gradio_ui import ui as gradio_ui
|
||||
from docling_serve.settings import uvicorn_settings
|
||||
|
||||
tmp_output_dir = get_scratch() / "gradio"
|
||||
tmp_output_dir.mkdir(exist_ok=True, parents=True)
|
||||
gradio_ui.gradio_output_dir = tmp_output_dir
|
||||
|
||||
# Build the root_path for Gradio, accounting for UVICORN_ROOT_PATH
|
||||
gradio_root_path = (
|
||||
f"{uvicorn_settings.root_path}/ui"
|
||||
if uvicorn_settings.root_path
|
||||
else "/ui"
|
||||
)
|
||||
|
||||
app = gr.mount_gradio_app(
|
||||
app,
|
||||
gradio_ui,
|
||||
path="/ui",
|
||||
allowed_paths=["./logo.png", tmp_output_dir],
|
||||
root_path="/ui",
|
||||
root_path=gradio_root_path,
|
||||
)
|
||||
except ImportError:
|
||||
_log.warning(
|
||||
@@ -249,10 +267,11 @@ def create_app(): # noqa: C901
|
||||
########################
|
||||
|
||||
async def _enque_source(
|
||||
orchestrator: BaseOrchestrator, conversion_request: ConvertDocumentsRequest
|
||||
orchestrator: BaseOrchestrator,
|
||||
request: ConvertDocumentsRequest | GenericChunkDocumentsRequest,
|
||||
) -> Task:
|
||||
sources: list[TaskSource] = []
|
||||
for s in conversion_request.sources:
|
||||
for s in request.sources:
|
||||
if isinstance(s, FileSourceRequest):
|
||||
sources.append(FileSource.model_validate(s))
|
||||
elif isinstance(s, HttpSourceRequest):
|
||||
@@ -260,18 +279,41 @@ def create_app(): # noqa: C901
|
||||
elif isinstance(s, S3SourceRequest):
|
||||
sources.append(S3Coordinates.model_validate(s))
|
||||
|
||||
convert_options: ConvertDocumentsRequestOptions
|
||||
chunking_options: BaseChunkerOptions | None = None
|
||||
chunking_export_options = ChunkingExportOptions()
|
||||
task_type: TaskType
|
||||
if isinstance(request, ConvertDocumentsRequest):
|
||||
task_type = TaskType.CONVERT
|
||||
convert_options = request.options
|
||||
elif isinstance(request, GenericChunkDocumentsRequest):
|
||||
task_type = TaskType.CHUNK
|
||||
convert_options = request.convert_options
|
||||
chunking_options = request.chunking_options
|
||||
chunking_export_options.include_converted_doc = (
|
||||
request.include_converted_doc
|
||||
)
|
||||
else:
|
||||
raise RuntimeError("Uknown request type.")
|
||||
|
||||
task = await orchestrator.enqueue(
|
||||
task_type=task_type,
|
||||
sources=sources,
|
||||
options=conversion_request.options,
|
||||
target=conversion_request.target,
|
||||
convert_options=convert_options,
|
||||
chunking_options=chunking_options,
|
||||
chunking_export_options=chunking_export_options,
|
||||
target=request.target,
|
||||
)
|
||||
return task
|
||||
|
||||
async def _enque_file(
|
||||
orchestrator: BaseOrchestrator,
|
||||
files: list[UploadFile],
|
||||
options: ConvertDocumentsRequestOptions,
|
||||
target: TaskTarget,
|
||||
task_type: TaskType,
|
||||
convert_options: ConvertDocumentsRequestOptions,
|
||||
chunking_options: BaseChunkerOptions | None,
|
||||
chunking_export_options: ChunkingExportOptions | None,
|
||||
target: TargetRequest,
|
||||
) -> Task:
|
||||
_log.info(f"Received {len(files)} files for processing.")
|
||||
|
||||
@@ -284,7 +326,12 @@ def create_app(): # noqa: C901
|
||||
file_sources.append(DocumentStream(name=name, stream=buf))
|
||||
|
||||
task = await orchestrator.enqueue(
|
||||
sources=file_sources, options=options, target=target
|
||||
task_type=task_type,
|
||||
sources=file_sources,
|
||||
convert_options=convert_options,
|
||||
chunking_options=chunking_options,
|
||||
chunking_export_options=chunking_export_options,
|
||||
target=target,
|
||||
)
|
||||
return task
|
||||
|
||||
@@ -381,7 +428,7 @@ def create_app(): # noqa: C901
|
||||
response = RedirectResponse(url=logo_url)
|
||||
return response
|
||||
|
||||
@app.get("/health")
|
||||
@app.get("/health", tags=["health"])
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
@@ -393,6 +440,7 @@ def create_app(): # noqa: C901
|
||||
# Convert a document from URL(s)
|
||||
@app.post(
|
||||
"/v1/convert/source",
|
||||
tags=["convert"],
|
||||
response_model=ConvertDocumentResponse | PresignedUrlConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
@@ -408,7 +456,7 @@ def create_app(): # noqa: C901
|
||||
conversion_request: ConvertDocumentsRequest,
|
||||
):
|
||||
task = await _enque_source(
|
||||
orchestrator=orchestrator, conversion_request=conversion_request
|
||||
orchestrator=orchestrator, request=conversion_request
|
||||
)
|
||||
completed = await _wait_task_complete(
|
||||
orchestrator=orchestrator, task_id=task.task_id
|
||||
@@ -416,7 +464,7 @@ def create_app(): # noqa: C901
|
||||
|
||||
if not completed:
|
||||
# TODO: abort task!
|
||||
return HTTPException(
|
||||
raise HTTPException(
|
||||
status_code=504,
|
||||
detail=f"Conversion is taking too long. The maximum wait time is configure as DOCLING_SERVE_MAX_SYNC_WAIT={docling_serve_settings.max_sync_wait}.",
|
||||
)
|
||||
@@ -438,6 +486,7 @@ def create_app(): # noqa: C901
|
||||
# Convert a document from file(s)
|
||||
@app.post(
|
||||
"/v1/convert/file",
|
||||
tags=["convert"],
|
||||
response_model=ConvertDocumentResponse | PresignedUrlConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
@@ -457,7 +506,13 @@ def create_app(): # noqa: C901
|
||||
):
|
||||
target = InBodyTarget() if target_type == TargetName.INBODY else ZipTarget()
|
||||
task = await _enque_file(
|
||||
orchestrator=orchestrator, files=files, options=options, target=target
|
||||
task_type=TaskType.CONVERT,
|
||||
orchestrator=orchestrator,
|
||||
files=files,
|
||||
convert_options=options,
|
||||
chunking_options=None,
|
||||
chunking_export_options=None,
|
||||
target=target,
|
||||
)
|
||||
completed = await _wait_task_complete(
|
||||
orchestrator=orchestrator, task_id=task.task_id
|
||||
@@ -465,7 +520,7 @@ def create_app(): # noqa: C901
|
||||
|
||||
if not completed:
|
||||
# TODO: abort task!
|
||||
return HTTPException(
|
||||
raise HTTPException(
|
||||
status_code=504,
|
||||
detail=f"Conversion is taking too long. The maximum wait time is configure as DOCLING_SERVE_MAX_SYNC_WAIT={docling_serve_settings.max_sync_wait}.",
|
||||
)
|
||||
@@ -487,6 +542,7 @@ def create_app(): # noqa: C901
|
||||
# Convert a document from URL(s) using the async api
|
||||
@app.post(
|
||||
"/v1/convert/source/async",
|
||||
tags=["convert"],
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def process_url_async(
|
||||
@@ -495,13 +551,14 @@ def create_app(): # noqa: C901
|
||||
conversion_request: ConvertDocumentsRequest,
|
||||
):
|
||||
task = await _enque_source(
|
||||
orchestrator=orchestrator, conversion_request=conversion_request
|
||||
orchestrator=orchestrator, request=conversion_request
|
||||
)
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task.task_id
|
||||
)
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
@@ -510,6 +567,7 @@ def create_app(): # noqa: C901
|
||||
# Convert a document from file(s) using the async api
|
||||
@app.post(
|
||||
"/v1/convert/file/async",
|
||||
tags=["convert"],
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def process_file_async(
|
||||
@@ -524,21 +582,249 @@ def create_app(): # noqa: C901
|
||||
):
|
||||
target = InBodyTarget() if target_type == TargetName.INBODY else ZipTarget()
|
||||
task = await _enque_file(
|
||||
orchestrator=orchestrator, files=files, options=options, target=target
|
||||
task_type=TaskType.CONVERT,
|
||||
orchestrator=orchestrator,
|
||||
files=files,
|
||||
convert_options=options,
|
||||
chunking_options=None,
|
||||
chunking_export_options=None,
|
||||
target=target,
|
||||
)
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task.task_id
|
||||
)
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
)
|
||||
|
||||
# Chunking endpoints
|
||||
for display_name, path_name, opt_cls in (
|
||||
("HybridChunker", "hybrid", HybridChunkerOptions),
|
||||
("HierarchicalChunker", "hierarchical", HierarchicalChunkerOptions),
|
||||
):
|
||||
req_cls = make_request_model(opt_cls)
|
||||
|
||||
@app.post(
|
||||
f"/v1/chunk/{path_name}/source/async",
|
||||
name=f"Chunk sources with {display_name} as async task",
|
||||
tags=["chunk"],
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def chunk_source_async(
|
||||
background_tasks: BackgroundTasks,
|
||||
auth: Annotated[AuthenticationResult, Depends(require_auth)],
|
||||
orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
|
||||
request: req_cls,
|
||||
):
|
||||
task = await _enque_source(orchestrator=orchestrator, request=request)
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task.task_id
|
||||
)
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
)
|
||||
|
||||
@app.post(
|
||||
f"/v1/chunk/{path_name}/file/async",
|
||||
name=f"Chunk files with {display_name} as async task",
|
||||
tags=["chunk"],
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def chunk_file_async(
|
||||
background_tasks: BackgroundTasks,
|
||||
auth: Annotated[AuthenticationResult, Depends(require_auth)],
|
||||
orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
|
||||
files: list[UploadFile],
|
||||
convert_options: Annotated[
|
||||
ConvertDocumentsRequestOptions,
|
||||
FormDepends(
|
||||
ConvertDocumentsRequestOptions,
|
||||
prefix="convert_",
|
||||
excluded_fields=[
|
||||
"to_formats",
|
||||
],
|
||||
),
|
||||
],
|
||||
chunking_options: Annotated[
|
||||
opt_cls,
|
||||
FormDepends(
|
||||
HybridChunkerOptions,
|
||||
prefix="chunking_",
|
||||
excluded_fields=["chunker"],
|
||||
),
|
||||
],
|
||||
include_converted_doc: Annotated[
|
||||
bool,
|
||||
Form(
|
||||
description="If true, the output will include both the chunks and the converted document."
|
||||
),
|
||||
] = False,
|
||||
target_type: Annotated[
|
||||
TargetName,
|
||||
Form(description="Specification for the type of output target."),
|
||||
] = TargetName.INBODY,
|
||||
):
|
||||
target = InBodyTarget() if target_type == TargetName.INBODY else ZipTarget()
|
||||
task = await _enque_file(
|
||||
task_type=TaskType.CHUNK,
|
||||
orchestrator=orchestrator,
|
||||
files=files,
|
||||
convert_options=convert_options,
|
||||
chunking_options=chunking_options,
|
||||
chunking_export_options=ChunkingExportOptions(
|
||||
include_converted_doc=include_converted_doc
|
||||
),
|
||||
target=target,
|
||||
)
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task.task_id
|
||||
)
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
)
|
||||
|
||||
@app.post(
|
||||
f"/v1/chunk/{path_name}/source",
|
||||
name=f"Chunk sources with {display_name}",
|
||||
tags=["chunk"],
|
||||
response_model=ChunkDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
# "description": "Return the JSON item or an image.",
|
||||
}
|
||||
},
|
||||
)
|
||||
async def chunk_source(
|
||||
background_tasks: BackgroundTasks,
|
||||
auth: Annotated[AuthenticationResult, Depends(require_auth)],
|
||||
orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
|
||||
request: req_cls,
|
||||
):
|
||||
task = await _enque_source(orchestrator=orchestrator, request=request)
|
||||
completed = await _wait_task_complete(
|
||||
orchestrator=orchestrator, task_id=task.task_id
|
||||
)
|
||||
|
||||
if not completed:
|
||||
# TODO: abort task!
|
||||
raise HTTPException(
|
||||
status_code=504,
|
||||
detail=f"Conversion is taking too long. The maximum wait time is configure as DOCLING_SERVE_MAX_SYNC_WAIT={docling_serve_settings.max_sync_wait}.",
|
||||
)
|
||||
|
||||
task_result = await orchestrator.task_result(task_id=task.task_id)
|
||||
if task_result is None:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Task result not found. Please wait for a completion status.",
|
||||
)
|
||||
response = await prepare_response(
|
||||
task_id=task.task_id,
|
||||
task_result=task_result,
|
||||
orchestrator=orchestrator,
|
||||
background_tasks=background_tasks,
|
||||
)
|
||||
return response
|
||||
|
||||
@app.post(
|
||||
f"/v1/chunk/{path_name}/file",
|
||||
name=f"Chunk files with {display_name}",
|
||||
tags=["chunk"],
|
||||
response_model=ChunkDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
}
|
||||
},
|
||||
)
|
||||
async def chunk_file(
|
||||
background_tasks: BackgroundTasks,
|
||||
auth: Annotated[AuthenticationResult, Depends(require_auth)],
|
||||
orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
|
||||
files: list[UploadFile],
|
||||
convert_options: Annotated[
|
||||
ConvertDocumentsRequestOptions,
|
||||
FormDepends(
|
||||
ConvertDocumentsRequestOptions,
|
||||
prefix="convert_",
|
||||
excluded_fields=[
|
||||
"to_formats",
|
||||
],
|
||||
),
|
||||
],
|
||||
chunking_options: Annotated[
|
||||
opt_cls,
|
||||
FormDepends(
|
||||
HybridChunkerOptions,
|
||||
prefix="chunking_",
|
||||
excluded_fields=["chunker"],
|
||||
),
|
||||
],
|
||||
include_converted_doc: Annotated[
|
||||
bool,
|
||||
Form(
|
||||
description="If true, the output will include both the chunks and the converted document."
|
||||
),
|
||||
] = False,
|
||||
target_type: Annotated[
|
||||
TargetName,
|
||||
Form(description="Specification for the type of output target."),
|
||||
] = TargetName.INBODY,
|
||||
):
|
||||
target = InBodyTarget() if target_type == TargetName.INBODY else ZipTarget()
|
||||
task = await _enque_file(
|
||||
task_type=TaskType.CHUNK,
|
||||
orchestrator=orchestrator,
|
||||
files=files,
|
||||
convert_options=convert_options,
|
||||
chunking_options=chunking_options,
|
||||
chunking_export_options=ChunkingExportOptions(
|
||||
include_converted_doc=include_converted_doc
|
||||
),
|
||||
target=target,
|
||||
)
|
||||
completed = await _wait_task_complete(
|
||||
orchestrator=orchestrator, task_id=task.task_id
|
||||
)
|
||||
|
||||
if not completed:
|
||||
# TODO: abort task!
|
||||
raise HTTPException(
|
||||
status_code=504,
|
||||
detail=f"Conversion is taking too long. The maximum wait time is configure as DOCLING_SERVE_MAX_SYNC_WAIT={docling_serve_settings.max_sync_wait}.",
|
||||
)
|
||||
|
||||
task_result = await orchestrator.task_result(task_id=task.task_id)
|
||||
if task_result is None:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Task result not found. Please wait for a completion status.",
|
||||
)
|
||||
response = await prepare_response(
|
||||
task_id=task.task_id,
|
||||
task_result=task_result,
|
||||
orchestrator=orchestrator,
|
||||
background_tasks=background_tasks,
|
||||
)
|
||||
return response
|
||||
|
||||
# Task status poll
|
||||
@app.get(
|
||||
"/v1/status/poll/{task_id}",
|
||||
tags=["tasks"],
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def task_status_poll(
|
||||
@@ -557,6 +843,7 @@ def create_app(): # noqa: C901
|
||||
raise HTTPException(status_code=404, detail="Task not found.")
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
@@ -600,6 +887,7 @@ def create_app(): # noqa: C901
|
||||
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
||||
task_response = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
@@ -615,6 +903,7 @@ def create_app(): # noqa: C901
|
||||
)
|
||||
task_response = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
@@ -637,7 +926,10 @@ def create_app(): # noqa: C901
|
||||
# Task result
|
||||
@app.get(
|
||||
"/v1/result/{task_id}",
|
||||
response_model=ConvertDocumentResponse | PresignedUrlConvertDocumentResponse,
|
||||
tags=["tasks"],
|
||||
response_model=ConvertDocumentResponse
|
||||
| PresignedUrlConvertDocumentResponse
|
||||
| ChunkDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
@@ -670,6 +962,8 @@ def create_app(): # noqa: C901
|
||||
# Update task progress
|
||||
@app.post(
|
||||
"/v1/callback/task/progress",
|
||||
tags=["internal"],
|
||||
include_in_schema=False,
|
||||
response_model=ProgressCallbackResponse,
|
||||
)
|
||||
async def callback_task_progress(
|
||||
@@ -692,6 +986,7 @@ def create_app(): # noqa: C901
|
||||
# Offload models
|
||||
@app.get(
|
||||
"/v1/clear/converters",
|
||||
tags=["clear"],
|
||||
response_model=ClearResponse,
|
||||
)
|
||||
async def clear_converters(
|
||||
@@ -704,6 +999,7 @@ def create_app(): # noqa: C901
|
||||
# Clean results
|
||||
@app.get(
|
||||
"/v1/clear/results",
|
||||
tags=["clear"],
|
||||
response_model=ClearResponse,
|
||||
)
|
||||
async def clear_results(
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
import enum
|
||||
from typing import Annotated, Literal
|
||||
from functools import cache
|
||||
from typing import Annotated, Generic, Literal
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
from pydantic_core import PydanticCustomError
|
||||
from typing_extensions import Self
|
||||
from typing_extensions import Self, TypeVar
|
||||
|
||||
from docling_jobkit.datamodel.chunking import (
|
||||
BaseChunkerOptions,
|
||||
)
|
||||
from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
|
||||
from docling_jobkit.datamodel.s3_coords import S3Coordinates
|
||||
from docling_jobkit.datamodel.task_targets import (
|
||||
InBodyTarget,
|
||||
PutTarget,
|
||||
S3Target,
|
||||
TaskTarget,
|
||||
ZipTarget,
|
||||
)
|
||||
|
||||
@@ -43,12 +47,17 @@ SourceRequestItem = Annotated[
|
||||
FileSourceRequest | HttpSourceRequest | S3SourceRequest, Field(discriminator="kind")
|
||||
]
|
||||
|
||||
TargetRequest = Annotated[
|
||||
InBodyTarget | ZipTarget | S3Target | PutTarget,
|
||||
Field(discriminator="kind"),
|
||||
]
|
||||
|
||||
|
||||
## Complete Source request
|
||||
class ConvertDocumentsRequest(BaseModel):
|
||||
options: ConvertDocumentsRequestOptions = ConvertDocumentsRequestOptions()
|
||||
sources: list[SourceRequestItem]
|
||||
target: TaskTarget = InBodyTarget()
|
||||
target: TargetRequest = InBodyTarget()
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_s3_source_and_target(self) -> Self:
|
||||
@@ -70,3 +79,52 @@ class ConvertDocumentsRequest(BaseModel):
|
||||
"error target", 'target kind "s3" requires source kind "s3"'
|
||||
)
|
||||
return self
|
||||
|
||||
|
||||
## Source chunking requests
|
||||
|
||||
|
||||
class BaseChunkDocumentsRequest(BaseModel):
|
||||
convert_options: Annotated[
|
||||
ConvertDocumentsRequestOptions, Field(description="Conversion options.")
|
||||
] = ConvertDocumentsRequestOptions()
|
||||
sources: Annotated[
|
||||
list[SourceRequestItem],
|
||||
Field(description="List of input document sources to process."),
|
||||
]
|
||||
include_converted_doc: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description="If true, the output will include both the chunks and the converted document."
|
||||
),
|
||||
] = False
|
||||
target: Annotated[
|
||||
TargetRequest, Field(description="Specification for the type of output target.")
|
||||
] = InBodyTarget()
|
||||
|
||||
|
||||
ChunkingOptT = TypeVar("ChunkingOptT", bound=BaseChunkerOptions)
|
||||
|
||||
|
||||
class GenericChunkDocumentsRequest(BaseChunkDocumentsRequest, Generic[ChunkingOptT]):
|
||||
chunking_options: ChunkingOptT
|
||||
|
||||
|
||||
@cache
|
||||
def make_request_model(
|
||||
opt_type: type[ChunkingOptT],
|
||||
) -> type[GenericChunkDocumentsRequest[ChunkingOptT]]:
|
||||
"""
|
||||
Dynamically create (and cache) a subclass of GenericChunkDocumentsRequest[opt_type]
|
||||
with chunking_options having a default factory.
|
||||
"""
|
||||
return type(
|
||||
f"{opt_type.__name__}DocumentsRequest",
|
||||
(GenericChunkDocumentsRequest[opt_type],), # type: ignore[valid-type]
|
||||
{
|
||||
"__annotations__": {"chunking_options": opt_type},
|
||||
"chunking_options": Field(
|
||||
default_factory=opt_type, description="Options specific to the chunker."
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -5,8 +5,12 @@ from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.document import ConversionStatus, ErrorItem
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_jobkit.datamodel.result import ExportDocumentResponse
|
||||
from docling_jobkit.datamodel.task_meta import TaskProcessingMeta
|
||||
from docling_jobkit.datamodel.result import (
|
||||
ChunkedDocumentResultItem,
|
||||
ExportDocumentResponse,
|
||||
ExportResult,
|
||||
)
|
||||
from docling_jobkit.datamodel.task_meta import TaskProcessingMeta, TaskType
|
||||
|
||||
|
||||
# Status
|
||||
@@ -37,8 +41,15 @@ class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
|
||||
|
||||
class ChunkDocumentResponse(BaseModel):
|
||||
chunks: list[ChunkedDocumentResultItem]
|
||||
documents: list[ExportResult]
|
||||
processing_time: float
|
||||
|
||||
|
||||
class TaskStatusResponse(BaseModel):
|
||||
task_id: str
|
||||
task_type: TaskType
|
||||
task_status: str
|
||||
task_position: Optional[int] = None
|
||||
task_meta: Optional[TaskProcessingMeta] = None
|
||||
|
||||
@@ -4,6 +4,7 @@ import itertools
|
||||
import json
|
||||
import logging
|
||||
import ssl
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -224,13 +225,17 @@ def auto_set_return_as_file(
|
||||
|
||||
def change_ocr_lang(ocr_engine):
|
||||
if ocr_engine == "easyocr":
|
||||
return "en,fr,de,es"
|
||||
return gr.update(visible=True, value="en,fr,de,es")
|
||||
elif ocr_engine == "tesseract_cli":
|
||||
return "eng,fra,deu,spa"
|
||||
return gr.update(visible=True, value="eng,fra,deu,spa")
|
||||
elif ocr_engine == "tesseract":
|
||||
return "eng,fra,deu,spa"
|
||||
return gr.update(visible=True, value="eng,fra,deu,spa")
|
||||
elif ocr_engine == "rapidocr":
|
||||
return "english,chinese"
|
||||
return gr.update(visible=True, value="english,chinese")
|
||||
elif ocr_engine == "ocrmac":
|
||||
return gr.update(visible=True, value="fr-FR,de-DE,es-ES,en-US")
|
||||
|
||||
return gr.update(visible=False, value="")
|
||||
|
||||
|
||||
def wait_task_finish(auth: str, task_id: str, return_as_file: bool):
|
||||
@@ -570,14 +575,17 @@ with gr.Blocks(
|
||||
with gr.Tab("Convert File"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
raw_exts = itertools.chain.from_iterable(FormatToExtensions.values())
|
||||
file_input = gr.File(
|
||||
elem_id="file_input_zone",
|
||||
label="Upload File",
|
||||
file_types=[
|
||||
f".{v}"
|
||||
for v in itertools.chain.from_iterable(
|
||||
FormatToExtensions.values()
|
||||
)
|
||||
f".{v.lower()}"
|
||||
for v in raw_exts # lowercase
|
||||
]
|
||||
+ [
|
||||
f".{v.upper()}"
|
||||
for v in raw_exts # uppercase
|
||||
],
|
||||
file_count="multiple",
|
||||
scale=4,
|
||||
@@ -633,18 +641,25 @@ with gr.Blocks(
|
||||
ocr = gr.Checkbox(label="Enable OCR", value=True)
|
||||
force_ocr = gr.Checkbox(label="Force OCR", value=False)
|
||||
with gr.Column(scale=1):
|
||||
engines_list = [
|
||||
("Auto", "auto"),
|
||||
("EasyOCR", "easyocr"),
|
||||
("Tesseract", "tesseract"),
|
||||
("RapidOCR", "rapidocr"),
|
||||
]
|
||||
if sys.platform == "darwin":
|
||||
engines_list.append(("OCRMac", "ocrmac"))
|
||||
|
||||
ocr_engine = gr.Radio(
|
||||
[
|
||||
("EasyOCR", "easyocr"),
|
||||
("Tesseract", "tesseract"),
|
||||
("RapidOCR", "rapidocr"),
|
||||
],
|
||||
engines_list,
|
||||
label="OCR Engine",
|
||||
value="easyocr",
|
||||
value="auto",
|
||||
)
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr_lang = gr.Textbox(
|
||||
label="OCR Language (beware of the format)", value="en,fr,de,es"
|
||||
label="OCR Language (beware of the format)",
|
||||
value="en,fr,de,es",
|
||||
visible=False,
|
||||
)
|
||||
ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
|
||||
with gr.Row():
|
||||
|
||||
@@ -29,10 +29,15 @@ def is_pydantic_model(type_):
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
||||
def FormDepends(cls: type[BaseModel]):
|
||||
def FormDepends(
|
||||
cls: type[BaseModel], prefix: str = "", excluded_fields: list[str] = []
|
||||
):
|
||||
new_parameters = []
|
||||
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
if field_name in excluded_fields:
|
||||
continue
|
||||
|
||||
annotation = model_field.annotation
|
||||
description = model_field.description
|
||||
default = (
|
||||
@@ -63,7 +68,7 @@ def FormDepends(cls: type[BaseModel]):
|
||||
|
||||
new_parameters.append(
|
||||
inspect.Parameter(
|
||||
name=field_name,
|
||||
name=f"{prefix}{field_name}",
|
||||
kind=inspect.Parameter.POSITIONAL_ONLY,
|
||||
default=default,
|
||||
annotation=annotation,
|
||||
@@ -71,19 +76,23 @@ def FormDepends(cls: type[BaseModel]):
|
||||
)
|
||||
|
||||
async def as_form_func(**data):
|
||||
newdata = {}
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
value = data.get(field_name)
|
||||
if field_name in excluded_fields:
|
||||
continue
|
||||
value = data.get(f"{prefix}{field_name}")
|
||||
newdata[field_name] = value
|
||||
annotation = model_field.annotation
|
||||
|
||||
# Parse nested models from JSON string
|
||||
if value is not None and is_pydantic_model(annotation):
|
||||
try:
|
||||
validator = TypeAdapter(annotation)
|
||||
data[field_name] = validator.validate_json(value)
|
||||
newdata[field_name] = validator.validate_json(value)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid JSON for field '{field_name}': {e}")
|
||||
|
||||
return cls(**data)
|
||||
return cls(**newdata)
|
||||
|
||||
sig = inspect.signature(as_form_func)
|
||||
sig = sig.replace(parameters=new_parameters)
|
||||
|
||||
@@ -4,7 +4,8 @@ import logging
|
||||
from fastapi import BackgroundTasks, Response
|
||||
|
||||
from docling_jobkit.datamodel.result import (
|
||||
ConvertDocumentResult,
|
||||
ChunkedDocumentResult,
|
||||
DoclingTaskResult,
|
||||
ExportResult,
|
||||
RemoteTargetResult,
|
||||
ZipArchiveResult,
|
||||
@@ -14,6 +15,7 @@ from docling_jobkit.orchestrators.base_orchestrator import (
|
||||
)
|
||||
|
||||
from docling_serve.datamodel.responses import (
|
||||
ChunkDocumentResponse,
|
||||
ConvertDocumentResponse,
|
||||
PresignedUrlConvertDocumentResponse,
|
||||
)
|
||||
@@ -24,11 +26,16 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
async def prepare_response(
|
||||
task_id: str,
|
||||
task_result: ConvertDocumentResult,
|
||||
task_result: DoclingTaskResult,
|
||||
orchestrator: BaseOrchestrator,
|
||||
background_tasks: BackgroundTasks,
|
||||
):
|
||||
response: Response | ConvertDocumentResponse | PresignedUrlConvertDocumentResponse
|
||||
response: (
|
||||
Response
|
||||
| ConvertDocumentResponse
|
||||
| PresignedUrlConvertDocumentResponse
|
||||
| ChunkDocumentResponse
|
||||
)
|
||||
if isinstance(task_result.result, ExportResult):
|
||||
response = ConvertDocumentResponse(
|
||||
document=task_result.result.content,
|
||||
@@ -52,6 +59,12 @@ async def prepare_response(
|
||||
num_succeeded=task_result.num_succeeded,
|
||||
num_failed=task_result.num_failed,
|
||||
)
|
||||
elif isinstance(task_result.result, ChunkedDocumentResult):
|
||||
response = ChunkDocumentResponse(
|
||||
chunks=task_result.result.chunks,
|
||||
documents=task_result.result.documents,
|
||||
processing_time=task_result.processing_time,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Unknown result type")
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ class WebsocketNotifier(BaseNotifier):
|
||||
task_queue_position = await self.orchestrator.get_queue_position(task_id)
|
||||
msg = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_type=task.task_type,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
task_meta=task.processing_meta,
|
||||
|
||||
@@ -433,7 +433,7 @@ with connect(uri) as websocket:
|
||||
payload = json.loads(message)
|
||||
if payload["message"] == "error":
|
||||
break
|
||||
if payload["message"] == "error" and payload["task"]["task_status"] in ("success", "failure"):
|
||||
if payload["message"] == "update" and payload["task"]["task_status"] in ("success", "failure"):
|
||||
break
|
||||
except:
|
||||
break
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "docling-serve"
|
||||
version = "1.4.0" # DO NOT EDIT, updated automatically
|
||||
version = "1.7.0" # DO NOT EDIT, updated automatically
|
||||
description = "Running Docling as a service"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
@@ -35,7 +35,7 @@ requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"docling~=2.38",
|
||||
"docling-core>=2.45.0",
|
||||
"docling-jobkit[kfp,rq,vlm]>=1.4.0,<2.0.0",
|
||||
"docling-jobkit[kfp,rq,vlm]>=1.6.0,<2.0.0",
|
||||
"fastapi[standard]~=0.115",
|
||||
"httpx~=0.28",
|
||||
"pydantic~=2.10",
|
||||
@@ -50,15 +50,17 @@ dependencies = [
|
||||
|
||||
[project.optional-dependencies]
|
||||
ui = [
|
||||
"gradio~=5.9",
|
||||
"pydantic<2.11.0", # fix compatibility between gradio and new pydantic 2.11
|
||||
"gradio>=5.23.2,<6.0.0",
|
||||
]
|
||||
tesserocr = [
|
||||
"tesserocr~=2.7"
|
||||
]
|
||||
easyocr = [
|
||||
"easyocr>=1.7",
|
||||
]
|
||||
rapidocr = [
|
||||
"rapidocr-onnxruntime~=1.4; python_version<'3.13'",
|
||||
"onnxruntime~=1.7",
|
||||
"rapidocr (>=3.3,<4.0.0) ; python_version < '3.14'",
|
||||
"onnxruntime (>=1.7.0,<2.0.0)",
|
||||
]
|
||||
flash-attn = [
|
||||
"flash-attn~=2.8.2; sys_platform == 'linux' and platform_machine == 'x86_64'"
|
||||
@@ -87,10 +89,10 @@ cpu = [
|
||||
"torchvision>=0.22.1",
|
||||
]
|
||||
|
||||
cu124 = [
|
||||
"torch>=2.6.0",
|
||||
"torchvision>=0.21.0",
|
||||
]
|
||||
# cu124 = [
|
||||
# "torch>=2.6.0",
|
||||
# "torchvision>=0.21.0",
|
||||
# ]
|
||||
|
||||
cu126 = [
|
||||
"torch>=2.7.1",
|
||||
@@ -115,7 +117,7 @@ conflicts = [
|
||||
[
|
||||
{ group = "pypi" },
|
||||
{ group = "cpu" },
|
||||
{ group = "cu124" },
|
||||
# { group = "cu124" },
|
||||
{ group = "cu126" },
|
||||
{ group = "cu128" },
|
||||
{ group = "rocm" },
|
||||
@@ -123,14 +125,15 @@ conflicts = [
|
||||
]
|
||||
environments = ["sys_platform != 'darwin' or platform_machine != 'x86_64'"]
|
||||
override-dependencies = [
|
||||
"urllib3~=2.0"
|
||||
"urllib3~=2.0",
|
||||
"xgrammar>=0.1.24"
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
torch = [
|
||||
{ index = "pytorch-pypi", group = "pypi" },
|
||||
{ index = "pytorch-cpu", group = "cpu" },
|
||||
{ index = "pytorch-cu124", group = "cu124", marker = "sys_platform == 'linux'" },
|
||||
# { index = "pytorch-cu124", group = "cu124", marker = "sys_platform == 'linux'" },
|
||||
{ index = "pytorch-cu126", group = "cu126", marker = "sys_platform == 'linux'" },
|
||||
{ index = "pytorch-cu128", group = "cu128", marker = "sys_platform == 'linux'" },
|
||||
{ index = "pytorch-rocm", group = "rocm", marker = "sys_platform == 'linux'" },
|
||||
@@ -139,7 +142,7 @@ torch = [
|
||||
torchvision = [
|
||||
{ index = "pytorch-pypi", group = "pypi" },
|
||||
{ index = "pytorch-cpu", group = "cpu" },
|
||||
{ index = "pytorch-cu124", group = "cu124", marker = "sys_platform == 'linux'" },
|
||||
# { index = "pytorch-cu124", group = "cu124", marker = "sys_platform == 'linux'" },
|
||||
{ index = "pytorch-cu126", group = "cu126", marker = "sys_platform == 'linux'" },
|
||||
{ index = "pytorch-cu128", group = "cu128", marker = "sys_platform == 'linux'" },
|
||||
{ index = "pytorch-rocm", group = "rocm", marker = "sys_platform == 'linux'" },
|
||||
@@ -162,10 +165,10 @@ name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
explicit = true
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cu124"
|
||||
url = "https://download.pytorch.org/whl/cu124"
|
||||
explicit = true
|
||||
# [[tool.uv.index]]
|
||||
# name = "pytorch-cu124"
|
||||
# url = "https://download.pytorch.org/whl/cu124"
|
||||
# explicit = true
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cu126"
|
||||
@@ -279,6 +282,7 @@ module = [
|
||||
"kfp.*",
|
||||
"kfp_server_api.*",
|
||||
"mlx_vlm.*",
|
||||
"mlx.*",
|
||||
"scalar_fastapi.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
@@ -62,3 +62,60 @@ async def test_convert_url(async_client):
|
||||
time.sleep(2)
|
||||
|
||||
assert task["task_status"] == "success"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("include_converted_doc", [False, True])
|
||||
async def test_chunk_url(async_client, include_converted_doc: bool):
|
||||
"""Test chunk URL"""
|
||||
|
||||
example_docs = [
|
||||
"https://arxiv.org/pdf/2311.18481",
|
||||
]
|
||||
|
||||
base_url = "http://localhost:5001/v1"
|
||||
payload = {
|
||||
"sources": [{"kind": "http", "url": random.choice(example_docs)}],
|
||||
"include_converted_doc": include_converted_doc,
|
||||
}
|
||||
|
||||
response = await async_client.post(
|
||||
f"{base_url}/chunk/hybrid/source/async", json=payload
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
task = response.json()
|
||||
|
||||
print(json.dumps(task, indent=2))
|
||||
|
||||
while task["task_status"] not in ("success", "failure"):
|
||||
response = await async_client.get(f"{base_url}/status/poll/{task['task_id']}")
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
task = response.json()
|
||||
print(f"{task['task_status']=}")
|
||||
print(f"{task['task_position']=}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
assert task["task_status"] == "success"
|
||||
|
||||
result_resp = await async_client.get(f"{base_url}/result/{task['task_id']}")
|
||||
assert result_resp.status_code == 200, "Response should be 200 OK"
|
||||
result = result_resp.json()
|
||||
print("Got result.")
|
||||
|
||||
assert "chunks" in result
|
||||
assert len(result["chunks"]) > 0
|
||||
|
||||
assert "documents" in result
|
||||
assert len(result["documents"]) > 0
|
||||
assert result["documents"][0]["status"] == "success"
|
||||
|
||||
if include_converted_doc:
|
||||
assert result["documents"][0]["content"]["json_content"] is not None
|
||||
assert (
|
||||
result["documents"][0]["content"]["json_content"]["schema_name"]
|
||||
== "DoclingDocument"
|
||||
)
|
||||
else:
|
||||
assert result["documents"][0]["content"]["json_content"] is None
|
||||
|
||||
Reference in New Issue
Block a user