chore: bump version to 1.4.0 [skip ci]

feat(docling): perfomance improvements in parsing, new layout model, fixes in html processing (#352 )
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-11-29 08:33:50 +00:00 · 2025-09-05 17:57:08 +00:00 · 2025-09-05 16:21:29 +02:00 · 2025-09-04 10:42:11 +02:00 · 2025-09-04 09:17:19 +02:00 · 2025-09-03 15:42:55 +02:00
81 changed files with 8287 additions and 4682 deletions
--- a/.github/scripts/release.sh
+++ b/.github/scripts/release.sh
@@ -3,32 +3,68 @@
 set -e  # trigger failure on error - do not remove!
 set -x  # display command on output

+## debug
+# TARGET_VERSION="1.2.x"
+
 if [ -z "${TARGET_VERSION}" ]; then
    >&2 echo "No TARGET_VERSION specified"
    exit 1
 fi
 CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"

-# update package version
+# Update package version
 uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
 uv lock --upgrade-package docling-serve

-# collect release notes
+# Extract all docling packages and versions from uv.lock
+DOCVERSIONS=$(uvx --with toml python3 - <<'PY'
+import toml
+data = toml.load("uv.lock")
+for pkg in data.get("package", []):
+    if pkg["name"].startswith("docling"):
+        print(f"{pkg['name']} {pkg['version']}")
+PY
+)
+
+# Format docling versions list without trailing newline
+DOCLING_VERSIONS="### Docling libraries included in this release:"
+while IFS= read -r line; do
+  DOCLING_VERSIONS+="
+- $line"
+done <<< "$DOCVERSIONS"
+
+# Collect release notes
 REL_NOTES=$(mktemp)
 uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"

-# update changelog
+# Strip trailing blank lines from release notes and append docling versions
+{
+  sed -e :a -e '/^\n*$/{$d;N;};/\n$/ba' "${REL_NOTES}"
+  printf "\n"
+  printf "%s" "${DOCLING_VERSIONS}"
+  printf "\n"
+} > "${REL_NOTES}.tmp" && mv "${REL_NOTES}.tmp" "${REL_NOTES}"
+
+# Update changelog
 TMP_CHGLOG=$(mktemp)
 TARGET_TAG_NAME="v${TARGET_VERSION}"
 RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
-printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
-cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
-if [ -f "${CHGLOG_FILE}" ]; then
-    printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
-fi
+## debug
+#RELEASE_URL="myrepo/releases/tag/${TARGET_TAG_NAME}"
+
+# Strip leading blank lines from existing changelog to avoid multiple blank lines when appending
+EXISTING_CL=$(sed -e :a -e '/^\n*$/{$d;N;};/\n$/ba' "${CHGLOG_FILE}")
+
+{
+  printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n"
+  cat "${REL_NOTES}"
+  printf "\n"
+  printf "%s\n" "${EXISTING_CL}"
+} >> "${TMP_CHGLOG}"
+
 mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"

-# push changes
+# Push changes
 git config --global user.name 'github-actions[bot]'
 git config --global user.email 'github-actions[bot]@users.noreply.github.com'
 git add pyproject.toml uv.lock "${CHGLOG_FILE}"
@@ -36,5 +72,5 @@ COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
 git commit -m "${COMMIT_MSG}"
 git push origin main

-# create GitHub release (incl. Git tag)
+# Create GitHub release (incl. Git tag)
 gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
--- a/.github/styles/config/vocabularies/Docling/accept.txt
+++ b/.github/styles/config/vocabularies/Docling/accept.txt
@@ -0,0 +1,39 @@
+[Dd]ocling
+precommit
+asgi
+async
+(?i)urls
+uvicorn
+[Ww]ebserver
+RQ
+(?i)url
+keyfile
+[Ww]ebsocket(s?)
+[Kk]ubernetes
+UI
+(?i)vllm
+APIs
+[Ss]ubprocesses
+(?i)api
+Kubeflow
+(?i)Jobkit
+(?i)cpu
+(?i)PyTorch
+(?i)CUDA
+(?i)NVIDIA
+(?i)ROCm
+(?i)env
+Gradio
+Podman
+bool
+Ollama
+inbody
+LGTMs
+Dolfi
+Lysak
+Nikos
+Nassar
+Panos
+Vagenas
+Staar
+Livathinos
--- a/.github/vale.ini
+++ b/.github/vale.ini
@@ -0,0 +1,11 @@
+StylesPath = styles
+MinAlertLevel = suggestion
+; Packages = write-good, proselint
+
+Vocab = Docling
+
+[*.md]
+BasedOnStyles = Vale
+
+[CHANGELOG.md]
+BasedOnStyles = 
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -13,7 +13,7 @@ jobs:
  actionlint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
      - name: Download actionlint
        id: get_actionlint
        run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash)
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -11,11 +11,11 @@ jobs:
    outputs:
      TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
        with:
          fetch-depth: 0  # for fetching tags, required for semantic-release
      - name: Install uv and set the python version
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v6
        with:
          enable-cache: true
      - name: Install dependencies
@@ -40,12 +40,12 @@ jobs:
        with:
          app-id: ${{ vars.CI_APP_ID }}
          private-key: ${{ secrets.CI_PRIVATE_KEY }}
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
        with:
          token: ${{ steps.app-token.outputs.token }}
          fetch-depth: 0  # for fetching tags, required for semantic-release
      - name: Install uv and set the python version
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v6
        with:
          enable-cache: true
      - name: Install dependencies
--- a/.github/workflows/ci-images-dryrun.yml
+++ b/.github/workflows/ci-images-dryrun.yml
@@ -15,16 +15,28 @@ jobs:
        spec:
          - name: docling-project/docling-serve
            build_args: |
-              UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
+              UV_SYNC_EXTRA_ARGS=--no-extra flash-attn
            platforms: linux/amd64, linux/arm64
          - name: docling-project/docling-serve-cpu
            build_args: |
-              UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra flash-attn
+              UV_SYNC_EXTRA_ARGS=--no-group pypi --group cpu --no-extra flash-attn
            platforms: linux/amd64, linux/arm64
-          - name: docling-project/docling-serve-cu124
+          # - name: docling-project/docling-serve-cu124
+          #   build_args: |
+          #     UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu124
+          #   platforms: linux/amd64
+          - name: docling-project/docling-serve-cu126
            build_args: |
-              UV_SYNC_EXTRA_ARGS=--no-extra cpu
+              UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu126
            platforms: linux/amd64
+          - name: docling-project/docling-serve-cu128
+            build_args: |
+              UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu128
+            platforms: linux/amd64
+          # - name: docling-project/docling-serve-rocm
+          #   build_args: |
+          #     UV_SYNC_EXTRA_ARGS=--no-group pypi --group rocm --no-extra flash-attn
+          #   platforms: linux/amd64

    permissions:
      packages: write
--- a/.github/workflows/images.yml
+++ b/.github/workflows/images.yml
@@ -19,17 +19,28 @@ jobs:
        spec:
          - name: docling-project/docling-serve
            build_args: |
-              UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
+              UV_SYNC_EXTRA_ARGS=--no-extra flash-attn
            platforms: linux/amd64, linux/arm64
          - name: docling-project/docling-serve-cpu
            build_args: |
-              UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra flash-attn
+              UV_SYNC_EXTRA_ARGS=--no-group pypi --group cpu --no-extra flash-attn
            platforms: linux/amd64, linux/arm64
-          - name: docling-project/docling-serve-cu124
+          # - name: docling-project/docling-serve-cu124
+          #   build_args: |
+          #     UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu124
+          #   platforms: linux/amd64
+          - name: docling-project/docling-serve-cu126
            build_args: |
-              UV_SYNC_EXTRA_ARGS=--no-extra cpu
+              UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu126
            platforms: linux/amd64
-
+          - name: docling-project/docling-serve-cu128
+            build_args: |
+              UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu128
+            platforms: linux/amd64
+          # - name: docling-project/docling-serve-rocm
+          #   build_args: |
+          #     UV_SYNC_EXTRA_ARGS=--no-group pypi --group rocm --no-extra flash-attn
+          #   platforms: linux/amd64
    permissions:
      packages: write
      contents: read
--- a/.github/workflows/job-build.yml
+++ b/.github/workflows/job-build.yml
@@ -10,14 +10,14 @@ jobs:
      matrix:
        python-version: ['3.12']
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
      - name: Install uv and set the python version
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
      - name: Install dependencies
-        run: uv sync --all-extras --no-extra cu124 --no-extra flash-attn
+        run: uv sync --all-extras --no-extra flash-attn
      - name: Build package
        run: uv build
      - name: Check content of wheel
--- a/.github/workflows/job-checks.yml
+++ b/.github/workflows/job-checks.yml
@@ -10,9 +10,9 @@ jobs:
      matrix:
        python-version: ['3.12']
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
      - name: Install uv and set the python version
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
@@ -25,10 +25,10 @@ jobs:
          key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}

      - name: Install dependencies
-        run: uv sync --frozen --all-extras --no-extra cu124 --no-extra flash-attn
+        run: uv sync --frozen --all-extras --no-extra flash-attn

      - name: Run styling check
-        run: pre-commit run --all-files
+        run: uv run pre-commit run --all-files

  build-package:
    uses: ./.github/workflows/job-build.yml
@@ -47,21 +47,22 @@ jobs:
          name: python-package-distributions
          path: dist/
      - name: Install uv and set the python version
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
+      - name: Create virtual environment
+        run: uv venv
      - name: Install package
        run: uv pip install dist/*.whl
      - name: Create the server
-        run: python -c 'from docling_serve.app import create_app; create_app()'
-
-  markdown-lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: markdownlint-cli2-action
-        uses: DavidAnson/markdownlint-cli2-action@v16
-        with:
-          globs: "**/*.md"
+        run: .venv/bin/python -c 'from docling_serve.app import create_app; create_app()'

+  # markdown-lint:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v5
+  #     - name: markdownlint-cli2-action
+  #       uses: DavidAnson/markdownlint-cli2-action@v16
+  #       with:
+  #         globs: "**/*.md"
--- a/.github/workflows/job-image.yml
+++ b/.github/workflows/job-image.yml
@@ -53,7 +53,7 @@ jobs:
            df -h

      - name: Check out the repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5

      - name: Log in to the GHCR container image registry
        if: ${{ inputs.publish }}
@@ -88,19 +88,114 @@ jobs:
        with:
          images: ${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}

+      # # Local test
+      # - name: Set metadata outputs for local testing ## comment out Free up space, Log in to cr, Cache Docker, Extract metadata, and quay blocks and run act
+      #   id: ghcr_meta
+      #   run: |
+      #     echo "tags=ghcr.io/docling-project/docling-serve:pr-123" >> $GITHUB_OUTPUT
+      #     echo "labels=org.opencontainers.image.source=https://github.com/docling-project/docling-serve" >> $GITHUB_OUTPUT
+
      - name: Build and push image to ghcr.io
        id: ghcr_push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
-          push: ${{ inputs.publish }}
+          push: ${{ inputs.publish }} # set 'false' for local test
          tags: ${{ steps.ghcr_meta.outputs.tags }}
          labels: ${{ steps.ghcr_meta.outputs.labels }}
-          platforms: ${{ inputs.platforms}}
+          platforms: ${{ inputs.platforms }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          file: Containerfile
          build-args: ${{ inputs.build_args }}
+      ##
+      ## This stage runs after the build, so it leverages all build cache
+      ## 
+      - name: Export built image for testing
+        id: ghcr_export_built_image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: false
+          load: true # == '--output=type=docker'
+          tags: ${{ steps.ghcr_meta.outputs.tags }}-test
+          labels: |
+            org.opencontainers.image.title=docling-serve
+            org.opencontainers.image.test=true
+          platforms: linux/amd64 # when 'load' is true, we can't use a list ${{ inputs.platforms }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          file: Containerfile
+          build-args: ${{ inputs.build_args }}
+
+      - name: Test image
+        if: steps.ghcr_export_built_image.outcome == 'success'
+        run: |
+          set -e
+
+          IMAGE_TAG="${{ steps.ghcr_meta.outputs.tags }}-test"
+          echo "Testing local image: $IMAGE_TAG"
+
+          # Remove existing container if any
+          docker rm -f docling-serve-test-container 2>/dev/null || true
+
+          echo "Starting container..."
+          docker run -d -p 5001:5001 --name docling-serve-test-container "$IMAGE_TAG"
+
+          echo "Waiting 15s for container to boot..."
+          sleep 15
+
+          # Health check
+          echo "Checking service health..."
+          for i in {1..20}; do
+            HEALTH_RESPONSE=$(curl -s http://localhost:5001/health || true)
+            echo "Health check response [$i]: $HEALTH_RESPONSE"
+
+            if echo "$HEALTH_RESPONSE" | grep -q '"status":"ok"'; then
+              echo "Service is healthy!"
+
+              # Install pytest and dependencies
+              echo "Installing pytest and dependencies..."
+              pip install uv
+              uv venv --allow-existing
+              source .venv/bin/activate
+              uv sync --all-extras --no-extra flash-attn
+
+              # Run pytest tests
+              echo "Running tests..."
+              # Test import
+              python -c 'from docling_serve.app import create_app; create_app()'
+
+              # Run pytest and check result directly
+              if ! pytest -sv -k "test_convert_url" tests/test_1-url-async.py \
+                --disable-warnings; then
+                echo "Tests failed!"
+                docker logs docling-serve-test-container
+                docker rm -f docling-serve-test-container
+                exit 1
+              fi
+
+              echo "Tests passed successfully!"
+              break
+            else
+              echo "Waiting for service... [$i/20]"
+              sleep 3
+            fi
+          done
+
+          # Final health check if service didn't pass earlier
+          if ! echo "$HEALTH_RESPONSE" | grep -q '"status":"ok"'; then
+            echo "Service did not become healthy in time."
+            docker logs docling-serve-test-container
+            docker rm -f docling-serve-test-container
+            exit 1
+          fi
+
+          # Cleanup
+          echo "Cleaning up test container..."
+          docker rm -f docling-serve-test-container
+          echo "Cleaning up test image..."
+          docker rmi "$IMAGE_TAG"

      - name: Generate artifact attestation
        if: ${{ inputs.publish }}
@@ -120,7 +215,7 @@ jobs:
      - name: Build and push image to quay.io
        if: ${{ inputs.publish }}
        # id: push-serve-cpu-quay
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          push: ${{ inputs.publish }}
@@ -131,11 +226,202 @@ jobs:
          cache-to: type=gha,mode=max
          file: Containerfile
          build-args: ${{ inputs.build_args }}
-      
-      # - name: Inspect the image details
-      #   run: |
-      #     echo "${{ steps.ghcr_push.outputs.metadata }}"

      - name: Remove Local Docker Images
        run: |
          docker image prune -af
+##
+## Extra tests for released images
+##
+
+    # outputs:
+    #   image-tags: ${{ steps.ghcr_meta.outputs.tags }}
+    #   image-labels: ${{ steps.ghcr_meta.outputs.labels }}
+
+  # test-cpu-image:
+  #   needs:
+  #     - image
+  #   runs-on: ubuntu-latest
+  #   permissions:
+  #     contents: read
+  #     packages: read
+
+  #   steps:
+  #     - name: Checkout code
+  #       uses: actions/checkout@v5
+
+  #     - name: Test CPU images
+  #       run: |
+  #         set -e
+
+  #         echo "Testing image: ${{ needs.image.outputs.image-tags }}"
+
+  #         for tag in ${{ needs.image.outputs.image-tags }}; do
+  #           if echo "$tag" | grep -q -- '-cpu' && echo "$tag" | grep -qE ':[vV][0-9]+(\.[0-9]+){0,2}$'; then
+  #             echo "Testing CPU image: $tag"
+
+  #             # Remove existing container if any
+  #             docker rm -f docling-serve-test-container 2>/dev/null || true
+
+  #             echo "Pulling image..."
+  #             docker pull "$tag"
+
+  #             echo "Waiting 5s after pull..."
+  #             sleep 5
+
+  #             echo "Starting container..."
+  #             docker run -d -p 5001:5001 --name docling-serve-test-container "$tag"
+
+  #             echo "Waiting 15s for container to boot..."
+  #             sleep 15
+
+  #             echo "Checking service health..."
+  #             for i in {1..20}; do
+  #               health_response=$(curl -s http://localhost:5001/health || true)
+  #               echo "Health check response [$i]: $health_response"
+  #               if echo "$health_response" | grep -q '"status":"ok"'; then
+  #                 echo "Service is healthy!"
+  #                 echo "Sending test conversion request..."
+
+  #                 status_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST 'http://localhost:5001/v1/convert/source' \
+  #                   -H 'accept: application/json' \
+  #                   -H 'Content-Type: application/json' \
+  #                   -d '{
+  #                     "options": {
+  #                       "from_formats": ["pdf"],
+  #                       "to_formats": ["md"]
+  #                     },
+  #                     "sources": [
+  #                       {
+  #                         "kind": "http",
+  #                         "url": "https://arxiv.org/pdf/2501.17887"
+  #                       }
+  #                     ],
+  #                     "target": {
+  #                       "kind": "inbody"
+  #                     }
+  #                   }')
+
+  #                 echo "Conversion request returned status code: $status_code"
+
+  #                 if [ "$status_code" -ne 200 ]; then
+  #                   echo "Conversion failed!"
+  #                   docker logs docling-serve-test-container
+  #                   docker rm -f docling-serve-test-container
+  #                   exit 1
+  #                 fi
+
+  #                 break
+  #               else
+  #                 echo "Waiting for service... [$i/20]"
+  #                 sleep 3
+  #               fi
+  #             done
+
+  #             if ! echo "$health_response" | grep -q '"status":"ok"'; then
+  #               echo "Service did not become healthy in time."
+  #               docker logs docling-serve-test-container
+  #               docker rm -f docling-serve-test-container
+  #               exit 1
+  #             fi
+
+  #             echo "Cleaning up test container..."
+  #             docker rm -f docling-serve-test-container
+  #           else
+  #             echo "Skipping non-released or non-CPU image: $tag"
+  #           fi
+  #         done
+
+  # test-cuda-image:
+  #   needs:
+  #     - image
+  #   runs-on: ubuntu-latest # >> placeholder for GPU runner << #
+  #   permissions:
+  #     contents: read
+  #     packages: read
+
+  #   steps:
+  #     - name: Checkout code
+  #       uses: actions/checkout@v5
+
+  #     - name: Test CUDA images
+  #       run: |
+  #         set -e
+
+  #         echo "Testing image: ${{ needs.image.outputs.image-tags }}"
+
+  #         for tag in ${{ needs.image.outputs.image-tags }}; do
+  #           if echo "$tag" | grep -qE -- '-cu[0-9]+' && echo "$tag" | grep -qE ':[vV][0-9]+(\.[0-9]+){0,2}$'; then
+  #             echo "Testing CUDA image: $tag"
+
+  #             # Remove existing container if any
+  #             docker rm -f docling-serve-test-container 2>/dev/null || true
+
+  #             echo "Pulling image..."
+  #             docker pull "$tag"
+
+  #             echo "Waiting 5s after pull..."
+  #             sleep 5
+
+  #             echo "Starting container..."
+  #             docker run -d -p 5001:5001 --gpus all --name docling-serve-test-container "$tag"
+
+  #             echo "Waiting 15s for container to boot..."
+  #             sleep 15
+
+  #             echo "Checking service health..."
+  #             for i in {1..25}; do
+  #               health_response=$(curl -s http://localhost:5001/health || true)
+  #               echo "Health check response [$i]: $health_response"
+  #               if echo "$health_response" | grep -q '"status":"ok"'; then
+  #                 echo "Service is healthy!"
+  #                 echo "Sending test conversion request..."
+
+  #                 status_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST 'http://localhost:5001/v1/convert/source' \
+  #                   -H 'accept: application/json' \
+  #                   -H 'Content-Type: application/json' \
+  #                   -d '{
+  #                     "options": {
+  #                       "from_formats": ["pdf"],
+  #                       "to_formats": ["md"]
+  #                     },
+  #                     "sources": [
+  #                       {
+  #                         "kind": "http",
+  #                         "url": "https://arxiv.org/pdf/2501.17887"
+  #                       }
+  #                     ],
+  #                     "target": {
+  #                       "kind": "inbody"
+  #                     }
+  #                   }')
+
+  #                 echo "Conversion request returned status code: $status_code"
+
+  #                 if [ "$status_code" -ne 200 ]; then
+  #                   echo "Conversion failed!"
+  #                   docker logs docling-serve-test-container
+  #                   docker rm -f docling-serve-test-container
+  #                   exit 1
+  #                 fi
+
+  #                 break
+  #               else
+  #                 echo "Waiting for service... [$i/25]"
+  #                 sleep 3
+  #               fi
+  #             done
+
+  #             if ! echo "$health_response" | grep -q '"status":"ok"'; then
+  #               echo "Service did not become healthy in time."
+  #               docker logs docling-serve-test-container
+  #               docker rm -f docling-serve-test-container
+  #               exit 1
+  #             fi
+
+  #             echo "Cleaning up test container..."
+  #             docker rm -f docling-serve-test-container
+  #           else
+  #             echo "Skipping non-released or non-CUDA image: $tag"
+  #           fi
+  #         done
--- a/.gitignore
+++ b/.gitignore
@@ -445,4 +445,7 @@ pip-selfcheck.json
 .action-lint
 .markdown-lint

-cookies.txt
+cookies.txt
+
+# Examples
+/examples/splitted_pdf/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,12 +7,12 @@ repos:
      - id: ruff-format
        name: "Ruff formatter"
        args: [--config=pyproject.toml]
-        files: '^(docling_serve|tests).*\.(py|ipynb)$'
+        files: '^(docling_serve|tests|examples).*\.(py|ipynb)$'
      # Run the Ruff linter.
      - id: ruff
        name: "Ruff linter"
        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
-        files: '^(docling_serve|tests).*\.(py|ipynb)$'
+        files: '^(docling_serve|tests|examples).*\.(py|ipynb)$'
  - repo: local
    hooks:
      - id: system
@@ -21,8 +21,19 @@ repos:
        pass_filenames: false
        language: system
        files: '\.py$'
+  - repo: https://github.com/errata-ai/vale
+    rev: v3.12.0  # Use latest stable version
+    hooks:
+      - id: vale
+        name: vale sync
+        pass_filenames: false
+        args: [sync, "--config=.github/vale.ini"]
+      - id: vale
+        name: Spell and Style Check with Vale
+        args: ["--config=.github/vale.ini"]
+        files: \.md$
  - repo: https://github.com/astral-sh/uv-pre-commit
-    # uv version.
-    rev: 0.6.1
+    # uv version, https://github.com/astral-sh/uv-pre-commit/releases
+    rev: 0.8.3
    hooks:
      - id: uv-lock
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,137 @@
+## [v1.4.0](https://github.com/docling-project/docling-serve/releases/tag/v1.4.0) - 2025-09-05
+
+### Feature
+
+* **docling:** Perfomance improvements in parsing, new layout model, fixes in html processing ([#352](https://github.com/docling-project/docling-serve/issues/352)) ([`d64a2a9`](https://github.com/docling-project/docling-serve/commit/d64a2a974a276c7ae3b105c448fd79f77a653d20))
+
+### Fix
+
+* Upgrade to latest docling version with fixes ([#335](https://github.com/docling-project/docling-serve/issues/335)) ([`e544947`](https://github.com/docling-project/docling-serve/commit/e5449472b2a3e71796f41c8a58c251d8229305c1))
+
+### Documentation
+
+* Add split processing example ([#303](https://github.com/docling-project/docling-serve/issues/303)) ([`0d4545a`](https://github.com/docling-project/docling-serve/commit/0d4545a65a5a941fc1fdefda57e39cfb1ea106ab))
+* Document DOCLING_NUM_THREADS environment variable ([#341](https://github.com/docling-project/docling-serve/issues/341)) ([`27fdd7b`](https://github.com/docling-project/docling-serve/commit/27fdd7b85ab18b3eece428366f46dc5cf0995e38))
+* Fix parameters typo ([#333](https://github.com/docling-project/docling-serve/issues/333)) ([`81f0a8d`](https://github.com/docling-project/docling-serve/commit/81f0a8ddf80a532042d550ae4568f891458b45e7))
+* Describe how to use Docling MCP ([#332](https://github.com/docling-project/docling-serve/issues/332)) ([`a69cc86`](https://github.com/docling-project/docling-serve/commit/a69cc867f5a3fb76648803ca866d65cc3a75c6b8))
+
+### Docling libraries included in this release:
+- docling 2.46.0
+- docling 2.51.0
+- docling-core 2.47.0
+- docling-ibm-models 3.9.1
+- docling-jobkit 1.4.1
+- docling-mcp 1.2.0
+- docling-parse 4.4.0
+- docling-serve 1.4.0
+
+## [v1.3.1](https://github.com/docling-project/docling-serve/releases/tag/v1.3.1) - 2025-08-21
+
+### Fix
+
+* Configuration and performance fixes via upgrade of packages ([#328](https://github.com/docling-project/docling-serve/issues/328)) ([`f02dbc0`](https://github.com/docling-project/docling-serve/commit/f02dbc01449fe1caf3fb4a73c0a5f4adf8265faf))
+
+### Documentation
+
+* Fix parameter in api key docs ([#323](https://github.com/docling-project/docling-serve/issues/323)) ([`37fe022`](https://github.com/docling-project/docling-serve/commit/37fe02277b3e2358eced28e15b4360e7c82d3b43))
+
+## [v1.3.0](https://github.com/docling-project/docling-serve/releases/tag/v1.3.0) - 2025-08-14
+
+### Feature
+
+* Add configuration option for apikey security ([#322](https://github.com/docling-project/docling-serve/issues/322)) ([`9a64410`](https://github.com/docling-project/docling-serve/commit/9a644105523d312431993ded8dd88e064550a5db))
+* Add RQ engine ([#315](https://github.com/docling-project/docling-serve/issues/315)) ([`885f319`](https://github.com/docling-project/docling-serve/commit/885f319d3a3488a4090869560447437a4104f14e))
+
+### Documentation
+
+* Example of docling-serve deployment in the RQ engine mode ([#321](https://github.com/docling-project/docling-serve/issues/321)) ([`71edf41`](https://github.com/docling-project/docling-serve/commit/71edf4184960d8664ef9da20617e2d0f91793d36))
+* Handling models in docling-serve ([#319](https://github.com/docling-project/docling-serve/issues/319)) ([`6e9aa8c`](https://github.com/docling-project/docling-serve/commit/6e9aa8c759220458281c7fe4c87443ac41023eee))
+* Add Gradio cache usage ([#312](https://github.com/docling-project/docling-serve/issues/312)) ([`d584895`](https://github.com/docling-project/docling-serve/commit/d584895e1108d71a0f45deadcd3c669eb0a58133))
+
+## [v1.2.2](https://github.com/docling-project/docling-serve/releases/tag/v1.2.2) - 2025-08-13
+
+### Fix
+
+* Update of transformers module to 4.55.1 ([#316](https://github.com/docling-project/docling-serve/issues/316)) ([`7692eb2`](https://github.com/docling-project/docling-serve/commit/7692eb26006fd4deaa021180c99e23a1b65de506))
+
+## [v1.2.1](https://github.com/docling-project/docling-serve/releases/tag/v1.2.1) - 2025-08-13
+
+### Fix
+
+* Handling of vlm model options and update deps ([#314](https://github.com/docling-project/docling-serve/issues/314)) ([`8b470cb`](https://github.com/docling-project/docling-serve/commit/8b470cba8ef500c271eb84c8368c8a1a1a5a6d6a))
+* Add missing response type in sync endpoints ([#309](https://github.com/docling-project/docling-serve/issues/309)) ([`8048f45`](https://github.com/docling-project/docling-serve/commit/8048f4589a91de2b2b391ab33a326efd1b29f25b))
+
+### Documentation
+
+* Update readme to use v1 ([#306](https://github.com/docling-project/docling-serve/issues/306)) ([`b3058e9`](https://github.com/docling-project/docling-serve/commit/b3058e91e0c56e27110eb50f22cbdd89640bf398))
+* Update deployment examples to use v1 API ([#308](https://github.com/docling-project/docling-serve/issues/308)) ([`63da9ee`](https://github.com/docling-project/docling-serve/commit/63da9eedebae3ad31d04e65635e573194e413793))
+* Fix typo in v1 migration instructions ([#307](https://github.com/docling-project/docling-serve/issues/307)) ([`b15dc25`](https://github.com/docling-project/docling-serve/commit/b15dc2529f78d68a475e5221c37408c3f77d8588))
+
+## [v1.2.0](https://github.com/docling-project/docling-serve/releases/tag/v1.2.0) - 2025-08-07
+
+### Feature
+
+* Workers without shared models and convert params ([#304](https://github.com/docling-project/docling-serve/issues/304)) ([`db3fdb5`](https://github.com/docling-project/docling-serve/commit/db3fdb5bc1a0ae250afd420d737abc4071a7546c))
+* Add rocm image build support and fix cuda ([#292](https://github.com/docling-project/docling-serve/issues/292)) ([`fd1b987`](https://github.com/docling-project/docling-serve/commit/fd1b987e8dc174f1a6013c003dde33e9acbae39a))
+
+## [v1.1.0](https://github.com/docling-project/docling-serve/releases/tag/v1.1.0) - 2025-07-30
+
+### Feature
+
+* Add docling-mcp in the distribution ([#290](https://github.com/docling-project/docling-serve/issues/290)) ([`ecb1874`](https://github.com/docling-project/docling-serve/commit/ecb1874a507bef83d102e0e031e49fed34298637))
+* Add 3.0 openapi endpoint ([#287](https://github.com/docling-project/docling-serve/issues/287)) ([`ec594d8`](https://github.com/docling-project/docling-serve/commit/ec594d84fe36df23e7d010a2fcf769856c43600b))
+* Add new source and target ([#270](https://github.com/docling-project/docling-serve/issues/270)) ([`3771c1b`](https://github.com/docling-project/docling-serve/commit/3771c1b55403bd51966d07d8f760d5c4fbcc1760))
+
+### Fix
+
+* Referenced paths relative to zip root ([#289](https://github.com/docling-project/docling-serve/issues/289)) ([`1333f71`](https://github.com/docling-project/docling-serve/commit/1333f71c9c6495342b2169d574e921f828446f15))
+
+## [v1.0.1](https://github.com/docling-project/docling-serve/releases/tag/v1.0.1) - 2025-07-21
+
+### Fix
+
+* Docling update v2.42.0 ([#277](https://github.com/docling-project/docling-serve/issues/277)) ([`8706706`](https://github.com/docling-project/docling-serve/commit/8706706e8797b0a06ec4baa7cf87988311be68b6))
+
+### Documentation
+
+* Typo in README ([#276](https://github.com/docling-project/docling-serve/issues/276)) ([`766adb2`](https://github.com/docling-project/docling-serve/commit/766adb248113c7bd5144d14b3c82929a2ad29f8e))
+
+## [v1.0.0](https://github.com/docling-project/docling-serve/releases/tag/v1.0.0) - 2025-07-14
+
+### Feature
+
+* V1 api with list of sources and target ([#249](https://github.com/docling-project/docling-serve/issues/249)) ([`56e328b`](https://github.com/docling-project/docling-serve/commit/56e328baf76b4bb0476fc6ca820b52034e4f97bf))
+* Use orchestrators from jobkit ([#248](https://github.com/docling-project/docling-serve/issues/248)) ([`daa924a`](https://github.com/docling-project/docling-serve/commit/daa924a77e56d063ef17347dfd8a838872a70529))
+
+### Breaking
+
+* v1 api with list of sources and target ([#249](https://github.com/docling-project/docling-serve/issues/249)) ([`56e328b`](https://github.com/docling-project/docling-serve/commit/56e328baf76b4bb0476fc6ca820b52034e4f97bf))
+* use orchestrators from jobkit ([#248](https://github.com/docling-project/docling-serve/issues/248)) ([`daa924a`](https://github.com/docling-project/docling-serve/commit/daa924a77e56d063ef17347dfd8a838872a70529))
+
+## [v0.16.1](https://github.com/docling-project/docling-serve/releases/tag/v0.16.1) - 2025-07-07
+
+### Fix
+
+* Upgrade deps including, docling v2.40.0 with locks in models init ([#264](https://github.com/docling-project/docling-serve/issues/264)) ([`bfde1a0`](https://github.com/docling-project/docling-serve/commit/bfde1a0991c2da53b72c4f131ff74fa10f6340de))
+* Missing tesseract osd ([#263](https://github.com/docling-project/docling-serve/issues/263)) ([`eb3892e`](https://github.com/docling-project/docling-serve/commit/eb3892ee141eb2c941d580b095d8a266f2d2610c))
+* Properly load models at boot ([#244](https://github.com/docling-project/docling-serve/issues/244)) ([`149a8cb`](https://github.com/docling-project/docling-serve/commit/149a8cb1c0a16c1e0b7d17f40b88b4d6e8f0109d))
+
+### Documentation
+
+* Fix typo ([#259](https://github.com/docling-project/docling-serve/issues/259)) ([`93b8471`](https://github.com/docling-project/docling-serve/commit/93b84712b2c6d180908a197847b52b217a7ff05f))
+* Change the doc example ([#258](https://github.com/docling-project/docling-serve/issues/258)) ([`c45b937`](https://github.com/docling-project/docling-serve/commit/c45b93706466a073ab4a5c75aa8a267110873e26))
+* Update typo ([#247](https://github.com/docling-project/docling-serve/issues/247)) ([`50e431f`](https://github.com/docling-project/docling-serve/commit/50e431f30fbffa33f43727417fe746d20cbb9d6b))
+
+## [v0.16.0](https://github.com/docling-project/docling-serve/releases/tag/v0.16.0) - 2025-06-25
+
+### Feature
+
+* Package updates and more cuda images ([#229](https://github.com/docling-project/docling-serve/issues/229)) ([`30aca92`](https://github.com/docling-project/docling-serve/commit/30aca92298ab0d86bb4debcfcacb2dd8b9040a27))
+
+### Documentation
+
+* Update example resources and improve README ([#231](https://github.com/docling-project/docling-serve/issues/231)) ([`80755a7`](https://github.com/docling-project/docling-serve/commit/80755a7d5955f7d0c53df8e558fdd852dd1f5b75))
+
 ## [v0.15.0](https://github.com/docling-project/docling-serve/releases/tag/v0.15.0) - 2025-06-17

 ### Feature
--- a/20
+++ b/20
@@ -1,13 +1,17 @@
 ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s

-FROM ${BASE_IMAGE}
+ARG UV_VERSION=0.8.3

-USER 0
+ARG UV_SYNC_EXTRA_ARGS=""
+
+FROM ${BASE_IMAGE} AS docling-base

 ###################################################################################################
 # OS Layer                                                                                        #
 ###################################################################################################

+USER 0
+
 RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
    dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
    dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
@@ -21,16 +25,19 @@ RUN /usr/bin/fix-permissions /opt/app-root/src/.cache

 ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/

+FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv_stage
+
 ###################################################################################################
 # Docling layer                                                                                   #
 ###################################################################################################

+FROM docling-base
+
 USER 1001

 WORKDIR /opt/app-root/src

 ENV \
-    # On container environments, always set a thread budget to avoid undesired thread congestion.
    OMP_NUM_THREADS=4 \
    LANG=en_US.UTF-8 \
    LC_ALL=en_US.UTF-8 \
@@ -40,9 +47,9 @@ ENV \
    UV_PROJECT_ENVIRONMENT=/opt/app-root \
    DOCLING_SERVE_ARTIFACTS_PATH=/opt/app-root/src/.cache/docling/models

-ARG UV_SYNC_EXTRA_ARGS=""
+ARG UV_SYNC_EXTRA_ARGS

-RUN --mount=from=ghcr.io/astral-sh/uv:0.6.1,source=/uv,target=/bin/uv \
+RUN --mount=from=uv_stage,source=/uv,target=/bin/uv \
    --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
    --mount=type=bind,source=uv.lock,target=uv.lock \
    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
@@ -61,7 +68,8 @@ RUN echo "Downloading models..." && \
    chmod -R g=u ${DOCLING_SERVE_ARTIFACTS_PATH}

 COPY --chown=1001:0 ./docling_serve ./docling_serve
-RUN --mount=from=ghcr.io/astral-sh/uv:0.6.1,source=/uv,target=/bin/uv \
+
+RUN --mount=from=uv_stage,source=/uv,target=/bin/uv \
    --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
    --mount=type=bind,source=uv.lock,target=uv.lock \
    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@@ -1,11 +1,11 @@
 # MAINTAINERS

- Christoph Auer - [@cau-git](https://github.com/cau-git)
- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
- Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos)
- Ahmed Nassar - [@nassarofficial](https://github.com/nassarofficial)
- Panos Vagenas - [@vagenas](https://github.com/vagenas)
- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
+- Christoph Auer - [`@cau-git`](https://github.com/cau-git)
+- Michele Dolfi - [`@dolfim-ibm`](https://github.com/dolfim-ibm)
+- Maxim Lysak - [`@maxmnemonic`](https://github.com/maxmnemonic)
+- Nikos Livathinos - [`@nikos-livathinos`](https://github.com/nikos-livathinos)
+- Ahmed Nassar - [`@nassarofficial`](https://github.com/nassarofficial)
+- Panos Vagenas - [`@vagenas`](https://github.com/vagenas)
+- Peter Staar - [`@PeterStaar-IBM`](https://github.com/PeterStaar-IBM)

 Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
--- a/62
+++ b/62
@@ -26,26 +26,47 @@ md-lint-file:
 	$(CMD_PREFIX) touch .markdown-lint

 .PHONY: docling-serve-image
-docling-serve-image: Containerfile
+docling-serve-image: Containerfile ## Build docling-serve container image
 	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve]"
-	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu" -f Containerfile -t ghcr.io/docling-project/docling-serve:$(TAG) .
+	$(CMD_PREFIX) docker build --load -f Containerfile -t ghcr.io/docling-project/docling-serve:$(TAG) .
 	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve:$(TAG) ghcr.io/docling-project/docling-serve:$(BRANCH_TAG)
 	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve:$(TAG) quay.io/docling-project/docling-serve:$(BRANCH_TAG)

 .PHONY: docling-serve-cpu-image
 docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
 	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve CPU]"
-	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra flash-attn" -f Containerfile -t ghcr.io/docling-project/docling-serve-cpu:$(TAG) .
+	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-group pypi --group cpu --no-extra flash-attn" -f Containerfile -t ghcr.io/docling-project/docling-serve-cpu:$(TAG) .
 	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cpu:$(TAG) ghcr.io/docling-project/docling-serve-cpu:$(BRANCH_TAG)
 	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cpu:$(TAG) quay.io/docling-project/docling-serve-cpu:$(BRANCH_TAG)

 .PHONY: docling-serve-cu124-image
-docling-serve-cu124-image: Containerfile ## Build docling-serve container image with GPU support
+docling-serve-cu124-image: Containerfile ## Build docling-serve container image with CUDA 12.4 support
 	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve with Cuda 12.4]"
-	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cpu" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-cu124:$(TAG) .
+	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu124" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-cu124:$(TAG) .
 	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu124:$(TAG) ghcr.io/docling-project/docling-serve-cu124:$(BRANCH_TAG)
 	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu124:$(TAG) quay.io/docling-project/docling-serve-cu124:$(BRANCH_TAG)

+.PHONY: docling-serve-cu126-image
+docling-serve-cu126-image: Containerfile ## Build docling-serve container image with CUDA 12.6 support
+	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve with Cuda 12.6]"
+	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu126" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-cu126:$(TAG) .
+	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu126:$(TAG) ghcr.io/docling-project/docling-serve-cu126:$(BRANCH_TAG)
+	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu126:$(TAG) quay.io/docling-project/docling-serve-cu126:$(BRANCH_TAG)
+
+.PHONY: docling-serve-cu128-image
+docling-serve-cu128-image: Containerfile ## Build docling-serve container image with CUDA 12.8 support
+	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve with Cuda 12.8]"
+	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-group pypi --group cu128" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-cu128:$(TAG) .
+	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu128:$(TAG) ghcr.io/docling-project/docling-serve-cu128:$(BRANCH_TAG)
+	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu128:$(TAG) quay.io/docling-project/docling-serve-cu128:$(BRANCH_TAG)
+
+.PHONY: docling-serve-rocm-image
+docling-serve-rocm-image: Containerfile ## Build docling-serve container image with ROCm support
+	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve with ROCm 6.3]"
+	$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-group pypi --group rocm --no-extra flash-attn" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-rocm:$(TAG) .
+	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-rocm:$(TAG) ghcr.io/docling-project/docling-serve-rocm:$(BRANCH_TAG)
+	$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-rocm:$(TAG) quay.io/docling-project/docling-serve-rocm:$(BRANCH_TAG)
+
 .PHONY: action-lint
 action-lint: .action-lint ##      Lint GitHub Action workflows
 .action-lint: $(shell find .github -type f) | action-lint-file
@@ -87,9 +108,30 @@ run-docling-cpu: ## Run the docling-serve container with CPU support and assign
 	$(ECHO_PREFIX) printf "  %-12s Running docling-serve container with CPU support on port 5001...\n" "[RUN CPU]"
 	$(CMD_PREFIX) docker run -it --name docling-serve-cpu -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:main

-.PHONY: run-docling-gpu
-run-docling-gpu: ## Run the docling-serve container with GPU support and assign a container name
+.PHONY: run-docling-cu124
+run-docling-cu124: ## Run the docling-serve container with GPU support and assign a container name
 	$(ECHO_PREFIX) printf "  %-12s Removing existing container if it exists...\n" "[CLEANUP]"
-	$(CMD_PREFIX) docker rm -f docling-serve-gpu 2>/dev/null || true
-	$(ECHO_PREFIX) printf "  %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN GPU]"
-	$(CMD_PREFIX) docker run -it --name docling-serve-gpu -p 5001:5001 ghcr.io/docling-project/docling-serve:main
+	$(CMD_PREFIX) docker rm -f docling-serve-cu124 2>/dev/null || true
+	$(ECHO_PREFIX) printf "  %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN CUDA 12.4]"
+	$(CMD_PREFIX) docker run -it --name docling-serve-cu124 -p 5001:5001 ghcr.io/docling-project/docling-serve-cu124:main
+
+.PHONY: run-docling-cu126
+run-docling-cu126: ## Run the docling-serve container with GPU support and assign a container name
+	$(ECHO_PREFIX) printf "  %-12s Removing existing container if it exists...\n" "[CLEANUP]"
+	$(CMD_PREFIX) docker rm -f docling-serve-cu126 2>/dev/null || true
+	$(ECHO_PREFIX) printf "  %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN CUDA 12.6]"
+	$(CMD_PREFIX) docker run -it --name docling-serve-cu126 -p 5001:5001 ghcr.io/docling-project/docling-serve-cu126:main
+
+.PHONY: run-docling-cu128
+run-docling-cu128: ## Run the docling-serve container with GPU support and assign a container name
+	$(ECHO_PREFIX) printf "  %-12s Removing existing container if it exists...\n" "[CLEANUP]"
+	$(CMD_PREFIX) docker rm -f docling-serve-cu128 2>/dev/null || true
+	$(ECHO_PREFIX) printf "  %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN CUDA 12.8]"
+	$(CMD_PREFIX) docker run -it --name docling-serve-cu128 -p 5001:5001 ghcr.io/docling-project/docling-serve-cu128:main
+
+.PHONY: run-docling-rocm
+run-docling-rocm: ## Run the docling-serve container with GPU support and assign a container name
+	$(ECHO_PREFIX) printf "  %-12s Removing existing container if it exists...\n" "[CLEANUP]"
+	$(CMD_PREFIX) docker rm -f docling-serve-rocm 2>/dev/null || true
+	$(ECHO_PREFIX) printf "  %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN ROCm 6.3]"
+	$(CMD_PREFIX) docker run -it --name docling-serve-rocm -p 5001:5001 ghcr.io/docling-project/docling-serve-rocm:main
--- a/README.md
+++ b/README.md
@@ -8,69 +8,85 @@

 Running [Docling](https://github.com/docling-project/docling) as an API service.

+📚 [Docling Serve documentation](./docs/README.md)
+
+- Learning how to [configure the webserver](./docs/configuration.md)
+- Get to know all [runtime options](./docs/usage.md) of the API
+- Explore useful [deployment examples](./docs/deployment.md)
+- And more
+
+> [!NOTE]
+> **Migration to the `v1` API.** Docling Serve now has a stable v1 API. Read more on the [migration to v1](./docs/v1_migration.md).
+
 ## Getting started

 Install the `docling-serve` package and run the server.

 ```bash
 # Using the python package
-pip install "docling-serve"
-docling-serve run
+pip install "docling-serve[ui]"
+docling-serve run --enable-ui

 # Using container images, e.g. with Podman
-podman run -p 5001:5001 quay.io/docling-project/docling-serve
+podman run -p 5001:5001 -e DOCLING_SERVE_ENABLE_UI=1 quay.io/docling-project/docling-serve
 ```

 The server is available at

 - API <http://127.0.0.1:5001>
 - API documentation <http://127.0.0.1:5001/docs>
-  ![swagger.png](img/swagger.png)
+- UI playground <http://127.0.0.1:5001/ui>
+
+![API documentation](img/fastapi-ui.png)

 Try it out with a simple conversion:

 ```bash
 curl -X 'POST' \
-  'http://localhost:5001/v1alpha/convert/source' \
+  'http://localhost:5001/v1/convert/source' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
-    "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
+    "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}]
  }'
 ```

-### Container images
+### Container Images

-Available container images:
+The following container images are available for running **Docling Serve** with different hardware and PyTorch configurations:

-| Name | Description | Arch | Size |
-| -----|-------------|------|------|
-| [`ghcr.io/docling-project/docling-serve`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve) <br /> [`quay.io/docling-project/docling-serve`](https://quay.io/repository/docling-project/docling-serve) | Simple image for Docling Serve, installing all packages from the official pypi.org index. | `linux/amd64`, `linux/arm64` | 3.6 GB |
-| [`ghcr.io/docling-project/docling-serve-cpu`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cpu) <br /> [`quay.io/docling-project/docling-serve-cpu`](https://quay.io/repository/docling-project/docling-serve-cpu) | Cpu-only image which installs `torch` from the pytorch cpu index. | `linux/amd64`, `linux/arm64` | 3.6 GB |
-| [`ghcr.io/docling-project/docling-serve-cu124`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cu124) <br /> [`quay.io/docling-project/docling-serve-cu124`](https://quay.io/repository/docling-project/docling-serve-cu124) | Cuda 12.4 image which installs `torch` from the pytorch cu124 index. | `linux/amd64` | 8.7 GB |
+#### 📦 Distributed Images
+
+| Image | Description | Architectures | Size |
+|-------|-------------|----------------|------|
+| [`ghcr.io/docling-project/docling-serve`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve) <br> [`quay.io/docling-project/docling-serve`](https://quay.io/repository/docling-project/docling-serve) | Base image with all packages installed from the official PyPI index. | `linux/amd64`, `linux/arm64` | 4.4 GB (arm64) <br> 8.7 GB (amd64) |
+| [`ghcr.io/docling-project/docling-serve-cpu`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cpu) <br> [`quay.io/docling-project/docling-serve-cpu`](https://quay.io/repository/docling-project/docling-serve-cpu) | CPU-only variant, using `torch` from the PyTorch CPU index. | `linux/amd64`, `linux/arm64` | 4.4 GB |
+| [`ghcr.io/docling-project/docling-serve-cu126`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cu126) <br> [`quay.io/docling-project/docling-serve-cu126`](https://quay.io/repository/docling-project/docling-serve-cu126) | CUDA 12.6 build with `torch` from the cu126 index. | `linux/amd64` | 10.0 GB |
+| [`ghcr.io/docling-project/docling-serve-cu128`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cu128) <br> [`quay.io/docling-project/docling-serve-cu128`](https://quay.io/repository/docling-project/docling-serve-cu128) | CUDA 12.8 build with `torch` from the cu128 index. | `linux/amd64` | 11.4 GB |
+
+#### 🚫 Not Distributed
+
+An image for AMD ROCm 6.3 (`docling-serve-rocm`) is supported but **not published** due to its large size.
+
+To build it locally:
+
+```bash
+git clone --branch main git@github.com:docling-project/docling-serve.git
+cd docling-serve/
+make docling-serve-rocm-image
+```
+
+For deployment using Docker Compose, see [docs/deployment.md](docs/deployment.md).

 Coming soon: `docling-serve-slim` images will reduce the size by skipping the model weights download.

 ### Demonstration UI

-```bash
-# Install the Python package with the extra dependencies
-pip install "docling-serve[ui]"
-docling-serve run --enable-ui
-
-# Run the container image with the extra env parameters
-podman run -p 5001:5001 -e DOCLING_SERVE_ENABLE_UI=true quay.io/docling-project/docling-serve
-```
-
 An easy to use UI is available at the `/ui` endpoint.

-![ui-input.png](img/ui-input.png)
+![Input controllers in the UI](img/ui-input.png)

-![ui-output.png](img/ui-output.png)
-
-## Documentation and advance usages
-
-Visit the [Docling Serve documentation](./docs/README.md) for learning how to [configure the webserver](./docs/configuration.md), use all the [runtime options](./docs/usage.md) of the API and [deployment examples](./docs/deployment.md), pre-load model weights into a persistent volume [model weights on persistent volume](./docs/pre-loading-models.md)
+![Output visualization in the UI](img/ui-output.png)

 ## Get help and support

--- a/docling_serve/main.py
+++ b/docling_serve/main.py
@@ -11,6 +11,7 @@ import uvicorn
 from rich.console import Console

 from docling_serve.settings import docling_serve_settings, uvicorn_settings
+from docling_serve.storage import get_scratch

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -30,6 +31,7 @@ logger = logging.getLogger(__name__)
 def version_callback(value: bool) -> None:
    if value:
        docling_serve_version = importlib.metadata.version("docling_serve")
+        docling_jobkit_version = importlib.metadata.version("docling-jobkit")
        docling_version = importlib.metadata.version("docling")
        docling_core_version = importlib.metadata.version("docling-core")
        docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
@@ -38,6 +40,7 @@ def version_callback(value: bool) -> None:
        py_impl_version = sys.implementation.cache_tag
        py_lang_version = platform.python_version()
        console.print(f"Docling Serve version: {docling_serve_version}")
+        console.print(f"Docling Jobkit version: {docling_jobkit_version}")
        console.print(f"Docling version: {docling_version}")
        console.print(f"Docling Core version: {docling_core_version}")
        console.print(f"Docling IBM Models version: {docling_ibm_models_version}")
@@ -359,6 +362,37 @@ def run(
    )


+@app.command()
+def rq_worker() -> Any:
+    """
+    Run the [bold]Docling JobKit[/bold] RQ worker.
+    """
+    from docling_jobkit.convert.manager import DoclingConverterManagerConfig
+    from docling_jobkit.orchestrators.rq.orchestrator import RQOrchestratorConfig
+    from docling_jobkit.orchestrators.rq.worker import run_worker
+
+    rq_config = RQOrchestratorConfig(
+        redis_url=docling_serve_settings.eng_rq_redis_url,
+        results_prefix=docling_serve_settings.eng_rq_results_prefix,
+        sub_channel=docling_serve_settings.eng_rq_sub_channel,
+        scratch_dir=get_scratch(),
+    )
+
+    cm_config = DoclingConverterManagerConfig(
+        artifacts_path=docling_serve_settings.artifacts_path,
+        options_cache_size=docling_serve_settings.options_cache_size,
+        enable_remote_services=docling_serve_settings.enable_remote_services,
+        allow_external_plugins=docling_serve_settings.allow_external_plugins,
+        max_num_pages=docling_serve_settings.max_num_pages,
+        max_file_size=docling_serve_settings.max_file_size,
+    )
+
+    run_worker(
+        rq_config=rq_config,
+        cm_config=cm_config,
+    )
+
+
 def main() -> None:
    app()

--- a/docling_serve/app.py
+++ b/docling_serve/app.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import importlib.metadata
 import logging
 import shutil
@@ -11,11 +12,13 @@ from fastapi import (
    BackgroundTasks,
    Depends,
    FastAPI,
+    Form,
    HTTPException,
    Query,
    UploadFile,
    WebSocket,
    WebSocketDisconnect,
+    status,
 )
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.openapi.docs import (
@@ -23,41 +26,53 @@ from fastapi.openapi.docs import (
    get_swagger_ui_html,
    get_swagger_ui_oauth2_redirect_html,
 )
-from fastapi.responses import RedirectResponse
+from fastapi.responses import JSONResponse, RedirectResponse
 from fastapi.staticfiles import StaticFiles
 from scalar_fastapi import get_scalar_api_reference

 from docling.datamodel.base_models import DocumentStream
-
-from docling_serve.datamodel.callback import (
+from docling_jobkit.datamodel.callback import (
    ProgressCallbackRequest,
    ProgressCallbackResponse,
 )
-from docling_serve.datamodel.convert import ConvertDocumentsOptions
+from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
+from docling_jobkit.datamodel.s3_coords import S3Coordinates
+from docling_jobkit.datamodel.task import Task, TaskSource
+from docling_jobkit.datamodel.task_targets import (
+    InBodyTarget,
+    TaskTarget,
+    ZipTarget,
+)
+from docling_jobkit.orchestrators.base_orchestrator import (
+    BaseOrchestrator,
+    ProgressInvalid,
+    TaskNotFoundError,
+)
+
+from docling_serve.auth import APIKeyAuth, AuthenticationResult
+from docling_serve.datamodel.convert import ConvertDocumentsRequestOptions
 from docling_serve.datamodel.requests import (
-    ConvertDocumentFileSourcesRequest,
-    ConvertDocumentHttpSourcesRequest,
    ConvertDocumentsRequest,
+    FileSourceRequest,
+    HttpSourceRequest,
+    S3SourceRequest,
+    TargetName,
 )
 from docling_serve.datamodel.responses import (
    ClearResponse,
    ConvertDocumentResponse,
    HealthCheckResponse,
    MessageKind,
+    PresignedUrlConvertDocumentResponse,
    TaskStatusResponse,
    WebsocketMessage,
 )
-from docling_serve.datamodel.task import Task, TaskSource
-from docling_serve.docling_conversion import _get_converter_from_hash
-from docling_serve.engines.async_orchestrator import (
-    BaseAsyncOrchestrator,
-    ProgressInvalid,
-)
-from docling_serve.engines.async_orchestrator_factory import get_async_orchestrator
-from docling_serve.engines.base_orchestrator import TaskNotFoundError
 from docling_serve.helper_functions import FormDepends
+from docling_serve.orchestrator_factory import get_async_orchestrator
+from docling_serve.response_preparation import prepare_response
 from docling_serve.settings import docling_serve_settings
 from docling_serve.storage import get_scratch
+from docling_serve.websocket_notifier import WebsocketNotifier


 # Set up custom logging as we'll be intermixes with FastAPI/Uvicorn's logging
@@ -95,11 +110,15 @@ _log = logging.getLogger(__name__)
 # Context manager to initialize and clean up the lifespan of the FastAPI app
@asynccontextmanager
 async def lifespan(app: FastAPI):
-    orchestrator = get_async_orchestrator()
    scratch_dir = get_scratch()

+    orchestrator = get_async_orchestrator()
+    notifier = WebsocketNotifier(orchestrator)
+    orchestrator.bind_notifier(notifier)
+
    # Warm up processing cache
-    await orchestrator.warm_up_caches()
+    if docling_serve_settings.load_models_at_boot:
+        await orchestrator.warm_up_caches()

    # Start the background queue processor
    queue_task = asyncio.create_task(orchestrator.process_queue())
@@ -139,6 +158,7 @@ def create_app():  # noqa: C901
        offline_docs_assets = True
        _log.info("Found static assets.")

+    require_auth = APIKeyAuth(docling_serve_settings.api_key)
    app = FastAPI(
        title="Docling Serve",
        docs_url=None if offline_docs_assets else "/swagger",
@@ -229,23 +249,29 @@ def create_app():  # noqa: C901
    ########################

    async def _enque_source(
-        orchestrator: BaseAsyncOrchestrator, conversion_request: ConvertDocumentsRequest
+        orchestrator: BaseOrchestrator, conversion_request: ConvertDocumentsRequest
    ) -> Task:
        sources: list[TaskSource] = []
-        if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
-            sources.extend(conversion_request.file_sources)
-        if isinstance(conversion_request, ConvertDocumentHttpSourcesRequest):
-            sources.extend(conversion_request.http_sources)
+        for s in conversion_request.sources:
+            if isinstance(s, FileSourceRequest):
+                sources.append(FileSource.model_validate(s))
+            elif isinstance(s, HttpSourceRequest):
+                sources.append(HttpSource.model_validate(s))
+            elif isinstance(s, S3SourceRequest):
+                sources.append(S3Coordinates.model_validate(s))

        task = await orchestrator.enqueue(
-            sources=sources, options=conversion_request.options
+            sources=sources,
+            options=conversion_request.options,
+            target=conversion_request.target,
        )
        return task

    async def _enque_file(
-        orchestrator: BaseAsyncOrchestrator,
+        orchestrator: BaseOrchestrator,
        files: list[UploadFile],
-        options: ConvertDocumentsOptions,
+        options: ConvertDocumentsRequestOptions,
+        target: TaskTarget,
    ) -> Task:
        _log.info(f"Received {len(files)} files for processing.")

@@ -257,12 +283,12 @@ def create_app():  # noqa: C901
            name = file.filename if file.filename else f"file{suffix}.pdf"
            file_sources.append(DocumentStream(name=name, stream=buf))

-        task = await orchestrator.enqueue(sources=file_sources, options=options)
+        task = await orchestrator.enqueue(
+            sources=file_sources, options=options, target=target
+        )
        return task

-    async def _wait_task_complete(
-        orchestrator: BaseAsyncOrchestrator, task_id: str
-    ) -> bool:
+    async def _wait_task_complete(orchestrator: BaseOrchestrator, task_id: str) -> bool:
        start_time = time.monotonic()
        while True:
            task = await orchestrator.task_status(task_id=task_id)
@@ -273,10 +299,79 @@ def create_app():  # noqa: C901
            if elapsed_time > docling_serve_settings.max_sync_wait:
                return False

+    ##########################################
+    # Downgrade openapi 3.1 to 3.0.x helpers #
+    ##########################################
+
+    def ensure_array_items(schema):
+        """Ensure that array items are defined."""
+        if "type" in schema and schema["type"] == "array":
+            if "items" not in schema or schema["items"] is None:
+                schema["items"] = {"type": "string"}
+            elif isinstance(schema["items"], dict):
+                if "type" not in schema["items"]:
+                    schema["items"]["type"] = "string"
+
+    def handle_discriminators(schema):
+        """Ensure that discriminator properties are included in required."""
+        if "discriminator" in schema and "propertyName" in schema["discriminator"]:
+            prop = schema["discriminator"]["propertyName"]
+            if "properties" in schema and prop in schema["properties"]:
+                if "required" not in schema:
+                    schema["required"] = []
+                if prop not in schema["required"]:
+                    schema["required"].append(prop)
+
+    def handle_properties(schema):
+        """Ensure that property 'kind' is included in required."""
+        if "properties" in schema and "kind" in schema["properties"]:
+            if "required" not in schema:
+                schema["required"] = []
+            if "kind" not in schema["required"]:
+                schema["required"].append("kind")
+
+    # Downgrade openapi 3.1 to 3.0.x
+    def downgrade_openapi31_to_30(spec):
+        def strip_unsupported(obj):
+            if isinstance(obj, dict):
+                obj = {
+                    k: strip_unsupported(v)
+                    for k, v in obj.items()
+                    if k not in ("const", "examples", "prefixItems")
+                }
+
+                handle_discriminators(obj)
+                ensure_array_items(obj)
+
+                # Check for oneOf and anyOf to handle nested schemas
+                for key in ["oneOf", "anyOf"]:
+                    if key in obj:
+                        for sub in obj[key]:
+                            handle_discriminators(sub)
+                            ensure_array_items(sub)
+
+                return obj
+            elif isinstance(obj, list):
+                return [strip_unsupported(i) for i in obj]
+            return obj
+
+        if "components" in spec and "schemas" in spec["components"]:
+            for schema_name, schema in spec["components"]["schemas"].items():
+                handle_properties(schema)
+
+        return strip_unsupported(copy.deepcopy(spec))
+
    #############################
    # API Endpoints definitions #
    #############################

+    @app.get("/openapi-3.0.json")
+    def openapi_30():
+        spec = app.openapi()
+        downgraded = downgrade_openapi31_to_30(spec)
+        downgraded["openapi"] = "3.0.3"
+        return JSONResponse(downgraded)
+
    # Favicon
    @app.get("/favicon.ico", include_in_schema=False)
    async def favicon():
@@ -297,8 +392,8 @@ def create_app():  # noqa: C901

    # Convert a document from URL(s)
    @app.post(
-        "/v1alpha/convert/source",
-        response_model=ConvertDocumentResponse,
+        "/v1/convert/source",
+        response_model=ConvertDocumentResponse | PresignedUrlConvertDocumentResponse,
        responses={
            200: {
                "content": {"application/zip": {}},
@@ -308,37 +403,42 @@ def create_app():  # noqa: C901
    )
    async def process_url(
        background_tasks: BackgroundTasks,
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        conversion_request: ConvertDocumentsRequest,
    ):
        task = await _enque_source(
            orchestrator=orchestrator, conversion_request=conversion_request
        )
-        success = await _wait_task_complete(
+        completed = await _wait_task_complete(
            orchestrator=orchestrator, task_id=task.task_id
        )

-        if not success:
+        if not completed:
            # TODO: abort task!
            return HTTPException(
                status_code=504,
                detail=f"Conversion is taking too long. The maximum wait time is configure as DOCLING_SERVE_MAX_SYNC_WAIT={docling_serve_settings.max_sync_wait}.",
            )

-        result = await orchestrator.task_result(
-            task_id=task.task_id, background_tasks=background_tasks
-        )
-        if result is None:
+        task_result = await orchestrator.task_result(task_id=task.task_id)
+        if task_result is None:
            raise HTTPException(
                status_code=404,
                detail="Task result not found. Please wait for a completion status.",
            )
-        return result
+        response = await prepare_response(
+            task_id=task.task_id,
+            task_result=task_result,
+            orchestrator=orchestrator,
+            background_tasks=background_tasks,
+        )
+        return response

    # Convert a document from file(s)
    @app.post(
-        "/v1alpha/convert/file",
-        response_model=ConvertDocumentResponse,
+        "/v1/convert/file",
+        response_model=ConvertDocumentResponse | PresignedUrlConvertDocumentResponse,
        responses={
            200: {
                "content": {"application/zip": {}},
@@ -347,43 +447,51 @@ def create_app():  # noqa: C901
    )
    async def process_file(
        background_tasks: BackgroundTasks,
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        files: list[UploadFile],
        options: Annotated[
-            ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
+            ConvertDocumentsRequestOptions, FormDepends(ConvertDocumentsRequestOptions)
        ],
+        target_type: Annotated[TargetName, Form()] = TargetName.INBODY,
    ):
+        target = InBodyTarget() if target_type == TargetName.INBODY else ZipTarget()
        task = await _enque_file(
-            orchestrator=orchestrator, files=files, options=options
+            orchestrator=orchestrator, files=files, options=options, target=target
        )
-        success = await _wait_task_complete(
+        completed = await _wait_task_complete(
            orchestrator=orchestrator, task_id=task.task_id
        )

-        if not success:
+        if not completed:
            # TODO: abort task!
            return HTTPException(
                status_code=504,
                detail=f"Conversion is taking too long. The maximum wait time is configure as DOCLING_SERVE_MAX_SYNC_WAIT={docling_serve_settings.max_sync_wait}.",
            )

-        result = await orchestrator.task_result(
-            task_id=task.task_id, background_tasks=background_tasks
-        )
-        if result is None:
+        task_result = await orchestrator.task_result(task_id=task.task_id)
+        if task_result is None:
            raise HTTPException(
                status_code=404,
                detail="Task result not found. Please wait for a completion status.",
            )
-        return result
+        response = await prepare_response(
+            task_id=task.task_id,
+            task_result=task_result,
+            orchestrator=orchestrator,
+            background_tasks=background_tasks,
+        )
+        return response

    # Convert a document from URL(s) using the async api
    @app.post(
-        "/v1alpha/convert/source/async",
+        "/v1/convert/source/async",
        response_model=TaskStatusResponse,
    )
    async def process_url_async(
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        conversion_request: ConvertDocumentsRequest,
    ):
        task = await _enque_source(
@@ -401,19 +509,22 @@ def create_app():  # noqa: C901

    # Convert a document from file(s) using the async api
    @app.post(
-        "/v1alpha/convert/file/async",
+        "/v1/convert/file/async",
        response_model=TaskStatusResponse,
    )
    async def process_file_async(
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        background_tasks: BackgroundTasks,
        files: list[UploadFile],
        options: Annotated[
-            ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
+            ConvertDocumentsRequestOptions, FormDepends(ConvertDocumentsRequestOptions)
        ],
+        target_type: Annotated[TargetName, Form()] = TargetName.INBODY,
    ):
+        target = InBodyTarget() if target_type == TargetName.INBODY else ZipTarget()
        task = await _enque_file(
-            orchestrator=orchestrator, files=files, options=options
+            orchestrator=orchestrator, files=files, options=options, target=target
        )
        task_queue_position = await orchestrator.get_queue_position(
            task_id=task.task_id
@@ -427,14 +538,16 @@ def create_app():  # noqa: C901

    # Task status poll
    @app.get(
-        "/v1alpha/status/poll/{task_id}",
+        "/v1/status/poll/{task_id}",
        response_model=TaskStatusResponse,
    )
    async def task_status_poll(
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        task_id: str,
        wait: Annotated[
-            float, Query(help="Number of seconds to wait for a completed status.")
+            float,
+            Query(description="Number of seconds to wait for a completed status."),
        ] = 0.0,
    ):
        try:
@@ -451,13 +564,22 @@ def create_app():  # noqa: C901

    # Task status websocket
    @app.websocket(
-        "/v1alpha/status/ws/{task_id}",
+        "/v1/status/ws/{task_id}",
    )
    async def task_status_ws(
        websocket: WebSocket,
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        task_id: str,
+        api_key: Annotated[str, Query()] = "",
    ):
+        if docling_serve_settings.api_key:
+            if api_key != docling_serve_settings.api_key:
+                raise HTTPException(
+                    status_code=status.HTTP_401_UNAUTHORIZED,
+                    detail="Api key is required as ?api_key=SECRET.",
+                )
+
+        assert isinstance(orchestrator.notifier, WebsocketNotifier)
        await websocket.accept()

        if task_id not in orchestrator.tasks:
@@ -472,7 +594,7 @@ def create_app():  # noqa: C901
        task = orchestrator.tasks[task_id]

        # Track active WebSocket connections for this job
-        orchestrator.task_subscribers[task_id].add(websocket)
+        orchestrator.notifier.task_subscribers[task_id].add(websocket)

        try:
            task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
@@ -510,12 +632,12 @@ def create_app():  # noqa: C901
            _log.info(f"WebSocket disconnected for job {task_id}")

        finally:
-            orchestrator.task_subscribers[task_id].remove(websocket)
+            orchestrator.notifier.task_subscribers[task_id].remove(websocket)

    # Task result
    @app.get(
-        "/v1alpha/result/{task_id}",
-        response_model=ConvertDocumentResponse,
+        "/v1/result/{task_id}",
+        response_model=ConvertDocumentResponse | PresignedUrlConvertDocumentResponse,
        responses={
            200: {
                "content": {"application/zip": {}},
@@ -523,27 +645,36 @@ def create_app():  # noqa: C901
        },
    )
    async def task_result(
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        background_tasks: BackgroundTasks,
        task_id: str,
    ):
-        result = await orchestrator.task_result(
-            task_id=task_id, background_tasks=background_tasks
-        )
-        if result is None:
-            raise HTTPException(
-                status_code=404,
-                detail="Task result not found. Please wait for a completion status.",
+        try:
+            task_result = await orchestrator.task_result(task_id=task_id)
+            if task_result is None:
+                raise HTTPException(
+                    status_code=404,
+                    detail="Task result not found. Please wait for a completion status.",
+                )
+            response = await prepare_response(
+                task_id=task_id,
+                task_result=task_result,
+                orchestrator=orchestrator,
+                background_tasks=background_tasks,
            )
-        return result
+            return response
+        except TaskNotFoundError:
+            raise HTTPException(status_code=404, detail="Task not found.")

    # Update task progress
    @app.post(
-        "/v1alpha/callback/task/progress",
+        "/v1/callback/task/progress",
        response_model=ProgressCallbackResponse,
    )
    async def callback_task_progress(
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        request: ProgressCallbackRequest,
    ):
        try:
@@ -560,20 +691,24 @@ def create_app():  # noqa: C901

    # Offload models
    @app.get(
-        "/v1alpha/clear/converters",
+        "/v1/clear/converters",
        response_model=ClearResponse,
    )
-    async def clear_converters():
-        _get_converter_from_hash.cache_clear()
+    async def clear_converters(
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
+    ):
+        await orchestrator.clear_converters()
        return ClearResponse()

    # Clean results
    @app.get(
-        "/v1alpha/clear/results",
+        "/v1/clear/results",
        response_model=ClearResponse,
    )
    async def clear_results(
-        orchestrator: Annotated[BaseAsyncOrchestrator, Depends(get_async_orchestrator)],
+        auth: Annotated[AuthenticationResult, Depends(require_auth)],
+        orchestrator: Annotated[BaseOrchestrator, Depends(get_async_orchestrator)],
        older_then: float = 3600,
    ):
        await orchestrator.clear_results(older_than=older_then)
--- a/docling_serve/auth.py
+++ b/docling_serve/auth.py
@@ -0,0 +1,56 @@
+from typing import Any
+
+from fastapi import HTTPException, Request, status
+from fastapi.security import APIKeyHeader
+from pydantic import BaseModel
+
+
+class AuthenticationResult(BaseModel):
+    valid: bool
+    errors: list[str] = []
+    detail: Any | None = None
+
+
+class APIKeyAuth(APIKeyHeader):
+    """
+    FastAPI dependency which evaluates a status API Key.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        header_name: str = "X-Api-Key",
+        fail_on_unauthorized: bool = True,
+    ) -> None:
+        self.api_key = api_key
+        self.header_name = header_name
+        super().__init__(name=self.header_name, auto_error=False)
+
+    async def _validate_api_key(self, header_api_key: str | None):
+        if header_api_key is None:
+            return AuthenticationResult(
+                valid=False, errors=[f"Missing header {self.header_name}."]
+            )
+
+        header_api_key = header_api_key.strip()
+
+        # Otherwise check the apikey
+        if header_api_key == self.api_key or self.api_key == "":
+            return AuthenticationResult(
+                valid=True,
+                detail=header_api_key,
+            )
+        else:
+            return AuthenticationResult(
+                valid=False,
+                errors=["The provided API Key is invalid."],
+            )
+
+    async def __call__(self, request: Request) -> AuthenticationResult:  # type: ignore
+        header_api_key = await super().__call__(request=request)
+        result = await self._validate_api_key(header_api_key)
+        if self.api_key and not result.valid:
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED, detail=result.detail
+            )
+        return result
--- a/docling_serve/datamodel/callback.py
+++ b/docling_serve/datamodel/callback.py
@@ -1,50 +0,0 @@
-import enum
-from typing import Annotated, Literal
-
-from pydantic import BaseModel, Field
-
-
-class ProgressKind(str, enum.Enum):
-    SET_NUM_DOCS = "set_num_docs"
-    UPDATE_PROCESSED = "update_processed"
-
-
-class BaseProgress(BaseModel):
-    kind: ProgressKind
-
-
-class ProgressSetNumDocs(BaseProgress):
-    kind: Literal[ProgressKind.SET_NUM_DOCS] = ProgressKind.SET_NUM_DOCS
-
-    num_docs: int
-
-
-class SucceededDocsItem(BaseModel):
-    source: str
-
-
-class FailedDocsItem(BaseModel):
-    source: str
-    error: str
-
-
-class ProgressUpdateProcessed(BaseProgress):
-    kind: Literal[ProgressKind.UPDATE_PROCESSED] = ProgressKind.UPDATE_PROCESSED
-
-    num_processed: int
-    num_succeeded: int
-    num_failed: int
-
-    docs_succeeded: list[SucceededDocsItem]
-    docs_failed: list[FailedDocsItem]
-
-
-class ProgressCallbackRequest(BaseModel):
-    task_id: str
-    progress: Annotated[
-        ProgressSetNumDocs | ProgressUpdateProcessed, Field(discriminator="kind")
-    ]
-
-
-class ProgressCallbackResponse(BaseModel):
-    status: Literal["ack"] = "ack"
--- a/docling_serve/datamodel/convert.py
+++ b/docling_serve/datamodel/convert.py
@@ -1,24 +1,13 @@
 # Define the input options for the API
-from typing import Annotated, Any, Optional
+from typing import Annotated

-from pydantic import AnyUrl, BaseModel, Field, model_validator
-from typing_extensions import Self
+from pydantic import Field

-from docling.datamodel.base_models import InputFormat, OutputFormat
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
-    PdfBackend,
-    PdfPipeline,
-    PictureDescriptionBaseOptions,
-    TableFormerMode,
-    TableStructureOptions,
-)
-from docling.datamodel.settings import (
-    DEFAULT_PAGE_RANGE,
-    PageRange,
 )
 from docling.models.factories import get_ocr_factory
-from docling_core.types.doc import ImageRefMode
+from docling_jobkit.datamodel.convert import ConvertDocumentsOptions

 from docling_serve.settings import docling_serve_settings

@@ -28,154 +17,7 @@ ocr_factory = get_ocr_factory(
 ocr_engines_enum = ocr_factory.get_enum()


-class PictureDescriptionLocal(BaseModel):
-    repo_id: Annotated[
-        str,
-        Field(
-            description="Repository id from the Hugging Face Hub.",
-            examples=[
-                "HuggingFaceTB/SmolVLM-256M-Instruct",
-                "ibm-granite/granite-vision-3.2-2b",
-            ],
-        ),
-    ]
-    prompt: Annotated[
-        str,
-        Field(
-            description="Prompt used when calling the vision-language model.",
-            examples=[
-                "Describe this image in a few sentences.",
-                "This is a figure from a document. Provide a detailed description of it.",
-            ],
-        ),
-    ] = "Describe this image in a few sentences."
-    generation_config: Annotated[
-        dict[str, Any],
-        Field(
-            description="Config from https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig",
-            examples=[{"max_new_tokens": 200, "do_sample": False}],
-        ),
-    ] = {"max_new_tokens": 200, "do_sample": False}
-
-
-class PictureDescriptionApi(BaseModel):
-    url: Annotated[
-        AnyUrl,
-        Field(
-            description="Endpoint which accepts openai-api compatible requests.",
-            examples=[
-                AnyUrl(
-                    "http://localhost:8000/v1/chat/completions"
-                ),  # example of a local vllm api
-                AnyUrl(
-                    "http://localhost:11434/v1/chat/completions"
-                ),  # example of ollama
-            ],
-        ),
-    ]
-    headers: Annotated[
-        dict[str, str],
-        Field(
-            description="Headers used for calling the API endpoint. For example, it could include authentication headers."
-        ),
-    ] = {}
-    params: Annotated[
-        dict[str, Any],
-        Field(
-            description="Model parameters.",
-            examples=[
-                {  # on vllm
-                    "model": "HuggingFaceTB/SmolVLM-256M-Instruct",
-                    "max_completion_tokens": 200,
-                },
-                {  # on vllm
-                    "model": "ibm-granite/granite-vision-3.2-2b",
-                    "max_completion_tokens": 200,
-                },
-                {  # on ollama
-                    "model": "granite3.2-vision:2b"
-                },
-            ],
-        ),
-    ] = {}
-    timeout: Annotated[float, Field(description="Timeout for the API request.")] = 20
-    prompt: Annotated[
-        str,
-        Field(
-            description="Prompt used when calling the vision-language model.",
-            examples=[
-                "Describe this image in a few sentences.",
-                "This is a figures from a document. Provide a detailed description of it.",
-            ],
-        ),
-    ] = "Describe this image in a few sentences."
-
-
-class ConvertDocumentsOptions(BaseModel):
-    from_formats: Annotated[
-        list[InputFormat],
-        Field(
-            description=(
-                "Input format(s) to convert from. String or list of strings. "
-                f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
-                "Optional, defaults to all formats."
-            ),
-            examples=[[v.value for v in InputFormat]],
-        ),
-    ] = list(InputFormat)
-
-    to_formats: Annotated[
-        list[OutputFormat],
-        Field(
-            description=(
-                "Output format(s) to convert to. String or list of strings. "
-                f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
-                "Optional, defaults to Markdown."
-            ),
-            examples=[
-                [OutputFormat.MARKDOWN],
-                [OutputFormat.MARKDOWN, OutputFormat.JSON],
-                [v.value for v in OutputFormat],
-            ],
-        ),
-    ] = [OutputFormat.MARKDOWN]
-
-    image_export_mode: Annotated[
-        ImageRefMode,
-        Field(
-            description=(
-                "Image export mode for the document (in case of JSON,"
-                " Markdown or HTML). "
-                f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
-                "Optional, defaults to Embedded."
-            ),
-            examples=[ImageRefMode.EMBEDDED.value],
-            # pattern="embedded|placeholder|referenced",
-        ),
-    ] = ImageRefMode.EMBEDDED
-
-    do_ocr: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, the bitmap content will be processed using OCR. "
-                "Boolean. Optional, defaults to true"
-            ),
-            # examples=[True],
-        ),
-    ] = True
-
-    force_ocr: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, replace existing text with OCR-generated "
-                "text over content. Boolean. Optional, defaults to false."
-            ),
-            # examples=[False],
-        ),
-    ] = False
-
+class ConvertDocumentsRequestOptions(ConvertDocumentsOptions):
    ocr_engine: Annotated[  # type: ignore
        ocr_engines_enum,
        Field(
@@ -188,57 +30,6 @@ class ConvertDocumentsOptions(BaseModel):
        ),
    ] = ocr_engines_enum(EasyOcrOptions.kind)  # type: ignore

-    ocr_lang: Annotated[
-        Optional[list[str]],
-        Field(
-            description=(
-                "List of languages used by the OCR engine. "
-                "Note that each OCR engine has "
-                "different values for the language names. String or list of strings. "
-                "Optional, defaults to empty."
-            ),
-            examples=[["fr", "de", "es", "en"]],
-        ),
-    ] = None
-
-    pdf_backend: Annotated[
-        PdfBackend,
-        Field(
-            description=(
-                "The PDF backend to use. String. "
-                f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
-                f"Optional, defaults to {PdfBackend.DLPARSE_V4.value}."
-            ),
-            examples=[PdfBackend.DLPARSE_V4],
-        ),
-    ] = PdfBackend.DLPARSE_V4
-
-    table_mode: Annotated[
-        TableFormerMode,
-        Field(
-            description=(
-                "Mode to use for table structure, String. "
-                f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
-                "Optional, defaults to fast."
-            ),
-            examples=[TableStructureOptions().mode],
-            # pattern="fast|accurate",
-        ),
-    ] = TableStructureOptions().mode
-
-    pipeline: Annotated[
-        PdfPipeline,
-        Field(description="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD
-
-    page_range: Annotated[
-        PageRange,
-        Field(
-            description="Only convert a range of pages. The page number starts at 1.",
-            examples=[DEFAULT_PAGE_RANGE, (1, 4)],
-        ),
-    ] = DEFAULT_PAGE_RANGE
-
    document_timeout: Annotated[
        float,
        Field(
@@ -247,152 +38,3 @@ class ConvertDocumentsOptions(BaseModel):
            le=docling_serve_settings.max_document_timeout,
        ),
    ] = docling_serve_settings.max_document_timeout
-
-    abort_on_error: Annotated[
-        bool,
-        Field(
-            description=(
-                "Abort on error if enabled. Boolean. Optional, defaults to false."
-            ),
-            # examples=[False],
-        ),
-    ] = False
-
-    return_as_file: Annotated[
-        bool,
-        Field(
-            description=(
-                "Return the output as a zip file "
-                "(will happen anyway if multiple files are generated). "
-                "Boolean. Optional, defaults to false."
-            ),
-            examples=[False],
-        ),
-    ] = False
-
-    do_table_structure: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, the table structure will be extracted. "
-                "Boolean. Optional, defaults to true."
-            ),
-            examples=[True],
-        ),
-    ] = True
-
-    include_images: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, images will be extracted from the document. "
-                "Boolean. Optional, defaults to true."
-            ),
-            examples=[True],
-        ),
-    ] = True
-
-    images_scale: Annotated[
-        float,
-        Field(
-            description="Scale factor for images. Float. Optional, defaults to 2.0.",
-            examples=[2.0],
-        ),
-    ] = 2.0
-
-    md_page_break_placeholder: Annotated[
-        str,
-        Field(
-            description="Add this placeholder betweek pages in the markdown output.",
-            examples=["<!-- page-break -->", ""],
-        ),
-    ] = ""
-
-    do_code_enrichment: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, perform OCR code enrichment. "
-                "Boolean. Optional, defaults to false."
-            ),
-            examples=[False],
-        ),
-    ] = False
-
-    do_formula_enrichment: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, perform formula OCR, return LaTeX code. "
-                "Boolean. Optional, defaults to false."
-            ),
-            examples=[False],
-        ),
-    ] = False
-
-    do_picture_classification: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, classify pictures in documents. "
-                "Boolean. Optional, defaults to false."
-            ),
-            examples=[False],
-        ),
-    ] = False
-
-    do_picture_description: Annotated[
-        bool,
-        Field(
-            description=(
-                "If enabled, describe pictures in documents. "
-                "Boolean. Optional, defaults to false."
-            ),
-            examples=[False],
-        ),
-    ] = False
-
-    picture_description_area_threshold: Annotated[
-        float,
-        Field(
-            description="Minimum percentage of the area for a picture to be processed with the models.",
-            examples=[PictureDescriptionBaseOptions().picture_area_threshold],
-        ),
-    ] = PictureDescriptionBaseOptions().picture_area_threshold
-
-    picture_description_local: Annotated[
-        Optional[PictureDescriptionLocal],
-        Field(
-            description="Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.",
-            examples=[
-                PictureDescriptionLocal(repo_id="ibm-granite/granite-vision-3.2-2b"),
-                PictureDescriptionLocal(repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"),
-            ],
-        ),
-    ] = None
-
-    picture_description_api: Annotated[
-        Optional[PictureDescriptionApi],
-        Field(
-            description="API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.",
-            examples=[
-                PictureDescriptionApi(
-                    url="http://localhost:11434/v1/chat/completions",
-                    params={"model": "granite3.2-vision:2b"},
-                )
-            ],
-        ),
-    ] = None
-
-    @model_validator(mode="after")
-    def picture_description_exclusivity(self) -> Self:
-        # Validate picture description options
-        if (
-            self.picture_description_local is not None
-            and self.picture_description_api is not None
-        ):
-            raise ValueError(
-                "The parameters picture_description_local and picture_description_api are mutually exclusive, only one of them can be set."
-            )
-
-        return self
--- a/docling_serve/datamodel/engines.py
+++ b/docling_serve/datamodel/engines.py
@@ -1,13 +0,0 @@
-import enum
-
-
-class TaskStatus(str, enum.Enum):
-    SUCCESS = "success"
-    PENDING = "pending"
-    STARTED = "started"
-    FAILURE = "failure"
-
-
-class AsyncEngine(str, enum.Enum):
-    LOCAL = "local"
-    KFP = "kfp"
--- a/docling_serve/datamodel/kfp.py
+++ b/docling_serve/datamodel/kfp.py
@@ -1,7 +0,0 @@
-from pydantic import AnyUrl, BaseModel
-
-
-class CallbackSpec(BaseModel):
-    url: AnyUrl
-    headers: dict[str, str] = {}
-    ca_cert: str = ""
--- a/docling_serve/datamodel/requests.py
+++ b/docling_serve/datamodel/requests.py
@@ -1,62 +1,72 @@
-import base64
-from io import BytesIO
-from typing import Annotated, Any, Union
+import enum
+from typing import Annotated, Literal

-from pydantic import AnyHttpUrl, BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
+from pydantic_core import PydanticCustomError
+from typing_extensions import Self

-from docling.datamodel.base_models import DocumentStream
+from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
+from docling_jobkit.datamodel.s3_coords import S3Coordinates
+from docling_jobkit.datamodel.task_targets import (
+    InBodyTarget,
+    S3Target,
+    TaskTarget,
+    ZipTarget,
+)

-from docling_serve.datamodel.convert import ConvertDocumentsOptions
+from docling_serve.datamodel.convert import ConvertDocumentsRequestOptions
+from docling_serve.settings import AsyncEngine, docling_serve_settings
+
+## Sources


-class DocumentsConvertBase(BaseModel):
-    options: ConvertDocumentsOptions = ConvertDocumentsOptions()
+class FileSourceRequest(FileSource):
+    kind: Literal["file"] = "file"


-class HttpSource(BaseModel):
-    url: Annotated[
-        AnyHttpUrl,
-        Field(
-            description="HTTP url to process",
-            examples=["https://arxiv.org/pdf/2206.01062"],
-        ),
-    ]
-    headers: Annotated[
-        dict[str, Any],
-        Field(
-            description="Additional headers used to fetch the urls, "
-            "e.g. authorization, agent, etc"
-        ),
-    ] = {}
+class HttpSourceRequest(HttpSource):
+    kind: Literal["http"] = "http"


-class FileSource(BaseModel):
-    base64_string: Annotated[
-        str,
-        Field(
-            description="Content of the file serialized in base64. "
-            "For example it can be obtained via "
-            "`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
-        ),
-    ]
-    filename: Annotated[
-        str,
-        Field(description="Filename of the uploaded document", examples=["file.pdf"]),
-    ]
-
-    def to_document_stream(self) -> DocumentStream:
-        buf = BytesIO(base64.b64decode(self.base64_string))
-        return DocumentStream(stream=buf, name=self.filename)
+class S3SourceRequest(S3Coordinates):
+    kind: Literal["s3"] = "s3"


-class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
-    http_sources: list[HttpSource]
+## Multipart targets
+class TargetName(str, enum.Enum):
+    INBODY = InBodyTarget().kind
+    ZIP = ZipTarget().kind


-class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
-    file_sources: list[FileSource]
-
-
-ConvertDocumentsRequest = Union[
-    ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
+## Aliases
+SourceRequestItem = Annotated[
+    FileSourceRequest | HttpSourceRequest | S3SourceRequest, Field(discriminator="kind")
 ]
+
+
+## Complete Source request
+class ConvertDocumentsRequest(BaseModel):
+    options: ConvertDocumentsRequestOptions = ConvertDocumentsRequestOptions()
+    sources: list[SourceRequestItem]
+    target: TaskTarget = InBodyTarget()
+
+    @model_validator(mode="after")
+    def validate_s3_source_and_target(self) -> Self:
+        for source in self.sources:
+            if isinstance(source, S3SourceRequest):
+                if docling_serve_settings.eng_kind != AsyncEngine.KFP:
+                    raise PydanticCustomError(
+                        "error source", 'source kind "s3" requires engine kind "KFP"'
+                    )
+                if self.target.kind != "s3":
+                    raise PydanticCustomError(
+                        "error source", 'source kind "s3" requires target kind "s3"'
+                    )
+        if isinstance(self.target, S3Target):
+            for source in self.sources:
+                if isinstance(source, S3SourceRequest):
+                    return self
+            raise PydanticCustomError(
+                "error target", 'target kind "s3" requires source kind "s3"'
+            )
+        return self
--- a/docling_serve/datamodel/responses.py
+++ b/docling_serve/datamodel/responses.py
@@ -5,9 +5,8 @@ from pydantic import BaseModel

 from docling.datamodel.document import ConversionStatus, ErrorItem
 from docling.utils.profiling import ProfilingItem
-from docling_core.types.doc import DoclingDocument
-
-from docling_serve.datamodel.task_meta import TaskProcessingMeta
+from docling_jobkit.datamodel.result import ExportDocumentResponse
+from docling_jobkit.datamodel.task_meta import TaskProcessingMeta


 # Status
@@ -19,23 +18,21 @@ class ClearResponse(BaseModel):
    status: str = "ok"


-class DocumentResponse(BaseModel):
-    filename: str
-    md_content: Optional[str] = None
-    json_content: Optional[DoclingDocument] = None
-    html_content: Optional[str] = None
-    text_content: Optional[str] = None
-    doctags_content: Optional[str] = None
-
-
 class ConvertDocumentResponse(BaseModel):
-    document: DocumentResponse
+    document: ExportDocumentResponse
    status: ConversionStatus
    errors: list[ErrorItem] = []
    processing_time: float
    timings: dict[str, ProfilingItem] = {}


+class PresignedUrlConvertDocumentResponse(BaseModel):
+    processing_time: float
+    num_converted: int
+    num_succeeded: int
+    num_failed: int
+
+
 class ConvertDocumentErrorResponse(BaseModel):
    status: ConversionStatus

--- a/docling_serve/datamodel/task.py
+++ b/docling_serve/datamodel/task.py
@@ -1,55 +0,0 @@
-import datetime
-from functools import partial
-from pathlib import Path
-from typing import Optional, Union
-
-from fastapi.responses import FileResponse
-from pydantic import BaseModel, ConfigDict, Field
-
-from docling.datamodel.base_models import DocumentStream
-
-from docling_serve.datamodel.convert import ConvertDocumentsOptions
-from docling_serve.datamodel.engines import TaskStatus
-from docling_serve.datamodel.requests import FileSource, HttpSource
-from docling_serve.datamodel.responses import ConvertDocumentResponse
-from docling_serve.datamodel.task_meta import TaskProcessingMeta
-
-TaskSource = Union[HttpSource, FileSource, DocumentStream]
-
-
-class Task(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    task_id: str
-    task_status: TaskStatus = TaskStatus.PENDING
-    sources: list[TaskSource] = []
-    options: Optional[ConvertDocumentsOptions]
-    result: Optional[Union[ConvertDocumentResponse, FileResponse]] = None
-    scratch_dir: Optional[Path] = None
-    processing_meta: Optional[TaskProcessingMeta] = None
-    created_at: datetime.datetime = Field(
-        default_factory=partial(datetime.datetime.now, datetime.timezone.utc)
-    )
-    started_at: Optional[datetime.datetime] = None
-    finished_at: Optional[datetime.datetime] = None
-    last_update_at: datetime.datetime = Field(
-        default_factory=partial(datetime.datetime.now, datetime.timezone.utc)
-    )
-
-    def set_status(self, status: TaskStatus):
-        now = datetime.datetime.now(datetime.timezone.utc)
-        if status == TaskStatus.STARTED and self.started_at is None:
-            self.started_at = now
-        if (
-            status in [TaskStatus.SUCCESS, TaskStatus.FAILURE]
-            and self.finished_at is None
-        ):
-            self.finished_at = now
-
-        self.last_update_at = now
-        self.task_status = status
-
-    def is_completed(self) -> bool:
-        if self.task_status in [TaskStatus.SUCCESS, TaskStatus.FAILURE]:
-            return True
-        return False
--- a/docling_serve/datamodel/task_meta.py
+++ b/docling_serve/datamodel/task_meta.py
@@ -1,8 +0,0 @@
-from pydantic import BaseModel
-
-
-class TaskProcessingMeta(BaseModel):
-    num_docs: int
-    num_processed: int = 0
-    num_succeeded: int = 0
-    num_failed: int = 0
--- a/docling_serve/docling_conversion.py
+++ b/docling_serve/docling_conversion.py
@@ -1,256 +0,0 @@
-import hashlib
-import json
-import logging
-import sys
-from collections.abc import Iterable, Iterator
-from functools import lru_cache
-from pathlib import Path
-from typing import Any, Optional, Union
-
-from fastapi import HTTPException
-
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
-from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import DocumentStream, InputFormat
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    OcrOptions,
-    PdfBackend,
-    PdfPipeline,
-    PdfPipelineOptions,
-    PictureDescriptionApiOptions,
-    PictureDescriptionVlmOptions,
-    TableFormerMode,
-    VlmPipelineOptions,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
-)
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-from docling.pipeline.vlm_pipeline import VlmPipeline
-from docling_core.types.doc import ImageRefMode
-
-from docling_serve.datamodel.convert import ConvertDocumentsOptions, ocr_factory
-from docling_serve.helper_functions import _to_list_of_strings
-from docling_serve.settings import docling_serve_settings
-
-_log = logging.getLogger(__name__)
-
-
-# Custom serializer for PdfFormatOption
-# (model_dump_json does not work with some classes)
-def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
-    data = pdf_format_option.model_dump(serialize_as_any=True)
-
-    # pipeline_options are not fully serialized by model_dump, dedicated pass
-    if pdf_format_option.pipeline_options:
-        data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
-            serialize_as_any=True, mode="json"
-        )
-
-    # Replace `pipeline_cls` with a string representation
-    data["pipeline_cls"] = repr(data["pipeline_cls"])
-
-    # Replace `backend` with a string representation
-    data["backend"] = repr(data["backend"])
-
-    # Serialize the dictionary to JSON with sorted keys to have consistent hashes
-    serialized_data = json.dumps(data, sort_keys=True)
-    options_hash = hashlib.sha1(
-        serialized_data.encode(), usedforsecurity=False
-    ).digest()
-    return options_hash
-
-
-# Cache of DocumentConverter objects
-_options_map: dict[bytes, PdfFormatOption] = {}
-
-
-@lru_cache(maxsize=docling_serve_settings.options_cache_size)
-def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
-    pdf_format_option = _options_map[options_hash]
-    format_options: dict[InputFormat, FormatOption] = {
-        InputFormat.PDF: pdf_format_option,
-        InputFormat.IMAGE: pdf_format_option,
-    }
-
-    return DocumentConverter(format_options=format_options)
-
-
-def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
-    options_hash = _hash_pdf_format_option(pdf_format_option)
-    _options_map[options_hash] = pdf_format_option
-    return _get_converter_from_hash(options_hash)
-
-
-def _parse_standard_pdf_opts(
-    request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
-) -> PdfPipelineOptions:
-    try:
-        ocr_options: OcrOptions = ocr_factory.create_options(
-            kind=request.ocr_engine.value,  # type: ignore
-            force_full_page_ocr=request.force_ocr,
-        )
-    except ImportError as err:
-        raise HTTPException(
-            status_code=400,
-            detail="The requested OCR engine"
-            f" (ocr_engine={request.ocr_engine.value})"  # type: ignore
-            " is not available on this system. Please choose another OCR engine "
-            "or contact your system administrator.\n"
-            f"{err}",
-        )
-
-    if request.ocr_lang is not None:
-        if isinstance(request.ocr_lang, str):
-            ocr_options.lang = _to_list_of_strings(request.ocr_lang)
-        else:
-            ocr_options.lang = request.ocr_lang
-
-    pipeline_options = PdfPipelineOptions(
-        artifacts_path=artifacts_path,
-        enable_remote_services=docling_serve_settings.enable_remote_services,
-        document_timeout=request.document_timeout,
-        do_ocr=request.do_ocr,
-        ocr_options=ocr_options,
-        do_table_structure=request.do_table_structure,
-        do_code_enrichment=request.do_code_enrichment,
-        do_formula_enrichment=request.do_formula_enrichment,
-        do_picture_classification=request.do_picture_classification,
-        do_picture_description=request.do_picture_description,
-    )
-    pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
-
-    if request.image_export_mode != ImageRefMode.PLACEHOLDER:
-        pipeline_options.generate_page_images = True
-        if request.image_export_mode == ImageRefMode.REFERENCED:
-            pipeline_options.generate_picture_images = True
-        if request.images_scale:
-            pipeline_options.images_scale = request.images_scale
-
-    if request.picture_description_local is not None:
-        pipeline_options.picture_description_options = (
-            PictureDescriptionVlmOptions.model_validate(
-                request.picture_description_local.model_dump()
-            )
-        )
-
-    if request.picture_description_api is not None:
-        pipeline_options.picture_description_options = (
-            PictureDescriptionApiOptions.model_validate(
-                request.picture_description_api.model_dump()
-            )
-        )
-    pipeline_options.picture_description_options.picture_area_threshold = (
-        request.picture_description_area_threshold
-    )
-
-    return pipeline_options
-
-
-def _parse_backend(request: ConvertDocumentsOptions) -> type[PdfDocumentBackend]:
-    if request.pdf_backend == PdfBackend.DLPARSE_V1:
-        backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
-    elif request.pdf_backend == PdfBackend.DLPARSE_V2:
-        backend = DoclingParseV2DocumentBackend
-    elif request.pdf_backend == PdfBackend.DLPARSE_V4:
-        backend = DoclingParseV4DocumentBackend
-    elif request.pdf_backend == PdfBackend.PYPDFIUM2:
-        backend = PyPdfiumDocumentBackend
-    else:
-        raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
-
-    return backend
-
-
-def _parse_vlm_pdf_opts(
-    request: ConvertDocumentsOptions, artifacts_path: Optional[Path]
-) -> VlmPipelineOptions:
-    pipeline_options = VlmPipelineOptions(
-        artifacts_path=artifacts_path,
-        document_timeout=request.document_timeout,
-    )
-    pipeline_options.vlm_options = smoldocling_vlm_conversion_options
-    if sys.platform == "darwin":
-        try:
-            import mlx_vlm  # noqa: F401
-
-            pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
-        except ImportError:
-            _log.warning(
-                "To run SmolDocling faster, please install mlx-vlm:\n"
-                "pip install mlx-vlm"
-            )
-    return pipeline_options
-
-
-# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
-def get_pdf_pipeline_opts(
-    request: ConvertDocumentsOptions,
-) -> PdfFormatOption:
-    artifacts_path: Optional[Path] = None
-    if docling_serve_settings.artifacts_path is not None:
-        if str(docling_serve_settings.artifacts_path.absolute()) == "":
-            _log.info(
-                "artifacts_path is an empty path, model weights will be downloaded "
-                "at runtime."
-            )
-            artifacts_path = None
-        elif docling_serve_settings.artifacts_path.is_dir():
-            _log.info(
-                "artifacts_path is set to a valid directory. "
-                "No model weights will be downloaded at runtime."
-            )
-            artifacts_path = docling_serve_settings.artifacts_path
-        else:
-            _log.warning(
-                "artifacts_path is set to an invalid directory. "
-                "The system will download the model weights at runtime."
-            )
-            artifacts_path = None
-    else:
-        _log.info(
-            "artifacts_path is unset. "
-            "The system will download the model weights at runtime."
-        )
-
-    pipeline_options: Union[PdfPipelineOptions, VlmPipelineOptions]
-    if request.pipeline == PdfPipeline.STANDARD:
-        pipeline_options = _parse_standard_pdf_opts(request, artifacts_path)
-        backend = _parse_backend(request)
-        pdf_format_option = PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,
-        )
-
-    elif request.pipeline == PdfPipeline.VLM:
-        pipeline_options = _parse_vlm_pdf_opts(request, artifacts_path)
-        pdf_format_option = PdfFormatOption(
-            pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
-        )
-    else:
-        raise NotImplementedError(
-            f"The pipeline {request.pipeline} is not implemented."
-        )
-
-    return pdf_format_option
-
-
-def convert_documents(
-    sources: Iterable[Union[Path, str, DocumentStream]],
-    options: ConvertDocumentsOptions,
-    headers: Optional[dict[str, Any]] = None,
-):
-    pdf_format_option = get_pdf_pipeline_opts(options)
-    converter = get_converter(pdf_format_option)
-    results: Iterator[ConversionResult] = converter.convert_all(
-        sources,
-        headers=headers,
-        page_range=options.page_range,
-        max_file_size=docling_serve_settings.max_file_size,
-        max_num_pages=docling_serve_settings.max_num_pages,
-    )
-
-    return results
--- a/docling_serve/engines/init.py
+++ b/docling_serve/engines/init.py
--- a/docling_serve/engines/async_kfp/init.py
+++ b/docling_serve/engines/async_kfp/init.py
--- a/docling_serve/engines/async_kfp/kfp_pipeline.py
+++ b/docling_serve/engines/async_kfp/kfp_pipeline.py
@@ -1,137 +0,0 @@
-# ruff: noqa: E402, UP006, UP035
-
-from typing import Any, Dict, List
-
-from kfp import dsl
-
-PYTHON_BASE_IMAGE = "python:3.12"
-
-
-@dsl.component(
-    base_image=PYTHON_BASE_IMAGE,
-    packages_to_install=[
-        "pydantic",
-        "docling-serve @ git+https://github.com/docling-project/docling-serve@feat-kfp-engine",
-    ],
-    pip_index_urls=["https://download.pytorch.org/whl/cpu", "https://pypi.org/simple"],
-)
-def generate_chunks(
-    run_name: str,
-    request: Dict[str, Any],
-    batch_size: int,
-    callbacks: List[Dict[str, Any]],
-) -> List[List[Dict[str, Any]]]:
-    from pydantic import TypeAdapter
-
-    from docling_serve.datamodel.callback import (
-        ProgressCallbackRequest,
-        ProgressSetNumDocs,
-    )
-    from docling_serve.datamodel.kfp import CallbackSpec
-    from docling_serve.engines.async_kfp.notify import notify_callbacks
-
-    CallbacksListType = TypeAdapter(list[CallbackSpec])
-
-    sources = request["http_sources"]
-    splits = [sources[i : i + batch_size] for i in range(0, len(sources), batch_size)]
-
-    total = sum(len(chunk) for chunk in splits)
-    payload = ProgressCallbackRequest(
-        task_id=run_name, progress=ProgressSetNumDocs(num_docs=total)
-    )
-    notify_callbacks(
-        payload=payload,
-        callbacks=CallbacksListType.validate_python(callbacks),
-    )
-
-    return splits
-
-
-@dsl.component(
-    base_image=PYTHON_BASE_IMAGE,
-    packages_to_install=[
-        "pydantic",
-        "docling-serve @ git+https://github.com/docling-project/docling-serve@feat-kfp-engine",
-    ],
-    pip_index_urls=["https://download.pytorch.org/whl/cpu", "https://pypi.org/simple"],
-)
-def convert_batch(
-    run_name: str,
-    data_splits: List[Dict[str, Any]],
-    request: Dict[str, Any],
-    callbacks: List[Dict[str, Any]],
-    output_path: dsl.OutputPath("Directory"),  # type: ignore
-):
-    from pathlib import Path
-
-    from pydantic import AnyUrl, TypeAdapter
-
-    from docling_serve.datamodel.callback import (
-        FailedDocsItem,
-        ProgressCallbackRequest,
-        ProgressUpdateProcessed,
-        SucceededDocsItem,
-    )
-    from docling_serve.datamodel.convert import ConvertDocumentsOptions
-    from docling_serve.datamodel.kfp import CallbackSpec
-    from docling_serve.datamodel.requests import HttpSource
-    from docling_serve.engines.async_kfp.notify import notify_callbacks
-
-    CallbacksListType = TypeAdapter(list[CallbackSpec])
-
-    convert_options = ConvertDocumentsOptions.model_validate(request["options"])
-    print(convert_options)
-
-    output_dir = Path(output_path)
-    output_dir.mkdir(exist_ok=True, parents=True)
-    docs_succeeded: list[SucceededDocsItem] = []
-    docs_failed: list[FailedDocsItem] = []
-    for source_dict in data_splits:
-        source = HttpSource.model_validate(source_dict)
-        filename = Path(str(AnyUrl(source.url).path)).name
-        output_filename = output_dir / filename
-        print(f"Writing {output_filename}")
-        with output_filename.open("w") as f:
-            f.write(source.model_dump_json())
-        docs_succeeded.append(SucceededDocsItem(source=source.url))
-
-    payload = ProgressCallbackRequest(
-        task_id=run_name,
-        progress=ProgressUpdateProcessed(
-            num_failed=len(docs_failed),
-            num_processed=len(docs_succeeded) + len(docs_failed),
-            num_succeeded=len(docs_succeeded),
-            docs_succeeded=docs_succeeded,
-            docs_failed=docs_failed,
-        ),
-    )
-
-    print(payload)
-    notify_callbacks(
-        payload=payload,
-        callbacks=CallbacksListType.validate_python(callbacks),
-    )
-
-
-@dsl.pipeline()
-def process(
-    batch_size: int,
-    request: Dict[str, Any],
-    callbacks: List[Dict[str, Any]] = [],
-    run_name: str = "",
-):
-    chunks_task = generate_chunks(
-        run_name=run_name,
-        request=request,
-        batch_size=batch_size,
-        callbacks=callbacks,
-    )
-    chunks_task.set_caching_options(False)
-
-    with dsl.ParallelFor(chunks_task.output, parallelism=4) as data_splits:
-        convert_batch(
-            run_name=run_name,
-            data_splits=data_splits,
-            request=request,
-            callbacks=callbacks,
-        )
--- a/docling_serve/engines/async_kfp/notify.py
+++ b/docling_serve/engines/async_kfp/notify.py
@@ -1,32 +0,0 @@
-import ssl
-
-import certifi
-import httpx
-
-from docling_serve.datamodel.callback import ProgressCallbackRequest
-from docling_serve.datamodel.kfp import CallbackSpec
-
-
-def notify_callbacks(
-    payload: ProgressCallbackRequest,
-    callbacks: list[CallbackSpec],
-):
-    if len(callbacks) == 0:
-        return
-
-    for callback in callbacks:
-        # https://www.python-httpx.org/advanced/ssl/#configuring-client-instances
-        if callback.ca_cert:
-            ctx = ssl.create_default_context(cadata=callback.ca_cert)
-        else:
-            ctx = ssl.create_default_context(cafile=certifi.where())
-
-        try:
-            httpx.post(
-                str(callback.url),
-                headers=callback.headers,
-                json=payload.model_dump(mode="json"),
-                verify=ctx,
-            )
-        except httpx.HTTPError as err:
-            print(f"Error notifying callback {callback.url}: {err}")
--- a/docling_serve/engines/async_kfp/orchestrator.py
+++ b/docling_serve/engines/async_kfp/orchestrator.py
@@ -1,235 +0,0 @@
-import datetime
-import json
-import logging
-import uuid
-from pathlib import Path
-from typing import Optional
-
-from kfp_server_api.models import V2beta1RuntimeState
-from pydantic import BaseModel, TypeAdapter
-from pydantic_settings import SettingsConfigDict
-
-from docling_serve.datamodel.callback import (
-    ProgressCallbackRequest,
-    ProgressSetNumDocs,
-    ProgressUpdateProcessed,
-)
-from docling_serve.datamodel.convert import ConvertDocumentsOptions
-from docling_serve.datamodel.engines import TaskStatus
-from docling_serve.datamodel.kfp import CallbackSpec
-from docling_serve.datamodel.requests import HttpSource
-from docling_serve.datamodel.task import Task, TaskSource
-from docling_serve.datamodel.task_meta import TaskProcessingMeta
-from docling_serve.engines.async_kfp.kfp_pipeline import process
-from docling_serve.engines.async_orchestrator import (
-    BaseAsyncOrchestrator,
-    ProgressInvalid,
-)
-from docling_serve.settings import docling_serve_settings
-
-_log = logging.getLogger(__name__)
-
-
-class _RunItem(BaseModel):
-    model_config = SettingsConfigDict(arbitrary_types_allowed=True)
-
-    run_id: str
-    state: str
-    created_at: datetime.datetime
-    scheduled_at: datetime.datetime
-    finished_at: datetime.datetime
-
-
-class AsyncKfpOrchestrator(BaseAsyncOrchestrator):
-    def __init__(self):
-        super().__init__()
-        import kfp
-
-        kfp_endpoint = docling_serve_settings.eng_kfp_endpoint
-        if kfp_endpoint is None:
-            raise ValueError("KFP endpoint is required when using the KFP engine.")
-
-        kube_sa_token_path = Path("/run/secrets/kubernetes.io/serviceaccount/token")
-        kube_sa_ca_cert_path = Path(
-            "/run/secrets/kubernetes.io/serviceaccount/service-ca.crt"
-        )
-
-        ssl_ca_cert = docling_serve_settings.eng_kfp_ca_cert_path
-        token = docling_serve_settings.eng_kfp_token
-        if (
-            ssl_ca_cert is None
-            and ".svc" in kfp_endpoint.host
-            and kube_sa_ca_cert_path.exists()
-        ):
-            ssl_ca_cert = str(kube_sa_ca_cert_path)
-        if token is None and kube_sa_token_path.exists():
-            token = kube_sa_token_path.read_text()
-
-        self._client = kfp.Client(
-            host=str(kfp_endpoint),
-            existing_token=token,
-            ssl_ca_cert=ssl_ca_cert,
-            # verify_ssl=False,
-        )
-
-    async def enqueue(
-        self, sources: list[TaskSource], options: ConvertDocumentsOptions
-    ) -> Task:
-        callbacks = []
-        if docling_serve_settings.eng_kfp_self_callback_endpoint is not None:
-            headers = {}
-            if docling_serve_settings.eng_kfp_self_callback_token_path is not None:
-                token = (
-                    docling_serve_settings.eng_kfp_self_callback_token_path.read_text()
-                )
-                headers["Authorization"] = f"Bearer {token}"
-            ca_cert = ""
-            if docling_serve_settings.eng_kfp_self_callback_ca_cert_path is not None:
-                ca_cert = docling_serve_settings.eng_kfp_self_callback_ca_cert_path.read_text()
-            callbacks.append(
-                CallbackSpec(
-                    url=docling_serve_settings.eng_kfp_self_callback_endpoint,
-                    headers=headers,
-                    ca_cert=ca_cert,
-                )
-            )
-
-        CallbacksType = TypeAdapter(list[CallbackSpec])
-        SourcesListType = TypeAdapter(list[HttpSource])
-        http_sources = [s for s in sources if isinstance(s, HttpSource)]
-        # hack: since the current kfp backend is not resolving the job_id placeholder,
-        # we set the run_name and pass it as argument to the job itself.
-        run_name = f"docling-job-{uuid.uuid4()}"
-        kfp_run = self._client.create_run_from_pipeline_func(
-            process,
-            arguments={
-                "batch_size": 10,
-                "sources": SourcesListType.dump_python(http_sources, mode="json"),
-                "options": options.model_dump(mode="json"),
-                "callbacks": CallbacksType.dump_python(callbacks, mode="json"),
-                "run_name": run_name,
-            },
-            run_name=run_name,
-        )
-        task_id = kfp_run.run_id
-
-        task = Task(task_id=task_id, sources=sources, options=options)
-        await self.init_task_tracking(task)
-        return task
-
-    async def _update_task_from_run(self, task_id: str, wait: float = 0.0):
-        run_info = self._client.get_run(run_id=task_id)
-        task = await self.get_raw_task(task_id=task_id)
-        # RUNTIME_STATE_UNSPECIFIED = "RUNTIME_STATE_UNSPECIFIED"
-        # PENDING = "PENDING"
-        # RUNNING = "RUNNING"
-        # SUCCEEDED = "SUCCEEDED"
-        # SKIPPED = "SKIPPED"
-        # FAILED = "FAILED"
-        # CANCELING = "CANCELING"
-        # CANCELED = "CANCELED"
-        # PAUSED = "PAUSED"
-        if run_info.state == V2beta1RuntimeState.SUCCEEDED:
-            task.set_status(TaskStatus.SUCCESS)
-        elif run_info.state == V2beta1RuntimeState.PENDING:
-            task.set_status(TaskStatus.PENDING)
-        elif run_info.state == V2beta1RuntimeState.RUNNING:
-            task.set_status(TaskStatus.STARTED)
-        else:
-            task.set_status(TaskStatus.FAILURE)
-
-    async def task_status(self, task_id: str, wait: float = 0.0) -> Task:
-        await self._update_task_from_run(task_id=task_id, wait=wait)
-        return await self.get_raw_task(task_id=task_id)
-
-    async def _get_pending(self) -> list[_RunItem]:
-        runs: list[_RunItem] = []
-        next_page: Optional[str] = None
-        while True:
-            res = self._client.list_runs(
-                page_token=next_page,
-                page_size=20,
-                filter=json.dumps(
-                    {
-                        "predicates": [
-                            {
-                                "operation": "EQUALS",
-                                "key": "state",
-                                "stringValue": "PENDING",
-                            }
-                        ]
-                    }
-                ),
-            )
-            if res.runs is not None:
-                for run in res.runs:
-                    runs.append(
-                        _RunItem(
-                            run_id=run.run_id,
-                            state=run.state,
-                            created_at=run.created_at,
-                            scheduled_at=run.scheduled_at,
-                            finished_at=run.finished_at,
-                        )
-                    )
-            if res.next_page_token is None:
-                break
-            next_page = res.next_page_token
-        return runs
-
-    async def queue_size(self) -> int:
-        runs = await self._get_pending()
-        return len(runs)
-
-    async def get_queue_position(self, task_id: str) -> Optional[int]:
-        runs = await self._get_pending()
-        for pos, run in enumerate(runs, start=1):
-            if run.run_id == task_id:
-                return pos
-        return None
-
-    async def process_queue(self):
-        return
-
-    async def warm_up_caches(self):
-        return
-
-    async def _get_run_id(self, run_name: str) -> str:
-        res = self._client.list_runs(
-            filter=json.dumps(
-                {
-                    "predicates": [
-                        {
-                            "operation": "EQUALS",
-                            "key": "name",
-                            "stringValue": run_name,
-                        }
-                    ]
-                }
-            ),
-        )
-        if res.runs is not None and len(res.runs) > 0:
-            return res.runs[0].run_id
-        raise RuntimeError(f"Run with {run_name=} not found.")
-
-    async def receive_task_progress(self, request: ProgressCallbackRequest):
-        task_id = await self._get_run_id(run_name=request.task_id)
-        progress = request.progress
-        task = await self.get_raw_task(task_id=task_id)
-
-        if isinstance(progress, ProgressSetNumDocs):
-            task.processing_meta = TaskProcessingMeta(num_docs=progress.num_docs)
-            task.task_status = TaskStatus.STARTED
-
-        elif isinstance(progress, ProgressUpdateProcessed):
-            if task.processing_meta is None:
-                raise ProgressInvalid(
-                    "UpdateProcessed was called before setting the expected number of documents."
-                )
-            task.processing_meta.num_processed += progress.num_processed
-            task.processing_meta.num_succeeded += progress.num_succeeded
-            task.processing_meta.num_failed += progress.num_failed
-            task.task_status = TaskStatus.STARTED
-
-        # TODO: could be moved to BackgroundTask
-        await self.notify_task_subscribers(task_id=task_id)
--- a/docling_serve/engines/async_local/init.py
+++ b/docling_serve/engines/async_local/init.py
--- a/docling_serve/engines/async_local/orchestrator.py
+++ b/docling_serve/engines/async_local/orchestrator.py
@@ -1,57 +0,0 @@
-import asyncio
-import logging
-import uuid
-from typing import Optional
-
-from docling_serve.datamodel.convert import ConvertDocumentsOptions
-from docling_serve.datamodel.task import Task, TaskSource
-from docling_serve.docling_conversion import get_converter, get_pdf_pipeline_opts
-from docling_serve.engines.async_local.worker import AsyncLocalWorker
-from docling_serve.engines.async_orchestrator import BaseAsyncOrchestrator
-from docling_serve.settings import docling_serve_settings
-
-_log = logging.getLogger(__name__)
-
-
-class AsyncLocalOrchestrator(BaseAsyncOrchestrator):
-    def __init__(self):
-        super().__init__()
-        self.task_queue = asyncio.Queue()
-        self.queue_list: list[str] = []
-
-    async def enqueue(
-        self, sources: list[TaskSource], options: ConvertDocumentsOptions
-    ) -> Task:
-        task_id = str(uuid.uuid4())
-        task = Task(task_id=task_id, sources=sources, options=options)
-        await self.init_task_tracking(task)
-
-        self.queue_list.append(task_id)
-        await self.task_queue.put(task_id)
-        return task
-
-    async def queue_size(self) -> int:
-        return self.task_queue.qsize()
-
-    async def get_queue_position(self, task_id: str) -> Optional[int]:
-        return (
-            self.queue_list.index(task_id) + 1 if task_id in self.queue_list else None
-        )
-
-    async def process_queue(self):
-        # Create a pool of workers
-        workers = []
-        for i in range(docling_serve_settings.eng_loc_num_workers):
-            _log.debug(f"Starting worker {i}")
-            w = AsyncLocalWorker(i, self)
-            worker_task = asyncio.create_task(w.loop())
-            workers.append(worker_task)
-
-        # Wait for all workers to complete (they won't, as they run indefinitely)
-        await asyncio.gather(*workers)
-        _log.debug("All workers completed.")
-
-    async def warm_up_caches(self):
-        # Converter with default options
-        pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
-        get_converter(pdf_format_option)
--- a/docling_serve/engines/async_local/worker.py
+++ b/docling_serve/engines/async_local/worker.py
@@ -1,124 +0,0 @@
-import asyncio
-import logging
-import shutil
-import time
-from typing import TYPE_CHECKING, Any, Optional, Union
-
-from fastapi.responses import FileResponse
-
-from docling.datamodel.base_models import DocumentStream
-
-from docling_serve.datamodel.engines import TaskStatus
-from docling_serve.datamodel.requests import FileSource, HttpSource
-from docling_serve.docling_conversion import convert_documents
-from docling_serve.response_preparation import process_results
-from docling_serve.storage import get_scratch
-
-if TYPE_CHECKING:
-    from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
-
-_log = logging.getLogger(__name__)
-
-
-class AsyncLocalWorker:
-    def __init__(self, worker_id: int, orchestrator: "AsyncLocalOrchestrator"):
-        self.worker_id = worker_id
-        self.orchestrator = orchestrator
-
-    async def loop(self):
-        _log.debug(f"Starting loop for worker {self.worker_id}")
-        while True:
-            task_id: str = await self.orchestrator.task_queue.get()
-            self.orchestrator.queue_list.remove(task_id)
-
-            if task_id not in self.orchestrator.tasks:
-                raise RuntimeError(f"Task {task_id} not found.")
-            task = self.orchestrator.tasks[task_id]
-
-            try:
-                task.set_status(TaskStatus.STARTED)
-                _log.info(f"Worker {self.worker_id} processing task {task_id}")
-
-                # Notify clients about task updates
-                await self.orchestrator.notify_task_subscribers(task_id)
-
-                # Notify clients about queue updates
-                await self.orchestrator.notify_queue_positions()
-
-                # Define a callback function to send progress updates to the client.
-                # TODO: send partial updates, e.g. when a document in the batch is done
-                def run_conversion():
-                    convert_sources: list[Union[str, DocumentStream]] = []
-                    headers: Optional[dict[str, Any]] = None
-                    for source in task.sources:
-                        if isinstance(source, DocumentStream):
-                            convert_sources.append(source)
-                        elif isinstance(source, FileSource):
-                            convert_sources.append(source.to_document_stream())
-                        elif isinstance(source, HttpSource):
-                            convert_sources.append(str(source.url))
-                            if headers is None and source.headers:
-                                headers = source.headers
-
-                    # Note: results are only an iterator->lazy evaluation
-                    results = convert_documents(
-                        sources=convert_sources,
-                        options=task.options,
-                        headers=headers,
-                    )
-
-                    # The real processing will happen here
-                    work_dir = get_scratch() / task_id
-                    response = process_results(
-                        conversion_options=task.options,
-                        conv_results=results,
-                        work_dir=work_dir,
-                    )
-
-                    if work_dir.exists():
-                        task.scratch_dir = work_dir
-                        if not isinstance(response, FileResponse):
-                            _log.warning(
-                                f"Task {task_id=} produced content in {work_dir=} but the response is not a file."
-                            )
-                            shutil.rmtree(work_dir, ignore_errors=True)
-
-                    return response
-
-                start_time = time.monotonic()
-
-                # Run the prediction in a thread to avoid blocking the event loop.
-                # Get the current event loop
-                # loop = asyncio.get_event_loop()
-                # future = asyncio.run_coroutine_threadsafe(
-                #     run_conversion(),
-                #     loop=loop
-                # )
-                # response = future.result()
-
-                # Run in a thread
-                response = await asyncio.to_thread(
-                    run_conversion,
-                )
-                processing_time = time.monotonic() - start_time
-
-                task.result = response
-                task.sources = []
-                task.options = None
-
-                task.set_status(TaskStatus.SUCCESS)
-                _log.info(
-                    f"Worker {self.worker_id} completed job {task_id} "
-                    f"in {processing_time:.2f} seconds"
-                )
-
-            except Exception as e:
-                _log.error(
-                    f"Worker {self.worker_id} failed to process job {task_id}: {e}"
-                )
-                task.set_status(TaskStatus.FAILURE)
-
-            finally:
-                await self.orchestrator.notify_task_subscribers(task_id)
-                self.orchestrator.task_queue.task_done()
-                _log.debug(f"Worker {self.worker_id} completely done with {task_id}")
--- a/docling_serve/engines/async_orchestrator.py
+++ b/docling_serve/engines/async_orchestrator.py
@@ -1,127 +0,0 @@
-import asyncio
-import datetime
-import logging
-import shutil
-from typing import Union
-
-from fastapi import BackgroundTasks, WebSocket
-from fastapi.responses import FileResponse
-
-from docling_serve.datamodel.callback import ProgressCallbackRequest
-from docling_serve.datamodel.engines import TaskStatus
-from docling_serve.datamodel.responses import (
-    ConvertDocumentResponse,
-    MessageKind,
-    TaskStatusResponse,
-    WebsocketMessage,
-)
-from docling_serve.datamodel.task import Task
-from docling_serve.engines.base_orchestrator import (
-    BaseOrchestrator,
-    OrchestratorError,
-    TaskNotFoundError,
-)
-from docling_serve.settings import docling_serve_settings
-
-_log = logging.getLogger(__name__)
-
-
-class ProgressInvalid(OrchestratorError):
-    pass
-
-
-class BaseAsyncOrchestrator(BaseOrchestrator):
-    def __init__(self):
-        self.tasks: dict[str, Task] = {}
-        self.task_subscribers: dict[str, set[WebSocket]] = {}
-
-    async def init_task_tracking(self, task: Task):
-        task_id = task.task_id
-        self.tasks[task.task_id] = task
-        self.task_subscribers[task_id] = set()
-
-    async def get_raw_task(self, task_id: str) -> Task:
-        if task_id not in self.tasks:
-            raise TaskNotFoundError()
-        return self.tasks[task_id]
-
-    async def task_status(self, task_id: str, wait: float = 0.0) -> Task:
-        return await self.get_raw_task(task_id=task_id)
-
-    async def task_result(
-        self, task_id: str, background_tasks: BackgroundTasks
-    ) -> Union[ConvertDocumentResponse, FileResponse, None]:
-        try:
-            task = await self.get_raw_task(task_id=task_id)
-            if task.is_completed() and docling_serve_settings.single_use_results:
-                if task.scratch_dir is not None:
-                    background_tasks.add_task(
-                        shutil.rmtree, task.scratch_dir, ignore_errors=True
-                    )
-
-                async def _remove_task_impl():
-                    await asyncio.sleep(docling_serve_settings.result_removal_delay)
-                    await self.delete_task(task_id=task.task_id)
-
-                async def _remove_task():
-                    asyncio.create_task(_remove_task_impl())  # noqa: RUF006
-
-                background_tasks.add_task(_remove_task)
-
-            return task.result
-        except TaskNotFoundError:
-            return None
-
-    async def delete_task(self, task_id: str):
-        _log.info(f"Deleting {task_id=}")
-        if task_id in self.task_subscribers:
-            for websocket in self.task_subscribers[task_id]:
-                await websocket.close()
-
-            del self.task_subscribers[task_id]
-
-        if task_id in self.tasks:
-            del self.tasks[task_id]
-
-    async def clear_results(self, older_than: float = 0.0):
-        cutoff_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(
-            seconds=older_than
-        )
-
-        tasks_to_delete = [
-            task_id
-            for task_id, task in self.tasks.items()
-            if task.finished_at is not None and task.finished_at < cutoff_time
-        ]
-        for task_id in tasks_to_delete:
-            await self.delete_task(task_id=task_id)
-
-    async def notify_task_subscribers(self, task_id: str):
-        if task_id not in self.task_subscribers:
-            raise RuntimeError(f"Task {task_id} does not have a subscribers list.")
-
-        task = await self.get_raw_task(task_id=task_id)
-        task_queue_position = await self.get_queue_position(task_id)
-        msg = TaskStatusResponse(
-            task_id=task.task_id,
-            task_status=task.task_status,
-            task_position=task_queue_position,
-            task_meta=task.processing_meta,
-        )
-        for websocket in self.task_subscribers[task_id]:
-            await websocket.send_text(
-                WebsocketMessage(message=MessageKind.UPDATE, task=msg).model_dump_json()
-            )
-            if task.is_completed():
-                await websocket.close()
-
-    async def notify_queue_positions(self):
-        for task_id in self.task_subscribers.keys():
-            # notify only pending tasks
-            if self.tasks[task_id].task_status != TaskStatus.PENDING:
-                continue
-
-            await self.notify_task_subscribers(task_id)
-
-    async def receive_task_progress(self, request: ProgressCallbackRequest):
-        raise NotImplementedError()
--- a/docling_serve/engines/async_orchestrator_factory.py
+++ b/docling_serve/engines/async_orchestrator_factory.py
@@ -1,21 +0,0 @@
-from functools import lru_cache
-
-from docling_serve.datamodel.engines import AsyncEngine
-from docling_serve.engines.async_orchestrator import BaseAsyncOrchestrator
-from docling_serve.settings import docling_serve_settings
-
-
-@lru_cache
-def get_async_orchestrator() -> BaseAsyncOrchestrator:
-    if docling_serve_settings.eng_kind == AsyncEngine.LOCAL:
-        from docling_serve.engines.async_local.orchestrator import (
-            AsyncLocalOrchestrator,
-        )
-
-        return AsyncLocalOrchestrator()
-    elif docling_serve_settings.eng_kind == AsyncEngine.KFP:
-        from docling_serve.engines.async_kfp.orchestrator import AsyncKfpOrchestrator
-
-        return AsyncKfpOrchestrator()
-
-    raise RuntimeError(f"Engine {docling_serve_settings.eng_kind} not recognized.")
--- a/docling_serve/engines/base_orchestrator.py
+++ b/docling_serve/engines/base_orchestrator.py
@@ -1,55 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Optional, Union
-
-from fastapi import BackgroundTasks
-from fastapi.responses import FileResponse
-
-from docling_serve.datamodel.convert import ConvertDocumentsOptions
-from docling_serve.datamodel.responses import ConvertDocumentResponse
-from docling_serve.datamodel.task import Task, TaskSource
-
-
-class OrchestratorError(Exception):
-    pass
-
-
-class TaskNotFoundError(OrchestratorError):
-    pass
-
-
-class BaseOrchestrator(ABC):
-    @abstractmethod
-    async def enqueue(
-        self, sources: list[TaskSource], options: ConvertDocumentsOptions
-    ) -> Task:
-        pass
-
-    @abstractmethod
-    async def queue_size(self) -> int:
-        pass
-
-    @abstractmethod
-    async def get_queue_position(self, task_id: str) -> Optional[int]:
-        pass
-
-    @abstractmethod
-    async def task_status(self, task_id: str, wait: float = 0.0) -> Task:
-        pass
-
-    @abstractmethod
-    async def task_result(
-        self, task_id: str, background_tasks: BackgroundTasks
-    ) -> Union[ConvertDocumentResponse, FileResponse, None]:
-        pass
-
-    @abstractmethod
-    async def clear_results(self, older_than: float = 0.0):
-        pass
-
-    @abstractmethod
-    async def process_queue(self):
-        pass
-
-    @abstractmethod
-    async def warm_up_caches(self):
-        pass
--- a/docling_serve/engines/block_local/init.py
+++ b/docling_serve/engines/block_local/init.py
--- a/docling_serve/gradio_ui.py
+++ b/docling_serve/gradio_ui.py
@@ -16,7 +16,7 @@ import httpx
 from docling.datamodel.base_models import FormatToExtensions
 from docling.datamodel.pipeline_options import (
    PdfBackend,
-    PdfPipeline,
+    ProcessingPipeline,
    TableFormerMode,
    TableStructureOptions,
 )
@@ -233,15 +233,21 @@ def change_ocr_lang(ocr_engine):
        return "english,chinese"


-def wait_task_finish(task_id: str, return_as_file: bool):
+def wait_task_finish(auth: str, task_id: str, return_as_file: bool):
    conversion_sucess = False
    task_finished = False
    task_status = ""
+
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = str(auth)
+
    ssl_ctx = get_ssl_context()
    while not task_finished:
        try:
            response = httpx.get(
-                f"{get_api_endpoint()}/v1alpha/status/poll/{task_id}?wait=5",
+                f"{get_api_endpoint()}/v1/status/poll/{task_id}?wait=5",
+                headers=headers,
                verify=ssl_ctx,
                timeout=15,
            )
@@ -264,7 +270,8 @@ def wait_task_finish(task_id: str, return_as_file: bool):
    if conversion_sucess:
        try:
            response = httpx.get(
-                f"{get_api_endpoint()}/v1alpha/result/{task_id}",
+                f"{get_api_endpoint()}/v1/result/{task_id}",
+                headers=headers,
                timeout=15,
                verify=ssl_ctx,
            )
@@ -279,6 +286,7 @@ def wait_task_finish(task_id: str, return_as_file: bool):


 def process_url(
+    auth,
    input_sources,
    to_formats,
    image_export_mode,
@@ -296,8 +304,11 @@ def process_url(
    do_picture_classification,
    do_picture_description,
 ):
+    target = {"kind": "zip" if return_as_file else "inbody"}
    parameters = {
-        "http_sources": [{"url": source} for source in input_sources.split(",")],
+        "sources": [
+            {"kind": "http", "url": source} for source in input_sources.split(",")
+        ],
        "options": {
            "to_formats": to_formats,
            "image_export_mode": image_export_mode,
@@ -309,25 +320,32 @@ def process_url(
            "pdf_backend": pdf_backend,
            "table_mode": table_mode,
            "abort_on_error": abort_on_error,
-            "return_as_file": return_as_file,
            "do_code_enrichment": do_code_enrichment,
            "do_formula_enrichment": do_formula_enrichment,
            "do_picture_classification": do_picture_classification,
            "do_picture_description": do_picture_description,
        },
+        "target": target,
    }
    if (
-        not parameters["http_sources"]
-        or len(parameters["http_sources"]) == 0
-        or parameters["http_sources"][0]["url"] == ""
+        not parameters["sources"]
+        or len(parameters["sources"]) == 0
+        or parameters["sources"][0]["url"] == ""
    ):
        logger.error("No input sources provided.")
        raise gr.Error("No input sources provided.", print_exception=False)
+
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = str(auth)
+
+    print(f"{headers=}")
    try:
        ssl_ctx = get_ssl_context()
        response = httpx.post(
-            f"{get_api_endpoint()}/v1alpha/convert/source/async",
+            f"{get_api_endpoint()}/v1/convert/source/async",
            json=parameters,
+            headers=headers,
            verify=ssl_ctx,
            timeout=60,
        )
@@ -351,6 +369,7 @@ def file_to_base64(file):


 def process_file(
+    auth,
    files,
    to_formats,
    image_export_mode,
@@ -372,11 +391,13 @@ def process_file(
        logger.error("No files provided.")
        raise gr.Error("No files provided.", print_exception=False)
    files_data = [
-        {"base64_string": file_to_base64(file), "filename": file.name} for file in files
+        {"kind": "file", "base64_string": file_to_base64(file), "filename": file.name}
+        for file in files
    ]
+    target = {"kind": "zip" if return_as_file else "inbody"}

    parameters = {
-        "file_sources": files_data,
+        "sources": files_data,
        "options": {
            "to_formats": to_formats,
            "image_export_mode": image_export_mode,
@@ -394,13 +415,19 @@ def process_file(
            "do_picture_classification": do_picture_classification,
            "do_picture_description": do_picture_description,
        },
+        "target": target,
    }

+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = str(auth)
+
    try:
        ssl_ctx = get_ssl_context()
        response = httpx.post(
-            f"{get_api_endpoint()}/v1alpha/convert/source/async",
+            f"{get_api_endpoint()}/v1/convert/source/async",
            json=parameters,
+            headers=headers,
            verify=ssl_ctx,
            timeout=60,
        )
@@ -474,7 +501,7 @@ with gr.Blocks(
    css=css,
    theme=theme,
    title="Docling Serve",
-    delete_cache=(3600, 3600),  # Delete all files older than 1 hour every hour
+    delete_cache=(3600, 36000),  # Delete all files older than 10 hour every hour
 ) as ui:
    # Constants stored in states to be able to pass them as inputs to functions
    processing_text = gr.State("Processing your document(s), please wait...")
@@ -559,6 +586,15 @@ with gr.Blocks(
                file_process_btn = gr.Button("Process File", scale=1)
                file_reset_btn = gr.Button("Reset", scale=1)

+    # Auth
+    with gr.Row(visible=bool(docling_serve_settings.api_key)):
+        with gr.Column():
+            auth = gr.Textbox(
+                label="Authentication",
+                placeholder="API Key",
+                type="password",
+            )
+
    # Options
    with gr.Accordion("Options") as options:
        with gr.Row():
@@ -584,12 +620,13 @@ with gr.Blocks(
                    label="Image Export Mode",
                    value="embedded",
                )
+
        with gr.Row():
            with gr.Column(scale=1, min_width=200):
                pipeline = gr.Radio(
-                    [(v.value.capitalize(), v.value) for v in PdfPipeline],
+                    [(v.value.capitalize(), v.value) for v in ProcessingPipeline],
                    label="Pipeline type",
-                    value=PdfPipeline.STANDARD.value,
+                    value=ProcessingPipeline.STANDARD.value,
                )
        with gr.Row():
            with gr.Column(scale=1, min_width=200):
@@ -718,6 +755,7 @@ with gr.Blocks(
    ).then(
        process_url,
        inputs=[
+            auth,
            url_input,
            to_formats,
            image_export_mode,
@@ -744,7 +782,7 @@ with gr.Blocks(
        outputs=[content_output, file_output],
    ).then(
        wait_task_finish,
-        inputs=[task_id_rendered, return_as_file],
+        inputs=[auth, task_id_rendered, return_as_file],
        outputs=[
            output_markdown,
            output_markdown_rendered,
@@ -805,6 +843,7 @@ with gr.Blocks(
    ).then(
        process_file,
        inputs=[
+            auth,
            file_input,
            to_formats,
            image_export_mode,
@@ -831,7 +870,7 @@ with gr.Blocks(
        outputs=[content_output, file_output],
    ).then(
        wait_task_finish,
-        inputs=[task_id_rendered, return_as_file],
+        inputs=[auth, task_id_rendered, return_as_file],
        outputs=[
            output_markdown,
            output_markdown_rendered,
--- a/docling_serve/orchestrator_factory.py
+++ b/docling_serve/orchestrator_factory.py
@@ -0,0 +1,69 @@
+from functools import lru_cache
+
+from docling_jobkit.orchestrators.base_orchestrator import BaseOrchestrator
+
+from docling_serve.settings import AsyncEngine, docling_serve_settings
+from docling_serve.storage import get_scratch
+
+
+@lru_cache
+def get_async_orchestrator() -> BaseOrchestrator:
+    if docling_serve_settings.eng_kind == AsyncEngine.LOCAL:
+        from docling_jobkit.convert.manager import (
+            DoclingConverterManager,
+            DoclingConverterManagerConfig,
+        )
+        from docling_jobkit.orchestrators.local.orchestrator import (
+            LocalOrchestrator,
+            LocalOrchestratorConfig,
+        )
+
+        local_config = LocalOrchestratorConfig(
+            num_workers=docling_serve_settings.eng_loc_num_workers,
+            shared_models=docling_serve_settings.eng_loc_share_models,
+            scratch_dir=get_scratch(),
+        )
+
+        cm_config = DoclingConverterManagerConfig(
+            artifacts_path=docling_serve_settings.artifacts_path,
+            options_cache_size=docling_serve_settings.options_cache_size,
+            enable_remote_services=docling_serve_settings.enable_remote_services,
+            allow_external_plugins=docling_serve_settings.allow_external_plugins,
+            max_num_pages=docling_serve_settings.max_num_pages,
+            max_file_size=docling_serve_settings.max_file_size,
+        )
+        cm = DoclingConverterManager(config=cm_config)
+
+        return LocalOrchestrator(config=local_config, converter_manager=cm)
+    elif docling_serve_settings.eng_kind == AsyncEngine.RQ:
+        from docling_jobkit.orchestrators.rq.orchestrator import (
+            RQOrchestrator,
+            RQOrchestratorConfig,
+        )
+
+        rq_config = RQOrchestratorConfig(
+            redis_url=docling_serve_settings.eng_rq_redis_url,
+            results_prefix=docling_serve_settings.eng_rq_results_prefix,
+            sub_channel=docling_serve_settings.eng_rq_sub_channel,
+            scratch_dir=get_scratch(),
+        )
+
+        return RQOrchestrator(config=rq_config)
+    elif docling_serve_settings.eng_kind == AsyncEngine.KFP:
+        from docling_jobkit.orchestrators.kfp.orchestrator import (
+            KfpOrchestrator,
+            KfpOrchestratorConfig,
+        )
+
+        kfp_config = KfpOrchestratorConfig(
+            endpoint=docling_serve_settings.eng_kfp_endpoint,
+            token=docling_serve_settings.eng_kfp_token,
+            ca_cert_path=docling_serve_settings.eng_kfp_ca_cert_path,
+            self_callback_endpoint=docling_serve_settings.eng_kfp_self_callback_endpoint,
+            self_callback_token_path=docling_serve_settings.eng_kfp_self_callback_token_path,
+            self_callback_ca_cert_path=docling_serve_settings.eng_kfp_self_callback_ca_cert_path,
+        )
+
+        return KfpOrchestrator(config=kfp_config)
+
+    raise RuntimeError(f"Engine {docling_serve_settings.eng_kind} not recognized.")
--- a/docling_serve/response_preparation.py
+++ b/docling_serve/response_preparation.py
@@ -1,232 +1,69 @@
+import asyncio
 import logging
-import os
-import shutil
-import time
-from collections.abc import Iterable
-from pathlib import Path
-from typing import Union

-from fastapi import HTTPException
-from fastapi.responses import FileResponse
+from fastapi import BackgroundTasks, Response

-from docling.datamodel.base_models import OutputFormat
-from docling.datamodel.document import ConversionResult, ConversionStatus
-from docling_core.types.doc import ImageRefMode
+from docling_jobkit.datamodel.result import (
+    ConvertDocumentResult,
+    ExportResult,
+    RemoteTargetResult,
+    ZipArchiveResult,
+)
+from docling_jobkit.orchestrators.base_orchestrator import (
+    BaseOrchestrator,
+)

-from docling_serve.datamodel.convert import ConvertDocumentsOptions
-from docling_serve.datamodel.responses import ConvertDocumentResponse, DocumentResponse
+from docling_serve.datamodel.responses import (
+    ConvertDocumentResponse,
+    PresignedUrlConvertDocumentResponse,
+)
+from docling_serve.settings import docling_serve_settings

 _log = logging.getLogger(__name__)


-def _export_document_as_content(
-    conv_res: ConversionResult,
-    export_json: bool,
-    export_html: bool,
-    export_md: bool,
-    export_txt: bool,
-    export_doctags: bool,
-    image_mode: ImageRefMode,
-    md_page_break_placeholder: str,
+async def prepare_response(
+    task_id: str,
+    task_result: ConvertDocumentResult,
+    orchestrator: BaseOrchestrator,
+    background_tasks: BackgroundTasks,
 ):
-    document = DocumentResponse(filename=conv_res.input.file.name)
-
-    if conv_res.status == ConversionStatus.SUCCESS:
-        new_doc = conv_res.document._make_copy_with_refmode(Path(), image_mode)
-
-        # Create the different formats
-        if export_json:
-            document.json_content = new_doc
-        if export_html:
-            document.html_content = new_doc.export_to_html(image_mode=image_mode)
-        if export_txt:
-            document.text_content = new_doc.export_to_markdown(
-                strict_text=True,
-                image_mode=image_mode,
-            )
-        if export_md:
-            document.md_content = new_doc.export_to_markdown(
-                image_mode=image_mode,
-                page_break_placeholder=md_page_break_placeholder or None,
-            )
-        if export_doctags:
-            document.doctags_content = new_doc.export_to_doctags()
-    elif conv_res.status == ConversionStatus.SKIPPED:
-        raise HTTPException(status_code=400, detail=conv_res.errors)
-    else:
-        raise HTTPException(status_code=500, detail=conv_res.errors)
-
-    return document
-
-
-def _export_documents_as_files(
-    conv_results: Iterable[ConversionResult],
-    output_dir: Path,
-    export_json: bool,
-    export_html: bool,
-    export_md: bool,
-    export_txt: bool,
-    export_doctags: bool,
-    image_export_mode: ImageRefMode,
-    md_page_break_placeholder: str,
-):
-    success_count = 0
-    failure_count = 0
-
-    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            success_count += 1
-            doc_filename = conv_res.input.file.stem
-
-            # Export JSON format:
-            if export_json:
-                fname = output_dir / f"{doc_filename}.json"
-                _log.info(f"writing JSON output to {fname}")
-                conv_res.document.save_as_json(
-                    filename=fname, image_mode=image_export_mode
-                )
-
-            # Export HTML format:
-            if export_html:
-                fname = output_dir / f"{doc_filename}.html"
-                _log.info(f"writing HTML output to {fname}")
-                conv_res.document.save_as_html(
-                    filename=fname, image_mode=image_export_mode
-                )
-
-            # Export Text format:
-            if export_txt:
-                fname = output_dir / f"{doc_filename}.txt"
-                _log.info(f"writing TXT output to {fname}")
-                conv_res.document.save_as_markdown(
-                    filename=fname,
-                    strict_text=True,
-                    image_mode=ImageRefMode.PLACEHOLDER,
-                )
-
-            # Export Markdown format:
-            if export_md:
-                fname = output_dir / f"{doc_filename}.md"
-                _log.info(f"writing Markdown output to {fname}")
-                conv_res.document.save_as_markdown(
-                    filename=fname,
-                    image_mode=image_export_mode,
-                    page_break_placeholder=md_page_break_placeholder or None,
-                )
-
-            # Export Document Tags format:
-            if export_doctags:
-                fname = output_dir / f"{doc_filename}.doctags"
-                _log.info(f"writing Doc Tags output to {fname}")
-                conv_res.document.save_as_document_tokens(filename=fname)
-
-        else:
-            _log.warning(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-
-    _log.info(
-        f"Processed {success_count + failure_count} docs, "
-        f"of which {failure_count} failed"
-    )
-
-
-def process_results(
-    conversion_options: ConvertDocumentsOptions,
-    conv_results: Iterable[ConversionResult],
-    work_dir: Path,
-) -> Union[ConvertDocumentResponse, FileResponse]:
-    # Let's start by processing the documents
-    try:
-        start_time = time.monotonic()
-
-        # Convert the iterator to a list to count the number of results and get timings
-        # As it's an iterator (lazy evaluation), it will also start the conversion
-        conv_results = list(conv_results)
-
-        processing_time = time.monotonic() - start_time
-
-        _log.info(
-            f"Processed {len(conv_results)} docs in {processing_time:.2f} seconds."
-        )
-
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-
-    if len(conv_results) == 0:
-        raise HTTPException(
-            status_code=500, detail="No documents were generated by Docling."
-        )
-
-    # We have some results, let's prepare the response
-    response: Union[FileResponse, ConvertDocumentResponse]
-
-    # Booleans to know what to export
-    export_json = OutputFormat.JSON in conversion_options.to_formats
-    export_html = OutputFormat.HTML in conversion_options.to_formats
-    export_md = OutputFormat.MARKDOWN in conversion_options.to_formats
-    export_txt = OutputFormat.TEXT in conversion_options.to_formats
-    export_doctags = OutputFormat.DOCTAGS in conversion_options.to_formats
-
-    # Only 1 document was processed, and we are not returning it as a file
-    if len(conv_results) == 1 and not conversion_options.return_as_file:
-        conv_res = conv_results[0]
-        document = _export_document_as_content(
-            conv_res,
-            export_json=export_json,
-            export_html=export_html,
-            export_md=export_md,
-            export_txt=export_txt,
-            export_doctags=export_doctags,
-            image_mode=conversion_options.image_export_mode,
-            md_page_break_placeholder=conversion_options.md_page_break_placeholder,
-        )
-
+    response: Response | ConvertDocumentResponse | PresignedUrlConvertDocumentResponse
+    if isinstance(task_result.result, ExportResult):
        response = ConvertDocumentResponse(
-            document=document,
-            status=conv_res.status,
-            processing_time=processing_time,
-            timings=conv_res.timings,
+            document=task_result.result.content,
+            status=task_result.result.status,
+            processing_time=task_result.processing_time,
+            timings=task_result.result.timings,
+            errors=task_result.result.errors,
+        )
+    elif isinstance(task_result.result, ZipArchiveResult):
+        response = Response(
+            content=task_result.result.content,
+            media_type="application/zip",
+            headers={
+                "Content-Disposition": 'attachment; filename="converted_docs.zip"'
+            },
+        )
+    elif isinstance(task_result.result, RemoteTargetResult):
+        response = PresignedUrlConvertDocumentResponse(
+            processing_time=task_result.processing_time,
+            num_converted=task_result.num_converted,
+            num_succeeded=task_result.num_succeeded,
+            num_failed=task_result.num_failed,
        )
-
-    # Multiple documents were processed, or we are forced returning as a file
    else:
-        # Temporary directory to store the outputs
-        output_dir = work_dir / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
+        raise ValueError("Unknown result type")

-        # Worker pid to use in archive identification as we may have multiple workers
-        os.getpid()
+    if docling_serve_settings.single_use_results:

-        # Export the documents
-        _export_documents_as_files(
-            conv_results=conv_results,
-            output_dir=output_dir,
-            export_json=export_json,
-            export_html=export_html,
-            export_md=export_md,
-            export_txt=export_txt,
-            export_doctags=export_doctags,
-            image_export_mode=conversion_options.image_export_mode,
-            md_page_break_placeholder=conversion_options.md_page_break_placeholder,
-        )
+        async def _remove_task_impl():
+            await asyncio.sleep(docling_serve_settings.result_removal_delay)
+            await orchestrator.delete_task(task_id=task_id)

-        files = os.listdir(output_dir)
-        if len(files) == 0:
-            raise HTTPException(status_code=500, detail="No documents were exported.")
+        async def _remove_task():
+            asyncio.create_task(_remove_task_impl())  # noqa: RUF006

-        file_path = work_dir / "converted_docs.zip"
-        shutil.make_archive(
-            base_name=str(file_path.with_suffix("")),
-            format="zip",
-            root_dir=output_dir,
-        )
-
-        # Other cleanups after the response is sent
-        # Output directory
-        # background_tasks.add_task(shutil.rmtree, work_dir, ignore_errors=True)
-
-        response = FileResponse(
-            file_path, filename=file_path.name, media_type="application/zip"
-        )
+        background_tasks.add_task(_remove_task)

    return response
--- a/docling_serve/settings.py
+++ b/docling_serve/settings.py
@@ -1,3 +1,4 @@
+import enum
 import sys
 from pathlib import Path
 from typing import Optional, Union
@@ -6,8 +7,6 @@ from pydantic import AnyUrl, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import Self

-from docling_serve.datamodel.engines import AsyncEngine
-

 class UvicornSettings(BaseSettings):
    model_config = SettingsConfigDict(
@@ -26,6 +25,12 @@ class UvicornSettings(BaseSettings):
    workers: Union[int, None] = None


+class AsyncEngine(str, enum.Enum):
+    LOCAL = "local"
+    KFP = "kfp"
+    RQ = "rq"
+
+
 class DoclingServeSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_prefix="DOCLING_SERVE_",
@@ -41,10 +46,13 @@ class DoclingServeSettings(BaseSettings):
    scratch_path: Optional[Path] = None
    single_use_results: bool = True
    result_removal_delay: float = 300  # 5 minutes
+    load_models_at_boot: bool = True
    options_cache_size: int = 2
    enable_remote_services: bool = False
    allow_external_plugins: bool = False

+    api_key: str = ""
+
    max_document_timeout: float = 3_600 * 24 * 7  # 7 days
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
@@ -58,6 +66,11 @@ class DoclingServeSettings(BaseSettings):
    eng_kind: AsyncEngine = AsyncEngine.LOCAL
    # Local engine
    eng_loc_num_workers: int = 2
+    eng_loc_share_models: bool = False
+    # RQ engine
+    eng_rq_redis_url: str = ""
+    eng_rq_results_prefix: str = "docling:results"
+    eng_rq_sub_channel: str = "docling:updates"
    # KFP engine
    eng_kfp_endpoint: Optional[AnyUrl] = None
    eng_kfp_token: Optional[str] = None
@@ -81,6 +94,10 @@ class DoclingServeSettings(BaseSettings):
                    "KFP is not yet working. To enable the development version, you must set DOCLING_SERVE_ENG_KFP_EXPERIMENTAL=true."
                )

+        if self.eng_kind == AsyncEngine.RQ:
+            if not self.eng_rq_redis_url:
+                raise ValueError("RQ Redis url is required when using the RQ engine.")
+
        return self


--- a/docling_serve/websocket_notifier.py
+++ b/docling_serve/websocket_notifier.py
@@ -0,0 +1,54 @@
+from fastapi import WebSocket
+
+from docling_jobkit.datamodel.task_meta import TaskStatus
+from docling_jobkit.orchestrators.base_notifier import BaseNotifier
+from docling_jobkit.orchestrators.base_orchestrator import BaseOrchestrator
+
+from docling_serve.datamodel.responses import (
+    MessageKind,
+    TaskStatusResponse,
+    WebsocketMessage,
+)
+
+
+class WebsocketNotifier(BaseNotifier):
+    def __init__(self, orchestrator: BaseOrchestrator):
+        super().__init__(orchestrator)
+        self.task_subscribers: dict[str, set[WebSocket]] = {}
+
+    async def add_task(self, task_id: str):
+        self.task_subscribers[task_id] = set()
+
+    async def remove_task(self, task_id: str):
+        if task_id in self.task_subscribers:
+            for websocket in self.task_subscribers[task_id]:
+                await websocket.close()
+
+            del self.task_subscribers[task_id]
+
+    async def notify_task_subscribers(self, task_id: str):
+        if task_id not in self.task_subscribers:
+            raise RuntimeError(f"Task {task_id} does not have a subscribers list.")
+
+        task = await self.orchestrator.get_raw_task(task_id=task_id)
+        task_queue_position = await self.orchestrator.get_queue_position(task_id)
+        msg = TaskStatusResponse(
+            task_id=task.task_id,
+            task_status=task.task_status,
+            task_position=task_queue_position,
+            task_meta=task.processing_meta,
+        )
+        for websocket in self.task_subscribers[task_id]:
+            await websocket.send_text(
+                WebsocketMessage(message=MessageKind.UPDATE, task=msg).model_dump_json()
+            )
+            if task.is_completed():
+                await websocket.close()
+
+    async def notify_queue_positions(self):
+        for task_id in self.task_subscribers.keys():
+            # notify only pending tasks
+            if self.orchestrator.tasks[task_id].task_status != TaskStatus.PENDING:
+                continue
+
+            await self.notify_task_subscribers(task_id)
--- a/docs/README.md
+++ b/docs/README.md
@@ -3,6 +3,9 @@
 This documentation pages explore the webserver configurations, runtime options, deployment examples as well as development best practices.

 - [Configuration](./configuration.md)
- [Advance usage](./usage.md)
+- [Handling models](./models.md)
+- [Usage](./usage.md)
 - [Deployment](./deployment.md)
+- [MCP](./mcp.md)
 - [Development](./development.md)
+- [`v1` migration](./v1_migration.md)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -7,7 +7,7 @@ server and the actual app-specific configurations.

 > [!WARNING]
 > When the server is running with `reload` or with multiple `workers`, uvicorn
-> will spawn multiple subprocessed. This invalides all the values configured
+> will spawn multiple subprocesses. This invalidates all the values configured
 > via the CLI command line options. Please use environment variables in this
 > type of deployments.

@@ -36,7 +36,7 @@ THe following table describes the options to configure the Docling Serve app.
 | CLI option | ENV | Default | Description |
 | -----------|-----|---------|-------------|
 | `--artifacts-path` | `DOCLING_SERVE_ARTIFACTS_PATH` | unset | If set to a valid directory, the model weights will be loaded from this path |
-|  | `DOCLING_SERVE_STATIC_PATH` | unset | If set to a valid directory, the static assets for the docs and ui will be loaded from this path |
+|  | `DOCLING_SERVE_STATIC_PATH` | unset | If set to a valid directory, the static assets for the docs and UI will be loaded from this path |
 |  | `DOCLING_SERVE_SCRATCH_PATH` |  | If set, this directory will be used as scratch workspace, e.g. storing the results before they get requested. If unset, a temporary created is created for this purpose. |
 | `--enable-ui` | `DOCLING_SERVE_ENABLE_UI` | `false` | Enable the demonstrator UI. |
 |  | `DOCLING_SERVE_ENABLE_REMOTE_SERVICES` | `false` | Allow pipeline components making remote connections. For example, this is needed when using a vision-language model via APIs. |
@@ -44,14 +44,17 @@ THe following table describes the options to configure the Docling Serve app.
 |  | `DOCLING_SERVE_SINGLE_USE_RESULTS` | `true` | If true, results can be accessed only once. If false, the results accumulate in the scratch directory. |
 |  | `DOCLING_SERVE_RESULT_REMOVAL_DELAY` | `300` | When `DOCLING_SERVE_SINGLE_USE_RESULTS` is active, this is the delay before results are removed from the task registry. |
 |  | `DOCLING_SERVE_MAX_DOCUMENT_TIMEOUT` | `604800` (7 days) | The maximum time for processing a document. |
+|  | `DOCLING_NUM_THREADS` | `4` | Number of concurrent threads for processing a document. |
 |  | `DOCLING_SERVE_MAX_NUM_PAGES` |  | The maximum number of pages for a document to be processed. |
 |  | `DOCLING_SERVE_MAX_FILE_SIZE` |  | The maximum file size for a document to be processed. |
 |  | `DOCLING_SERVE_MAX_SYNC_WAIT` | `120` | Max number of seconds a synchronous endpoint is waiting for the task completion. |
+|  | `DOCLING_SERVE_LOAD_MODELS_AT_BOOT` | `True` | If enabled, the models for the default options will be loaded at boot. |
 |  | `DOCLING_SERVE_OPTIONS_CACHE_SIZE` | `2` | How many DocumentConveter objects (including their loaded models) to keep in the cache. |
 |  | `DOCLING_SERVE_CORS_ORIGINS` | `["*"]` | A list of origins that should be permitted to make cross-origin requests. |
 |  | `DOCLING_SERVE_CORS_METHODS` | `["*"]` | A list of HTTP methods that should be allowed for cross-origin requests. |
 |  | `DOCLING_SERVE_CORS_HEADERS` | `["*"]` | A list of HTTP request headers that should be supported for cross-origin requests. |
-|  | `DOCLING_SERVE_ENG_KIND` | `local` | The compute engine to use for the async tasks. Possible values are `local` and `kfp`. See below for more configurations of the engines. |
+|  | `DOCLING_SERVE_API_KEY` | | If specified, all the API requests must contain the header `X-Api-Key` with this value. |
+|  | `DOCLING_SERVE_ENG_KIND` | `local` | The compute engine to use for the async tasks. Possible values are `local`, `rq` and `kfp`. See below for more configurations of the engines. |

 ### Compute engine

@@ -60,11 +63,22 @@ The selected compute engine will be running all the async jobs.

 #### Local engine

-The following table describes the options to configure the Docling Serve KFP engine.
+The following table describes the options to configure the Docling Serve local engine.

 | ENV | Default | Description |
 |-----|---------|-------------|
 | `DOCLING_SERVE_ENG_LOC_NUM_WORKERS` | 2 | Number of workers/threads processing the incoming tasks. |
+| `DOCLING_SERVE_ENG_LOC_SHARE_MODELS` | False | If true, each process will share the same models among all thread workers. Otherwise, one instance of the models is allocated for each worker thread. |
+
+#### RQ engine
+
+The following table describes the options to configure the Docling Serve RQ engine.
+
+| ENV | Default | Description |
+|-----|---------|-------------|
+| `DOCLING_SERVE_ENG_RQ_REDIS_URL` | (required) | The connection Redis url, e.g. `redis://localhost:6373/` |
+| `DOCLING_SERVE_ENG_RQ_RESULTS_PREFIX` | `docling:results` | The prefix used for storing the results in Redis. |
+| `DOCLING_SERVE_ENG_RQ_SUB_CHANNEL` | `docling:updates` | The channel key name used for storing communicating updates between the workers and the orchestrator. |

 #### KFP engine

@@ -75,6 +89,13 @@ The following table describes the options to configure the Docling Serve KFP eng
 | `DOCLING_SERVE_ENG_KFP_ENDPOINT` |  | Must be set to the Kubeflow Pipeline endpoint. When using the in-cluster deployment, make sure to use the cluster endpoint, e.g. `https://NAME.NAMESPACE.svc.cluster.local:8888`  |
 | `DOCLING_SERVE_ENG_KFP_TOKEN` |  | The authentication token for KFP. For in-cluster deployment, the app will load automatically the token of the ServiceAccount. |
 | `DOCLING_SERVE_ENG_KFP_CA_CERT_PATH` |  | Path to the CA certificates for the KFP endpoint. For in-cluster deployment, the app will load automatically the internal CA. |
-| `DOCLING_SERVE_ENG_KFP_SELF_CALLBACK_ENDPOINT` |  | If set, it enables internal callbacks providing status update of the KFP job. Usually something like `https://NAME.NAMESPACE.svc.cluster.local:5001/v1alpha/callback/task/progress`. |
+| `DOCLING_SERVE_ENG_KFP_SELF_CALLBACK_ENDPOINT` |  | If set, it enables internal callbacks providing status update of the KFP job. Usually something like `https://NAME.NAMESPACE.svc.cluster.local:5001/v1/callback/task/progress`. |
 | `DOCLING_SERVE_ENG_KFP_SELF_CALLBACK_TOKEN_PATH` |  | The token used for authenticating the progress callback. For cluster-internal workloads, use `/run/secrets/kubernetes.io/serviceaccount/token`. |
 | `DOCLING_SERVE_ENG_KFP_SELF_CALLBACK_CA_CERT_PATH` |  | The CA certificate for the progress callback. For cluster-inetrnal workloads, use `/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt`. |
+
+#### Gradio UI
+
+When using Gradio UI and using the option to output conversion as file, Gradio uses cache to prevent files to be overwritten ([more info here](https://www.gradio.app/guides/file-access#the-gradio-cache)), and we defined the cache clean frequency of one hour to clean files older than 10hours. For situations that files need to be available to download from UI older than 10 hours, there is two options:
+
+- Increase the older age of files to clean [here](https://github.com/docling-project/docling-serve/blob/main/docling_serve/gradio_ui.py#L483) to suffice the age desired;
+- Or set the clean up manually by defining the temporary dir of Gradio to use the same as `DOCLING_SERVE_SCRATCH_PATH` absolute path. This can be achieved by setting the environment variable `GRADIO_TEMP_DIR`, that can be done via command line `export GRADIO_TEMP_DIR="<same_path_as_scratch>"` or in `Dockerfile` using `ENV GRADIO_TEMP_DIR="<same_path_as_scratch>"`. After this, set the clean of cache to `None` [here](https://github.com/docling-project/docling-serve/blob/main/docling_serve/gradio_ui.py#L483). Now, the clean up of `DOCLING_SERVE_SCRATCH_PATH` will also clean the Gradio temporary dir. (If you use this option, please remember when reversing changes to remove the environment variable `GRADIO_TEMP_DIR`, otherwise may lead to files not be available to download).
--- a/docs/deploy-examples/compose-amd.yaml
+++ b/docs/deploy-examples/compose-amd.yaml
@@ -0,0 +1,21 @@
+# AMD ROCm deployment
+
+services:
+  docling-serve:
+    image: ghcr.io/docling-project/docling-serve-rocm:main
+    container_name: docling-serve
+    ports:
+      - "5001:5001"
+    environment:
+      DOCLING_SERVE_ENABLE_UI: "true"
+      ROCR_VISIBLE_DEVICES: "0" # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html#rocr-visible-devices
+      ## This section is for compatibility with older cards
+      # HSA_OVERRIDE_GFX_VERSION: "11.0.0"
+      # HSA_ENABLE_SDMA: "0"
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    group_add:
+      - 44    # video group GID from host
+      - 992   # render group GID from host
+    restart: always
--- a/docs/deploy-examples/compose-gpu.yaml
+++ b/docs/deploy-examples/compose-gpu.yaml
@@ -1,15 +0,0 @@
-services:
-  docling:
-    image: ghcr.io/docling-project/docling-serve-cu124
-    container_name: docling-serve
-    ports:
-      - 5001:5001
-    environment:
-      - DOCLING_SERVE_ENABLE_UI=true
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            count: all # nvidia-smi 
-            capabilities: [gpu]
--- a/docs/deploy-examples/compose-nvidia.yaml
+++ b/docs/deploy-examples/compose-nvidia.yaml
@@ -0,0 +1,20 @@
+# NVIDIA CUDA deployment
+
+services:
+  docling-serve:
+    image: ghcr.io/docling-project/docling-serve-cu126:main
+    container_name: docling-serve
+    ports:
+      - "5001:5001"
+    environment:
+      DOCLING_SERVE_ENABLE_UI: "true"
+      NVIDIA_VISIBLE_DEVICES: "all" # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
+    # deploy:  # This section is for compatibility with Swarm
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: all
+    #           capabilities: [gpu]
+    runtime: nvidia
+    restart: always
--- a/docs/deploy-examples/docling-model-cache-deployment.yaml
+++ b/docs/deploy-examples/docling-model-cache-deployment.yaml
@@ -22,8 +22,8 @@ spec:
        - name: api
          resources:
            limits:
-              cpu: 500m
-              memory: 2Gi
+              cpu: 2
+              memory: 4Gi
            requests:
              cpu: 250m
              memory: 1Gi
--- a/docs/deploy-examples/docling-serve-oauth.yaml
+++ b/docs/deploy-examples/docling-serve-oauth.yaml
@@ -85,7 +85,7 @@ spec:
          resources:
            limits:
              cpu: 2000m
-              memory: 2Gi
+              memory: 4Gi
            requests:
              cpu: 800m
              memory: 1Gi
--- a/docs/deploy-examples/docling-serve-replicas-w-sticky-sessions.yaml
+++ b/docs/deploy-examples/docling-serve-replicas-w-sticky-sessions.yaml
@@ -60,8 +60,8 @@ spec:
        - name: api
          resources:
            limits:
-              cpu: 500m
-              memory: 2Gi
+              cpu: 1
+              memory: 4Gi
            requests:
              cpu: 250m
              memory: 1Gi
--- a/docs/deploy-examples/docling-serve-rq-workers.yaml
+++ b/docs/deploy-examples/docling-serve-rq-workers.yaml
@@ -0,0 +1,192 @@
+# This example deployment configures Docling Serve with a Service and RQ workers
+
+# Create following secret
+# kubectl create secret generic docling-serve-rq-secrets --from-literal=REDIS_PASSWORD=myredispassword --from-literal=RQ_REDIS_URL=redis://:myredispassword@docling-serve-redis-service:6373/
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: docling-serve
+  labels:
+    app: docling-serve
+    component: docling-serve-api
+spec:
+  ports:
+  - name: http
+    port: 5001
+    targetPort: http
+  selector:
+    app: docling-serve
+    component: docling-serve-api
+---
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  name: docling-serve
+  labels:
+    app: docling-serve
+    component: docling-serve-api
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docling-serve
+      component: docling-serve-api
+  template:
+    metadata:
+      labels:
+        app: docling-serve
+        component: docling-serve-api
+    spec:
+      restartPolicy: Always
+      containers:
+        - name: api
+          resources:
+            limits:
+              cpu: 1
+              memory: 8Gi
+            requests:
+              cpu: 250m
+              memory: 1Gi
+          env:
+            - name: DOCLING_SERVE_ENABLE_UI
+              value: 'true'
+            - name: DOCLING_SERVE_ENG_KIND
+              value: 'rq'
+            - name: DOCLING_SERVE_ENG_RQ_REDIS_URL
+              valueFrom:
+                secretKeyRef:
+                  name: docling-serve-rq-secrets
+                  key: RQ_REDIS_URL
+          ports:
+            - name: http
+              containerPort: 5001
+              protocol: TCP
+          imagePullPolicy: Always
+          image: 'ghcr.io/docling-project/docling-serve-cpu'
+---
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  name: docling-serve-rq-workers
+  labels:
+    app: docling-serve-rq-workers
+    component: docling-serve-rq-worker
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: docling-serve-rq-workers
+      component: docling-serve-rq-worker
+  template:
+    metadata:
+      labels:
+        app: docling-serve-rq-workers
+        component: docling-serve-rq-worker
+    spec:
+      restartPolicy: Always
+      containers:
+        - name: worker
+          resources:
+            limits:
+              cpu: 1
+              memory: 4Gi
+            requests:
+              cpu: 250m
+              memory: 1Gi
+          env:
+            - name: DOCLING_SERVE_ENG_KIND
+              value: 'rq'
+            - name: DOCLING_SERVE_ENG_RQ_REDIS_URL
+              valueFrom:
+                secretKeyRef:
+                  name: docling-serve-rq-secrets
+                  key: RQ_REDIS_URL
+          ports:
+            - name: http
+              containerPort: 5001
+              protocol: TCP
+          imagePullPolicy: Always
+          image: 'ghcr.io/docling-project/docling-serve-cpu'
+          command: ["docling-serve"]
+          args: ["rq-worker"]
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docling-serve-redis
+  labels:
+    app: docling-serve-redis
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docling-serve-redis
+  template:
+    metadata:
+      labels:
+        app: docling-serve-redis
+    spec:
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      containers:
+        - name: redis
+          resources:
+            limits:
+              cpu: 1
+              memory: 1Gi
+            requests:
+              cpu: 250m
+              memory: 100Mi
+          image: redis:latest
+          command: ["redis-server"]
+          args:
+            - "--port"
+            - "6373"
+            - "--dir"
+            - "/mnt/redis/data"
+            - "--appendonly"
+            - "yes"
+            - "--requirepass"
+            - "$(REDIS_PASSWORD)"
+          ports:
+            - containerPort: 6373
+          env:
+            - name: REDIS_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: docling-serve-rq-secrets
+                  key: REDIS_PASSWORD
+          volumeMounts:
+            - name: redis-data
+              mountPath: /mnt/redis/data
+          securityContext:
+            fsGroup: 1004
+            runAsNonRoot: true
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+            seccompProfile:
+              type: RuntimeDefault
+      volumes:
+        - name: redis-data
+          emptyDir:
+            medium: Memory
+            sizeLimit: 2Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: docling-serve-redis-service
+  labels:
+      app: docling-serve-redis
+spec:
+  type: NodePort
+  ports:
+    - name: redis-service
+      protocol: TCP
+      port: 6373
+      targetPort: 6373
+  selector:
+    app: docling-serve-redis
--- a/docs/deploy-examples/docling-serve-simple.yaml
+++ b/docs/deploy-examples/docling-serve-simple.yaml
@@ -40,8 +40,8 @@ spec:
        - name: api
          resources:
            limits:
-              cpu: 500m
-              memory: 2Gi
+              cpu: 1
+              memory: 4Gi
              nvidia.com/gpu: 1  # Limit to one GPU
            requests:
              cpu: 250m
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -4,16 +4,17 @@ This document provides deployment examples for running the application in differ

 Choose the deployment option that best fits your setup.

- **[Local GPU](#local-gpu)**: For deploying the application locally on a machine with a NVIDIA GPU (using Docker Compose).
+- **[Local GPU NVIDIA](#local-gpu-nvidia)**: For deploying the application locally on a machine with a supported NVIDIA GPU (using Docker Compose).
+- **[Local GPU AMD](#local-gpu-amd)**: For deploying the application locally on a machine with a supported AMD GPU (using Docker Compose).
 - **[OpenShift](#openshift)**: For deploying the application on an OpenShift cluster, designed for cloud-native environments.

 ---

-## Local GPU
+## Local GPU NVIDIA

 ### Docker compose

-Manifest example: [compose-gpu.yaml](./deploy-examples/compose-gpu.yaml)
+Manifest example: [compose-nvidia.yaml](./deploy-examples/compose-nvidia.yaml)

 This deployment has the following features:

@@ -22,7 +23,7 @@ This deployment has the following features:
 Install the app with:

 ```sh
-docker compose -f docs/deploy-examples/compose-gpu.yaml up -d
+docker compose -f docs/deploy-examples/compose-nvidia.yaml up -d
 ```

 For using the API:
@@ -30,11 +31,11 @@ For using the API:
 ```sh
 # Make a test query
 curl -X 'POST' \
-  "localhost:5001/v1alpha/convert/source/async" \
+  "localhost:5001/v1/convert/source/async" \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
-    "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
+    "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}]
  }'
 ```

@@ -56,7 +57,7 @@ Docs:
 <details>
 <summary><b>Steps</b></summary>

-1. Check driver version and which GPU you want to use (0/1/2/3.. and update [compose-gpu.yaml](./deploy-examples/compose-gpu.yaml) file or use `count: all`)
+1. Check driver version and which GPU you want to use 0/1/2/n (and update [compose-nvidia.yaml](./deploy-examples/compose-nvidia.yaml) file or use `count: all`)

    ```sh
    nvidia-smi
@@ -117,7 +118,75 @@ Docs:
 5. Run the container:

    ```sh
-    docker compose -f docs/deploy-examples/compose-gpu.yaml up -d
+    docker compose -f docs/deploy-examples/compose-nvidia.yaml up -d
+    ```
+
+</details>
+
+## Local GPU AMD
+
+### Docker compose
+
+Manifest example: [compose-amd.yaml](./deploy-examples/compose-amd.yaml)
+
+This deployment has the following features:
+
+- AMD rocm enabled
+
+Install the app with:
+
+```sh
+docker compose -f docs/deploy-examples/compose-amd.yaml up -d
+```
+
+For using the API:
+
+```sh
+# Make a test query
+curl -X 'POST' \
+  "localhost:5001/v1/convert/source/async" \
+  -H "accept: application/json" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}]
+  }'
+```
+
+<details>
+<summary><b>Requirements</b></summary>
+
+- debian/ubuntu/rhel/fedora/opensuse
+- docker
+- AMDGPU driver >=6.3
+- AMD ROCm >=6.3
+
+Docs:
+
+- [AMD ROCm installation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
+
+</details>
+
+<details>
+<summary><b>Steps</b></summary>
+
+1. Check driver version and which GPU you want to use 0/1/2/n (and update [compose-amd.yaml](./deploy-examples/compose-amd.yaml) file)
+
+    ```sh
+    rocm-smi --showdriverversion
+    rocminfo | grep -i "ROCm version"
+    ```
+
+2. Find both video group GID and render group GID from host (and update [compose-amd.yaml](./deploy-examples/compose-amd.yaml) file)
+
+    ```sh
+    getent group video
+    getent group render
+    ```
+
+3. Build the image locally (and update [compose-amd.yaml](./deploy-examples/compose-amd.yaml) file)
+
+    ```sh
+    make docling-serve-rocm-image
    ```

 </details>
@@ -148,14 +217,39 @@ oc port-forward svc/docling-serve 5001:5001

 # Make a test query
 curl -X 'POST' \
-  "localhost:5001/v1alpha/convert/source/async" \
+  "localhost:5001/v1/convert/source/async" \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
-    "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
+    "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}]
  }'
 ```

+### Multiple workers with RQ
+
+Manifest example: [`docling-serve-rq-workers.yaml`](./deploy-examples/docling-serve-rq-workers.yaml)
+
+This deployment example has the following features:
+
+- Deployment configuration
+- Service configuration
+- Redis deployment
+- Multiple (2 by default) worker Pods
+
+Install the app with:
+
+- create k8s secret:
+
+```sh
+kubectl create secret generic docling-serve-rq-secrets --from-literal=REDIS_PASSWORD=myredispassword --from-literal=RQ_REDIS_URL=redis://:myredispassword@docling-serve-redis-service:6373/
+```
+
+- apply deployment manifest:
+
+```sh
+oc apply -f docs/deploy-examples/docling-serve-rq-workers.yaml
+```
+
 ### Secure deployment with `oauth-proxy`

 Manifest example: [docling-serve-oauth.yaml](./deploy-examples/docling-serve-oauth.yaml)
@@ -184,12 +278,12 @@ OCP_AUTH_TOKEN=$(oc whoami --show-token)

 # Make a test query
 curl -X 'POST' \
-  "${DOCLING_ROUTE}/v1alpha/convert/source/async" \
+  "${DOCLING_ROUTE}/v1/convert/source/async" \
  -H "Authorization: Bearer ${OCP_AUTH_TOKEN}" \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
-    "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
+    "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}]
  }'
 ```

@@ -218,11 +312,11 @@ DOCLING_ROUTE="https://$(oc get routes $DOCLING_NAME --template={{.spec.host}})"

 # Make a test query, store the cookie and taskid
 task_id=$(curl -s -X 'POST' \
-    "${DOCLING_ROUTE}/v1alpha/convert/source/async" \
+    "${DOCLING_ROUTE}/v1/convert/source/async" \
    -H "accept: application/json" \
    -H "Content-Type: application/json" \
    -d '{
-    "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
+      "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}]
    }' \
    -c cookies.txt | grep -oP '"task_id":"\K[^"]+')
 ```
@@ -230,7 +324,7 @@ task_id=$(curl -s -X 'POST' \
 ```sh
 # Grab the taskid and cookie to check the task status
 curl -v -X 'GET' \
-  "${DOCLING_ROUTE}/v1alpha/status/poll/$task_id?wait=0" \
+  "${DOCLING_ROUTE}/v1/status/poll/$task_id?wait=0" \
  -H "accept: application/json" \
  -b "cookies.txt"
 ```
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -0,0 +1,22 @@
+# Examples
+
+## Split processing
+
+The example of provided of split processing demonstrates how to split a PDF into chunks of pages and send them for conversion. At the end, it concatenates all split pages into a single conversion `JSON`.
+
+At beginning of file there's variables to be used (and modified) such as:
+| Variable | Description |
+| ---------|-------------|
+| `path_to_pdf`| Path to PDF file to be split |
+| `pages_per_file`| The number of pages per chunk to split PDF |
+| `base_url`| Base url of the `docling-serve` host |
+| `out_dir`| The output folder of each conversion `JSON` of split PDF and the final concatenated `JSON` |
+
+The example follows the following logic:
+- Get the number of pages of the `PDF`
+- Based on the number of chunks of pages, send each chunk to conversion using `page_range` parameter
+- Wait all conversions to finish
+- Get all conversion results
+- Save each conversion `JSON` result into a `JSON` file
+- Concatenate all `JSONs` into a single `JSON` using `docling` concatenate method
+- Save concatenated `JSON` into a `JSON` file
--- a/docs/mcp.md
+++ b/docs/mcp.md
@@ -0,0 +1,39 @@
+# Docling MCP in Docling Serve
+
+The `docling-serve` container image includes all MCP (Model Communication Protocol) features starting from version v1.1.0. To leverage these features, you simply need to use a different entrypoint—no custom image builds or additional installations are required. The image provides the `docling-mcp-server` executable, which enables MCP functionality out of the box as of version v1.1.0 ([changelog](https://github.com/docling-project/docling-serve/blob/624f65d41b734e8b39ff267bc8bf6e766c376d6d/CHANGELOG.md)).
+
+Read more on [Docling MCP](https://github.com/docling-project/docling-mcp) in its dedicated repository.
+
+## Launching the MCP Service
+
+By default, the container runs `docling-serve run` and exposes port 5001. To start the MCP service, override the entrypoint and specify your desired port mapping. For example:
+
+```sh
+podman run -p 8000:8000 quay.io/docling-project/docling-serve -- docling-mcp-server --transport streamable-http --port 8000 --host 0.0.0.0
+```
+
+This command starts the MCP server on port 8000, accessible at `http://localhost:8000/mcp`. Adjust the port and host as needed. Key arguments for `docling-mcp-server` include `--transport streamable-http` (HTTP transport for client connections), `--port <PORT>`, and `--host <HOST>` (use `0.0.0.0` to accept connections from any interface).
+
+## Configuring MCP Clients
+
+Most MCP-compatible clients, such as LM Studio and Claude Desktop, allow you to specify custom MCP server endpoints. The standard configuration uses a JSON block to define available MCP servers. For example, to connect to the Docling MCP server running on port 8000:
+
+```json
+{
+  "mcpServers": {
+    "docling": {
+      "url": "http://localhost:8000/mcp"
+    }
+  }
+}
+```
+
+Insert this configuration in your client's settings where MCP servers are defined. Update the URL if you use a different port.
+
+### LM Studio and Claude Desktop
+
+Both LM Studio and Claude Desktop support MCP endpoints via configuration files or UI settings. Paste the above JSON block into the appropriate configuration section. For Claude Desktop, add the MCP server in the "Custom Model" or "MCP Server" section. For LM Studio, refer to its documentation for the location of the MCP server configuration.
+
+### Other MCP Clients
+
+Other clients, such as Continue Coding Assistant, also support custom MCP endpoints. Use the same configuration pattern: provide the MCP server URL ending with `/mcp` and ensure the port matches your container setup. See the [Docling MCP docs](https://github.com/docling-project/docling-mcp/tree/main/docs/integrations) for more details.
--- a/docs/models.md
+++ b/docs/models.md
@@ -0,0 +1,175 @@
+# Handling Models in Docling Serve
+
+When enabling steps in Docling Serve that require extra models (such as picture classification, picture description, table detection, code recognition, formula extraction, or vision-language modules), you must ensure those models are available in the runtime environment. The standard container image includes only the default models. Any additional models must be downloaded and made available before use. If required models are missing, Docling Serve will raise runtime errors rather than downloading them automatically. This default choice wants to guarantee the system is not calling external services.
+
+## Model Storage Location
+
+Docling Serve loads models from the directory specified by the `DOCLING_SERVE_ARTIFACTS_PATH` environment variable. This path must be consistent across model download and runtime. When running with multiple workers or reload enabled, you must use the environment variable rather than the CLI argument for configuration [[source]](./configuration.md).
+
+## Approaches for Making Extra Models Available
+
+There are several ways to ensure required models are present:
+
+### 1. Disable Local Models (Trigger Auto-Download)
+
+You can configure the container to download all models at startup by clearing the artifacts path:
+
+```sh
+podman run -d -p 5001:5001 --name docling-serve \
+  -e DOCLING_SERVE_ARTIFACTS_PATH="" \
+  -e DOCLING_SERVE_ENABLE_UI=true \
+  quay.io/docling-project/docling-serve
+```
+
+This approach is simple for local development but not recommended for production, as it increases startup time and depends on network availability.
+
+### 2. Build a Custom Image with Pre-Downloaded Models
+
+You can create a new image that includes the required models:
+
+```Dockerfile
+FROM quay.io/docling-project/docling-serve
+RUN docling-tools models download smolvlm
+```
+
+This method is suitable for production, as it ensures all models are present in the image and avoids runtime downloads.
+
+### 3. Update the Entrypoint to Download Models Before Startup
+
+You can override the entrypoint to download models before starting the service:
+
+```sh
+podman run -p 5001:5001 -e DOCLING_SERVE_ENABLE_UI=true \
+  quay.io/docling-project/docling-serve \
+  -- sh -c 'exec docling-tools models download smolvlm && exec docling-serve run'
+```
+
+This is useful for environments where you want to keep the base image unchanged but still automate model preparation.
+
+### 4. Mount a Volume with Pre-Downloaded Models
+
+Download models locally and mount them into the container:
+
+```sh
+# Download the models locally
+docling-tools models download --all -o models
+
+# Start the container with the local models folder
+podman run -p 5001:5001 \
+  -v $(pwd)/models:/opt/app-root/src/models \
+  -e DOCLING_SERVE_ARTIFACTS_PATH="/opt/app-root/src/models" \
+  -e DOCLING_SERVE_ENABLE_UI=true \
+  quay.io/docling-project/docling-serve
+```
+
+This approach is robust for both local and production deployments, especially when using persistent storage.
+
+## Kubernetes/Cluster Deployments
+
+For Kubernetes or OpenShift clusters, the recommended approach is to use a PersistentVolumeClaim (PVC) for model storage, a Kubernetes Job to download models, and mount the volume into the deployment. This ensures models persist across pod restarts and scale-out scenarios.
+
+### Example: PersistentVolumeClaim
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: docling-model-cache-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 10Gi
+```
+
+If you don't want to use default storage class, set your custom storage class with following:
+
+```yaml
+spec:
+    ...
+    storageClassName: <Storage Class Name>
+```
+
+Manifest example: [docling-model-cache-pvc.yaml](./deploy-examples/docling-model-cache-pvc.yaml)
+
+### Example: Model Download Job
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: docling-model-cache-load
+spec:
+  template:
+    spec:
+      containers:
+        - name: loader
+          image: ghcr.io/docling-project/docling-serve-cpu:main
+          command:
+            - docling-tools
+            - models
+            - download
+            - '--output-dir=/modelcache'
+            - 'layout'
+            - 'tableformer'
+            - 'code_formula'
+            - 'picture_classifier'
+            - 'smolvlm'
+            - 'granite_vision'
+            - 'easyocr'
+          volumeMounts:
+            - name: docling-model-cache
+              mountPath: /modelcache
+      volumes:
+        - name: docling-model-cache
+          persistentVolumeClaim:
+            claimName: docling-model-cache-pvc
+      restartPolicy: Never
+```
+
+The job will mount the previously created persistent volume and execute command similar to how we would load models locally:
+`docling-tools models download --output-dir <MOUNT-PATH> [LIST_OF_MODELS]`
+
+In manifest, we specify desired models individually, or we can use `--all` parameter to download all models.
+
+Manifest example: [docling-model-cache-job.yaml](./deploy-examples/docling-model-cache-job.yaml)
+
+### Example: Deployment with Mounted Volume
+
+```yaml
+spec:
+  template:
+    spec:
+      containers:
+        - name: api
+          env:
+            - name: DOCLING_SERVE_ARTIFACTS_PATH
+              value: '/modelcache'
+          volumeMounts:
+            - name: docling-model-cache
+              mountPath: /modelcache
+      volumes:
+        - name: docling-model-cache
+          persistentVolumeClaim:
+            claimName: docling-model-cache-pvc
+```
+
+The value of `DOCLING_SERVE_ARTIFACTS_PATH` must match the mount path where models are stored.
+
+Now, when docling-serve is executing tasks, the underlying docling installation will load model weights from mounted volume.
+
+Manifest example: [docling-model-cache-deployment.yaml](./deploy-examples/docling-model-cache-deployment.yaml)
+
+## Local Docker Execution
+
+For local Docker or Podman execution, you can use any of the approaches above. Mounting a local directory with pre-downloaded models is the most reliable for repeated runs and avoids network dependencies.
+
+## Troubleshooting and Best Practices
+
+- If a required model is missing from the artifacts path, Docling Serve will raise a runtime error.
+- Always ensure the value of `DOCLING_SERVE_ARTIFACTS_PATH` matches the directory where models are stored and mounted.
+- For production and cluster environments, prefer persistent storage and pre-loading models via a dedicated job.
+
+For more details and YAML manifest examples, see the [deployment documentation](./deployment.md).
--- a/docs/pre-loading-models.md
+++ b/docs/pre-loading-models.md
@@ -1,103 +0,0 @@
-# Pre-loading models for docling
-
-This document provides examples for pre-loading docling models to a persistent volume and re-using it for docling-serve deployments.
-
-1. We need to create a persistent volume that will store models weights:
-
-    ```yaml
-    apiVersion: v1
-    kind: PersistentVolumeClaim
-    metadata:
-      name: docling-model-cache-pvc
-    spec:
-      accessModes:
-        - ReadWriteOnce
-      volumeMode: Filesystem
-      resources:
-        requests:
-          storage: 10Gi
-    ```
-
-    If you don't want to use default storage class, set your custom storage class with following:
-
-    ```yaml
-    spec:
-      ...
-      storageClassName: <Storage Class Name>
-    ```
-
-    Manifest example: [docling-model-cache-pvc.yaml](./deploy-examples/docling-model-cache-pvc.yaml)
-
-2. In order to load model weights, we can use docling-toolkit to download them, as this is a one time operation we can use kubernetes job for this:
-
-    ```yaml
-    apiVersion: batch/v1
-    kind: Job
-    metadata:
-      name: docling-model-cache-load
-    spec:
-      selector: {}
-      template:
-        metadata:
-          name: docling-model-load
-        spec:
-          containers:
-            - name: loader
-              image: ghcr.io/docling-project/docling-serve-cpu:main
-              command:
-                - docling-tools
-                - models
-                - download
-                - '--output-dir=/modelcache'
-                - 'layout'
-                - 'tableformer'
-                - 'code_formula'
-                - 'picture_classifier'
-                - 'smolvlm'
-                - 'granite_vision'
-                - 'easyocr'
-              volumeMounts:
-                - name: docling-model-cache
-                  mountPath: /modelcache
-          volumes:
-            - name: docling-model-cache
-              persistentVolumeClaim:
-                claimName: docling-model-cache-pvc
-          restartPolicy: Never
-    ```
-
-    The job will mount previously created persistent volume and execute command similar to how we would load models locally:
-    `docling-tools models download --output-dir <MOUNT-PATH> [LIST_OF_MODELS]`
-
-    In manifest, we specify desired models individually, or we can use `--all` parameter to download all models.
-
-    Manifest example: [docling-model-cache-job.yaml](./deploy-examples/docling-model-cache-job.yaml)
-
-3. Now we can mount volume in the docling-serve deployment and set env `DOCLING_SERVE_ARTIFACTS_PATH` to point to it.
-    Following additions to deploymeny should be made:
-
-    ```yaml
-    spec:
-      template:
-        spec:
-          containers:
-            - name: api
-              env:
-              ...
-                - name: DOCLING_SERVE_ARTIFACTS_PATH
-                  value: '/modelcache'
-              volumeMounts:
-                - name: docling-model-cache
-                  mountPath: /modelcache
-          ...
-          volumes:
-            - name: docling-model-cache
-              persistentVolumeClaim:
-                claimName: docling-model-cache-pvc
-    ```
-
-    Make sure that value of `DOCLING_SERVE_ARTIFACTS_PATH` is the same as where models were downloaded and where volume is mounted.
-
-    Now when docling-serve is executing tasks, the underlying docling installation will load model weights from mouted volume.
-
-    Manifest example: [docling-model-cache-deployment.yaml](./deploy-examples/docling-model-cache-deployment.yaml)
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -9,7 +9,7 @@ On top of the source of file (see below), both endpoints support the same parame
 - `from_formats` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
 - `to_formats` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
 - `pipeline` (str). The choice of which pipeline to use. Allowed values are `standard` and `vlm`. Defaults to `standard`.
- `page_range` (tuple). If speficied, only convert a range of pages. The page number starts at 1.
+- `page_range` (tuple). If specified, only convert a range of pages. The page number starts at 1.
 - `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
 - `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
 - `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
@@ -18,24 +18,27 @@ On top of the source of file (see below), both endpoints support the same parame
 - `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`, `dlparse_v4`. Defaults to `dlparse_v4`.
 - `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
 - `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
- `md_page_break_placeholder` (str): Add this placeholder betweek pages in the markdown output.
+- `md_page_break_placeholder` (str): Add this placeholder between pages in the markdown output.
 - `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
 - `do_code_enrichment` (bool): If enabled, perform OCR code enrichment. Defaults to false.
 - `do_formula_enrichment` (bool): If enabled, perform formula OCR, return LaTeX code. Defaults to false.
 - `do_picture_classification` (bool): If enabled, classify pictures in documents. Defaults to false.
 - `do_picture_description` (bool): If enabled, describe pictures in documents. Defaults to false.
 - `picture_description_area_threshold` (float): Minimum percentage of the area for a picture to be processed with the models. Defaults to 0.05.
- `picture_description_local` (dict): Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.
- `picture_description_api` (dict): API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.
+- `picture_description_local` (dict): Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with `picture_description_api`.
+- `picture_description_api` (dict): API details for using a vision-language model in the picture description. This parameter is mutually exclusive with `picture_description_local`.
 - `include_images` (bool): If enabled, images will be extracted from the document. Defaults to false.
 - `images_scale` (float): Scale factor for images. Defaults to 2.0.

+### Authentication
+
+When authentication is activated (see the parameter `DOCLING_SERVE_API_KEY` in [configuration.md](./configuration.md)), all the API requests **must** provide the header `X-Api-Key` with the correct secret key.
+
 ## Convert endpoints

 ### Source endpoint

-The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
+The endpoint is `/v1/convert/source`, listening for POST requests of JSON payloads.

 On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
 The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
@@ -66,7 +69,6 @@ Simple payload example:
    "pdf_backend": "dlparse_v2",
    "table_mode": "fast",
    "abort_on_error": false,
-    "return_as_file": false,
  },
  "http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
 }
@@ -80,7 +82,7 @@ Simple payload example:

 ```sh
 curl -X 'POST' \
-  'http://localhost:5001/v1alpha/convert/source' \
+  'http://localhost:5001/v1/convert/source' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
@@ -109,7 +111,6 @@ curl -X 'POST' \
    "pdf_backend": "dlparse_v2",
    "table_mode": "fast",
    "abort_on_error": false,
-    "return_as_file": false,
    "do_table_structure": true,
    "include_images": true,
    "images_scale": 2
@@ -127,7 +128,7 @@ curl -X 'POST' \
 import httpx

 async_client = httpx.AsyncClient(timeout=60.0)
-url = "http://localhost:5001/v1alpha/convert/source"
+url = "http://localhost:5001/v1/convert/source"
 payload = {
  "options": {
    "from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
@@ -140,7 +141,6 @@ payload = {
    "pdf_backend": "dlparse_v2",
    "table_mode": "fast",
    "abort_on_error": False,
-    "return_as_file": False,
  },
  "http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
 }
@@ -179,7 +179,7 @@ cat <<EOF > /tmp/request_body.json
 EOF

 # 3. POST the request to the docling service
-curl -X POST "localhost:5001/v1alpha/convert/source" \
+curl -X POST "localhost:5001/v1/convert/source" \
     -H "Content-Type: application/json" \
     -d @/tmp/request_body.json
 ```
@@ -188,14 +188,14 @@ curl -X POST "localhost:5001/v1alpha/convert/source" \

 ### File endpoint

-The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
+The endpoint is: `/v1/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.

 <details>
 <summary>CURL example:</summary>

 ```sh
 curl -X 'POST' \
-  'http://127.0.0.1:5001/v1alpha/convert/file' \
+  'http://127.0.0.1:5001/v1/convert/file' \
  -H 'accept: application/json' \
  -H 'Content-Type: multipart/form-data' \
  -F 'ocr_engine=easyocr' \
@@ -211,7 +211,6 @@ curl -X 'POST' \
  -F 'abort_on_error=false' \
  -F 'to_formats=md' \
  -F 'to_formats=text' \
-  -F 'return_as_file=false' \
  -F 'do_ocr=true'
 ```

@@ -224,7 +223,7 @@ curl -X 'POST' \
 import httpx

 async_client = httpx.AsyncClient(timeout=60.0)
-url = "http://localhost:5001/v1alpha/convert/file"
+url = "http://localhost:5001/v1/convert/file"
 parameters = {
 "from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
 "to_formats": ["md", "json", "html", "text", "doctags"],
@@ -236,7 +235,6 @@ parameters = {
 "pdf_backend": "dlparse_v2",
 "table_mode": "fast",
 "abort_on_error": False,
-"return_as_file": False
 }

 current_dir = os.path.dirname(__file__)
@@ -288,33 +286,42 @@ The api option is specified with:

 Example URLs are:

- `http://localhost:8000/v1/chat/completions` for the local vllm api, with example `params`:
+- `http://localhost:8000/v1/chat/completions` for the local vllm api, with example `picture_description_api`:
  - the `HuggingFaceTB/SmolVLM-256M-Instruct` model

    ```json
    {
+      "url": "http://localhost:8000/v1/chat/completions",
+      "params": {
        "model": "HuggingFaceTB/SmolVLM-256M-Instruct",
        "max_completion_tokens": 200,
+      }
    }
    ```
-  
+
  - the `ibm-granite/granite-vision-3.2-2b` model

    ```json
    {
+      "url": "http://localhost:8000/v1/chat/completions",
+      "params": {
        "model": "ibm-granite/granite-vision-3.2-2b",
        "max_completion_tokens": 200,
+      }
    }
    ```

- `http://localhost:11434/v1/chat/completions` for the local ollama api, with example `params`:
+- `http://localhost:11434/v1/chat/completions` for the local Ollama api, with example `picture_description_api`:
  - the `granite3.2-vision:2b` model

    ```json
    {
+      "url": "http://localhost:11434/v1/chat/completions",
+      "params": {
        "model": "granite3.2-vision:2b"
+      }
    }
-    ```  
+    ```

 Note that when using `picture_description_api`, the server must be launched with `DOCLING_SERVE_ENABLE_REMOTE_SERVICES=true`.

@@ -345,19 +352,19 @@ The response can be a JSON Document or a File.
  `processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
  timing of all the internal Docling components.

- If you set the parameter `return_as_file` to True, the response will be a zip file.
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
+- If you set the parameter `target` to the zip mode, the response will be a zip file.
+- If multiple files are generated (multiple inputs, or one input but multiple outputs with the zip target mode), the response will be a zip file.

 ## Asynchronous API

-Both `/v1alpha/convert/source` and `/v1alpha/convert/file` endpoints are available as asynchronous variants.
+Both `/v1/convert/source` and `/v1/convert/file` endpoints are available as asynchronous variants.
 The advantage of the asynchronous endpoints is the possible to interrupt the connection, check for the progress update and fetch the result.
-This approach is more resilient against network stabilities and allows the client application logic to easily interleave conversion with other tasks.
+This approach is more resilient against network instabilities and allows the client application logic to easily interleave conversion with other tasks.

 Launch an asynchronous conversion with:

- `POST /v1alpha/convert/source/async` when providing the input as sources.
- `POST /v1alpha/convert/file/async` when providing the input as multipart-form files.
+- `POST /v1/convert/source/async` when providing the input as sources.
+- `POST /v1/convert/file/async` when providing the input as multipart-form files.

 The response format is a task detail:

@@ -374,7 +381,7 @@ The response format is a task detail:

 For checking the progress of the conversion task and wait for its completion, use the endpoint:

- `GET /v1alpha/status/poll/{task_id}`
+- `GET /v1/status/poll/{task_id}`

 <details>
 <summary>Example waiting loop:</summary>
@@ -399,9 +406,9 @@ while task["task_status"] not in ("success", "failure"):
 ### Subscribe with websockets

 Using websocket you can get the client application being notified about updates of the conversion task.
-To start the websocker connection, use the endpoint:
+To start the websocket connection, use the endpoint:

- `/v1alpha/status/ws/{task_id}`
+- `/v1/status/ws/{task_id}`

 Websocket messages are JSON object with the following structure:

@@ -414,12 +421,12 @@ Websocket messages are JSON object with the following structure:
 ```

 <details>
-<summary>Example websocker usage:</summary>
+<summary>Example websocket usage:</summary>

 ```python
 from websockets.sync.client import connect

-uri = f"ws://{base_url}/v1alpha/status/ws/{task['task_id']}"
+uri = f"ws://{base_url}/v1/status/ws/{task['task_id']}"
 with connect(uri) as websocket:
    for message in websocket:
        try:
@@ -438,4 +445,4 @@ with connect(uri) as websocket:

 When the task is completed, the result can be fetched with the endpoint:

- `GET /v1alpha/result/{task_id}`
+- `GET /v1/result/{task_id}`
--- a/docs/v1_migration.md
+++ b/docs/v1_migration.md
@@ -0,0 +1,80 @@
+# Migration to the `v1` API
+
+Docling Serve from the initial prototype `v1alpha` API to the stable `v1` API.
+This page provides simple instructions to upgrade your application to the new API.
+
+## API changes
+
+The breaking changes introduced in the `v1` release of Docling Serve are designed to provide a stable schema which
+allows the project to provide new capabilities as new type of input sources, targets and also the definition of callback for event-driven applications.
+
+### Endpoint names
+
+All endpoints are renamed from `/v1alpha/` to `/v1/`.
+
+### Sources
+
+When using the `/v1/convert/source` endpoint, input documents have to be specified with the `sources: []` argument, which is replacing the usage of `file_sources` and `http_sources`.
+
+Old version:
+
+```jsonc
+{
+    "options": {},  // conversion options
+    "file_sources": [  // input documents provided as base64-encoded strings
+        {"base64_string": "abc123...", "filename": "file.pdf"}
+    ],
+    "http_sources": [  // input documents provided as http urls
+        {"url": "https://..."}
+    ]
+}
+```
+
+New version:
+
+```jsonc
+{
+    "options": {},  // conversion options
+    "sources": [
+        // input document provided as base64-encoded string
+        {"kind": "file", "base64_string": "abc123...", "filename": "file.pdf"},
+        // input document provided as http urls
+        {"kind": "http", "url": "https://..."},
+    ]
+}
+```
+
+### Targets
+
+Switching between output formats, i.e. from the JSON inbody response to the zip archive response, users have to specify the `target` argument, which is replacing the usage of `options.return_as_file`.
+
+Old version:
+
+```jsonc
+{
+    "options": {
+        "return_as_file": true  // <-- to be removed
+    },
+    // ...
+}
+```
+
+New version:
+
+```jsonc
+{
+    "options": {},
+    "target": {"kind": "zip"},  // <-- add this
+    // ...
+}
+```
+
+## Continue with the old API
+
+If you are not able to apply the changes above to your application, please consider pinning of the previous `v0.x` container images, e.g.
+
+```sh
+podman run -p 5001:5001 -e DOCLING_SERVE_ENABLE_UI=1 quay.io/docling-project/docling-serve:v0.16.1
+```
+
+_Note that the old prototype API will not be supported in new `v1.x` versions._
--- a/examples/split_processing.py
+++ b/examples/split_processing.py
@@ -0,0 +1,124 @@
+import json
+import time
+from pathlib import Path
+
+import httpx
+from pydantic import BaseModel
+from pypdf import PdfReader
+
+from docling_core.types.doc.document import DoclingDocument
+
+# Variables to use
+path_to_pdf = Path("./tests/2206.01062v1.pdf")
+pages_per_file = 4
+base_url = "http://localhost:5001/v1"
+out_dir = Path("examples/splitted_pdf/")
+
+
+class ConvertedSplittedPdf(BaseModel):
+    task_id: str
+    conversion_finished: bool = False
+    result: dict | None = None
+
+
+def get_task_result(task_id: str):
+    response = httpx.get(
+        f"{base_url}/result/{task_id}",
+        timeout=15,
+    )
+    return response.json()
+
+
+def check_task_status(task_id: str):
+    response = httpx.get(f"{base_url}/status/poll/{task_id}", timeout=15)
+    task = response.json()
+    task_status = task["task_status"]
+
+    task_finished = False
+    if task_status == "success":
+        task_finished = True
+
+    if task_status in ("failure", "revoked"):
+        raise RuntimeError("A conversion failed")
+
+    time.sleep(5)
+
+    return task_finished
+
+
+def post_file(file_path: Path, start_page: int, end_page: int):
+    payload = {
+        "to_formats": ["json"],
+        "image_export_mode": "placeholder",
+        "ocr": False,
+        "abort_on_error": False,
+        "page_range": [start_page, end_page],
+    }
+
+    files = {
+        "files": (file_path.name, file_path.open("rb"), "application/pdf"),
+    }
+    response = httpx.post(
+        f"{base_url}/convert/file/async",
+        files=files,
+        data=payload,
+        timeout=15,
+    )
+
+    task = response.json()
+
+    return task["task_id"]
+
+
+def main():
+    filename = path_to_pdf
+
+    splitted_pdfs: list[ConvertedSplittedPdf] = []
+
+    with open(filename, "rb") as input_pdf_file:
+        pdf_reader = PdfReader(input_pdf_file)
+        total_pages = len(pdf_reader.pages)
+
+        for start_page in range(0, total_pages, pages_per_file):
+            task_id = post_file(
+                filename, start_page + 1, min(start_page + pages_per_file, total_pages)
+            )
+            splitted_pdfs.append(ConvertedSplittedPdf(task_id=task_id))
+
+    all_files_converted = False
+    while not all_files_converted:
+        found_conversion_running = False
+        for splitted_pdf in splitted_pdfs:
+            if not splitted_pdf.conversion_finished:
+                found_conversion_running = True
+                print("checking conversion status...")
+                splitted_pdf.conversion_finished = check_task_status(
+                    splitted_pdf.task_id
+                )
+        if not found_conversion_running:
+            all_files_converted = True
+
+    for splitted_pdf in splitted_pdfs:
+        splitted_pdf.result = get_task_result(splitted_pdf.task_id)
+
+    files = []
+    for i, splitted_pdf in enumerate(splitted_pdfs):
+        json_content = json.dumps(
+            splitted_pdf.result.get("document").get("json_content"), indent=2
+        )
+        doc = DoclingDocument.model_validate_json(json_content)
+        filename = f"{out_dir}/splited_json_{i}.json"
+        doc.save_as_json(filename=filename)
+        files.append(filename)
+
+    docs = [DoclingDocument.load_from_json(filename=f) for f in files]
+    concate_doc = DoclingDocument.concatenate(docs=docs)
+
+    exp_json_file = Path(f"{out_dir}/concatenated.json")
+    concate_doc.save_as_json(exp_json_file)
+
+    print("Finished")
+
+
+if __name__ == "__main__":
+    main()
--- a/img/fastapi-ui.png
+++ b/img/fastapi-ui.png
--- a/img/swagger.png
+++ b/img/swagger.png
--- a/os-packages.txt
+++ b/os-packages.txt
@@ -1,6 +1,7 @@
 tesseract
 tesseract-devel
 tesseract-langpack-eng
+tesseract-osd
 leptonica-devel
 libglvnd-glx
 glib2
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "docling-serve"
-version = "0.15.0"  # DO NOT EDIT, updated automatically
+version = "1.4.0"  # DO NOT EDIT, updated automatically
 description = "Running Docling as a service"
 license = {text = "MIT"}
 authors = [
@@ -8,7 +8,6 @@ authors = [
    {name="Guillaume Moutier", email="gmoutier@redhat.com"},
    {name="Anil Vishnoi", email="avishnoi@redhat.com"},
    {name="Panos Vagenas", email="pva@zurich.ibm.com"},
-    {name="Panos Vagenas", email="pva@zurich.ibm.com"},
    {name="Christoph Auer", email="cau@zurich.ibm.com"},
    {name="Peter Staar", email="taa@zurich.ibm.com"},
 ]
@@ -23,19 +22,22 @@ readme = "README.md"
 classifiers = [
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
-    # "Development Status :: 5 - Production/Stable",
+    "Development Status :: 5 - Production/Stable",
    "Intended Audience :: Developers",
    "Typing :: Typed",
-    "Programming Language :: Python :: 3"
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "docling[vlm]~=2.28",
-    "docling-core>=2.32.0",
-    "mlx-vlm~=0.1.12; sys_platform == 'darwin' and platform_machine == 'arm64'",
+    "docling~=2.38",
+    "docling-core>=2.45.0",
+    "docling-jobkit[kfp,rq,vlm]>=1.4.0,<2.0.0",
    "fastapi[standard]~=0.115",
    "httpx~=0.28",
-    "kfp[kubernetes]>=2.10.0",
    "pydantic~=2.10",
    "pydantic-settings~=2.4",
    "python-multipart>=0.0.14,<0.1.0",
@@ -43,6 +45,7 @@ dependencies = [
    "uvicorn[standard]>=0.29.0,<1.0.0",
    "websockets~=14.0",
    "scalar-fastapi>=1.0.3",
+    "docling-mcp>=1.0.0",
 ]

 [project.optional-dependencies]
@@ -57,16 +60,8 @@ rapidocr = [
    "rapidocr-onnxruntime~=1.4; python_version<'3.13'",
    "onnxruntime~=1.7",
 ]
-cpu = [
-  "torch>=2.6.0",
-  "torchvision>=0.21.0",
-]
-cu124 = [
-  "torch>=2.6.0",
-  "torchvision>=0.21.0",
-]
 flash-attn = [
-  "flash-attn~=2.7.0; sys_platform == 'linux' and platform_machine == 'x86_64'"
+  "flash-attn~=2.8.2; sys_platform == 'linux' and platform_machine == 'x86_64'"
 ]

 [dependency-groups]
@@ -74,6 +69,7 @@ dev = [
    "asgi-lifespan~=2.0",
    "mypy~=1.11",
    "pre-commit-uv~=4.1",
+    "pypdf>=6.0.0",
    "pytest~=8.3",
    "pytest-asyncio~=0.24",
    "pytest-check~=2.4",
@@ -81,17 +77,50 @@ dev = [
    "ruff>=0.9.6",
 ]

+pypi = [
+  "torch>=2.7.1",
+  "torchvision>=0.22.1",
+]
+
+cpu = [
+  "torch>=2.7.1",
+  "torchvision>=0.22.1",
+]
+
+cu124 = [
+  "torch>=2.6.0",
+  "torchvision>=0.21.0",
+]
+
+cu126 = [
+  "torch>=2.7.1",
+  "torchvision>=0.22.1",
+]
+
+cu128 = [
+  "torch>=2.7.1",
+  "torchvision>=0.22.1",
+]
+
+rocm = [
+  "torch>=2.7.1",
+  "torchvision>=0.22.1",
+  "pytorch-triton-rocm>=3.3.1 ; sys_platform == 'linux' and platform_machine == 'x86_64'",
+]
+
 [tool.uv]
 package = true
+default-groups = ["dev", "pypi"]
 conflicts = [
  [
-    { extra = "cpu" },
-    { extra = "cu124" },
+    { group = "pypi" },
+    { group = "cpu" },
+    { group = "cu124" },
+    { group = "cu126" },
+    { group = "cu128" },
+    { group = "rocm" },
  ],
-  [
-    { extra = "cpu" },
-    { extra = "flash-attn" },
-  ],]
+]
 environments = ["sys_platform != 'darwin' or platform_machine != 'x86_64'"]
 override-dependencies = [
  "urllib3~=2.0"
@@ -99,14 +128,35 @@ override-dependencies = [

 [tool.uv.sources]
 torch = [
-  { index = "pytorch-cpu", extra = "cpu" },
-  { index = "pytorch-cu124", extra = "cu124" },
+  { index = "pytorch-pypi", group = "pypi" },
+  { index = "pytorch-cpu", group = "cpu" },
+  { index = "pytorch-cu124", group = "cu124", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-cu126", group = "cu126", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-cu128", group = "cu128", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-rocm", group = "rocm", marker = "sys_platform == 'linux'" },
 ]
+
 torchvision = [
-  { index = "pytorch-cpu", extra = "cpu" },
-  { index = "pytorch-cu124", extra = "cu124" },
+  { index = "pytorch-pypi", group = "pypi" },
+  { index = "pytorch-cpu", group = "cpu" },
+  { index = "pytorch-cu124", group = "cu124", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-cu126", group = "cu126", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-cu128", group = "cu128", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-rocm", group = "rocm", marker = "sys_platform == 'linux'" },
 ]

+pytorch-triton-rocm = [
+  { index = "pytorch-rocm", marker = "sys_platform == 'linux'" },
+]
+
+# docling-jobkit = { git = "https://github.com/docling-project/docling-jobkit/", rev = "main" }
+# docling-jobkit = { path = "../docling-jobkit", editable = true }
+
+[[tool.uv.index]]
+name = "pytorch-pypi"
+url = "https://pypi.org/simple"
+explicit = true
+
 [[tool.uv.index]]
 name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
@@ -117,6 +167,21 @@ name = "pytorch-cu124"
 url = "https://download.pytorch.org/whl/cu124"
 explicit = true

+[[tool.uv.index]]
+name = "pytorch-cu126"
+url = "https://download.pytorch.org/whl/cu126"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-rocm"
+url = "https://download.pytorch.org/whl/rocm6.3"
+explicit = true
+
 [tool.setuptools.packages.find]
 include = ["docling_serve*"]
 namespaces = true
@@ -185,7 +250,7 @@ ignore = [
 max-complexity = 15

 [tool.ruff.lint.isort.sections]
-"docling" = ["docling", "docling_core"]
+"docling" = ["docling", "docling_core", "docling_jobkit"]

 [tool.ruff.lint.isort]
 combine-as-imports = true
--- a/tests/test_1-file-all-outputs.py
+++ b/tests/test_1-file-all-outputs.py
@@ -6,17 +6,22 @@ import pytest
 import pytest_asyncio
 from pytest_check import check

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@pytest.mark.asyncio
 async def test_convert_file(async_client):
    """Test convert single file to all outputs"""
-    url = "http://localhost:5001/v1alpha/convert/file"
+    url = "http://localhost:5001/v1/convert/file"
    options = {
        "from_formats": [
            "docx",
@@ -37,7 +42,6 @@ async def test_convert_file(async_client):
        "pdf_backend": "dlparse_v2",
        "table_mode": "fast",
        "abort_on_error": False,
-        "return_as_file": False,
    }

    current_dir = os.path.dirname(__file__)
--- a/tests/test_1-file-async.py
+++ b/tests/test_1-file-async.py
@@ -6,10 +6,15 @@ import httpx
 import pytest
 import pytest_asyncio

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@@ -17,13 +22,12 @@ async def async_client():
 async def test_convert_url(async_client):
    """Test convert URL to all outputs"""

-    base_url = "http://localhost:5001/v1alpha"
+    base_url = "http://localhost:5001/v1"
    payload = {
        "to_formats": ["md", "json", "html"],
        "image_export_mode": "placeholder",
        "ocr": False,
        "abort_on_error": False,
-        "return_as_file": False,
    }

    file_path = Path(__file__).parent / "2206.01062v1.pdf"
--- a/tests/test_1-url-all-outputs.py
+++ b/tests/test_1-url-all-outputs.py
@@ -5,17 +5,22 @@ import pytest
 import pytest_asyncio
 from pytest_check import check

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@pytest.mark.asyncio
 async def test_convert_url(async_client):
    """Test convert URL to all outputs"""
-    url = "http://localhost:5001/v1alpha/convert/source"
+    url = "http://localhost:5001/v1/convert/source"
    payload = {
        "options": {
            "from_formats": [
@@ -37,9 +42,8 @@ async def test_convert_url(async_client):
            "pdf_backend": "dlparse_v2",
            "table_mode": "fast",
            "abort_on_error": False,
-            "return_as_file": False,
        },
-        "http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}],
+        "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2206.01062"}],
    }
    print(json.dumps(payload, indent=2))

--- a/tests/test_1-url-async-ws.py
+++ b/tests/test_1-url-async-ws.py
@@ -6,28 +6,35 @@ import pytest
 import pytest_asyncio
 from websockets.sync.client import connect

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@pytest.mark.asyncio
 async def test_convert_url(async_client: httpx.AsyncClient):
    """Test convert URL to all outputs"""
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key

    doc_filename = Path("tests/2408.09869v5.pdf")
    encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode()

-    base_url = "http://localhost:5001/v1alpha"
+    base_url = "http://localhost:5001/v1"
    payload = {
        "options": {
            "to_formats": ["md", "json"],
            "image_export_mode": "placeholder",
            "ocr": True,
            "abort_on_error": False,
-            "return_as_file": False,
            # "do_picture_description": True,
            # "picture_description_api": {
            #     "url": "http://localhost:11434/v1/chat/completions",
@@ -39,8 +46,14 @@ async def test_convert_url(async_client: httpx.AsyncClient):
            #     "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",
            # },
        },
-        # "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}],
-        "file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
+        # "sources": [{"kind": "http", "url": "https://arxiv.org/pdf/2501.17887"}],
+        "sources": [
+            {
+                "kind": "file",
+                "base64_string": encoded_doc,
+                "filename": doc_filename.name,
+            }
+        ],
    }
    # print(json.dumps(payload, indent=2))

@@ -52,7 +65,7 @@ async def test_convert_url(async_client: httpx.AsyncClient):

    task = response.json()

-    uri = f"ws://localhost:5001/v1alpha/status/ws/{task['task_id']}"
+    uri = f"ws://localhost:5001/v1/status/ws/{task['task_id']}?api_key={docling_serve_settings.api_key}"
    with connect(uri) as websocket:
        for message in websocket:
            print(message)
--- a/tests/test_1-url-async.py
+++ b/tests/test_1-url-async.py
@@ -6,10 +6,15 @@ import httpx
 import pytest
 import pytest_asyncio

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@@ -25,16 +30,15 @@ async def test_convert_url(async_client):
        "https://arxiv.org/pdf/2311.18481",
    ]

-    base_url = "http://localhost:5001/v1alpha"
+    base_url = "http://localhost:5001/v1"
    payload = {
        "options": {
            "to_formats": ["md", "json"],
            "image_export_mode": "placeholder",
            "ocr": True,
            "abort_on_error": False,
-            "return_as_file": False,
        },
-        "http_sources": [{"url": random.choice(example_docs)}],
+        "sources": [{"kind": "http", "url": random.choice(example_docs)}],
    }
    print(json.dumps(payload, indent=2))

--- a/tests/test_2-files-all-outputs.py
+++ b/tests/test_2-files-all-outputs.py
@@ -5,17 +5,22 @@ import pytest
 import pytest_asyncio
 from pytest_check import check

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@pytest.mark.asyncio
 async def test_convert_file(async_client):
    """Test convert single file to all outputs"""
-    url = "http://localhost:5001/v1alpha/convert/file"
+    url = "http://localhost:5001/v1/convert/file"
    options = {
        "from_formats": [
            "docx",
@@ -36,7 +41,6 @@ async def test_convert_file(async_client):
        "pdf_backend": "dlparse_v2",
        "table_mode": "fast",
        "abort_on_error": False,
-        "return_as_file": False,
    }

    current_dir = os.path.dirname(__file__)
--- a/tests/test_2-urls-all-outputs.py
+++ b/tests/test_2-urls-all-outputs.py
@@ -3,17 +3,22 @@ import pytest
 import pytest_asyncio
 from pytest_check import check

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@pytest.mark.asyncio
 async def test_convert_url(async_client):
    """Test convert URL to all outputs"""
-    url = "http://localhost:5001/v1alpha/convert/source"
+    url = "http://localhost:5001/v1/convert/source"
    payload = {
        "options": {
            "from_formats": [
@@ -35,12 +40,12 @@ async def test_convert_url(async_client):
            "pdf_backend": "dlparse_v2",
            "table_mode": "fast",
            "abort_on_error": False,
-            "return_as_file": False,
        },
-        "http_sources": [
-            {"url": "https://arxiv.org/pdf/2206.01062"},
-            {"url": "https://arxiv.org/pdf/2408.09869"},
+        "sources": [
+            {"kind": "http", "url": "https://arxiv.org/pdf/2206.01062"},
+            {"kind": "http", "url": "https://arxiv.org/pdf/2408.09869"},
        ],
+        "target": {"kind": "zip"},
    }

    response = await async_client.post(url, json=payload)
--- a/tests/test_2-urls-async-all-outputs.py
+++ b/tests/test_2-urls-async-all-outputs.py
@@ -6,17 +6,22 @@ import pytest
 import pytest_asyncio
 from pytest_check import check

+from docling_serve.settings import docling_serve_settings
+

@pytest_asyncio.fixture
 async def async_client():
-    async with httpx.AsyncClient(timeout=60.0) as client:
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    async with httpx.AsyncClient(timeout=60.0, headers=headers) as client:
        yield client


@pytest.mark.asyncio
 async def test_convert_url(async_client):
    """Test convert URL to all outputs"""
-    base_url = "http://localhost:5001/v1alpha"
+    base_url = "http://localhost:5001/v1"
    payload = {
        "options": {
            "from_formats": [
@@ -38,12 +43,12 @@ async def test_convert_url(async_client):
            "pdf_backend": "dlparse_v2",
            "table_mode": "fast",
            "abort_on_error": False,
-            "return_as_file": False,
        },
-        "http_sources": [
-            {"url": "https://arxiv.org/pdf/2206.01062"},
-            {"url": "https://arxiv.org/pdf/2408.09869"},
+        "sources": [
+            {"kind": "http", "url": "https://arxiv.org/pdf/2206.01062"},
+            {"kind": "http", "url": "https://arxiv.org/pdf/2408.09869"},
        ],
+        "target": {"kind": "zip"},
    }

    response = await async_client.post(f"{base_url}/convert/source/async", json=payload)
--- a/tests/test_fastapi_endpoints.py
+++ b/tests/test_fastapi_endpoints.py
@@ -1,6 +1,8 @@
 import asyncio
+import io
 import json
 import os
+import zipfile

 import pytest
 import pytest_asyncio
@@ -8,7 +10,10 @@ from asgi_lifespan import LifespanManager
 from httpx import ASGITransport, AsyncClient
 from pytest_check import check

+from docling_core.types.doc import DoclingDocument, PictureItem
+
 from docling_serve.app import create_app
+from docling_serve.settings import docling_serve_settings


@pytest.fixture(scope="session")
@@ -16,6 +21,14 @@ def event_loop():
    return asyncio.get_event_loop()


+@pytest.fixture(scope="session")
+def auth_headers():
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    return headers
+
+
@pytest_asyncio.fixture(scope="session")
 async def app():
    app = create_app()
@@ -42,10 +55,10 @@ async def test_health(client: AsyncClient):


@pytest.mark.asyncio
-async def test_convert_file(client: AsyncClient):
+async def test_convert_file(client: AsyncClient, auth_headers: dict):
    """Test convert single file to all outputs"""

-    endpoint = "/v1alpha/convert/file"
+    endpoint = "/v1/convert/file"
    options = {
        "from_formats": [
            "docx",
@@ -66,7 +79,6 @@ async def test_convert_file(client: AsyncClient):
        "pdf_backend": "dlparse_v2",
        "table_mode": "fast",
        "abort_on_error": False,
-        "return_as_file": False,
    }

    current_dir = os.path.dirname(__file__)
@@ -76,7 +88,9 @@ async def test_convert_file(client: AsyncClient):
        "files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
    }

-    response = await client.post(endpoint, files=files, data=options)
+    response = await client.post(
+        endpoint, files=files, data=options, headers=auth_headers
+    )
    assert response.status_code == 200, "Response should be 200 OK"

    data = response.json()
@@ -154,3 +168,39 @@ async def test_convert_file(client: AsyncClient):
            data["document"]["doctags_content"],
            msg=f"DocTags document should contain '<doctag><page_header>'. Received: {safe_slice(data['document']['doctags_content'])}",
        )
+
+
+@pytest.mark.asyncio
+async def test_referenced_artifacts(client: AsyncClient, auth_headers: dict):
+    """Test that paths in the zip file are relative to the zip file root."""
+
+    endpoint = "/v1/convert/file"
+    options = {
+        "to_formats": ["json"],
+        "image_export_mode": "referenced",
+        "target_type": "zip",
+        "ocr": False,
+    }
+
+    current_dir = os.path.dirname(__file__)
+    file_path = os.path.join(current_dir, "2206.01062v1.pdf")
+
+    files = {
+        "files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
+    }
+
+    response = await client.post(
+        endpoint, files=files, data=options, headers=auth_headers
+    )
+    assert response.status_code == 200, "Response should be 200 OK"
+
+    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
+        namelist = zip_file.namelist()
+        for file in namelist:
+            if file.endswith(".json"):
+                doc = DoclingDocument.model_validate(json.loads(zip_file.read(file)))
+                for item, _level in doc.iterate_items():
+                    if isinstance(item, PictureItem):
+                        assert item.image is not None
+                        print(f"{item.image.uri}=")
+                        assert str(item.image.uri) in namelist
--- a/tests/test_file_opts.py
+++ b/tests/test_file_opts.py
@@ -11,6 +11,7 @@ from docling_core.types import DoclingDocument
 from docling_core.types.doc.document import PictureDescriptionData

 from docling_serve.app import create_app
+from docling_serve.settings import docling_serve_settings


@pytest.fixture(scope="session")
@@ -18,6 +19,14 @@ def event_loop():
    return asyncio.get_event_loop()


+@pytest.fixture(scope="session")
+def auth_headers():
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    return headers
+
+
@pytest_asyncio.fixture(scope="session")
 async def app():
    app = create_app()
@@ -37,10 +46,10 @@ async def client(app):


@pytest.mark.asyncio
-async def test_convert_file(client: AsyncClient):
+async def test_convert_file(client: AsyncClient, auth_headers: dict):
    """Test convert single file to all outputs"""

-    endpoint = "/v1alpha/convert/file"
+    endpoint = "/v1/convert/file"
    options = {
        "to_formats": ["md", "json"],
        "image_export_mode": "placeholder",
@@ -63,7 +72,9 @@ async def test_convert_file(client: AsyncClient):
        "files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
    }

-    response = await client.post(endpoint, files=files, data=options)
+    response = await client.post(
+        endpoint, files=files, data=options, headers=auth_headers
+    )
    assert response.status_code == 200, "Response should be 200 OK"

    data = response.json()
--- a/tests/test_options_serialization.py
+++ b/tests/test_options_serialization.py
@@ -1,54 +0,0 @@
-from docling_serve.datamodel.convert import (
-    ConvertDocumentsOptions,
-    PictureDescriptionApi,
-)
-from docling_serve.docling_conversion import (
-    _hash_pdf_format_option,
-    get_pdf_pipeline_opts,
-)
-
-
-def test_options_cache_key():
-    hashes = set()
-
-    opts = ConvertDocumentsOptions()
-    pipeline_opts = get_pdf_pipeline_opts(opts)
-    hash = _hash_pdf_format_option(pipeline_opts)
-    assert hash not in hashes
-    hashes.add(hash)
-
-    opts.do_picture_description = True
-    pipeline_opts = get_pdf_pipeline_opts(opts)
-    hash = _hash_pdf_format_option(pipeline_opts)
-    # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
-    assert hash not in hashes
-    hashes.add(hash)
-
-    opts.picture_description_api = PictureDescriptionApi(
-        url="http://localhost",
-        params={"model": "mymodel"},
-        prompt="Hello 1",
-    )
-    pipeline_opts = get_pdf_pipeline_opts(opts)
-    hash = _hash_pdf_format_option(pipeline_opts)
-    # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
-    assert hash not in hashes
-    hashes.add(hash)
-
-    opts.picture_description_api = PictureDescriptionApi(
-        url="http://localhost",
-        params={"model": "your-model"},
-        prompt="Hello 1",
-    )
-    pipeline_opts = get_pdf_pipeline_opts(opts)
-    hash = _hash_pdf_format_option(pipeline_opts)
-    # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
-    assert hash not in hashes
-    hashes.add(hash)
-
-    opts.picture_description_api.prompt = "World"
-    pipeline_opts = get_pdf_pipeline_opts(opts)
-    hash = _hash_pdf_format_option(pipeline_opts)
-    # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
-    assert hash not in hashes
-    hashes.add(hash)
--- a/tests/test_results_clear.py
+++ b/tests/test_results_clear.py
@@ -17,6 +17,14 @@ def event_loop():
    return asyncio.get_event_loop()


+@pytest.fixture(scope="session")
+def auth_headers():
+    headers = {}
+    if docling_serve_settings.api_key:
+        headers["X-Api-Key"] = docling_serve_settings.api_key
+    return headers
+
+
@pytest_asyncio.fixture(scope="session")
 async def app():
    app = create_app()
@@ -35,7 +43,7 @@ async def client(app):
        yield client


-async def convert_file(client: AsyncClient):
+async def convert_file(client: AsyncClient, auth_headers: dict):
    doc_filename = Path("tests/2408.09869v5.pdf")
    encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode()

@@ -43,10 +51,18 @@ async def convert_file(client: AsyncClient):
        "options": {
            "to_formats": ["json"],
        },
-        "file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
+        "sources": [
+            {
+                "kind": "file",
+                "base64_string": encoded_doc,
+                "filename": doc_filename.name,
+            }
+        ],
    }

-    response = await client.post("/v1alpha/convert/source/async", json=payload)
+    response = await client.post(
+        "/v1/convert/source/async", json=payload, headers=auth_headers
+    )
    assert response.status_code == 200, "Response should be 200 OK"

    task = response.json()
@@ -54,7 +70,9 @@ async def convert_file(client: AsyncClient):
    print(json.dumps(task, indent=2))

    while task["task_status"] not in ("success", "failure"):
-        response = await client.get(f"/v1alpha/status/poll/{task['task_id']}")
+        response = await client.get(
+            f"/v1/status/poll/{task['task_id']}", headers=auth_headers
+        )
        assert response.status_code == 200, "Response should be 200 OK"
        task = response.json()
        print(f"{task['task_status']=}")
@@ -68,52 +86,62 @@ async def convert_file(client: AsyncClient):


@pytest.mark.asyncio
-async def test_clear_results(client: AsyncClient):
+async def test_clear_results(client: AsyncClient, auth_headers: dict):
    """Test removal of task."""

    # Set long delay deletion
    docling_serve_settings.result_removal_delay = 100

    # Convert and wait for completion
-    task = await convert_file(client)
+    task = await convert_file(client, auth_headers=auth_headers)

    # Get result once
-    result_response = await client.get(f"/v1alpha/result/{task['task_id']}")
+    result_response = await client.get(
+        f"/v1/result/{task['task_id']}", headers=auth_headers
+    )
    assert result_response.status_code == 200, "Response should be 200 OK"
    print("Result 1 ok.")
    result = result_response.json()
    assert result["document"]["json_content"]["schema_name"] == "DoclingDocument"

    # Get result twice
-    result_response = await client.get(f"/v1alpha/result/{task['task_id']}")
+    result_response = await client.get(
+        f"/v1/result/{task['task_id']}", headers=auth_headers
+    )
    assert result_response.status_code == 200, "Response should be 200 OK"
    print("Result 2 ok.")
    result = result_response.json()
    assert result["document"]["json_content"]["schema_name"] == "DoclingDocument"

    # Clear
-    clear_response = await client.get("/v1alpha/clear/results?older_then=0")
+    clear_response = await client.get(
+        "/v1/clear/results?older_then=0", headers=auth_headers
+    )
    assert clear_response.status_code == 200, "Response should be 200 OK"
    print("Clear ok.")

    # Get deleted result
-    result_response = await client.get(f"/v1alpha/result/{task['task_id']}")
+    result_response = await client.get(
+        f"/v1/result/{task['task_id']}", headers=auth_headers
+    )
    assert result_response.status_code == 404, "Response should be removed"
    print("Result was no longer found.")


@pytest.mark.asyncio
-async def test_delay_remove(client: AsyncClient):
+async def test_delay_remove(client: AsyncClient, auth_headers: dict):
    """Test automatic removal of task with delay."""

    # Set short delay deletion
    docling_serve_settings.result_removal_delay = 5

    # Convert and wait for completion
-    task = await convert_file(client)
+    task = await convert_file(client, auth_headers=auth_headers)

    # Get result once
-    result_response = await client.get(f"/v1alpha/result/{task['task_id']}")
+    result_response = await client.get(
+        f"/v1/result/{task['task_id']}", headers=auth_headers
+    )
    assert result_response.status_code == 200, "Response should be 200 OK"
    print("Result ok.")
    result = result_response.json()
@@ -123,5 +151,7 @@ async def test_delay_remove(client: AsyncClient):
    await asyncio.sleep(10)

    # Get deleted result
-    result_response = await client.get(f"/v1alpha/result/{task['task_id']}")
+    result_response = await client.get(
+        f"/v1/result/{task['task_id']}", headers=auth_headers
+    )
    assert result_response.status_code == 404, "Response should be removed"
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
github-actions[bot]	40c7f1bcd3	chore: bump version to 1.4.0 [skip ci]	2025-09-05 17:57:08 +00:00
Michele Dolfi	d64a2a974a	feat(docling): perfomance improvements in parsing, new layout model, fixes in html processing (#352 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-09-05 16:21:29 +02:00
Tiago Santana	0d4545a65a	docs: add split processing example (#303 ) Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>	2025-09-04 10:42:11 +02:00
Rui Dias Gomes	fe98338239	ci: fix runner disk space issue (#350 ) Signed-off-by: Rui Dias Gomes <66125272+rmdg88@users.noreply.github.com>	2025-09-04 09:17:19 +02:00
Michele Dolfi	b844ce737e	ci: remove mdlint (#348 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-09-03 15:42:55 +02:00
Antonio Pisano	27fdd7b85a	docs: document DOCLING_NUM_THREADS environment variable (#341 ) Signed-off-by: Antonio Pisano <antonio.pisano@wu.ac.at> Co-authored-by: Antonio Pisano <antonio.pisano@wu.ac.at>	2025-09-03 11:00:28 +02:00
Rui Dias Gomes	1df62adf01	ci: workflow improvements (#310 ) Signed-off-by: rmdg88 <rmdg88@gmail.com> Signed-off-by: Rui Dias Gomes <66125272+rmdg88@users.noreply.github.com>	2025-09-03 10:06:30 +02:00
Michele Dolfi	e5449472b2	fix: upgrade to latest docling version with fixes (#335 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-25 10:55:43 +02:00
Michele Dolfi	81f0a8ddf8	docs: fix parameters typo (#333 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-22 14:59:12 +02:00
Michele Dolfi	a69cc867f5	docs: Describe how to use Docling MCP (#332 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-22 14:56:08 +02:00
github-actions[bot]	624f65d41b	chore: bump version to 1.3.1 [skip ci]	2025-08-21 07:01:51 +00:00
Michele Dolfi	f02dbc0144	fix: configuration and performance fixes via upgrade of packages (#328 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-20 20:40:52 +02:00
Michele Dolfi	37fe02277b	docs: fix parameter in api key docs (#323 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-15 11:00:05 +02:00
github-actions[bot]	783ada0580	chore: bump version to 1.3.0 [skip ci]	2025-08-14 14:26:57 +00:00
VIktor Kuropiantnyk	71edf41849	docs: example of docling-serve deployment in the RQ engine mode (#321 ) Signed-off-by: Viktor Kuropiatnyk <vku@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-14 16:10:39 +02:00
Michele Dolfi	9a64410552	feat: Add configuration option for apikey security (#322 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-14 15:25:53 +02:00
Michele Dolfi	6e9aa8c759	docs: handling models in docling-serve (#319 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-14 09:12:04 +02:00
Michele Dolfi	885f319d3a	feat: Add RQ engine (#315 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-14 08:48:31 +02:00
Tiago Santana	d584895e11	docs: add Gradio cache usage (#312 ) Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>	2025-08-13 16:49:54 +02:00
github-actions[bot]	d26e6637d8	chore: bump version to 1.2.2 [skip ci]	2025-08-13 14:48:17 +00:00
VIktor Kuropiantnyk	7692eb2600	fix: update of transformers module to 4.55.1 (#316 ) Signed-off-by: Viktor Kuropiatnyk <vku@zurich.ibm.com>	2025-08-13 16:07:52 +02:00
github-actions[bot]	3bd7828570	chore: bump version to 1.2.1 [skip ci]	2025-08-13 07:37:55 +00:00
Michele Dolfi	8b470cba8e	fix: handling of vlm model options and update deps (#314 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-13 09:32:21 +02:00
Tiago Santana	8048f4589a	fix: add missing response type in sync endpoints (#309 ) Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>	2025-08-08 12:32:19 +02:00
Thomas Vitale	b3058e91e0	docs: Update readme to use v1 (#306 ) Signed-off-by: Thomas Vitale <ThomasVitale@users.noreply.github.com>	2025-08-08 09:02:29 +02:00
Thomas Vitale	63da9eedeb	docs: Update deployment examples to use v1 API (#308 ) Signed-off-by: Thomas Vitale <ThomasVitale@users.noreply.github.com>	2025-08-08 08:47:59 +02:00
Thomas Vitale	b15dc2529f	docs: Fix typo in v1 migration instructions (#307 ) Signed-off-by: Thomas Vitale <ThomasVitale@users.noreply.github.com>	2025-08-08 08:44:09 +02:00
github-actions[bot]	4c7207be00	chore: bump version to 1.2.0 [skip ci]	2025-08-07 09:20:10 +00:00
Michele Dolfi	db3fdb5bc1	feat: workers without shared models and convert params (#304 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-08-07 11:16:06 +02:00
Rui Dias Gomes	fd1b987e8d	feat: add rocm image build support and fix cuda (#292 ) Signed-off-by: rmdg88 <rmdg88@gmail.com> Signed-off-by: Rui-Dias-Gomes <rui.dias.gomes@ibm.com> Co-authored-by: Rui-Dias-Gomes <rui.dias.gomes@ibm.com>	2025-07-31 14:22:42 +02:00
github-actions[bot]	ce15e0302b	chore: bump version to 1.1.0 [skip ci]	2025-07-30 15:53:01 +00:00
Michele Dolfi	ecb1874a50	feat: Add docling-mcp in the distribution (#290 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-30 15:39:11 +02:00
Michele Dolfi	1333f71c9c	fix: referenced paths relative to zip root (#289 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-30 14:49:26 +02:00
Tiago Santana	ec594d84fe	feat: add 3.0 openapi endpoint (#287 ) Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>	2025-07-30 14:08:59 +02:00
Tiago Santana	3771c1b554	feat: add new source and target (#270 ) Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>	2025-07-29 14:44:49 +02:00
github-actions[bot]	24db461b14	chore: bump version to 1.0.1 [skip ci]	2025-07-21 07:34:14 +00:00
Michele Dolfi	8706706e87	fix: docling update v2.42.0 (#277 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-21 08:47:40 +02:00
Michele Dolfi	766adb2481	docs: typo in README (#276 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-18 14:37:54 +02:00
Michele Dolfi	8222cf8955	ci: add spellchecker with custom vocabulary and fix typos (#268 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-15 14:17:35 +02:00
github-actions[bot]	b922824e5b	chore: bump version to 1.0.0 [skip ci]	2025-07-14 11:25:06 +00:00
Michele Dolfi	56e328baf7	feat!: v1 api with list of sources and target (#249 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-14 13:19:49 +02:00
Michele Dolfi	daa924a77e	feat!: use orchestrators from jobkit (#248 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-10 15:47:22 +02:00
Eugene	e63197e89e	chore: bump uv to 0.7.19 in container (#266 ) Signed-off-by: Eugene <fogaprod@gmail.com>	2025-07-10 15:10:21 +02:00
github-actions[bot]	767ce0982b	chore: bump version to 0.16.1 [skip ci]	2025-07-07 16:17:50 +00:00
Michele Dolfi	bfde1a0991	fix: upgrade deps including, docling v2.40.0 with locks in models init (#264 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-07-07 17:13:45 +02:00
VIktor Kuropiantnyk	eb3892ee14	fix: missing tesseract osd (#263 ) Signed-off-by: Viktor Kuropiatnyk <vku@zurich.ibm.com>	2025-07-07 16:36:43 +02:00
tassadarliu	93b84712b2	docs: fix typo (#259 ) Signed-off-by: tassadarliu <rhapsodyn@gmail.com>	2025-07-07 08:47:34 +02:00
Yishen Miao	c45b937064	docs: change the doc example (#258 ) Signed-off-by: Yishen Miao <mys721tx@gmail.com>	2025-07-07 08:47:21 +02:00
Francisco Arceo	50e431f30f	docs: Update typo (#247 ) Signed-off-by: Francisco Arceo <arceofrancisco@gmail.com>	2025-06-27 16:58:37 +02:00
Michele Dolfi	149a8cb1c0	fix: properly load models at boot (#244 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-06-27 12:20:38 +02:00
github-actions[bot]	5f9c20a985	chore: bump version to 0.16.0 [skip ci]	2025-06-25 09:52:08 +00:00
Michele Dolfi	80755a7d59	docs: Update example resources and improve README (#231 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-06-25 07:56:14 +02:00
Michele Dolfi	30aca92298	feat: package updates and more cuda images (#229 ) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>	2025-06-24 16:59:05 +02:00