mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 08:33:50 +00:00
Compare commits
50 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff75bab21b | ||
|
|
7a0fabae07 | ||
|
|
9ffe49a359 | ||
|
|
68772bb6f0 | ||
|
|
20ec87a63a | ||
|
|
e30f458923 | ||
|
|
03e405638f | ||
|
|
fd8e40a008 | ||
|
|
422c402bab | ||
|
|
ea090288d3 | ||
|
|
07c48edd5d | ||
|
|
a212547d28 | ||
|
|
c76daac70c | ||
|
|
7994b19b9f | ||
|
|
ec57b528ed | ||
|
|
b92c5d8899 | ||
|
|
3c9825df30 | ||
|
|
8dd0e216fd | ||
|
|
d406802f9d | ||
|
|
a92ad48b28 | ||
|
|
da2b26099d | ||
|
|
98b46eda50 | ||
|
|
7e75919ae8 | ||
|
|
c95db36438 | ||
|
|
82f8900197 | ||
|
|
ed851c95fe | ||
|
|
05df0735d3 | ||
|
|
cad1053e36 | ||
|
|
7e6d9cdef3 | ||
|
|
343b985287 | ||
|
|
c430d9b1a1 | ||
|
|
63141f1cc7 | ||
|
|
d5557fad9f | ||
|
|
36967f7f61 | ||
|
|
3b54d9b6ef | ||
|
|
4877248368 | ||
|
|
ec33a61faa | ||
|
|
663e03303a | ||
|
|
c64a450bf9 | ||
|
|
ae3b4906f1 | ||
|
|
7a351fcdea | ||
|
|
1615f977a2 | ||
|
|
1bf487b18e | ||
|
|
be7e4162af | ||
|
|
de42baf8dc | ||
|
|
4da28565a7 | ||
|
|
2a78142b96 | ||
|
|
d0e8578931 | ||
|
|
c6539c42de | ||
|
|
ddf3144512 |
40
.dockerignore
Normal file
40
.dockerignore
Normal file
@@ -0,0 +1,40 @@
|
||||
# Ignore Python cache files
|
||||
__pycache__/
|
||||
**/__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
|
||||
# Ignore virtual environments
|
||||
env/
|
||||
venv/
|
||||
|
||||
# Ignore development artifacts
|
||||
*.log
|
||||
*.db
|
||||
*.sqlite3
|
||||
|
||||
# Ignore configuration and sensitive files
|
||||
**/.env
|
||||
*.env
|
||||
*.ini
|
||||
*.cfg
|
||||
|
||||
# Ignore IDE and editor settings
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Ignore Git files
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Ignore Docker files themselves (optional if not needed in the image)
|
||||
.dockerignore
|
||||
Dockerfile*
|
||||
|
||||
# Ignore build artifacts (if applicable)
|
||||
build/
|
||||
dist/
|
||||
*.egg-info
|
||||
3
.env.example
Normal file
3
.env.example
Normal file
@@ -0,0 +1,3 @@
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
UVICORN_WORKERS=2
|
||||
UVICORN_RELOAD=True
|
||||
7
.flake8
7
.flake8
@@ -1,7 +0,0 @@
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
exclude = test/*
|
||||
max-complexity = 18
|
||||
docstring-convention = google
|
||||
ignore = W503,E203
|
||||
classmethod-decorators = classmethod,validator
|
||||
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
<!-- Thank you for contributing to Docling! -->
|
||||
|
||||
<!-- STEPS TO FOLLOW:
|
||||
1. Add a description of the changes (frequently the same as the commit description)
|
||||
2. Enter the issue number next to "Resolves #" below (if there is no tracking issue resolved, **remove that section**)
|
||||
3. Make sure the PR title follows the **Commit Message Formatting**: https://www.conventionalcommits.org/en/v1.0.0/#summary.
|
||||
-->
|
||||
|
||||
<!-- Uncomment this section with the issue number if an issue is being resolved
|
||||
**Issue resolved by this Pull Request:**
|
||||
Resolves #
|
||||
--->
|
||||
23
.github/SECURITY.md
vendored
Normal file
23
.github/SECURITY.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
# Security and Disclosure Information Policy for the Docling Project
|
||||
|
||||
The Docling team and community take security bugs seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
If you think you've identified a security issue in an Docling project repository, please DO NOT report the issue publicly via the GitHub issue tracker, etc.
|
||||
|
||||
Instead, send an email with as many details as possible to [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). This is a private mailing list for the maintainers team.
|
||||
|
||||
Please do not create a public issue.
|
||||
|
||||
## Security Vulnerability Response
|
||||
|
||||
Each report is acknowledged and analyzed by the core maintainers within 3 working days.
|
||||
|
||||
Any vulnerability information shared with core maintainers stays within the Docling project and will not be disseminated to other projects unless it is necessary to get the issue fixed.
|
||||
|
||||
After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
|
||||
|
||||
## Security Alerts
|
||||
|
||||
We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/docling-project/docling/discussions/categories/announcements).
|
||||
19
.github/actions/setup-poetry/action.yml
vendored
19
.github/actions/setup-poetry/action.yml
vendored
@@ -1,19 +0,0 @@
|
||||
name: 'Set up Poetry and install'
|
||||
description: 'Set up a specific version of Poetry and install dependencies using caching.'
|
||||
inputs:
|
||||
python-version:
|
||||
description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
|
||||
default: '3.11'
|
||||
runs:
|
||||
using: 'composite'
|
||||
steps:
|
||||
- name: Install poetry
|
||||
run: pipx install poetry==1.8.3
|
||||
shell: bash
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
cache: 'poetry'
|
||||
- name: Install dependencies
|
||||
run: poetry install --all-extras
|
||||
shell: bash
|
||||
9
.github/mergify.yml
vendored
Normal file
9
.github/mergify.yml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
merge_protections:
|
||||
- name: Enforce conventional commit
|
||||
description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/
|
||||
if:
|
||||
- base = main
|
||||
success_conditions:
|
||||
- "title ~=
|
||||
^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\
|
||||
\\))?(!)?:"
|
||||
40
.github/scripts/release.sh
vendored
Executable file
40
.github/scripts/release.sh
vendored
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e # trigger failure on error - do not remove!
|
||||
set -x # display command on output
|
||||
|
||||
if [ -z "${TARGET_VERSION}" ]; then
|
||||
>&2 echo "No TARGET_VERSION specified"
|
||||
exit 1
|
||||
fi
|
||||
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
|
||||
|
||||
# update package version
|
||||
uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
|
||||
uv lock --upgrade-package docling-serve
|
||||
|
||||
# collect release notes
|
||||
REL_NOTES=$(mktemp)
|
||||
uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"
|
||||
|
||||
# update changelog
|
||||
TMP_CHGLOG=$(mktemp)
|
||||
TARGET_TAG_NAME="v${TARGET_VERSION}"
|
||||
RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
|
||||
printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
|
||||
cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
|
||||
if [ -f "${CHGLOG_FILE}" ]; then
|
||||
printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
|
||||
fi
|
||||
mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
|
||||
|
||||
# push changes
|
||||
git config --global user.name 'github-actions[bot]'
|
||||
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
||||
git add pyproject.toml uv.lock "${CHGLOG_FILE}"
|
||||
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
|
||||
git commit -m "${COMMIT_MSG}"
|
||||
git push origin main
|
||||
|
||||
# create GitHub release (incl. Git tag)
|
||||
gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
|
||||
59
.github/workflows/cd.yml
vendored
Normal file
59
.github/workflows/cd.yml
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
name: "Run CD"
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
code-checks:
|
||||
uses: ./.github/workflows/job-checks.yml
|
||||
pre-release-check:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # for fetching tags, required for semantic-release
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --only-dev
|
||||
- name: Check version of potential release
|
||||
id: version_check
|
||||
run: |
|
||||
TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
|
||||
echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
|
||||
echo "${TRGT_VERSION}"
|
||||
- name: Check notes of potential release
|
||||
run: uv run --no-sync semantic-release changelog --unreleased
|
||||
release:
|
||||
needs: [code-checks, pre-release-check]
|
||||
if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
|
||||
environment: auto-release
|
||||
runs-on: ubuntu-latest
|
||||
concurrency: release
|
||||
steps:
|
||||
- uses: actions/create-github-app-token@v1
|
||||
id: app-token
|
||||
with:
|
||||
app-id: ${{ vars.CI_APP_ID }}
|
||||
private-key: ${{ secrets.CI_PRIVATE_KEY }}
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
token: ${{ steps.app-token.outputs.token }}
|
||||
fetch-depth: 0 # for fetching tags, required for semantic-release
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --only-dev
|
||||
- name: Run release script
|
||||
env:
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }}
|
||||
CHGLOG_FILE: CHANGELOG.md
|
||||
run: ./.github/scripts/release.sh
|
||||
shell: bash
|
||||
34
.github/workflows/checks.yml
vendored
34
.github/workflows/checks.yml
vendored
@@ -1,34 +0,0 @@
|
||||
name: Run linter checks
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
pull_request:
|
||||
branches: ["main"]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
py-lint:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11']
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/setup-poetry
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Run styling check
|
||||
run: poetry run pre-commit run --all-files
|
||||
|
||||
markdown-lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: markdownlint-cli2-action
|
||||
uses: DavidAnson/markdownlint-cli2-action@v16
|
||||
with:
|
||||
globs: "**/*.md"
|
||||
|
||||
41
.github/workflows/ci-images-dryrun.yml
vendored
Normal file
41
.github/workflows/ci-images-dryrun.yml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: Dry run docling-serve image building
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_image:
|
||||
name: Build ${{ matrix.spec.name }} container image
|
||||
strategy:
|
||||
matrix:
|
||||
spec:
|
||||
- name: docling-project/docling-serve
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cpu
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cu124
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: false
|
||||
build_args: ${{ matrix.spec.build_args }}
|
||||
ghcr_image_name: ${{ matrix.spec.name }}
|
||||
quay_image_name: ""
|
||||
platforms: ${{ matrix.spec.platforms }}
|
||||
25
.github/workflows/ci.yml
vendored
Normal file
25
.github/workflows/ci.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
name: "Run CI"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
pull_request:
|
||||
branches: ["main"]
|
||||
|
||||
jobs:
|
||||
code-checks:
|
||||
# if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling-serve' && github.event.pull_request.head.repo.full_name != 'docling-project/docling-serve') }}
|
||||
uses: ./.github/workflows/job-checks.yml
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
build-images:
|
||||
uses: ./.github/workflows/ci-images-dryrun.yml
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
105
.github/workflows/images-dryrun.yml
vendored
105
.github/workflows/images-dryrun.yml
vendored
@@ -1,105 +0,0 @@
|
||||
name: Dry run docling-serve image building
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: ["main"]
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
GHCR_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
GHCR_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
|
||||
jobs:
|
||||
build_cpu_image:
|
||||
name: Build docling-serve "CPU only" container image
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (CPU only) ghcr image
|
||||
id: ghcr_serve_cpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_CPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build docling-serve-cpu image
|
||||
id: build-serve-cpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: false
|
||||
tags: ${{ steps.ghcr_serve_cpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_cpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64, linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=true
|
||||
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
docker image prune -af
|
||||
|
||||
build_gpu_image:
|
||||
name: Build docling-serve (with GPU support) container image
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (GPU) ghcr image
|
||||
id: ghcr_serve_gpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_GPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build docling-serve (GPU) image
|
||||
id: build-serve-gpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: false
|
||||
tags: ${{ steps.ghcr_serve_gpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_gpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=false
|
||||
213
.github/workflows/images.yml
vendored
213
.github/workflows/images.yml
vendored
@@ -4,193 +4,44 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'v*'
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
GHCR_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
GHCR_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
QUAY_REGISTRY: quay.io
|
||||
QUAY_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
QUAY_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_and_publish_cpu_images:
|
||||
name: Push docling-serve "CPU only" container image to GHCR and QUAY
|
||||
runs-on: ubuntu-latest
|
||||
environment: registry-creds
|
||||
build_and_publish_images:
|
||||
name: Build and push ${{ matrix.spec.name }} container image to GHCR and QUAY
|
||||
strategy:
|
||||
matrix:
|
||||
spec:
|
||||
- name: docling-project/docling-serve
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cpu
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cu124
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
secrets: inherit
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the GHCR container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.GHCR_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Log in to the Quay container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.QUAY_REGISTRY }}
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (CPU only) ghcr image
|
||||
id: ghcr_serve_cpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_CPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve-cpu image to ghcr.io
|
||||
id: push-serve-cpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.ghcr_serve_cpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_cpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64, linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=true
|
||||
|
||||
- name: Generate artifact attestation
|
||||
uses: actions/attest-build-provenance@v1
|
||||
with:
|
||||
subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_CPU_IMAGE_NAME}}
|
||||
subject-digest: ${{ steps.push-serve-cpu-ghcr.outputs.digest }}
|
||||
push-to-registry: true
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (CPU only) quay image
|
||||
id: quay_serve_cpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_DOCLING_SERVE_CPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve-cpu image to quay.io
|
||||
id: push-serve-cpu-quay
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.quay_serve_cpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.quay_serve_cpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64, linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=true
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
docker image prune -af
|
||||
|
||||
build_and_publish_gpu_images:
|
||||
name: Push docling-serve (with GPU support) container image to GHCR and QUAY
|
||||
runs-on: ubuntu-latest
|
||||
environment: registry-creds
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the GHCR container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.GHCR_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Log in to the Quay container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.QUAY_REGISTRY }}
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (GPU) ghcr image
|
||||
id: ghcr_serve_gpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_GPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve (GPU) image to ghcr.io
|
||||
id: push-serve-gpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.ghcr_serve_gpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_gpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=false
|
||||
|
||||
- name: Generate artifact attestation
|
||||
uses: actions/attest-build-provenance@v1
|
||||
with:
|
||||
subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_GPU_IMAGE_NAME}}
|
||||
subject-digest: ${{ steps.push-serve-gpu-ghcr.outputs.digest }}
|
||||
push-to-registry: true
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (GPU) quay image
|
||||
id: quay_serve_gpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_DOCLING_SERVE_GPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve (GPU) image to quay.io
|
||||
id: push-serve-gpu-quay
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.quay_serve_gpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.quay_serve_gpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=false
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: true
|
||||
environment: registry-creds
|
||||
build_args: ${{ matrix.spec.build_args }}
|
||||
ghcr_image_name: ${{ matrix.spec.name }}
|
||||
quay_image_name: ${{ matrix.spec.name }}
|
||||
platforms: ${{ matrix.spec.platforms }}
|
||||
|
||||
29
.github/workflows/job-build.yml
vendored
Normal file
29
.github/workflows/job-build.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: Run checks
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
build-package:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.12']
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-extras --no-extra cu124
|
||||
- name: Build package
|
||||
run: uv build
|
||||
- name: Check content of wheel
|
||||
run: unzip -l dist/*.whl
|
||||
- name: Store the distribution packages
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
67
.github/workflows/job-checks.yml
vendored
Normal file
67
.github/workflows/job-checks.yml
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
name: Run checks
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
py-lint:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.12']
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
|
||||
- name: pre-commit cache key
|
||||
run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pre-commit
|
||||
key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --frozen --all-extras --no-extra cu124
|
||||
|
||||
- name: Run styling check
|
||||
run: pre-commit run --all-files
|
||||
|
||||
build-package:
|
||||
uses: ./.github/workflows/job-build.yml
|
||||
|
||||
test-package:
|
||||
needs:
|
||||
- build-package
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.12']
|
||||
steps:
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
- name: Install package
|
||||
run: uv pip install dist/*.whl
|
||||
- name: Create the server
|
||||
run: python -c 'from docling_serve.app import create_app; create_app()'
|
||||
|
||||
markdown-lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: markdownlint-cli2-action
|
||||
uses: DavidAnson/markdownlint-cli2-action@v16
|
||||
with:
|
||||
globs: "**/*.md"
|
||||
|
||||
141
.github/workflows/job-image.yml
vendored
Normal file
141
.github/workflows/job-image.yml
vendored
Normal file
@@ -0,0 +1,141 @@
|
||||
name: Build docling-serve container image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build_args:
|
||||
type: string
|
||||
description: "Extra build arguments for the build."
|
||||
default: ""
|
||||
ghcr_image_name:
|
||||
type: string
|
||||
description: "Name of the image for GHCR."
|
||||
quay_image_name:
|
||||
type: string
|
||||
description: "Name of the image Quay."
|
||||
platforms:
|
||||
type: string
|
||||
description: "Platform argument for building images."
|
||||
default: linux/amd64, linux/arm64
|
||||
publish:
|
||||
type: boolean
|
||||
description: "If true, the images will be published."
|
||||
default: false
|
||||
environment:
|
||||
type: string
|
||||
description: "GH Action environment"
|
||||
default: ""
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
QUAY_REGISTRY: quay.io
|
||||
|
||||
jobs:
|
||||
image:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
environment: ${{ inputs.environment }}
|
||||
|
||||
steps:
|
||||
- name: Free up space in github runner
|
||||
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
|
||||
run: |
|
||||
df -h
|
||||
sudo rm -rf "/usr/local/share/boost"
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
|
||||
# shellcheck disable=SC2046
|
||||
sudo docker rmi "$(docker image ls -aq)" >/dev/null 2>&1 || true
|
||||
df -h
|
||||
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the GHCR container image registry
|
||||
if: ${{ inputs.publish }}
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.GHCR_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Log in to the Quay container image registry
|
||||
if: ${{ inputs.publish }}
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.QUAY_REGISTRY }}
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve ghcr image
|
||||
id: ghcr_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}
|
||||
|
||||
- name: Build and push image to ghcr.io
|
||||
id: ghcr_push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: ${{ inputs.publish }}
|
||||
tags: ${{ steps.ghcr_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_meta.outputs.labels }}
|
||||
platforms: ${{ inputs.platforms}}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
|
||||
- name: Generate artifact attestation
|
||||
if: ${{ inputs.publish }}
|
||||
uses: actions/attest-build-provenance@v1
|
||||
with:
|
||||
subject-name: ${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}
|
||||
subject-digest: ${{ steps.ghcr_push.outputs.digest }}
|
||||
push-to-registry: true
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve quay image
|
||||
if: ${{ inputs.publish }}
|
||||
id: quay_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.QUAY_REGISTRY }}/${{ inputs.quay_image_name }}
|
||||
|
||||
- name: Build and push image to quay.io
|
||||
if: ${{ inputs.publish }}
|
||||
# id: push-serve-cpu-quay
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: ${{ inputs.publish }}
|
||||
tags: ${{ steps.quay_meta.outputs.tags }}
|
||||
labels: ${{ steps.quay_meta.outputs.labels }}
|
||||
platforms: ${{ inputs.platforms}}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
|
||||
# - name: Inspect the image details
|
||||
# run: |
|
||||
# echo "${{ steps.ghcr_push.outputs.metadata }}"
|
||||
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
docker image prune -af
|
||||
34
.github/workflows/pypi.yml
vendored
Normal file
34
.github/workflows/pypi.yml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
name: "Build and publish package"
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
build-package:
|
||||
uses: ./.github/workflows/job-build.yml
|
||||
|
||||
build-and-publish:
|
||||
needs:
|
||||
- build-package
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/docling-serve # Replace <package-name> with your PyPI project name
|
||||
permissions:
|
||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||
steps:
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
# currently not working with reusable workflows
|
||||
attestations: false
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,7 @@
|
||||
model_artifacts/
|
||||
scratch/
|
||||
.md-lint
|
||||
actionlint
|
||||
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||
|
||||
@@ -2,5 +2,9 @@ config:
|
||||
line-length: false
|
||||
no-emphasis-as-header: false
|
||||
first-line-heading: false
|
||||
MD033:
|
||||
allowed_elements: ["details", "summary", "br", "a", "p", "img"]
|
||||
MD024:
|
||||
siblings_only: true
|
||||
globs:
|
||||
- "**/*.md"
|
||||
|
||||
@@ -1,41 +1,28 @@
|
||||
fail_fast: true
|
||||
repos:
|
||||
- repo: local
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.6
|
||||
hooks:
|
||||
- id: system
|
||||
name: Black
|
||||
entry: poetry run black docling_serve tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: isort
|
||||
entry: poetry run isort docling_serve tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: flake8
|
||||
entry: poetry run flake8 docling_serve
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
# Run the Ruff formatter.
|
||||
- id: ruff-format
|
||||
name: "Ruff formatter"
|
||||
args: [--config=pyproject.toml]
|
||||
files: '^(docling_serve|tests).*\.(py|ipynb)$'
|
||||
# Run the Ruff linter.
|
||||
- id: ruff
|
||||
name: "Ruff linter"
|
||||
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||
files: '^(docling_serve|tests).*\.(py|ipynb)$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: MyPy
|
||||
entry: poetry run mypy docling_serve
|
||||
entry: uv run --no-sync mypy docling_serve
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
# uv version.
|
||||
rev: 0.6.1
|
||||
hooks:
|
||||
- id: system
|
||||
name: Poetry check
|
||||
entry: poetry check --lock
|
||||
pass_filenames: false
|
||||
language: system
|
||||
- id: uv-lock
|
||||
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.12
|
||||
66
CHANGELOG.md
Normal file
66
CHANGELOG.md
Normal file
@@ -0,0 +1,66 @@
|
||||
## [v0.7.0](https://github.com/docling-project/docling-serve/releases/tag/v0.7.0) - 2025-03-31
|
||||
|
||||
### Feature
|
||||
|
||||
* Expose TLS settings and example deploy with oauth-proxy ([#112](https://github.com/docling-project/docling-serve/issues/112)) ([`7a0faba`](https://github.com/docling-project/docling-serve/commit/7a0fabae07020c2659dbb22c3b0359909051a74c))
|
||||
* Offline static files ([#109](https://github.com/docling-project/docling-serve/issues/109)) ([`68772bb`](https://github.com/docling-project/docling-serve/commit/68772bb6f0a87b71094a08ff851f5754c6ca6163))
|
||||
* Update to Docling 2.28 ([#106](https://github.com/docling-project/docling-serve/issues/106)) ([`20ec87a`](https://github.com/docling-project/docling-serve/commit/20ec87a63a99145bc0ad7931549af8a0c30db641))
|
||||
|
||||
### Fix
|
||||
|
||||
* Move ARGs to prevent cache invalidation ([#104](https://github.com/docling-project/docling-serve/issues/104)) ([`e30f458`](https://github.com/docling-project/docling-serve/commit/e30f458923d34c169db7d5a5c296848716e8cac4))
|
||||
|
||||
## [v0.6.0](https://github.com/docling-project/docling-serve/releases/tag/v0.6.0) - 2025-03-17
|
||||
|
||||
### Feature
|
||||
|
||||
* Expose options for new features ([#92](https://github.com/docling-project/docling-serve/issues/92)) ([`ec57b52`](https://github.com/docling-project/docling-serve/commit/ec57b528ed3f8e7b9604ff4cdf06da3d52c714dd))
|
||||
|
||||
### Fix
|
||||
|
||||
* Allow changes in CORS settings ([#100](https://github.com/docling-project/docling-serve/issues/100)) ([`422c402`](https://github.com/docling-project/docling-serve/commit/422c402bab7f05e46274ede11f234a19a62e093e))
|
||||
* Avoid exploding options cache using lru and expose size parameter ([#101](https://github.com/docling-project/docling-serve/issues/101)) ([`ea09028`](https://github.com/docling-project/docling-serve/commit/ea090288d3eec4ea8fbdcd32a6a497a99c89189d))
|
||||
* Increase timeout_keep_alive and allow parameter changes ([#98](https://github.com/docling-project/docling-serve/issues/98)) ([`07c48ed`](https://github.com/docling-project/docling-serve/commit/07c48edd5d9437219d9623e3d05bc5166c5bb85a))
|
||||
* Add warning when using incompatible parameters ([#99](https://github.com/docling-project/docling-serve/issues/99)) ([`a212547`](https://github.com/docling-project/docling-serve/commit/a212547d28d6588c65e52000dc7bc04f3f77e69e))
|
||||
* **ui:** Use --port parameter and avoid failing when image is not found ([#97](https://github.com/docling-project/docling-serve/issues/97)) ([`c76daac`](https://github.com/docling-project/docling-serve/commit/c76daac70c87da412f791666881e48b74688b060))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Simplify README and move details to docs ([#102](https://github.com/docling-project/docling-serve/issues/102)) ([`fd8e40a`](https://github.com/docling-project/docling-serve/commit/fd8e40a00849771263d9b75b9a56f6caeccb8517))
|
||||
|
||||
## [v0.5.1](https://github.com/docling-project/docling-serve/releases/tag/v0.5.1) - 2025-03-10
|
||||
|
||||
### Fix
|
||||
|
||||
* Submodules in wheels ([#85](https://github.com/docling-project/docling-serve/issues/85)) ([`a92ad48`](https://github.com/docling-project/docling-serve/commit/a92ad48b287bfcb134011dc0fc3f91ee04e067ee))
|
||||
|
||||
## [v0.5.0](https://github.com/docling-project/docling-serve/releases/tag/v0.5.0) - 2025-03-07
|
||||
|
||||
### Feature
|
||||
|
||||
* Async api ([#60](https://github.com/docling-project/docling-serve/issues/60)) ([`82f8900`](https://github.com/docling-project/docling-serve/commit/82f890019745859699c1b01f9ccfb64cb7e37906))
|
||||
* Display version in fastapi docs ([#78](https://github.com/docling-project/docling-serve/issues/78)) ([`ed851c9`](https://github.com/docling-project/docling-serve/commit/ed851c95fee5f59305ddc3dcd5c09efce618470b))
|
||||
|
||||
### Fix
|
||||
|
||||
* Remove uv from image, merge ARG and ENV declarations ([#57](https://github.com/docling-project/docling-serve/issues/57)) ([`c95db36`](https://github.com/docling-project/docling-serve/commit/c95db3643807a4dfb96d93c8e10d6eb486c49a30))
|
||||
* **docs:** Remove comma in convert/source curl example ([#73](https://github.com/docling-project/docling-serve/issues/73)) ([`05df073`](https://github.com/docling-project/docling-serve/commit/05df0735d35a589bdc2a11fcdd764a10f700cb6f))
|
||||
|
||||
## [v0.4.0](https://github.com/docling-project/docling-serve/releases/tag/v0.4.0) - 2025-02-26
|
||||
|
||||
### Feature
|
||||
|
||||
* New container images ([#68](https://github.com/docling-project/docling-serve/issues/68)) ([`7e6d9cd`](https://github.com/docling-project/docling-serve/commit/7e6d9cdef398df70a5b4d626aeb523c428c10d56))
|
||||
* Render DoclingDocument with npm docling-components in the example UI ([#65](https://github.com/docling-project/docling-serve/issues/65)) ([`c430d9b`](https://github.com/docling-project/docling-serve/commit/c430d9b1a162ab29104d86ebaa1ac5a5488b1f09))
|
||||
|
||||
## [v0.3.0](https://github.com/docling-project/docling-serve/releases/tag/v0.3.0) - 2025-02-19
|
||||
|
||||
### Feature
|
||||
|
||||
* Add new docling-serve cli ([#50](https://github.com/docling-project/docling-serve/issues/50)) ([`ec33a61`](https://github.com/docling-project/docling-serve/commit/ec33a61faa7846b9b7998fbf557ebe39a3b800f6))
|
||||
|
||||
### Fix
|
||||
|
||||
* Set DOCLING_SERVE_ARTIFACTS_PATH in images ([#53](https://github.com/docling-project/docling-serve/issues/53)) ([`4877248`](https://github.com/docling-project/docling-serve/commit/487724836896576ca4f98e84abf15fd1c383bec8))
|
||||
* Set root UI path when behind proxy ([#38](https://github.com/docling-project/docling-serve/issues/38)) ([`c64a450`](https://github.com/docling-project/docling-serve/commit/c64a450bf9ba9947ab180e92bef2763ff710b210))
|
||||
* Support python 3.13 and docling updates and switch to uv ([#48](https://github.com/docling-project/docling-serve/issues/48)) ([`ae3b490`](https://github.com/docling-project/docling-serve/commit/ae3b4906f1c0829b1331ea491f3518741cabff71))
|
||||
@@ -3,13 +3,13 @@
|
||||
Our project welcomes external contributions. If you have an itch, please feel
|
||||
free to scratch it.
|
||||
|
||||
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling-serve/pulls).
|
||||
To contribute code or documentation, please submit a [pull request](https://github.com/docling-project/docling-serve/pulls).
|
||||
|
||||
A good way to familiarize yourself with the codebase and contribution process is
|
||||
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling-serve/issues).
|
||||
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/docling-project/docling-serve/issues).
|
||||
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
|
||||
|
||||
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling-serve/discussions).
|
||||
For general questions or support requests, please refer to the [discussion section](https://github.com/docling-project/docling-serve/discussions).
|
||||
|
||||
**Note: We appreciate your effort, and want to avoid a situation where a contribution
|
||||
requires extensive rework (by you or by us), sits in backlog for a long time, or
|
||||
@@ -17,14 +17,14 @@ cannot be accepted at all!**
|
||||
|
||||
### Proposing new features
|
||||
|
||||
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling-serve/issues)
|
||||
If you would like to implement a new feature, please [raise an issue](https://github.com/docling-project/docling-serve/issues)
|
||||
before sending a pull request so the feature can be discussed. This is to avoid
|
||||
you wasting your valuable time working on a feature that the project developers
|
||||
are not interested in accepting into the code base.
|
||||
|
||||
### Fixing bugs
|
||||
|
||||
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling-serve/issues) before sending a
|
||||
If you would like to fix a bug, please [raise an issue](https://github.com/docling-project/docling-serve/issues) before sending a
|
||||
pull request so it can be tracked.
|
||||
|
||||
### Merge approval
|
||||
@@ -73,7 +73,7 @@ git commit -s
|
||||
|
||||
## Communication
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling-serve/discussions).
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling-serve/discussions).
|
||||
|
||||
## Developing
|
||||
|
||||
@@ -142,8 +142,7 @@ poetry add NAME
|
||||
|
||||
We use the following tools to enforce code style:
|
||||
|
||||
- iSort, to sort imports
|
||||
- Black, to format code
|
||||
- ruff, to sort imports and format code
|
||||
|
||||
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
|
||||
|
||||
@@ -157,4 +156,4 @@ To run the checks on-demand, run:
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||
Note: Formatting checks like `ruff` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||
|
||||
@@ -1,32 +1,67 @@
|
||||
FROM python:3.11-slim-bookworm
|
||||
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
|
||||
|
||||
ARG CPU_ONLY=false
|
||||
WORKDIR /docling-serve
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||
&& apt-get clean
|
||||
USER 0
|
||||
|
||||
RUN pip install --no-cache-dir poetry
|
||||
###################################################################################################
|
||||
# OS Layer #
|
||||
###################################################################################################
|
||||
|
||||
COPY pyproject.toml poetry.lock README.md /docling-serve/
|
||||
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
|
||||
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
|
||||
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
|
||||
dnf config-manager --enable crb && \
|
||||
dnf -y update && \
|
||||
dnf install -y $(cat /tmp/os-packages.txt) && \
|
||||
dnf -y clean all && \
|
||||
rm -rf /var/cache/dnf
|
||||
|
||||
RUN if [ "$CPU_ONLY" = "true" ]; then \
|
||||
poetry install --no-root --with cpu; \
|
||||
else \
|
||||
poetry install --no-root; \
|
||||
fi
|
||||
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
|
||||
ENV HF_HOME=/tmp/
|
||||
ENV TORCH_HOME=/tmp/
|
||||
###################################################################################################
|
||||
# Docling layer #
|
||||
###################################################################################################
|
||||
|
||||
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
|
||||
USER 1001
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
WORKDIR /opt/app-root/src
|
||||
|
||||
COPY ./docling_serve /docling-serve/docling_serve
|
||||
ENV \
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
OMP_NUM_THREADS=4 \
|
||||
LANG=en_US.UTF-8 \
|
||||
LC_ALL=en_US.UTF-8 \
|
||||
PYTHONIOENCODING=utf-8 \
|
||||
UV_COMPILE_BYTECODE=1 \
|
||||
UV_LINK_MODE=copy \
|
||||
UV_PROJECT_ENVIRONMENT=/opt/app-root \
|
||||
DOCLING_SERVE_ARTIFACTS_PATH=/opt/app-root/src/.cache/docling/models
|
||||
|
||||
ARG UV_SYNC_EXTRA_ARGS=""
|
||||
|
||||
RUN --mount=from=ghcr.io/astral-sh/uv:0.6.1,source=/uv,target=/bin/uv \
|
||||
--mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --frozen --no-install-project --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS}
|
||||
|
||||
ARG MODELS_LIST="layout tableformer picture_classifier easyocr"
|
||||
|
||||
RUN echo "Downloading models..." && \
|
||||
HF_HUB_DOWNLOAD_TIMEOUT="90" \
|
||||
HF_HUB_ETAG_TIMEOUT="90" \
|
||||
docling-tools models download -o "${DOCLING_SERVE_ARTIFACTS_PATH}" ${MODELS_LIST} && \
|
||||
chown -R 1001:0 /opt/app-root/src/.cache && \
|
||||
chmod -R g=u /opt/app-root/src/.cache
|
||||
|
||||
COPY --chown=1001:0 ./docling_serve ./docling_serve
|
||||
RUN --mount=from=ghcr.io/astral-sh/uv:0.6.1,source=/uv,target=/bin/uv \
|
||||
--mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS}
|
||||
|
||||
EXPOSE 5001
|
||||
|
||||
CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
|
||||
CMD ["docling-serve", "run"]
|
||||
|
||||
57
Makefile
57
Makefile
@@ -24,19 +24,26 @@ action-lint-file:
|
||||
md-lint-file:
|
||||
$(CMD_PREFIX) touch .markdown-lint
|
||||
|
||||
.PHONY: docling-serve-cpu-image
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" continaer image
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU ONLY]"
|
||||
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=true -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
|
||||
.PHONY: docling-serve-image
|
||||
docling-serve-image: Containerfile
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu" -f Containerfile -t ghcr.io/docling-project/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve:$(TAG) ghcr.io/docling-project/docling-serve:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve:$(TAG) quay.io/docling-project/docling-serve:main
|
||||
|
||||
.PHONY: docling-serve-gpu-image
|
||||
docling-serve-gpu-image: Containerfile ## Build docling-serve continaer image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with GPU]"
|
||||
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=false -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) quay.io/ds4sd/docling-serve:main
|
||||
.PHONY: docling-serve-cpu-image
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124" -f Containerfile -t ghcr.io/docling-project/docling-serve-cpu:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cpu:$(TAG) ghcr.io/docling-project/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cpu:$(TAG) quay.io/docling-project/docling-serve-cpu:main
|
||||
|
||||
.PHONY: docling-serve-cu124-image
|
||||
docling-serve-cu124-image: Containerfile ## Build docling-serve container image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with Cuda 12.4]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cpu" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-cu124:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu124:$(TAG) ghcr.io/docling-project/docling-serve-cu124:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu124:$(TAG) quay.io/docling-project/docling-serve-cu124:main
|
||||
|
||||
.PHONY: action-lint
|
||||
action-lint: .action-lint ## Lint GitHub Action workflows
|
||||
@@ -59,17 +66,29 @@ action-lint: .action-lint ## Lint GitHub Action workflows
|
||||
md-lint: .md-lint ## Lint markdown files
|
||||
.md-lint: $(wildcard */**/*.md) | md-lint-file
|
||||
$(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]"
|
||||
$(CMD_PREFIX) docker run --rm -v $$(pwd):/workdir davidanson/markdownlint-cli2:v0.14.0 "**/*.md"
|
||||
$(CMD_PREFIX) docker run --rm -v $$(pwd):/workdir davidanson/markdownlint-cli2:v0.16.0 "**/*.md" "#.venv"
|
||||
$(CMD_PREFIX) touch $@
|
||||
|
||||
|
||||
.PHONY: py-Lint
|
||||
py-lint: ## Lint Python files
|
||||
$(ECHO_PREFIX) printf " %-12s ./...\n" "[PY LINT]"
|
||||
$(CMD_PREFIX) if ! which poetry $(PIPE_DEV_NULL) ; then \
|
||||
echo "Please install poetry." ; \
|
||||
echo "pip install poetry" ; \
|
||||
$(CMD_PREFIX) if ! which uv $(PIPE_DEV_NULL) ; then \
|
||||
echo "Please install uv." ; \
|
||||
exit 1 ; \
|
||||
fi
|
||||
$(CMD_PREFIX) poetry install --all-extras
|
||||
$(CMD_PREFIX) poetry run pre-commit run --all-files
|
||||
$(CMD_PREFIX) uv sync --extra ui
|
||||
$(CMD_PREFIX) uv run pre-commit run --all-files
|
||||
|
||||
.PHONY: run-docling-cpu
|
||||
run-docling-cpu: ## Run the docling-serve container with CPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-cpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with CPU support on port 5001...\n" "[RUN CPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-cpu -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:main
|
||||
|
||||
.PHONY: run-docling-gpu
|
||||
run-docling-gpu: ## Run the docling-serve container with GPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-gpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN GPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-gpu -p 5001:5001 ghcr.io/docling-project/docling-serve:main
|
||||
|
||||
118
README.md
118
README.md
@@ -1,56 +1,106 @@
|
||||
<p align="center">
|
||||
<a href="https://github.com/docling-project/docling-serve">
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling-serve/raw/main/docs/assets/docling-serve-pic.png" width="30%"/>
|
||||
</a>
|
||||
</p>
|
||||
|
||||
# Docling Serve
|
||||
|
||||
Running [Docling](https://github.com/DS4SD/docling) as an API service.
|
||||
Running [Docling](https://github.com/docling-project/docling) as an API service.
|
||||
|
||||
> [!NOTE]
|
||||
> This is an unstable draft implementation which will quickly evolve.
|
||||
## Getting started
|
||||
|
||||
## Development
|
||||
Install the `docling-serve` package and run the server.
|
||||
|
||||
Install the dependencies
|
||||
```bash
|
||||
# Using the python package
|
||||
pip install "docling-serve"
|
||||
docling-serve run
|
||||
|
||||
```sh
|
||||
# Install poetry if not already available
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
# Install dependencies
|
||||
poetry install
|
||||
|
||||
# Run the server
|
||||
poetry run uvicorn docling_serve.app:app --reload
|
||||
# Using container images, e.g. with Podman
|
||||
podman run -p 5001:5001 quay.io/docling-project/docling-serve
|
||||
```
|
||||
|
||||
Example payload (http source):
|
||||
The server is available at
|
||||
|
||||
```sh
|
||||
- API <http://127.0.0.1:5001>
|
||||
- API documentation <http://127.0.0.1:5001/docs>
|
||||

|
||||
|
||||
Try it out with a simple conversion:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/convert' \
|
||||
'http://localhost:5001/v1alpha/convert/source' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"http_source": {
|
||||
"url": "https://arxiv.org/pdf/2206.01062"
|
||||
}
|
||||
}'
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
|
||||
}'
|
||||
```
|
||||
|
||||
### Cuda GPU Support
|
||||
### Container images
|
||||
|
||||
For GPU support try the following:
|
||||
Available container images:
|
||||
|
||||
```sh
|
||||
# Create a virtual env
|
||||
python3 -m venv venv
|
||||
| Name | Description | Arch | Size |
|
||||
| -----|-------------|------|------|
|
||||
| [`ghcr.io/docling-project/docling-serve`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve) <br /> [`quay.io/docling-project/docling-serve`](https://quay.io/repository/docling-project/docling-serve) | Simple image for Docling Serve, installing all packages from the official pypi.org index. | `linux/amd64`, `linux/arm64` | 3.6 GB |
|
||||
| [`ghcr.io/docling-project/docling-serve-cpu`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cpu) <br /> [`quay.io/docling-project/docling-serve-cpu`](https://quay.io/repository/docling-project/docling-serve-cpu) | Cpu-only image which installs `torch` from the pytorch cpu index. | `linux/amd64`, `linux/arm64` | 3.6 GB |
|
||||
| [`ghcr.io/docling-project/docling-serve-cu124`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cu124) <br /> [`quay.io/docling-project/docling-serve-cu124`](https://quay.io/repository/docling-project/docling-serve-cu124) | Cuda 12.4 image which installs `torch` from the pytorch cu124 index. | `linux/amd64` | 8.7 GB |
|
||||
|
||||
# Activate the venv
|
||||
source venv/bin/active
|
||||
Coming soon: `docling-serve-slim` images will reduce the size by skipping the model weights download.
|
||||
|
||||
# Install torch with the special index
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
### Demonstration UI
|
||||
|
||||
# Install the package
|
||||
pip install -e .
|
||||
```bash
|
||||
# Install the Python package with the extra dependencies
|
||||
pip install "docling-serve[ui]"
|
||||
docling-serve run --enable-ui
|
||||
|
||||
# Run the server
|
||||
poetry run uvicorn docling_serve.app:app --reload
|
||||
# Run the container image with the extra env parameters
|
||||
podman run -p 5001:5001 -e DOCLING_SERVE_ENABLE_UI=true quay.io/docling-project/docling-serve
|
||||
```
|
||||
|
||||
An easy to use UI is available at the `/ui` endpoint.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## Documentation and advance usages
|
||||
|
||||
Visit the [Docling Serve documentation](./docs/README.md) for learning how to [configure the webserver](./docs/configuration.md), use all the [runtime options](./docs/usage.md) of the API and [deployment examples](./docs/deployment.md).
|
||||
|
||||
## Get help and support
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling Serve](https://github.com/docling-project/docling-serve/blob/main/CONTRIBUTING.md) for details.
|
||||
|
||||
## References
|
||||
|
||||
If you use Docling in your projects, please consider citing the following:
|
||||
|
||||
```bib
|
||||
@techreport{Docling,
|
||||
author = {Docling Contributors},
|
||||
month = {1},
|
||||
title = {Docling: An Efficient Open-Source Toolkit for AI-driven Document Conversion},
|
||||
url = {https://arxiv.org/abs/2501.17887},
|
||||
eprint = {2501.17887},
|
||||
doi = {10.48550/arXiv.2501.17887},
|
||||
version = {2.0.0},
|
||||
year = {2025}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The Docling Serve codebase is under MIT license.
|
||||
|
||||
## IBM ❤️ Open Source AI
|
||||
|
||||
Docling has been brought to you by IBM.
|
||||
|
||||
366
docling_serve/__main__.py
Normal file
366
docling_serve/__main__.py
Normal file
@@ -0,0 +1,366 @@
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import platform
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Optional, Union
|
||||
|
||||
import typer
|
||||
import uvicorn
|
||||
from rich.console import Console
|
||||
|
||||
from docling_serve.settings import docling_serve_settings, uvicorn_settings
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
|
||||
|
||||
err_console = Console(stderr=True)
|
||||
console = Console()
|
||||
|
||||
app = typer.Typer(
|
||||
no_args_is_help=True,
|
||||
rich_markup_mode="rich",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def version_callback(value: bool) -> None:
|
||||
if value:
|
||||
docling_serve_version = importlib.metadata.version("docling_serve")
|
||||
docling_version = importlib.metadata.version("docling")
|
||||
docling_core_version = importlib.metadata.version("docling-core")
|
||||
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
||||
docling_parse_version = importlib.metadata.version("docling-parse")
|
||||
platform_str = platform.platform()
|
||||
py_impl_version = sys.implementation.cache_tag
|
||||
py_lang_version = platform.python_version()
|
||||
console.print(f"Docling Serve version: {docling_serve_version}")
|
||||
console.print(f"Docling version: {docling_version}")
|
||||
console.print(f"Docling Core version: {docling_core_version}")
|
||||
console.print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
||||
console.print(f"Docling Parse version: {docling_parse_version}")
|
||||
console.print(f"Python: {py_impl_version} ({py_lang_version})")
|
||||
console.print(f"Platform: {platform_str}")
|
||||
raise typer.Exit()
|
||||
|
||||
|
||||
@app.callback()
|
||||
def callback(
|
||||
version: Annotated[
|
||||
Union[bool, None],
|
||||
typer.Option(help="Show the version and exit.", callback=version_callback),
|
||||
] = None,
|
||||
verbose: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--verbose",
|
||||
"-v",
|
||||
count=True,
|
||||
help="Set the verbosity level. -v for info logging, -vv for debug logging.",
|
||||
),
|
||||
] = 0,
|
||||
) -> None:
|
||||
if verbose == 0:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
elif verbose == 1:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
elif verbose == 2:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
def _run(
|
||||
*,
|
||||
command: str,
|
||||
# Docling serve parameters
|
||||
artifacts_path: Path | None,
|
||||
enable_ui: bool,
|
||||
) -> None:
|
||||
server_type = "development" if command == "dev" else "production"
|
||||
|
||||
console.print(f"Starting {server_type} server 🚀")
|
||||
|
||||
run_subprocess = (
|
||||
uvicorn_settings.workers is not None and uvicorn_settings.workers > 1
|
||||
) or uvicorn_settings.reload
|
||||
|
||||
run_ssl = (
|
||||
uvicorn_settings.ssl_certfile is not None
|
||||
and uvicorn_settings.ssl_keyfile is not None
|
||||
)
|
||||
|
||||
if run_subprocess and docling_serve_settings.artifacts_path != artifacts_path:
|
||||
err_console.print(
|
||||
"\n[yellow]:warning: The server will run with reload or multiple workers. \n"
|
||||
"The argument [bold]--artifacts-path[/bold] will be ignored, please set the value \n"
|
||||
"using the environment variable [bold]DOCLING_SERVE_ARTIFACTS_PATH[/bold].[/yellow]"
|
||||
)
|
||||
|
||||
if run_subprocess and docling_serve_settings.enable_ui != enable_ui:
|
||||
err_console.print(
|
||||
"\n[yellow]:warning: The server will run with reload or multiple workers. \n"
|
||||
"The argument [bold]--enable-ui[/bold] will be ignored, please set the value \n"
|
||||
"using the environment variable [bold]DOCLING_SERVE_ENABLE_UI[/bold].[/yellow]"
|
||||
)
|
||||
|
||||
# Propagate the settings to the app settings
|
||||
docling_serve_settings.artifacts_path = artifacts_path
|
||||
docling_serve_settings.enable_ui = enable_ui
|
||||
|
||||
# Print documentation
|
||||
protocol = "https" if run_ssl else "http"
|
||||
url = f"{protocol}://{uvicorn_settings.host}:{uvicorn_settings.port}"
|
||||
url_docs = f"{url}/docs"
|
||||
url_ui = f"{url}/ui"
|
||||
|
||||
console.print("")
|
||||
console.print(f"Server started at [link={url}]{url}[/]")
|
||||
console.print(f"Documentation at [link={url_docs}]{url_docs}[/]")
|
||||
if docling_serve_settings.enable_ui:
|
||||
console.print(f"UI at [link={url_ui}]{url_ui}[/]")
|
||||
|
||||
if command == "dev":
|
||||
console.print("")
|
||||
console.print(
|
||||
"Running in development mode, for production use: "
|
||||
"[bold]docling-serve run[/]",
|
||||
)
|
||||
|
||||
console.print("")
|
||||
console.print("Logs:")
|
||||
|
||||
# Launch the server
|
||||
uvicorn.run(
|
||||
app="docling_serve.app:create_app",
|
||||
factory=True,
|
||||
host=uvicorn_settings.host,
|
||||
port=uvicorn_settings.port,
|
||||
reload=uvicorn_settings.reload,
|
||||
workers=uvicorn_settings.workers,
|
||||
root_path=uvicorn_settings.root_path,
|
||||
proxy_headers=uvicorn_settings.proxy_headers,
|
||||
timeout_keep_alive=uvicorn_settings.timeout_keep_alive,
|
||||
ssl_certfile=uvicorn_settings.ssl_certfile,
|
||||
ssl_keyfile=uvicorn_settings.ssl_keyfile,
|
||||
ssl_keyfile_password=uvicorn_settings.ssl_keyfile_password,
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def dev(
|
||||
*,
|
||||
# uvicorn options
|
||||
host: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The host to serve on. For local development in localhost "
|
||||
"use [blue]127.0.0.1[/blue]. To enable public access, "
|
||||
"e.g. in a container, use all the IP addresses "
|
||||
"available with [blue]0.0.0.0[/blue]."
|
||||
)
|
||||
),
|
||||
] = "127.0.0.1",
|
||||
port: Annotated[
|
||||
int,
|
||||
typer.Option(help="The port to serve on."),
|
||||
] = uvicorn_settings.port,
|
||||
reload: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable auto-reload of the server when (code) files change. "
|
||||
"This is [bold]resource intensive[/bold], "
|
||||
"use it only during development."
|
||||
)
|
||||
),
|
||||
] = True,
|
||||
root_path: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The root path is used to tell your app that it is being served "
|
||||
"to the outside world with some [bold]path prefix[/bold] "
|
||||
"set up in some termination proxy or similar."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.root_path,
|
||||
proxy_headers: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable/Disable X-Forwarded-Proto, X-Forwarded-For, "
|
||||
"X-Forwarded-Port to populate remote address info."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.proxy_headers,
|
||||
timeout_keep_alive: Annotated[
|
||||
int, typer.Option(help="Timeout for the server response.")
|
||||
] = uvicorn_settings.timeout_keep_alive,
|
||||
ssl_certfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL certificate file")
|
||||
] = uvicorn_settings.ssl_certfile,
|
||||
ssl_keyfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL key file")
|
||||
] = uvicorn_settings.ssl_keyfile,
|
||||
ssl_keyfile_password: Annotated[
|
||||
Optional[str], typer.Option(help="SSL keyfile password")
|
||||
] = uvicorn_settings.ssl_keyfile_password,
|
||||
# docling options
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
typer.Option(
|
||||
help=(
|
||||
"If set to a valid directory, "
|
||||
"the model weights will be loaded from this path."
|
||||
)
|
||||
),
|
||||
] = docling_serve_settings.artifacts_path,
|
||||
enable_ui: Annotated[bool, typer.Option(help="Enable the development UI.")] = True,
|
||||
) -> Any:
|
||||
"""
|
||||
Run a [bold]Docling Serve[/bold] app in [yellow]development[/yellow] mode. 🧪
|
||||
|
||||
This is equivalent to [bold]docling-serve run[/bold] but with [bold]reload[/bold]
|
||||
enabled and listening on the [blue]127.0.0.1[/blue] address.
|
||||
|
||||
Options can be set also with the corresponding ENV variable, with the exception
|
||||
of --enable-ui, --host and --reload.
|
||||
"""
|
||||
|
||||
uvicorn_settings.host = host
|
||||
uvicorn_settings.port = port
|
||||
uvicorn_settings.reload = reload
|
||||
uvicorn_settings.root_path = root_path
|
||||
uvicorn_settings.proxy_headers = proxy_headers
|
||||
uvicorn_settings.timeout_keep_alive = timeout_keep_alive
|
||||
uvicorn_settings.ssl_certfile = ssl_certfile
|
||||
uvicorn_settings.ssl_keyfile = ssl_keyfile
|
||||
uvicorn_settings.ssl_keyfile_password = ssl_keyfile_password
|
||||
|
||||
_run(
|
||||
command="dev",
|
||||
artifacts_path=artifacts_path,
|
||||
enable_ui=enable_ui,
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def run(
|
||||
*,
|
||||
host: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The host to serve on. For local development in localhost "
|
||||
"use [blue]127.0.0.1[/blue]. To enable public access, "
|
||||
"e.g. in a container, use all the IP addresses "
|
||||
"available with [blue]0.0.0.0[/blue]."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.host,
|
||||
port: Annotated[
|
||||
int,
|
||||
typer.Option(help="The port to serve on."),
|
||||
] = uvicorn_settings.port,
|
||||
reload: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable auto-reload of the server when (code) files change. "
|
||||
"This is [bold]resource intensive[/bold], "
|
||||
"use it only during development."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.reload,
|
||||
workers: Annotated[
|
||||
Union[int, None],
|
||||
typer.Option(
|
||||
help=(
|
||||
"Use multiple worker processes. "
|
||||
"Mutually exclusive with the --reload flag."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.workers,
|
||||
root_path: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The root path is used to tell your app that it is being served "
|
||||
"to the outside world with some [bold]path prefix[/bold] "
|
||||
"set up in some termination proxy or similar."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.root_path,
|
||||
proxy_headers: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable/Disable X-Forwarded-Proto, X-Forwarded-For, "
|
||||
"X-Forwarded-Port to populate remote address info."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.proxy_headers,
|
||||
timeout_keep_alive: Annotated[
|
||||
int, typer.Option(help="Timeout for the server response.")
|
||||
] = uvicorn_settings.timeout_keep_alive,
|
||||
ssl_certfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL certificate file")
|
||||
] = uvicorn_settings.ssl_certfile,
|
||||
ssl_keyfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL key file")
|
||||
] = uvicorn_settings.ssl_keyfile,
|
||||
ssl_keyfile_password: Annotated[
|
||||
Optional[str], typer.Option(help="SSL keyfile password")
|
||||
] = uvicorn_settings.ssl_keyfile_password,
|
||||
# docling options
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
typer.Option(
|
||||
help=(
|
||||
"If set to a valid directory, "
|
||||
"the model weights will be loaded from this path."
|
||||
)
|
||||
),
|
||||
] = docling_serve_settings.artifacts_path,
|
||||
enable_ui: Annotated[
|
||||
bool, typer.Option(help="Enable the development UI.")
|
||||
] = docling_serve_settings.enable_ui,
|
||||
) -> Any:
|
||||
"""
|
||||
Run a [bold]Docling Serve[/bold] app in [green]production[/green] mode. 🚀
|
||||
|
||||
This is equivalent to [bold]docling-serve dev[/bold] but with [bold]reload[/bold]
|
||||
disabled and listening on the [blue]0.0.0.0[/blue] address.
|
||||
|
||||
Options can be set also with the corresponding ENV variable, e.g. UVICORN_PORT
|
||||
or DOCLING_SERVE_ENABLE_UI.
|
||||
"""
|
||||
|
||||
uvicorn_settings.host = host
|
||||
uvicorn_settings.port = port
|
||||
uvicorn_settings.reload = reload
|
||||
uvicorn_settings.workers = workers
|
||||
uvicorn_settings.root_path = root_path
|
||||
uvicorn_settings.proxy_headers = proxy_headers
|
||||
uvicorn_settings.timeout_keep_alive = timeout_keep_alive
|
||||
uvicorn_settings.ssl_certfile = ssl_certfile
|
||||
uvicorn_settings.ssl_keyfile = ssl_keyfile
|
||||
uvicorn_settings.ssl_keyfile_password = ssl_keyfile_password
|
||||
|
||||
_run(
|
||||
command="run",
|
||||
artifacts_path=artifacts_path,
|
||||
enable_ui=enable_ui,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
app()
|
||||
|
||||
|
||||
# Launch the CLI when calling python -m docling_serve
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,279 +1,430 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import asyncio
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import tempfile
|
||||
from contextlib import asynccontextmanager
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Optional, Union
|
||||
|
||||
import httpx
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
from fastapi import (
|
||||
BackgroundTasks,
|
||||
Depends,
|
||||
FastAPI,
|
||||
HTTPException,
|
||||
Query,
|
||||
UploadFile,
|
||||
WebSocket,
|
||||
WebSocketDisconnect,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.openapi.docs import (
|
||||
get_redoc_html,
|
||||
get_swagger_ui_html,
|
||||
get_swagger_ui_oauth2_redirect_html,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from docling_core.utils.file import resolve_remote_filename
|
||||
from fastapi import FastAPI, HTTPException, Response
|
||||
from pydantic import AnyHttpUrl, BaseModel
|
||||
from fastapi.responses import RedirectResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
from docling_serve.datamodel.requests import (
|
||||
ConvertDocumentFileSourcesRequest,
|
||||
ConvertDocumentsRequest,
|
||||
)
|
||||
from docling_serve.datamodel.responses import (
|
||||
ConvertDocumentResponse,
|
||||
HealthCheckResponse,
|
||||
MessageKind,
|
||||
TaskStatusResponse,
|
||||
WebsocketMessage,
|
||||
)
|
||||
from docling_serve.docling_conversion import (
|
||||
convert_documents,
|
||||
get_converter,
|
||||
get_pdf_pipeline_opts,
|
||||
)
|
||||
from docling_serve.engines import get_orchestrator
|
||||
from docling_serve.engines.async_local.orchestrator import (
|
||||
AsyncLocalOrchestrator,
|
||||
TaskNotFoundError,
|
||||
)
|
||||
from docling_serve.helper_functions import FormDepends
|
||||
from docling_serve.response_preparation import process_results
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
|
||||
# TODO: import enum from Docling, once it is exposed
|
||||
class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT = "tesseract"
|
||||
RAPIDOCR = "rapidocr"
|
||||
# Set up custom logging as we'll be intermixes with FastAPI/Uvicorn's logging
|
||||
class ColoredLogFormatter(logging.Formatter):
|
||||
COLOR_CODES = {
|
||||
logging.DEBUG: "\033[94m", # Blue
|
||||
logging.INFO: "\033[92m", # Green
|
||||
logging.WARNING: "\033[93m", # Yellow
|
||||
logging.ERROR: "\033[91m", # Red
|
||||
logging.CRITICAL: "\033[95m", # Magenta
|
||||
}
|
||||
RESET_CODE = "\033[0m"
|
||||
|
||||
def format(self, record):
|
||||
color = self.COLOR_CODES.get(record.levelno, "")
|
||||
record.levelname = f"{color}{record.levelname}{self.RESET_CODE}"
|
||||
return super().format(record)
|
||||
|
||||
|
||||
class ConvertOptions(BaseModel):
|
||||
output_docling_document: bool = True
|
||||
output_markdown: bool = False
|
||||
output_html: bool = False
|
||||
do_ocr: bool = True
|
||||
ocr_engine: OcrEngine = OcrEngine.EASYOCR
|
||||
ocr_lang: Optional[List[str]] = None
|
||||
force_ocr: bool = False
|
||||
do_table_structure: bool = True
|
||||
include_images: bool = True
|
||||
images_scale: float = 2.0
|
||||
|
||||
|
||||
class DocumentConvertBase(BaseModel):
|
||||
options: ConvertOptions = ConvertOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: str
|
||||
headers: Dict[str, Any] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: str
|
||||
filename: str
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
|
||||
http_source: HttpSource
|
||||
|
||||
|
||||
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
|
||||
file_source: FileSource
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
markdown: Optional[str] = None
|
||||
docling_document: Optional[DoclingDocument] = None
|
||||
html: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
# errors: List[ErrorItem] = []
|
||||
|
||||
|
||||
ConvertDocumentRequest = Union[
|
||||
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
|
||||
]
|
||||
|
||||
|
||||
class MarkdownTextResponse(Response):
|
||||
media_type = "text/markdown"
|
||||
|
||||
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
|
||||
|
||||
if options.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
|
||||
|
||||
if options.ocr_lang is not None:
|
||||
ocr_options.lang = options.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=options.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=options.do_table_structure,
|
||||
generate_page_images=options.include_images,
|
||||
generate_picture_images=options.include_images,
|
||||
images_scale=options.images_scale,
|
||||
)
|
||||
|
||||
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
|
||||
|
||||
return pipeline_options, options_hash
|
||||
|
||||
|
||||
converters: Dict[str, DocumentConverter] = {}
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, # Set the logging level
|
||||
format="%(levelname)s:\t%(asctime)s - %(name)s - %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
# Override the formatter with the custom ColoredLogFormatter
|
||||
root_logger = logging.getLogger() # Get the root logger
|
||||
for handler in root_logger.handlers: # Iterate through existing handlers
|
||||
if handler.formatter:
|
||||
handler.setFormatter(ColoredLogFormatter(handler.formatter._fmt))
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Context manager to initialize and clean up the lifespan of the FastAPI app
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# settings = Settings()
|
||||
|
||||
# Converter with default options
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
}
|
||||
)
|
||||
pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
get_converter(pdf_format_option)
|
||||
|
||||
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
||||
orchestrator = get_orchestrator()
|
||||
|
||||
# Start the background queue processor
|
||||
queue_task = asyncio.create_task(orchestrator.process_queue())
|
||||
|
||||
yield
|
||||
|
||||
converters.clear()
|
||||
# Cancel the background queue processor on shutdown
|
||||
queue_task.cancel()
|
||||
try:
|
||||
await queue_task
|
||||
except asyncio.CancelledError:
|
||||
_log.info("Queue processor cancelled.")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
##################################
|
||||
# App creation and configuration #
|
||||
##################################
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
def create_app(): # noqa: C901
|
||||
try:
|
||||
version = importlib.metadata.version("docling_serve")
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
_log.warning("Unable to get docling_serve version, falling back to 0.0.0")
|
||||
|
||||
version = "0.0.0"
|
||||
|
||||
def _convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConversionResult:
|
||||
offline_docs_assets = False
|
||||
if (
|
||||
docling_serve_settings.static_path is not None
|
||||
and (docling_serve_settings.static_path).is_dir()
|
||||
):
|
||||
offline_docs_assets = True
|
||||
_log.info("Found static assets.")
|
||||
|
||||
filename: str
|
||||
buf: BytesIO
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
docs_url=None if offline_docs_assets else "/docs",
|
||||
redoc_url=None if offline_docs_assets else "/redocs",
|
||||
lifespan=lifespan,
|
||||
version=version,
|
||||
)
|
||||
|
||||
if isinstance(body, ConvertDocumentFileSourceRequest):
|
||||
buf = BytesIO(base64.b64decode(body.file_source.base64_string))
|
||||
filename = body.file_source.filename
|
||||
elif isinstance(body, ConvertDocumentHttpSourceRequest):
|
||||
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
|
||||
buf = BytesIO(http_res.content)
|
||||
filename = resolve_remote_filename(
|
||||
http_url=AnyHttpUrl(body.http_source.url),
|
||||
response_headers=dict(**http_res.headers),
|
||||
origins = docling_serve_settings.cors_origins
|
||||
methods = docling_serve_settings.cors_methods
|
||||
headers = docling_serve_settings.cors_headers
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=methods,
|
||||
allow_headers=headers,
|
||||
)
|
||||
|
||||
# Mount the Gradio app
|
||||
if docling_serve_settings.enable_ui:
|
||||
try:
|
||||
import gradio as gr
|
||||
|
||||
from docling_serve.gradio_ui import ui as gradio_ui
|
||||
|
||||
tmp_output_dir = Path(tempfile.mkdtemp())
|
||||
gradio_ui.gradio_output_dir = tmp_output_dir
|
||||
app = gr.mount_gradio_app(
|
||||
app,
|
||||
gradio_ui,
|
||||
path="/ui",
|
||||
allowed_paths=["./logo.png", tmp_output_dir],
|
||||
root_path="/ui",
|
||||
)
|
||||
except ImportError:
|
||||
_log.warning(
|
||||
"Docling Serve enable_ui is activated, but gradio is not installed. "
|
||||
"Install it with `pip install docling-serve[ui]` "
|
||||
"or `pip install gradio`"
|
||||
)
|
||||
|
||||
#############################
|
||||
# Offline assets definition #
|
||||
#############################
|
||||
if offline_docs_assets:
|
||||
app.mount(
|
||||
"/static",
|
||||
StaticFiles(directory=docling_serve_settings.static_path),
|
||||
name="static",
|
||||
)
|
||||
|
||||
doc_input = DocumentStream(name=filename, stream=buf)
|
||||
@app.get("/docs", include_in_schema=False)
|
||||
async def custom_swagger_ui_html():
|
||||
return get_swagger_ui_html(
|
||||
openapi_url=app.openapi_url,
|
||||
title=app.title + " - Swagger UI",
|
||||
oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
|
||||
swagger_js_url="/static/swagger-ui-bundle.js",
|
||||
swagger_css_url="/static/swagger-ui.css",
|
||||
)
|
||||
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
|
||||
if options_hash not in converters:
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
@app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
|
||||
async def swagger_ui_redirect():
|
||||
return get_swagger_ui_oauth2_redirect_html()
|
||||
|
||||
@app.get("/redoc", include_in_schema=False)
|
||||
async def redoc_html():
|
||||
return get_redoc_html(
|
||||
openapi_url=app.openapi_url,
|
||||
title=app.title + " - ReDoc",
|
||||
redoc_js_url="/static/redoc.standalone.js",
|
||||
)
|
||||
|
||||
#############################
|
||||
# API Endpoints definitions #
|
||||
#############################
|
||||
|
||||
# Favicon
|
||||
@app.get("/favicon.ico", include_in_schema=False)
|
||||
async def favicon():
|
||||
logo_url = "https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"
|
||||
if offline_docs_assets:
|
||||
logo_url = "/static/logo.svg"
|
||||
response = RedirectResponse(url=logo_url)
|
||||
return response
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
# API readiness compatibility for OpenShift AI Workbench
|
||||
@app.get("/api", include_in_schema=False)
|
||||
def api_check() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
# Convert a document from URL(s)
|
||||
@app.post(
|
||||
"/v1alpha/convert/source",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
# "description": "Return the JSON item or an image.",
|
||||
}
|
||||
},
|
||||
)
|
||||
def process_url(
|
||||
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
||||
):
|
||||
sources: list[Union[str, DocumentStream]] = []
|
||||
headers: Optional[dict[str, Any]] = None
|
||||
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
||||
for file_source in conversion_request.file_sources:
|
||||
sources.append(file_source.to_document_stream())
|
||||
else:
|
||||
for http_source in conversion_request.http_sources:
|
||||
sources.append(http_source.url)
|
||||
if headers is None and http_source.headers:
|
||||
headers = http_source.headers
|
||||
|
||||
# Note: results are only an iterator->lazy evaluation
|
||||
results = convert_documents(
|
||||
sources=sources, options=conversion_request.options, headers=headers
|
||||
)
|
||||
|
||||
result: ConversionResult = converters[options_hash].convert(doc_input)
|
||||
|
||||
if result is None or result.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=result.errors)
|
||||
|
||||
if result is None or result.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
}:
|
||||
raise HTTPException(
|
||||
status_code=500, detail={"errors": result.errors, "status": result.status}
|
||||
# The real processing will happen here
|
||||
response = process_results(
|
||||
background_tasks=background_tasks,
|
||||
conversion_options=conversion_request.options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return result
|
||||
return response
|
||||
|
||||
|
||||
@app.post(
|
||||
"/convert",
|
||||
)
|
||||
def convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConvertDocumentResponse:
|
||||
|
||||
result = _convert_document(body=body)
|
||||
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
# Convert a document from file(s)
|
||||
@app.post(
|
||||
"/v1alpha/convert/file",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
}
|
||||
},
|
||||
)
|
||||
doc_resp = DocumentResponse()
|
||||
if body.options.output_docling_document:
|
||||
doc_resp.docling_document = result.document
|
||||
if body.options.output_markdown:
|
||||
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
|
||||
if body.options.output_html:
|
||||
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
|
||||
async def process_file(
|
||||
background_tasks: BackgroundTasks,
|
||||
files: list[UploadFile],
|
||||
options: Annotated[
|
||||
ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
|
||||
],
|
||||
):
|
||||
_log.info(f"Received {len(files)} files for processing.")
|
||||
|
||||
return ConvertDocumentResponse(
|
||||
document=doc_resp, status=result.status, timings=result.timings
|
||||
)
|
||||
# Load the uploaded files to Docling DocumentStream
|
||||
file_sources = []
|
||||
for file in files:
|
||||
buf = BytesIO(file.file.read())
|
||||
name = file.filename if file.filename else "file.pdf"
|
||||
file_sources.append(DocumentStream(name=name, stream=buf))
|
||||
|
||||
results = convert_documents(sources=file_sources, options=options)
|
||||
|
||||
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
|
||||
def convert_document_md(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> MarkdownTextResponse:
|
||||
result = _convert_document(body=body)
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
response = process_results(
|
||||
background_tasks=background_tasks,
|
||||
conversion_options=options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Convert a document from URL(s) using the async api
|
||||
@app.post(
|
||||
"/v1alpha/convert/source/async",
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
return MarkdownTextResponse(
|
||||
result.document.export_to_markdown(image_mode=image_mode)
|
||||
async def process_url_async(
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
conversion_request: ConvertDocumentsRequest,
|
||||
):
|
||||
task = await orchestrator.enqueue(request=conversion_request)
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task.task_id
|
||||
)
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
|
||||
# Task status poll
|
||||
@app.get(
|
||||
"/v1alpha/status/poll/{task_id}",
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def task_status_poll(
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
task_id: str,
|
||||
wait: Annotated[
|
||||
float, Query(help="Number of seconds to wait for a completed status.")
|
||||
] = 0.0,
|
||||
):
|
||||
try:
|
||||
task = await orchestrator.task_status(task_id=task_id, wait=wait)
|
||||
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
||||
except TaskNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="Task not found.")
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
|
||||
# Task status websocket
|
||||
@app.websocket(
|
||||
"/v1alpha/status/ws/{task_id}",
|
||||
)
|
||||
async def task_status_ws(
|
||||
websocket: WebSocket,
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
task_id: str,
|
||||
):
|
||||
await websocket.accept()
|
||||
|
||||
if task_id not in orchestrator.tasks:
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(
|
||||
message=MessageKind.ERROR, error="Task not found."
|
||||
).model_dump_json()
|
||||
)
|
||||
await websocket.close()
|
||||
return
|
||||
|
||||
task = orchestrator.tasks[task_id]
|
||||
|
||||
# Track active WebSocket connections for this job
|
||||
orchestrator.task_subscribers[task_id].add(websocket)
|
||||
|
||||
try:
|
||||
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
||||
task_response = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(
|
||||
message=MessageKind.CONNECTION, task=task_response
|
||||
).model_dump_json()
|
||||
)
|
||||
while True:
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task_id
|
||||
)
|
||||
task_response = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(
|
||||
message=MessageKind.UPDATE, task=task_response
|
||||
).model_dump_json()
|
||||
)
|
||||
# each client message will be interpreted as a request for update
|
||||
msg = await websocket.receive_text()
|
||||
_log.debug(f"Received message: {msg}")
|
||||
|
||||
except WebSocketDisconnect:
|
||||
_log.info(f"WebSocket disconnected for job {task_id}")
|
||||
|
||||
finally:
|
||||
orchestrator.task_subscribers[task_id].remove(websocket)
|
||||
|
||||
# Task result
|
||||
@app.get(
|
||||
"/v1alpha/result/{task_id}",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
}
|
||||
},
|
||||
)
|
||||
async def task_result(
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
task_id: str,
|
||||
):
|
||||
result = await orchestrator.task_result(task_id=task_id)
|
||||
if result is None:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Task result not found. Please wait for a completion status.",
|
||||
)
|
||||
return result
|
||||
|
||||
return app
|
||||
|
||||
0
docling_serve/datamodel/__init__.py
Normal file
0
docling_serve/datamodel/__init__.py
Normal file
229
docling_serve/datamodel/convert.py
Normal file
229
docling_serve/datamodel/convert.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# Define the input options for the API
|
||||
from typing import Annotated, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling.datamodel.base_models import InputFormat, OutputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfBackend,
|
||||
TableFormerMode,
|
||||
)
|
||||
from docling.models.factories import get_ocr_factory
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
ocr_factory = get_ocr_factory(
|
||||
allow_external_plugins=docling_serve_settings.allow_external_plugins
|
||||
)
|
||||
ocr_engines_enum = ocr_factory.get_enum()
|
||||
|
||||
|
||||
class ConvertDocumentsOptions(BaseModel):
|
||||
from_formats: Annotated[
|
||||
list[InputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Input format(s) to convert from. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
||||
"Optional, defaults to all formats."
|
||||
),
|
||||
examples=[[v.value for v in InputFormat]],
|
||||
),
|
||||
] = list(InputFormat)
|
||||
|
||||
to_formats: Annotated[
|
||||
list[OutputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Output format(s) to convert to. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
||||
"Optional, defaults to Markdown."
|
||||
),
|
||||
examples=[[OutputFormat.MARKDOWN]],
|
||||
),
|
||||
] = [OutputFormat.MARKDOWN]
|
||||
|
||||
image_export_mode: Annotated[
|
||||
ImageRefMode,
|
||||
Field(
|
||||
description=(
|
||||
"Image export mode for the document (in case of JSON,"
|
||||
" Markdown or HTML). "
|
||||
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
||||
"Optional, defaults to Embedded."
|
||||
),
|
||||
examples=[ImageRefMode.EMBEDDED.value],
|
||||
# pattern="embedded|placeholder|referenced",
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED
|
||||
|
||||
do_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the bitmap content will be processed using OCR. "
|
||||
"Boolean. Optional, defaults to true"
|
||||
),
|
||||
# examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
force_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, replace existing text with OCR-generated "
|
||||
"text over content. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
ocr_engine: Annotated[ # type: ignore
|
||||
ocr_engines_enum,
|
||||
Field(
|
||||
description=(
|
||||
"The OCR engine to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in ocr_engines_enum])}. "
|
||||
"Optional, defaults to easyocr."
|
||||
),
|
||||
examples=[EasyOcrOptions.kind],
|
||||
),
|
||||
] = ocr_engines_enum(EasyOcrOptions.kind) # type: ignore
|
||||
|
||||
ocr_lang: Annotated[
|
||||
Optional[list[str]],
|
||||
Field(
|
||||
description=(
|
||||
"List of languages used by the OCR engine. "
|
||||
"Note that each OCR engine has "
|
||||
"different values for the language names. String or list of strings. "
|
||||
"Optional, defaults to empty."
|
||||
),
|
||||
examples=[["fr", "de", "es", "en"]],
|
||||
),
|
||||
] = None
|
||||
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend,
|
||||
Field(
|
||||
description=(
|
||||
"The PDF backend to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
||||
f"Optional, defaults to {PdfBackend.DLPARSE_V4.value}."
|
||||
),
|
||||
examples=[PdfBackend.DLPARSE_V4],
|
||||
),
|
||||
] = PdfBackend.DLPARSE_V4
|
||||
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
Field(
|
||||
TableFormerMode.FAST,
|
||||
description=(
|
||||
"Mode to use for table structure, String. "
|
||||
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
||||
"Optional, defaults to fast."
|
||||
),
|
||||
examples=[TableFormerMode.FAST],
|
||||
# pattern="fast|accurate",
|
||||
),
|
||||
] = TableFormerMode.FAST
|
||||
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
return_as_file: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Return the output as a zip file "
|
||||
"(will happen anyway if multiple files are generated). "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_table_structure: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the table structure will be extracted. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
include_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, images will be extracted from the document. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
images_scale: Annotated[
|
||||
float,
|
||||
Field(
|
||||
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
||||
examples=[2.0],
|
||||
),
|
||||
] = 2.0
|
||||
|
||||
do_code_enrichment: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, perform OCR code enrichment. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_formula_enrichment: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, perform formula OCR, return Latex code. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_picture_classification: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, classify pictures in documents. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_picture_description: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, describe pictures in documents. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
12
docling_serve/datamodel/engines.py
Normal file
12
docling_serve/datamodel/engines.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import enum
|
||||
|
||||
|
||||
class TaskStatus(str, enum.Enum):
|
||||
SUCCESS = "success"
|
||||
PENDING = "pending"
|
||||
STARTED = "started"
|
||||
FAILURE = "failure"
|
||||
|
||||
|
||||
class AsyncEngine(str, enum.Enum):
|
||||
LOCAL = "local"
|
||||
62
docling_serve/datamodel/requests.py
Normal file
62
docling_serve/datamodel/requests.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Any, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
|
||||
|
||||
class DocumentsConvertBase(BaseModel):
|
||||
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="HTTP url to process",
|
||||
examples=["https://arxiv.org/pdf/2206.01062"],
|
||||
),
|
||||
]
|
||||
headers: Annotated[
|
||||
dict[str, Any],
|
||||
Field(
|
||||
description="Additional headers used to fetch the urls, "
|
||||
"e.g. authorization, agent, etc"
|
||||
),
|
||||
] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Content of the file serialized in base64. "
|
||||
"For example it can be obtained via "
|
||||
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
||||
),
|
||||
]
|
||||
filename: Annotated[
|
||||
str,
|
||||
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
||||
]
|
||||
|
||||
def to_document_stream(self) -> DocumentStream:
|
||||
buf = BytesIO(base64.b64decode(self.base64_string))
|
||||
return DocumentStream(stream=buf, name=self.filename)
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
||||
http_sources: list[HttpSource]
|
||||
|
||||
|
||||
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
||||
file_sources: list[FileSource]
|
||||
|
||||
|
||||
ConvertDocumentsRequest = Union[
|
||||
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
||||
]
|
||||
52
docling_serve/datamodel/responses.py
Normal file
52
docling_serve/datamodel/responses.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import enum
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.document import ConversionStatus, ErrorItem
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
|
||||
# Status
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
filename: str
|
||||
md_content: Optional[str] = None
|
||||
json_content: Optional[DoclingDocument] = None
|
||||
html_content: Optional[str] = None
|
||||
text_content: Optional[str] = None
|
||||
doctags_content: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: list[ErrorItem] = []
|
||||
processing_time: float
|
||||
timings: dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
|
||||
|
||||
class TaskStatusResponse(BaseModel):
|
||||
task_id: str
|
||||
task_status: str
|
||||
task_position: Optional[int] = None
|
||||
|
||||
|
||||
class MessageKind(str, enum.Enum):
|
||||
CONNECTION = "connection"
|
||||
UPDATE = "update"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
class WebsocketMessage(BaseModel):
|
||||
message: MessageKind
|
||||
task: Optional[TaskStatusResponse] = None
|
||||
error: Optional[str] = None
|
||||
19
docling_serve/datamodel/task.py
Normal file
19
docling_serve/datamodel/task.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.datamodel.engines import TaskStatus
|
||||
from docling_serve.datamodel.requests import ConvertDocumentsRequest
|
||||
from docling_serve.datamodel.responses import ConvertDocumentResponse
|
||||
|
||||
|
||||
class Task(BaseModel):
|
||||
task_id: str
|
||||
task_status: TaskStatus = TaskStatus.PENDING
|
||||
request: Optional[ConvertDocumentsRequest]
|
||||
result: Optional[ConvertDocumentResponse] = None
|
||||
|
||||
def is_completed(self) -> bool:
|
||||
if self.task_status in [TaskStatus.SUCCESS, TaskStatus.FAILURE]:
|
||||
return True
|
||||
return False
|
||||
185
docling_serve/docling_conversion.py
Normal file
185
docling_serve/docling_conversion.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Iterable, Iterator
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
OcrOptions,
|
||||
PdfBackend,
|
||||
PdfPipelineOptions,
|
||||
TableFormerMode,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions, ocr_factory
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Custom serializer for PdfFormatOption
|
||||
# (model_dump_json does not work with some classes)
|
||||
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
||||
data = pdf_format_option.model_dump()
|
||||
|
||||
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
||||
if pdf_format_option.pipeline_options:
|
||||
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()
|
||||
|
||||
# Replace `artifacts_path` with a string representation
|
||||
data["pipeline_options"]["artifacts_path"] = repr(
|
||||
data["pipeline_options"]["artifacts_path"]
|
||||
)
|
||||
|
||||
# Replace `pipeline_cls` with a string representation
|
||||
data["pipeline_cls"] = repr(data["pipeline_cls"])
|
||||
|
||||
# Replace `backend` with a string representation
|
||||
data["backend"] = repr(data["backend"])
|
||||
|
||||
# Handle `device` in `accelerator_options`
|
||||
if "accelerator_options" in data and "device" in data["accelerator_options"]:
|
||||
data["accelerator_options"]["device"] = repr(
|
||||
data["accelerator_options"]["device"]
|
||||
)
|
||||
|
||||
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
||||
serialized_data = json.dumps(data, sort_keys=True)
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||
return options_hash
|
||||
|
||||
|
||||
# Cache of DocumentConverter objects
|
||||
_options_map: dict[bytes, PdfFormatOption] = {}
|
||||
|
||||
|
||||
@lru_cache(maxsize=docling_serve_settings.options_cache_size)
|
||||
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
||||
pdf_format_option = _options_map[options_hash]
|
||||
format_options: dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
return DocumentConverter(format_options=format_options)
|
||||
|
||||
|
||||
def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
|
||||
options_hash = _hash_pdf_format_option(pdf_format_option)
|
||||
_options_map[options_hash] = pdf_format_option
|
||||
return _get_converter_from_hash(options_hash)
|
||||
|
||||
|
||||
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
||||
def get_pdf_pipeline_opts(
|
||||
request: ConvertDocumentsOptions,
|
||||
) -> PdfFormatOption:
|
||||
try:
|
||||
ocr_options: OcrOptions = ocr_factory.create_options(
|
||||
kind=request.ocr_engine.value, # type: ignore
|
||||
force_full_page_ocr=request.force_ocr,
|
||||
)
|
||||
except ImportError as err:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})" # type: ignore
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.\n"
|
||||
f"{err}",
|
||||
)
|
||||
|
||||
if request.ocr_lang is not None:
|
||||
if isinstance(request.ocr_lang, str):
|
||||
ocr_options.lang = _to_list_of_strings(request.ocr_lang)
|
||||
else:
|
||||
ocr_options.lang = request.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=request.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=request.do_table_structure,
|
||||
do_code_enrichment=request.do_code_enrichment,
|
||||
do_formula_enrichment=request.do_formula_enrichment,
|
||||
do_picture_classification=request.do_picture_classification,
|
||||
do_picture_description=request.do_picture_description,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
|
||||
|
||||
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||
pipeline_options.generate_page_images = True
|
||||
if request.images_scale:
|
||||
pipeline_options.images_scale = request.images_scale
|
||||
|
||||
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.DLPARSE_V4:
|
||||
backend = DoclingParseV4DocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
|
||||
|
||||
if docling_serve_settings.artifacts_path is not None:
|
||||
if str(docling_serve_settings.artifacts_path.absolute()) == "":
|
||||
_log.info(
|
||||
"artifacts_path is an empty path, model weights will be dowloaded "
|
||||
"at runtime."
|
||||
)
|
||||
pipeline_options.artifacts_path = None
|
||||
elif docling_serve_settings.artifacts_path.is_dir():
|
||||
_log.info(
|
||||
"artifacts_path is set to a valid directory. "
|
||||
"No model weights will be downloaded at runtime."
|
||||
)
|
||||
pipeline_options.artifacts_path = docling_serve_settings.artifacts_path
|
||||
else:
|
||||
_log.warning(
|
||||
"artifacts_path is set to an invalid directory. "
|
||||
"The system will download the model weights at runtime."
|
||||
)
|
||||
pipeline_options.artifacts_path = None
|
||||
else:
|
||||
_log.info(
|
||||
"artifacts_path is unset. "
|
||||
"The system will download the model weights at runtime."
|
||||
)
|
||||
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
return pdf_format_option
|
||||
|
||||
|
||||
def convert_documents(
|
||||
sources: Iterable[Union[Path, str, DocumentStream]],
|
||||
options: ConvertDocumentsOptions,
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
pdf_format_option = get_pdf_pipeline_opts(options)
|
||||
converter = get_converter(pdf_format_option)
|
||||
results: Iterator[ConversionResult] = converter.convert_all(
|
||||
sources,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
return results
|
||||
8
docling_serve/engines/__init__.py
Normal file
8
docling_serve/engines/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from functools import lru_cache
|
||||
|
||||
from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_orchestrator() -> AsyncLocalOrchestrator:
|
||||
return AsyncLocalOrchestrator()
|
||||
0
docling_serve/engines/async_local/__init__.py
Normal file
0
docling_serve/engines/async_local/__init__.py
Normal file
102
docling_serve/engines/async_local/orchestrator.py
Normal file
102
docling_serve/engines/async_local/orchestrator.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import WebSocket
|
||||
|
||||
from docling_serve.datamodel.engines import TaskStatus
|
||||
from docling_serve.datamodel.requests import ConvertDocumentsRequest
|
||||
from docling_serve.datamodel.responses import (
|
||||
MessageKind,
|
||||
TaskStatusResponse,
|
||||
WebsocketMessage,
|
||||
)
|
||||
from docling_serve.datamodel.task import Task
|
||||
from docling_serve.engines.async_local.worker import AsyncLocalWorker
|
||||
from docling_serve.engines.base_orchestrator import BaseOrchestrator
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OrchestratorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TaskNotFoundError(OrchestratorError):
|
||||
pass
|
||||
|
||||
|
||||
class AsyncLocalOrchestrator(BaseOrchestrator):
|
||||
def __init__(self):
|
||||
self.task_queue = asyncio.Queue()
|
||||
self.tasks: dict[str, Task] = {}
|
||||
self.queue_list: list[str] = []
|
||||
self.task_subscribers: dict[str, set[WebSocket]] = {}
|
||||
|
||||
async def enqueue(self, request: ConvertDocumentsRequest) -> Task:
|
||||
task_id = str(uuid.uuid4())
|
||||
task = Task(task_id=task_id, request=request)
|
||||
self.tasks[task_id] = task
|
||||
self.queue_list.append(task_id)
|
||||
self.task_subscribers[task_id] = set()
|
||||
await self.task_queue.put(task_id)
|
||||
return task
|
||||
|
||||
async def queue_size(self) -> int:
|
||||
return self.task_queue.qsize()
|
||||
|
||||
async def get_queue_position(self, task_id: str) -> Optional[int]:
|
||||
return (
|
||||
self.queue_list.index(task_id) + 1 if task_id in self.queue_list else None
|
||||
)
|
||||
|
||||
async def task_status(self, task_id: str, wait: float = 0.0) -> Task:
|
||||
if task_id not in self.tasks:
|
||||
raise TaskNotFoundError()
|
||||
return self.tasks[task_id]
|
||||
|
||||
async def task_result(self, task_id: str):
|
||||
if task_id not in self.tasks:
|
||||
raise TaskNotFoundError()
|
||||
return self.tasks[task_id].result
|
||||
|
||||
async def process_queue(self):
|
||||
# Create a pool of workers
|
||||
workers = []
|
||||
for i in range(docling_serve_settings.eng_loc_num_workers):
|
||||
_log.debug(f"Starting worker {i}")
|
||||
w = AsyncLocalWorker(i, self)
|
||||
worker_task = asyncio.create_task(w.loop())
|
||||
workers.append(worker_task)
|
||||
|
||||
# Wait for all workers to complete (they won't, as they run indefinitely)
|
||||
await asyncio.gather(*workers)
|
||||
_log.debug("All workers completed.")
|
||||
|
||||
async def notify_task_subscribers(self, task_id: str):
|
||||
if task_id not in self.task_subscribers:
|
||||
raise RuntimeError(f"Task {task_id} does not have a subscribers list.")
|
||||
|
||||
task = self.tasks[task_id]
|
||||
task_queue_position = await self.get_queue_position(task_id)
|
||||
msg = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
for websocket in self.task_subscribers[task_id]:
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(message=MessageKind.UPDATE, task=msg).model_dump_json()
|
||||
)
|
||||
if task.is_completed():
|
||||
await websocket.close()
|
||||
|
||||
async def notify_queue_positions(self):
|
||||
for task_id in self.task_subscribers.keys():
|
||||
# notify only pending tasks
|
||||
if self.tasks[task_id].task_status != TaskStatus.PENDING:
|
||||
continue
|
||||
|
||||
await self.notify_task_subscribers(task_id)
|
||||
116
docling_serve/engines/async_local/worker.py
Normal file
116
docling_serve/engines/async_local/worker.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from fastapi import BackgroundTasks
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.engines import TaskStatus
|
||||
from docling_serve.datamodel.requests import ConvertDocumentFileSourcesRequest
|
||||
from docling_serve.datamodel.responses import ConvertDocumentResponse
|
||||
from docling_serve.docling_conversion import convert_documents
|
||||
from docling_serve.response_preparation import process_results
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AsyncLocalWorker:
|
||||
def __init__(self, worker_id: int, orchestrator: "AsyncLocalOrchestrator"):
|
||||
self.worker_id = worker_id
|
||||
self.orchestrator = orchestrator
|
||||
|
||||
async def loop(self):
|
||||
_log.debug(f"Starting loop for worker {self.worker_id}")
|
||||
while True:
|
||||
task_id: str = await self.orchestrator.task_queue.get()
|
||||
self.orchestrator.queue_list.remove(task_id)
|
||||
|
||||
if task_id not in self.orchestrator.tasks:
|
||||
raise RuntimeError(f"Task {task_id} not found.")
|
||||
task = self.orchestrator.tasks[task_id]
|
||||
|
||||
try:
|
||||
task.task_status = TaskStatus.STARTED
|
||||
_log.info(f"Worker {self.worker_id} processing task {task_id}")
|
||||
|
||||
# Notify clients about task updates
|
||||
await self.orchestrator.notify_task_subscribers(task_id)
|
||||
|
||||
# Notify clients about queue updates
|
||||
await self.orchestrator.notify_queue_positions()
|
||||
|
||||
# Get the current event loop
|
||||
asyncio.get_event_loop()
|
||||
|
||||
# Define a callback function to send progress updates to the client.
|
||||
# TODO: send partial updates, e.g. when a document in the batch is done
|
||||
def run_conversion():
|
||||
sources: list[Union[str, DocumentStream]] = []
|
||||
headers: Optional[dict[str, Any]] = None
|
||||
if isinstance(task.request, ConvertDocumentFileSourcesRequest):
|
||||
for file_source in task.request.file_sources:
|
||||
sources.append(file_source.to_document_stream())
|
||||
else:
|
||||
for http_source in task.request.http_sources:
|
||||
sources.append(http_source.url)
|
||||
if headers is None and http_source.headers:
|
||||
headers = http_source.headers
|
||||
|
||||
# Note: results are only an iterator->lazy evaluation
|
||||
results = convert_documents(
|
||||
sources=sources,
|
||||
options=task.request.options,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# The real processing will happen here
|
||||
response = process_results(
|
||||
background_tasks=BackgroundTasks(),
|
||||
conversion_options=task.request.options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Run the prediction in a thread to avoid blocking the event loop.
|
||||
start_time = time.monotonic()
|
||||
# future = asyncio.run_coroutine_threadsafe(
|
||||
# run_conversion(),
|
||||
# loop=loop
|
||||
# )
|
||||
# response = future.result()
|
||||
|
||||
response = await asyncio.to_thread(
|
||||
run_conversion,
|
||||
)
|
||||
processing_time = time.monotonic() - start_time
|
||||
|
||||
if not isinstance(response, ConvertDocumentResponse):
|
||||
_log.error(
|
||||
f"Worker {self.worker_id} got un-processable "
|
||||
"result for {task_id}: {type(response)}"
|
||||
)
|
||||
task.result = response
|
||||
task.request = None
|
||||
|
||||
task.task_status = TaskStatus.SUCCESS
|
||||
_log.info(
|
||||
f"Worker {self.worker_id} completed job {task_id} "
|
||||
f"in {processing_time:.2f} seconds"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
_log.error(
|
||||
f"Worker {self.worker_id} failed to process job {task_id}: {e}"
|
||||
)
|
||||
task.task_status = TaskStatus.FAILURE
|
||||
|
||||
finally:
|
||||
await self.orchestrator.notify_task_subscribers(task_id)
|
||||
self.orchestrator.task_queue.task_done()
|
||||
_log.debug(f"Worker {self.worker_id} completely done with {task_id}")
|
||||
21
docling_serve/engines/base_orchestrator.py
Normal file
21
docling_serve/engines/base_orchestrator.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from docling_serve.datamodel.task import Task
|
||||
|
||||
|
||||
class BaseOrchestrator(ABC):
|
||||
@abstractmethod
|
||||
async def enqueue(self, task) -> Task:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def queue_size(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def task_status(self, task_id: str) -> Task:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def task_result(self, task_id: str):
|
||||
pass
|
||||
0
docling_serve/engines/block_local/__init__.py
Normal file
0
docling_serve/engines/block_local/__init__.py
Normal file
721
docling_serve/gradio_ui.py
Normal file
721
docling_serve/gradio_ui.py
Normal file
@@ -0,0 +1,721 @@
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
from docling_serve.settings import docling_serve_settings, uvicorn_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
############################
|
||||
# Path of static artifacts #
|
||||
############################
|
||||
|
||||
logo_path = "https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"
|
||||
js_components_url = "https://unpkg.com/@docling/docling-components@0.0.3"
|
||||
if (
|
||||
docling_serve_settings.static_path is not None
|
||||
and docling_serve_settings.static_path.is_dir()
|
||||
):
|
||||
logo_path = str(docling_serve_settings.static_path / "logo.svg")
|
||||
js_components_url = "/static/docling-components.js"
|
||||
|
||||
|
||||
##############################
|
||||
# Head JS for web components #
|
||||
##############################
|
||||
head = f"""
|
||||
<script src="{js_components_url}" type="module"></script>
|
||||
"""
|
||||
|
||||
#################
|
||||
# CSS and theme #
|
||||
#################
|
||||
|
||||
css = """
|
||||
#logo {
|
||||
border-style: none;
|
||||
background: none;
|
||||
box-shadow: none;
|
||||
min-width: 80px;
|
||||
}
|
||||
#dark_mode_column {
|
||||
display: flex;
|
||||
align-content: flex-end;
|
||||
}
|
||||
#title {
|
||||
text-align: left;
|
||||
display:block;
|
||||
height: auto;
|
||||
padding-top: 5px;
|
||||
line-height: 0;
|
||||
}
|
||||
.title-text h1 > p, .title-text p {
|
||||
margin-top: 0px !important;
|
||||
margin-bottom: 2px !important;
|
||||
}
|
||||
#custom-container {
|
||||
border: 0.909091px solid;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
#custom-container h4 {
|
||||
font-size: 14px;
|
||||
}
|
||||
#file_input_zone {
|
||||
height: 140px;
|
||||
}
|
||||
|
||||
docling-img::part(pages) {
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
docling-img::part(page) {
|
||||
box-shadow: 0 0.5rem 1rem 0 rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
"""
|
||||
|
||||
theme = gr.themes.Default(
|
||||
text_size="md",
|
||||
spacing_size="md",
|
||||
font=[
|
||||
gr.themes.GoogleFont("Red Hat Display"),
|
||||
"ui-sans-serif",
|
||||
"system-ui",
|
||||
"sans-serif",
|
||||
],
|
||||
font_mono=[
|
||||
gr.themes.GoogleFont("Red Hat Mono"),
|
||||
"ui-monospace",
|
||||
"Consolas",
|
||||
"monospace",
|
||||
],
|
||||
)
|
||||
|
||||
#############
|
||||
# Variables #
|
||||
#############
|
||||
|
||||
gradio_output_dir = None # Will be set by FastAPI when mounted
|
||||
file_output_path = None # Will be set when a new file is generated
|
||||
|
||||
#############
|
||||
# Functions #
|
||||
#############
|
||||
|
||||
|
||||
def health_check():
|
||||
response = requests.get(f"http://localhost:{uvicorn_settings.port}/health")
|
||||
if response.status_code == 200:
|
||||
return "Healthy"
|
||||
return "Unhealthy"
|
||||
|
||||
|
||||
def set_options_visibility(x):
|
||||
return gr.Accordion("Options", open=x)
|
||||
|
||||
|
||||
def set_outputs_visibility_direct(x, y):
|
||||
content = gr.Row(visible=x)
|
||||
file = gr.Row(visible=y)
|
||||
return content, file
|
||||
|
||||
|
||||
def set_outputs_visibility_process(x):
|
||||
content = gr.Row(visible=not x)
|
||||
file = gr.Row(visible=x)
|
||||
return content, file
|
||||
|
||||
|
||||
def set_download_button_label(label_text: gr.State):
|
||||
return gr.DownloadButton(label=str(label_text), scale=1)
|
||||
|
||||
|
||||
def clear_outputs():
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
json_rendered_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
|
||||
return (
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
json_rendered_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
doctags_content,
|
||||
)
|
||||
|
||||
|
||||
def clear_url_input():
|
||||
return ""
|
||||
|
||||
|
||||
def clear_file_input():
|
||||
return None
|
||||
|
||||
|
||||
def auto_set_return_as_file(url_input, file_input, image_export_mode):
|
||||
# If more than one input source is provided, return as file
|
||||
if (
|
||||
(len(url_input.split(",")) > 1)
|
||||
or (file_input and len(file_input) > 1)
|
||||
or (image_export_mode == "referenced")
|
||||
):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def change_ocr_lang(ocr_engine):
|
||||
if ocr_engine == "easyocr":
|
||||
return "en,fr,de,es"
|
||||
elif ocr_engine == "tesseract_cli":
|
||||
return "eng,fra,deu,spa"
|
||||
elif ocr_engine == "tesseract":
|
||||
return "eng,fra,deu,spa"
|
||||
elif ocr_engine == "rapidocr":
|
||||
return "english,chinese"
|
||||
|
||||
|
||||
def process_url(
|
||||
input_sources,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
):
|
||||
parameters = {
|
||||
"http_sources": [{"url": source} for source in input_sources.split(",")],
|
||||
"options": {
|
||||
"to_formats": to_formats,
|
||||
"image_export_mode": image_export_mode,
|
||||
"ocr": ocr,
|
||||
"force_ocr": force_ocr,
|
||||
"ocr_engine": ocr_engine,
|
||||
"ocr_lang": _to_list_of_strings(ocr_lang),
|
||||
"pdf_backend": pdf_backend,
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": abort_on_error,
|
||||
"return_as_file": return_as_file,
|
||||
"do_code_enrichment": do_code_enrichment,
|
||||
"do_formula_enrichment": do_formula_enrichment,
|
||||
"do_picture_classification": do_picture_classification,
|
||||
"do_picture_description": do_picture_description,
|
||||
},
|
||||
}
|
||||
if (
|
||||
not parameters["http_sources"]
|
||||
or len(parameters["http_sources"]) == 0
|
||||
or parameters["http_sources"][0]["url"] == ""
|
||||
):
|
||||
logger.error("No input sources provided.")
|
||||
raise gr.Error("No input sources provided.", print_exception=False)
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{uvicorn_settings.port}/v1alpha/convert/source",
|
||||
json=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing URL: {e}")
|
||||
raise gr.Error(f"Error processing URL: {e}", print_exception=False)
|
||||
if response.status_code != 200:
|
||||
data = response.json()
|
||||
error_message = data.get("detail", "An unknown error occurred.")
|
||||
logger.error(f"Error processing file: {error_message}")
|
||||
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
||||
output = response_to_output(response, return_as_file)
|
||||
return output
|
||||
|
||||
|
||||
def process_file(
|
||||
files,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
):
|
||||
if not files or len(files) == 0 or files[0] == "":
|
||||
logger.error("No files provided.")
|
||||
raise gr.Error("No files provided.", print_exception=False)
|
||||
files_data = [("files", (file.name, open(file.name, "rb"))) for file in files]
|
||||
|
||||
parameters = {
|
||||
"to_formats": to_formats,
|
||||
"image_export_mode": image_export_mode,
|
||||
"ocr": str(ocr).lower(),
|
||||
"force_ocr": str(force_ocr).lower(),
|
||||
"ocr_engine": ocr_engine,
|
||||
"ocr_lang": _to_list_of_strings(ocr_lang),
|
||||
"pdf_backend": pdf_backend,
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": str(abort_on_error).lower(),
|
||||
"return_as_file": str(return_as_file).lower(),
|
||||
"do_code_enrichment": str(do_code_enrichment).lower(),
|
||||
"do_formula_enrichment": str(do_formula_enrichment).lower(),
|
||||
"do_picture_classification": str(do_picture_classification).lower(),
|
||||
"do_picture_description": str(do_picture_description).lower(),
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{uvicorn_settings.port}/v1alpha/convert/file",
|
||||
files=files_data,
|
||||
data=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file(s): {e}")
|
||||
raise gr.Error(f"Error processing file(s): {e}", print_exception=False)
|
||||
if response.status_code != 200:
|
||||
data = response.json()
|
||||
error_message = data.get("detail", "An unknown error occurred.")
|
||||
logger.error(f"Error processing file: {error_message}")
|
||||
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
||||
output = response_to_output(response, return_as_file)
|
||||
return output
|
||||
|
||||
|
||||
def response_to_output(response, return_as_file):
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
json_rendered_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
download_button = gr.DownloadButton(visible=False, label="Download Output", scale=1)
|
||||
if return_as_file:
|
||||
filename = (
|
||||
response.headers.get("Content-Disposition").split("filename=")[1].strip('"')
|
||||
)
|
||||
tmp_output_dir = Path(tempfile.mkdtemp(dir=gradio_output_dir, prefix="ui_"))
|
||||
file_output_path = f"{tmp_output_dir}/{filename}"
|
||||
# logger.info(f"Saving file to: {file_output_path}")
|
||||
with open(file_output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
download_button = gr.DownloadButton(
|
||||
visible=True, label=f"Download {filename}", scale=1, value=file_output_path
|
||||
)
|
||||
else:
|
||||
full_content = response.json()
|
||||
markdown_content = full_content.get("document").get("md_content")
|
||||
json_content = json.dumps(
|
||||
full_content.get("document").get("json_content"), indent=2
|
||||
)
|
||||
# Embed document JSON and trigger load at client via an image.
|
||||
json_rendered_content = f"""
|
||||
<docling-img id="dclimg" pagenumbers tooltip="parsed"></docling-img>
|
||||
<script id="dcljson" type="application/json" onload="document.getElementById('dclimg').src = JSON.parse(document.getElementById('dcljson').textContent);">{json_content}</script>
|
||||
<img src onerror="document.getElementById('dclimg').src = JSON.parse(document.getElementById('dcljson').textContent);" />
|
||||
"""
|
||||
html_content = full_content.get("document").get("html_content")
|
||||
text_content = full_content.get("document").get("text_content")
|
||||
doctags_content = full_content.get("document").get("doctags_content")
|
||||
return (
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
json_rendered_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
doctags_content,
|
||||
download_button,
|
||||
)
|
||||
|
||||
|
||||
############
|
||||
# UI Setup #
|
||||
############
|
||||
|
||||
with gr.Blocks(
|
||||
head=head,
|
||||
css=css,
|
||||
theme=theme,
|
||||
title="Docling Serve",
|
||||
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
||||
) as ui:
|
||||
# Constants stored in states to be able to pass them as inputs to functions
|
||||
processing_text = gr.State("Processing your document(s), please wait...")
|
||||
true_bool = gr.State(True)
|
||||
false_bool = gr.State(False)
|
||||
|
||||
# Banner
|
||||
with gr.Row(elem_id="check_health"):
|
||||
# Logo
|
||||
with gr.Column(scale=1, min_width=90):
|
||||
try:
|
||||
gr.Image(
|
||||
logo_path,
|
||||
height=80,
|
||||
width=80,
|
||||
show_download_button=False,
|
||||
show_label=False,
|
||||
show_fullscreen_button=False,
|
||||
container=False,
|
||||
elem_id="logo",
|
||||
scale=0,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("Logo not found.")
|
||||
|
||||
# Title
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
gr.Markdown(
|
||||
f"# Docling Serve \n(docling version: "
|
||||
f"{importlib.metadata.version('docling')})",
|
||||
elem_id="title",
|
||||
elem_classes=["title-text"],
|
||||
)
|
||||
# Dark mode button
|
||||
with gr.Column(scale=16, elem_id="dark_mode_column"):
|
||||
dark_mode_btn = gr.Button("Dark/Light Mode", scale=0)
|
||||
dark_mode_btn.click(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
js="""() => {
|
||||
if (document.querySelectorAll('.dark').length) {
|
||||
document.querySelectorAll('.dark').forEach(
|
||||
el => el.classList.remove('dark')
|
||||
);
|
||||
} else {
|
||||
document.querySelector('body').classList.add('dark');
|
||||
}
|
||||
}""",
|
||||
show_api=False,
|
||||
)
|
||||
|
||||
# URL Processing Tab
|
||||
with gr.Tab("Convert URL(s)"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
url_input = gr.Textbox(
|
||||
label="Input Sources (comma-separated URLs)",
|
||||
placeholder="https://arxiv.org/pdf/2206.01062",
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
url_process_btn = gr.Button("Process URL(s)", scale=1)
|
||||
url_reset_btn = gr.Button("Reset", scale=1)
|
||||
|
||||
# File Processing Tab
|
||||
with gr.Tab("Convert File(s)"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
file_input = gr.File(
|
||||
elem_id="file_input_zone",
|
||||
label="Upload Files",
|
||||
file_types=[
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".html",
|
||||
".xlsx",
|
||||
".asciidoc",
|
||||
".txt",
|
||||
".md",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
],
|
||||
file_count="multiple",
|
||||
scale=4,
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
file_process_btn = gr.Button("Process File(s)", scale=1)
|
||||
file_reset_btn = gr.Button("Reset", scale=1)
|
||||
|
||||
# Options
|
||||
with gr.Accordion("Options") as options:
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1):
|
||||
to_formats = gr.CheckboxGroup(
|
||||
[
|
||||
("Markdown", "md"),
|
||||
("Docling (JSON)", "json"),
|
||||
("HTML", "html"),
|
||||
("Plain Text", "text"),
|
||||
("Doc Tags", "doctags"),
|
||||
],
|
||||
label="To Formats",
|
||||
value=["md"],
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
image_export_mode = gr.Radio(
|
||||
[
|
||||
("Embedded", "embedded"),
|
||||
("Placeholder", "placeholder"),
|
||||
("Referenced", "referenced"),
|
||||
],
|
||||
label="Image Export Mode",
|
||||
value="embedded",
|
||||
)
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr = gr.Checkbox(label="Enable OCR", value=True)
|
||||
force_ocr = gr.Checkbox(label="Force OCR", value=False)
|
||||
with gr.Column(scale=1):
|
||||
ocr_engine = gr.Radio(
|
||||
[
|
||||
("EasyOCR", "easyocr"),
|
||||
("Tesseract", "tesseract"),
|
||||
("RapidOCR", "rapidocr"),
|
||||
],
|
||||
label="OCR Engine",
|
||||
value="easyocr",
|
||||
)
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr_lang = gr.Textbox(
|
||||
label="OCR Language (beware of the format)", value="en,fr,de,es"
|
||||
)
|
||||
ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
|
||||
with gr.Row():
|
||||
with gr.Column(scale=2):
|
||||
pdf_backend = gr.Radio(
|
||||
["pypdfium2", "dlparse_v1", "dlparse_v2"],
|
||||
label="PDF Backend",
|
||||
value="dlparse_v2",
|
||||
)
|
||||
with gr.Column(scale=2):
|
||||
table_mode = gr.Radio(
|
||||
["fast", "accurate"], label="Table Mode", value="fast"
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
|
||||
return_as_file = gr.Checkbox(label="Return as File", value=False)
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
do_code_enrichment = gr.Checkbox(
|
||||
label="Enable code enrichment", value=False
|
||||
)
|
||||
do_formula_enrichment = gr.Checkbox(
|
||||
label="Enable formula enrichment", value=False
|
||||
)
|
||||
with gr.Column():
|
||||
do_picture_classification = gr.Checkbox(
|
||||
label="Enable picture classification", value=False
|
||||
)
|
||||
do_picture_description = gr.Checkbox(
|
||||
label="Enable picture description", value=False
|
||||
)
|
||||
|
||||
# Document output
|
||||
with gr.Row(visible=False) as content_output:
|
||||
with gr.Tab("Markdown"):
|
||||
output_markdown = gr.Code(
|
||||
language="markdown", wrap_lines=True, show_label=False
|
||||
)
|
||||
with gr.Tab("Markdown-Rendered"):
|
||||
output_markdown_rendered = gr.Markdown(label="Response")
|
||||
with gr.Tab("Docling (JSON)"):
|
||||
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("Docling-Rendered"):
|
||||
output_json_rendered = gr.HTML()
|
||||
with gr.Tab("HTML"):
|
||||
output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("HTML-Rendered"):
|
||||
output_html_rendered = gr.HTML(label="Response")
|
||||
with gr.Tab("Text"):
|
||||
output_text = gr.Code(wrap_lines=True, show_label=False)
|
||||
with gr.Tab("DocTags"):
|
||||
output_doctags = gr.Code(wrap_lines=True, show_label=False)
|
||||
|
||||
# File download output
|
||||
with gr.Row(visible=False) as file_output:
|
||||
download_file_btn = gr.DownloadButton(label="Placeholder", scale=1)
|
||||
|
||||
##############
|
||||
# UI Actions #
|
||||
##############
|
||||
|
||||
# Handle Return as File
|
||||
url_input.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
file_input.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
image_export_mode.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
|
||||
# URL processing
|
||||
url_process_btn.click(
|
||||
set_options_visibility, inputs=[false_bool], outputs=[options]
|
||||
).then(
|
||||
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
||||
).then(
|
||||
set_outputs_visibility_process,
|
||||
inputs=[return_as_file],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(
|
||||
process_url,
|
||||
inputs=[
|
||||
url_input,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
download_file_btn,
|
||||
],
|
||||
)
|
||||
|
||||
url_reset_btn.click(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(clear_url_input, inputs=None, outputs=[url_input])
|
||||
|
||||
# File processing
|
||||
file_process_btn.click(
|
||||
set_options_visibility, inputs=[false_bool], outputs=[options]
|
||||
).then(
|
||||
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
||||
).then(
|
||||
set_outputs_visibility_process,
|
||||
inputs=[return_as_file],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(
|
||||
process_file,
|
||||
inputs=[
|
||||
file_input,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
download_file_btn,
|
||||
],
|
||||
)
|
||||
|
||||
file_reset_btn.click(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(clear_file_input, inputs=None, outputs=[file_input])
|
||||
62
docling_serve/helper_functions.py
Normal file
62
docling_serve/helper_functions.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import inspect
|
||||
import re
|
||||
from typing import Union
|
||||
|
||||
from fastapi import Depends, Form
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
||||
def FormDepends(cls: type[BaseModel]):
|
||||
new_parameters = []
|
||||
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
new_parameters.append(
|
||||
inspect.Parameter(
|
||||
name=field_name,
|
||||
kind=inspect.Parameter.POSITIONAL_ONLY,
|
||||
default=(
|
||||
Form(...)
|
||||
if model_field.is_required()
|
||||
else Form(model_field.default)
|
||||
),
|
||||
annotation=model_field.annotation,
|
||||
)
|
||||
)
|
||||
|
||||
async def as_form_func(**data):
|
||||
return cls(**data)
|
||||
|
||||
sig = inspect.signature(as_form_func)
|
||||
sig = sig.replace(parameters=new_parameters)
|
||||
as_form_func.__signature__ = sig # type: ignore
|
||||
return Depends(as_form_func)
|
||||
|
||||
|
||||
def _to_list_of_strings(input_value: Union[str, list[str]]) -> list[str]:
|
||||
def split_and_strip(value: str) -> list[str]:
|
||||
if re.search(r"[;,]", value):
|
||||
return [item.strip() for item in re.split(r"[;,]", value)]
|
||||
else:
|
||||
return [value.strip()]
|
||||
|
||||
if isinstance(input_value, str):
|
||||
return split_and_strip(input_value)
|
||||
elif isinstance(input_value, list):
|
||||
result = []
|
||||
for item in input_value:
|
||||
result.extend(split_and_strip(str(item)))
|
||||
return result
|
||||
else:
|
||||
raise ValueError("Invalid input: must be a string or a list of strings.")
|
||||
|
||||
|
||||
# Helper functions to parse inputs coming as Form objects
|
||||
def _str_to_bool(value: Union[str, bool]) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value # Already a boolean, return as-is
|
||||
if isinstance(value, str):
|
||||
value = value.strip().lower() # Normalize input
|
||||
return value in ("true", "1", "yes")
|
||||
return False # Default to False if none of the above matches
|
||||
225
docling_serve/response_preparation.py
Normal file
225
docling_serve/response_preparation.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from fastapi import BackgroundTasks, HTTPException
|
||||
from fastapi.responses import FileResponse
|
||||
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
from docling.datamodel.document import ConversionResult, ConversionStatus
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
from docling_serve.datamodel.responses import ConvertDocumentResponse, DocumentResponse
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _export_document_as_content(
|
||||
conv_res: ConversionResult,
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_mode: ImageRefMode,
|
||||
):
|
||||
document = DocumentResponse(filename=conv_res.input.file.name)
|
||||
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
new_doc = conv_res.document._make_copy_with_refmode(Path(), image_mode)
|
||||
|
||||
# Create the different formats
|
||||
if export_json:
|
||||
document.json_content = new_doc
|
||||
if export_html:
|
||||
document.html_content = new_doc.export_to_html(image_mode=image_mode)
|
||||
if export_txt:
|
||||
document.text_content = new_doc.export_to_markdown(
|
||||
strict_text=True, image_mode=image_mode
|
||||
)
|
||||
if export_md:
|
||||
document.md_content = new_doc.export_to_markdown(image_mode=image_mode)
|
||||
if export_doctags:
|
||||
document.doctags_content = new_doc.export_to_document_tokens()
|
||||
elif conv_res.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=conv_res.errors)
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail=conv_res.errors)
|
||||
|
||||
return document
|
||||
|
||||
|
||||
def _export_documents_as_files(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_export_mode: ImageRefMode,
|
||||
):
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export JSON format:
|
||||
if export_json:
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
conv_res.document.save_as_json(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export HTML format:
|
||||
if export_html:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
_log.info(f"writing TXT output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname,
|
||||
strict_text=True,
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
conv_res.document.save_as_document_tokens(filename=fname)
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, "
|
||||
f"of which {failure_count} failed"
|
||||
)
|
||||
|
||||
|
||||
def process_results(
|
||||
background_tasks: BackgroundTasks,
|
||||
conversion_options: ConvertDocumentsOptions,
|
||||
conv_results: Iterable[ConversionResult],
|
||||
) -> Union[ConvertDocumentResponse, FileResponse]:
|
||||
# Let's start by processing the documents
|
||||
try:
|
||||
start_time = time.monotonic()
|
||||
|
||||
# Convert the iterator to a list to count the number of results and get timings
|
||||
# As it's an iterator (lazy evaluation), it will also start the conversion
|
||||
conv_results = list(conv_results)
|
||||
|
||||
processing_time = time.monotonic() - start_time
|
||||
|
||||
_log.info(
|
||||
f"Processed {len(conv_results)} docs in {processing_time:.2f} seconds."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if len(conv_results) == 0:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="No documents were generated by Docling."
|
||||
)
|
||||
|
||||
# We have some results, let's prepare the response
|
||||
response: Union[FileResponse, ConvertDocumentResponse]
|
||||
|
||||
# Booleans to know what to export
|
||||
export_json = OutputFormat.JSON in conversion_options.to_formats
|
||||
export_html = OutputFormat.HTML in conversion_options.to_formats
|
||||
export_md = OutputFormat.MARKDOWN in conversion_options.to_formats
|
||||
export_txt = OutputFormat.TEXT in conversion_options.to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in conversion_options.to_formats
|
||||
|
||||
# Only 1 document was processed, and we are not returning it as a file
|
||||
if len(conv_results) == 1 and not conversion_options.return_as_file:
|
||||
conv_res = conv_results[0]
|
||||
document = _export_document_as_content(
|
||||
conv_res,
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_mode=conversion_options.image_export_mode,
|
||||
)
|
||||
|
||||
response = ConvertDocumentResponse(
|
||||
document=document,
|
||||
status=conv_res.status,
|
||||
processing_time=processing_time,
|
||||
timings=conv_res.timings,
|
||||
)
|
||||
|
||||
# Multiple documents were processed, or we are forced returning as a file
|
||||
else:
|
||||
# Temporary directory to store the outputs
|
||||
work_dir = Path(tempfile.mkdtemp(prefix="docling_"))
|
||||
output_dir = work_dir / "output"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Worker pid to use in archive identification as we may have multiple workers
|
||||
os.getpid()
|
||||
|
||||
# Export the documents
|
||||
_export_documents_as_files(
|
||||
conv_results=conv_results,
|
||||
output_dir=output_dir,
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_export_mode=conversion_options.image_export_mode,
|
||||
)
|
||||
|
||||
files = os.listdir(output_dir)
|
||||
|
||||
if len(files) == 0:
|
||||
raise HTTPException(status_code=500, detail="No documents were exported.")
|
||||
|
||||
file_path = work_dir / "converted_docs.zip"
|
||||
shutil.make_archive(
|
||||
base_name=str(file_path.with_suffix("")),
|
||||
format="zip",
|
||||
root_dir=output_dir,
|
||||
)
|
||||
|
||||
# Other cleanups after the response is sent
|
||||
# Output directory
|
||||
background_tasks.add_task(shutil.rmtree, work_dir, ignore_errors=True)
|
||||
|
||||
response = FileResponse(
|
||||
file_path, filename=file_path.name, media_type="application/zip"
|
||||
)
|
||||
|
||||
return response
|
||||
@@ -1,6 +1,49 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
from docling_serve.datamodel.engines import AsyncEngine
|
||||
|
||||
class Settings(BaseSettings):
|
||||
|
||||
model_config = SettingsConfigDict(env_prefix="DOCLING_")
|
||||
class UvicornSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="UVICORN_", env_file=".env", extra="allow"
|
||||
)
|
||||
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 5001
|
||||
reload: bool = False
|
||||
root_path: str = ""
|
||||
proxy_headers: bool = True
|
||||
timeout_keep_alive: int = 60
|
||||
ssl_certfile: Optional[Path] = None
|
||||
ssl_keyfile: Optional[Path] = None
|
||||
ssl_keyfile_password: Optional[str] = None
|
||||
workers: Union[int, None] = None
|
||||
|
||||
|
||||
class DoclingServeSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="DOCLING_SERVE_",
|
||||
env_file=".env",
|
||||
env_parse_none_str="",
|
||||
extra="allow",
|
||||
)
|
||||
|
||||
enable_ui: bool = False
|
||||
artifacts_path: Optional[Path] = None
|
||||
static_path: Optional[Path] = None
|
||||
options_cache_size: int = 2
|
||||
allow_external_plugins: bool = False
|
||||
|
||||
cors_origins: list[str] = ["*"]
|
||||
cors_methods: list[str] = ["*"]
|
||||
cors_headers: list[str] = ["*"]
|
||||
|
||||
eng_kind: AsyncEngine = AsyncEngine.LOCAL
|
||||
eng_loc_num_workers: int = 2
|
||||
|
||||
|
||||
uvicorn_settings = UvicornSettings()
|
||||
docling_serve_settings = DoclingServeSettings()
|
||||
|
||||
8
docs/README.md
Normal file
8
docs/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Dolcing Serve documentation
|
||||
|
||||
This documentation pages explore the webserver configurations, runtime options, deployment examples as well as development best practices.
|
||||
|
||||
- [Configuration](./configuration.md)
|
||||
- [Advance usage](./usage.md)
|
||||
- [Deployment](./deployment.md)
|
||||
- [Development](./development.md)
|
||||
BIN
docs/assets/docling-serve-pic.png
Normal file
BIN
docs/assets/docling-serve-pic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 504 KiB |
44
docs/configuration.md
Normal file
44
docs/configuration.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# Configuration
|
||||
|
||||
The `docling-serve` executable allows to configure the server via command line
|
||||
options as well as environment variables.
|
||||
Configurations are divided between the settings used for the `uvicorn` asgi
|
||||
server and the actual app-specific configurations.
|
||||
|
||||
> [!WARNING]
|
||||
> When the server is running with `reload` or with multiple `workers`, uvicorn
|
||||
> will spawn multiple subprocessed. This invalides all the values configured
|
||||
> via the CLI command line options. Please use environment variables in this
|
||||
> type of deployments.
|
||||
|
||||
## Webserver configuration
|
||||
|
||||
The following table shows the options which are propagated directly to the
|
||||
`uvicorn` webserver runtime.
|
||||
|
||||
| CLI option | ENV | Default | Description |
|
||||
| -----------|-----|---------|-------------|
|
||||
| `--host` | `UVICORN_HOST` | `0.0.0.0` for `run`, `localhost` for `dev` | THe host to serve on. |
|
||||
| `--port` | `UVICORN_PORT` | `5001` | The port to serve on. |
|
||||
| `--reload` | `UVICORN_RELOAD` | `false` for `run`, `true` for `dev` | Enable auto-reload of the server when (code) files change. |
|
||||
| `--workers` | `UVICORN_WORKERS` | `1` | Use multiple worker processes. |
|
||||
| `--root-path` | `UVICORN_ROOT_PATH` | `""` | The root path is used to tell your app that it is being served to the outside world with some |
|
||||
| `--proxy-headers` | `UVICORN_PROXY_HEADERS` | `true` | Enable/Disable X-Forwarded-Proto, X-Forwarded-For, X-Forwarded-Port to populate remote address info. |
|
||||
| `--timeout-keep-alive` | `UVICORN_TIMEOUT_KEEP_ALIVE` | `60` | Timeout for the server response. |
|
||||
| `--ssl-certfile` | `UVICORN_SSL_CERTFILE` | | SSL certificate file. |
|
||||
| `--ssl-keyfile` | `UVICORN_SSL_KEYFILE` | | SSL key file. |
|
||||
| `--ssl-keyfile-password` | `UVICORN_SSL_KEYFILE_PASSWORD` | | SSL keyfile password. |
|
||||
|
||||
## Docling Serve configuration
|
||||
|
||||
THe following table describes the options to configure the Docling Serve app.
|
||||
|
||||
| CLI option | ENV | Default | Description |
|
||||
| -----------|-----|---------|-------------|
|
||||
| `--artifacts-path` | `DOCLING_SERVE_ARTIFACTS_PATH` | unset | If set to a valid directory, the model weights will be loaded from this path |
|
||||
| | `DOCLING_SERVE_STATIC_PATH` | unset | If set to a valid directory, the static assets for the docs and ui will be loaded from this path |
|
||||
| `--enable-ui` | `DOCLING_SERVE_ENABLE_UI` | `false` | Enable the demonstrator UI. |
|
||||
| | `DOCLING_SERVE_OPTIONS_CACHE_SIZE` | `2` | How many DocumentConveter objects (including their loaded models) to keep in the cache. |
|
||||
| | `DOCLING_SERVE_CORS_ORIGINS` | `["*"]` | A list of origins that should be permitted to make cross-origin requests. |
|
||||
| | `DOCLING_SERVE_CORS_METHODS` | `["*"]` | A list of HTTP methods that should be allowed for cross-origin requests. |
|
||||
| | `DOCLING_SERVE_CORS_HEADERS` | `["*"]` | A list of HTTP request headers that should be supported for cross-origin requests. |
|
||||
209
docs/deploy-examples/docling-serve-oauth.yaml
Normal file
209
docs/deploy-examples/docling-serve-oauth.yaml
Normal file
@@ -0,0 +1,209 @@
|
||||
# This example deployment configures Docling Serve with a OAuth-Proxy sidecar and TLS termination
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
annotations:
|
||||
serviceaccounts.openshift.io/oauth-redirectreference.primary: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"docling-serve"}}'
|
||||
---
|
||||
kind: Role
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: docling-serve-oauth
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
rules:
|
||||
- verbs:
|
||||
- create
|
||||
apiGroups:
|
||||
- authorization.k8s.io
|
||||
resources:
|
||||
- subjectaccessreviews
|
||||
- verbs:
|
||||
- create
|
||||
apiGroups:
|
||||
- authentication.k8s.io
|
||||
resources:
|
||||
- tokenreviews
|
||||
---
|
||||
kind: RoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: docling-serve-oauth
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: docling-serve
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: docling-serve-oauth
|
||||
---
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: docling-serve
|
||||
port:
|
||||
targetPort: oauth
|
||||
tls:
|
||||
termination: Reencrypt
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
annotations:
|
||||
service.alpha.openshift.io/serving-cert-secret-name: docling-serve-tls
|
||||
spec:
|
||||
ports:
|
||||
- name: oauth
|
||||
port: 8443
|
||||
targetPort: oauth
|
||||
- name: http
|
||||
port: 5001
|
||||
targetPort: http
|
||||
selector:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
---
|
||||
kind: Deployment
|
||||
apiVersion: apps/v1
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
restartPolicy: Always
|
||||
serviceAccountName: docling-serve
|
||||
containers:
|
||||
- name: api
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 10
|
||||
timeoutSeconds: 2
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 3
|
||||
timeoutSeconds: 2
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
env:
|
||||
- name: DOCLING_SERVE_ENABLE_UI
|
||||
value: 'true'
|
||||
- name: UVICORN_SSL_CERTFILE
|
||||
value: '/etc/tls/private/tls.crt'
|
||||
- name: UVICORN_SSL_KEYFILE
|
||||
value: '/etc/tls/private/tls.key'
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 5001
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: proxy-tls
|
||||
mountPath: /etc/tls/private
|
||||
imagePullPolicy: Always
|
||||
image: 'ghcr.io/docling-project/docling-serve:dev-ssl'
|
||||
- name: oauth-proxy
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /oauth/healthz
|
||||
port: oauth
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 1
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /oauth/healthz
|
||||
port: oauth
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 30
|
||||
timeoutSeconds: 1
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
ports:
|
||||
- name: oauth
|
||||
containerPort: 8443
|
||||
protocol: TCP
|
||||
imagePullPolicy: IfNotPresent
|
||||
volumeMounts:
|
||||
- name: proxy-tls
|
||||
mountPath: /etc/tls/private
|
||||
env:
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
image: 'registry.redhat.io/openshift4/ose-oauth-proxy:v4.13'
|
||||
args:
|
||||
- '--https-address=:8443'
|
||||
- '--provider=openshift'
|
||||
- '--openshift-service-account=docling-serve'
|
||||
- '--upstream=https://docling-serve.$(NAMESPACE).svc.cluster.local:5001'
|
||||
- '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt'
|
||||
- '--tls-cert=/etc/tls/private/tls.crt'
|
||||
- '--tls-key=/etc/tls/private/tls.key'
|
||||
- '--cookie-secret=SECRET'
|
||||
- '--openshift-delegate-urls={"/": {"group":"route.openshift.io","resource":"routes","verb":"get","name":"docling-serve","namespace":"$(NAMESPACE)"}}'
|
||||
- '--openshift-sar={"namespace":"$(NAMESPACE)","resource":"routes","resourceName":"docling-serve","verb":"get","resourceAPIGroup":"route.openshift.io"}'
|
||||
- '--skip-auth-regex=''(^/health|^/docs)'''
|
||||
volumes:
|
||||
- name: proxy-tls
|
||||
secret:
|
||||
secretName: docling-serve-tls
|
||||
defaultMode: 420
|
||||
40
docs/deployment.md
Normal file
40
docs/deployment.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Deployment
|
||||
|
||||
## OpenShift
|
||||
|
||||
### Secure deployment with `oauth-proxy`
|
||||
|
||||
Manifest example: [docling-serve-oauth.yaml](./deploy-examples/docling-serve-oauth.yaml)
|
||||
|
||||
This deployment has the following features:
|
||||
|
||||
- TLS encryption between all components (using the cluster-internal CA authority).
|
||||
- Authentication via a secure `oauth-proxy` sidecar.
|
||||
- Expose the service using a secure OpenShift `Route`
|
||||
|
||||
Install the app with:
|
||||
|
||||
```sh
|
||||
kubectl apply -f docs/deploy-examples/docling-serve-oauth.yaml
|
||||
```
|
||||
|
||||
For using the API:
|
||||
|
||||
```sh
|
||||
# Retrieve the endpoint
|
||||
DOCLING_NAME=docling-serve
|
||||
DOCLING_ROUTE="https://$(oc get routes ${DOCLING_NAME} --template={{.spec.host}})"
|
||||
|
||||
# Retrieve the authentication token
|
||||
OCP_AUTH_TOKEN=$(oc whoami --show-token)
|
||||
|
||||
# Make a test query
|
||||
curl -X 'POST' \
|
||||
"${DOCLING_ROUTE}/v1alpha/convert/source/async" \
|
||||
-H "Authorization: Bearer ${OCP_AUTH_TOKEN}" \
|
||||
-H "accept: application/json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
|
||||
}'
|
||||
```
|
||||
57
docs/development.md
Normal file
57
docs/development.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Development
|
||||
|
||||
## Install dependencies
|
||||
|
||||
### CPU only
|
||||
|
||||
```sh
|
||||
# Install uv if not already available
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Install dependencies
|
||||
uv sync --extra cpu
|
||||
```
|
||||
|
||||
### Cuda GPU
|
||||
|
||||
For GPU support use the following command:
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
uv sync
|
||||
```
|
||||
|
||||
### Gradio UI and different OCR backends
|
||||
|
||||
`/ui` endpoint using `gradio` and different OCR backends can be enabled via package extras:
|
||||
|
||||
```sh
|
||||
# Enable ui and rapidocr
|
||||
uv sync --extra ui --extra rapidocr
|
||||
```
|
||||
|
||||
```sh
|
||||
# Enable tesserocr
|
||||
uv sync --extra tesserocr
|
||||
```
|
||||
|
||||
See `[project.optional-dependencies]` section in `pyproject.toml` for full list of options and runtime options with `uv run docling-serve --help`.
|
||||
|
||||
### Run the server
|
||||
|
||||
The `docling-serve` executable is a convenient script for launching the webserver both in
|
||||
development and production mode.
|
||||
|
||||
```sh
|
||||
# Run the server in development mode
|
||||
# - reload is enabled by default
|
||||
# - listening on the 127.0.0.1 address
|
||||
# - ui is enabled by default
|
||||
docling-serve dev
|
||||
|
||||
# Run the server in production mode
|
||||
# - reload is disabled by default
|
||||
# - listening on the 0.0.0.0 address
|
||||
# - ui is disabled by default
|
||||
docling-serve run
|
||||
```
|
||||
279
docs/usage.md
Normal file
279
docs/usage.md
Normal file
@@ -0,0 +1,279 @@
|
||||
# Usage
|
||||
|
||||
The API provides two endpoints: one for urls, one for files. This is necessary to send files directly in binary format instead of base64-encoded strings.
|
||||
|
||||
## Common parameters
|
||||
|
||||
On top of the source of file (see below), both endpoints support the same parameters, which are almost the same as the Docling CLI.
|
||||
|
||||
- `from_format` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
|
||||
- `to_formats` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
|
||||
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
||||
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
||||
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
||||
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
||||
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`. Defaults to `dlparse_v2`.
|
||||
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
||||
- `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
|
||||
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
|
||||
- `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
|
||||
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to true.
|
||||
- `images_scale` (float): Scale factor for images. Defaults to 2.0.
|
||||
|
||||
## Convert endpoints
|
||||
|
||||
### Source endpoint
|
||||
|
||||
The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
|
||||
|
||||
On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
|
||||
The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
|
||||
No `options` is required, they can be partially or completely omitted.
|
||||
|
||||
Simple payload example:
|
||||
|
||||
```json
|
||||
{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Complete payload example:</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://localhost:5001/v1alpha/convert/source' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx"
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": [
|
||||
"fr",
|
||||
"de",
|
||||
"es",
|
||||
"en"
|
||||
],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
"do_table_structure": true,
|
||||
"include_images": true,
|
||||
"images_scale": 2
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": "en",
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
|
||||
response = await async_client_client.post(url, json=payload)
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### File as base64
|
||||
|
||||
The `file_sources` argument in the endpoint allows to send files as base64-encoded strings.
|
||||
When your PDF or other file type is too large, encoding it and passing it inline to curl
|
||||
can lead to an “Argument list too long” error on some systems. To avoid this, we write
|
||||
the JSON request body to a file and have curl read from that file.
|
||||
|
||||
<details>
|
||||
<summary>CURL steps:</summary>
|
||||
|
||||
```sh
|
||||
# 1. Base64-encode the file
|
||||
B64_DATA=$(base64 -w 0 /path/to/file/pdf-to-convert.pdf)
|
||||
|
||||
# 2. Build the JSON with your options
|
||||
cat <<EOF > /tmp/request_body.json
|
||||
{
|
||||
"options": {
|
||||
},
|
||||
"file_sources": [{
|
||||
"base64_string": "${B64_DATA}",
|
||||
"filename": "pdf-to-convert.pdf"
|
||||
}]
|
||||
}
|
||||
EOF
|
||||
|
||||
# 3. POST the request to the docling service
|
||||
curl -X POST "localhost:5001/v1alpha/convert/source" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @/tmp/request_body.json
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### File endpoint
|
||||
|
||||
The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
|
||||
|
||||
<details>
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:5001/v1alpha/convert/file' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'ocr_engine=easyocr' \
|
||||
-F 'pdf_backend=dlparse_v2' \
|
||||
-F 'from_formats=pdf' \
|
||||
-F 'from_formats=docx' \
|
||||
-F 'force_ocr=false' \
|
||||
-F 'image_export_mode=embedded' \
|
||||
-F 'ocr_lang=en' \
|
||||
-F 'ocr_lang=pl' \
|
||||
-F 'table_mode=fast' \
|
||||
-F 'files=@2206.01062v1.pdf;type=application/pdf' \
|
||||
-F 'abort_on_error=false' \
|
||||
-F 'to_formats=md' \
|
||||
-F 'to_formats=text' \
|
||||
-F 'return_as_file=false' \
|
||||
-F 'do_ocr=true'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
parameters = {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, '2206.01062v1.pdf')
|
||||
|
||||
files = {
|
||||
'files': ('2206.01062v1.pdf', open(file_path, 'rb'), 'application/pdf'),
|
||||
}
|
||||
|
||||
response = await async_client.post(url, files=files, data={"parameters": json.dumps(parameters)})
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Response format
|
||||
|
||||
The response can be a JSON Document or a File.
|
||||
|
||||
- If you process only one file, the response will be a JSON document with the following format:
|
||||
|
||||
```jsonc
|
||||
{
|
||||
"document": {
|
||||
"md_content": "",
|
||||
"json_content": {},
|
||||
"html_content": "",
|
||||
"text_content": "",
|
||||
"doctags_content": ""
|
||||
},
|
||||
"status": "<success|partial_success|skipped|failure>",
|
||||
"processing_time": 0.0,
|
||||
"timings": {},
|
||||
"errors": []
|
||||
}
|
||||
```
|
||||
|
||||
Depending on the value you set in `output_formats`, the different items will be populated with their respective results or empty.
|
||||
|
||||
`processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
|
||||
timing of all the internal Docling components.
|
||||
|
||||
- If you set the parameter `return_as_file` to True, the response will be a zip file.
|
||||
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
|
||||
|
||||
## Asynchronous API
|
||||
|
||||
TBA
|
||||
BIN
img/swagger.png
Normal file
BIN
img/swagger.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
BIN
img/ui-input.png
Normal file
BIN
img/ui-input.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 64 KiB |
BIN
img/ui-output.png
Normal file
BIN
img/ui-output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 124 KiB |
6
os-packages.txt
Normal file
6
os-packages.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
tesseract
|
||||
tesseract-devel
|
||||
tesseract-langpack-eng
|
||||
leptonica-devel
|
||||
libglvnd-glx
|
||||
glib2
|
||||
4430
poetry.lock
generated
4430
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
257
pyproject.toml
257
pyproject.toml
@@ -1,25 +1,25 @@
|
||||
[tool.poetry]
|
||||
[project]
|
||||
name = "docling-serve"
|
||||
version = "0.1.0"
|
||||
version = "0.7.0" # DO NOT EDIT, updated automatically
|
||||
description = "Running Docling as a service"
|
||||
license = "MIT"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
"Michele Dolfi <dol@zurich.ibm.com>",
|
||||
"Christoph Auer <cau@zurich.ibm.com>",
|
||||
"Panos Vagenas <pva@zurich.ibm.com>",
|
||||
"Cesar Berrospi Ramis <ceb@zurich.ibm.com>",
|
||||
"Peter Staar <taa@zurich.ibm.com>",
|
||||
{name="Michele Dolfi", email="dol@zurich.ibm.com"},
|
||||
{name="Guillaume Moutier", email="gmoutier@redhat.com"},
|
||||
{name="Anil Vishnoi", email="avishnoi@redhat.com"},
|
||||
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
|
||||
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
|
||||
{name="Christoph Auer", email="cau@zurich.ibm.com"},
|
||||
{name="Peter Staar", email="taa@zurich.ibm.com"},
|
||||
]
|
||||
maintainers = [
|
||||
"Peter Staar <taa@zurich.ibm.com>",
|
||||
"Christoph Auer <cau@zurich.ibm.com>",
|
||||
"Michele Dolfi <dol@zurich.ibm.com>",
|
||||
"Cesar Berrospi Ramis <ceb@zurich.ibm.com>",
|
||||
"Panos Vagenas <pva@zurich.ibm.com>",
|
||||
{name="Michele Dolfi", email="dol@zurich.ibm.com"},
|
||||
{name="Anil Vishnoi", email="avishnoi@redhat.com"},
|
||||
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
|
||||
{name="Christoph Auer", email="cau@zurich.ibm.com"},
|
||||
{name="Peter Staar", email="taa@zurich.ibm.com"},
|
||||
]
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/DS4SD/docling-serve"
|
||||
homepage = "https://github.com/DS4SD/docling-serve"
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
@@ -28,90 +28,159 @@ classifiers = [
|
||||
"Typing :: Typed",
|
||||
"Programming Language :: Python :: 3"
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
docling = "^2.10.0"
|
||||
fastapi = {version = "^0.115.6", extras = ["standard"]}
|
||||
uvicorn = "^0.32.1"
|
||||
pydantic-settings = "^2.4.0"
|
||||
httpx = "^0.28.1"
|
||||
tesserocr = { version = "^2.7.1", optional = true }
|
||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||
onnxruntime = [
|
||||
# 1.19.2 is the last version with python3.9 support,
|
||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"docling~=2.28",
|
||||
"fastapi[standard]~=0.115",
|
||||
"httpx~=0.28",
|
||||
"pydantic~=2.10",
|
||||
"pydantic-settings~=2.4",
|
||||
"python-multipart>=0.0.14,<0.1.0",
|
||||
"typer~=0.12",
|
||||
"uvicorn[standard]>=0.29.0,<1.0.0",
|
||||
"websockets~=14.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
ui = [
|
||||
"gradio~=5.9"
|
||||
]
|
||||
tesserocr = [
|
||||
"tesserocr~=2.7"
|
||||
]
|
||||
rapidocr = [
|
||||
"rapidocr-onnxruntime~=1.4; python_version<'3.13'",
|
||||
"onnxruntime~=1.7",
|
||||
]
|
||||
cpu = [
|
||||
"torch>=2.6.0",
|
||||
"torchvision>=0.21.0",
|
||||
]
|
||||
cu124 = [
|
||||
"torch>=2.6.0",
|
||||
"torchvision>=0.21.0",
|
||||
]
|
||||
|
||||
[tool.poetry.extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"mypy~=1.11",
|
||||
"pre-commit-uv~=4.1",
|
||||
"pytest~=8.3",
|
||||
"pytest-asyncio~=0.24",
|
||||
"pytest-check~=2.4",
|
||||
"python-semantic-release~=7.32",
|
||||
"ruff>=0.9.6",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
package = true
|
||||
conflicts = [
|
||||
[
|
||||
{ extra = "cpu" },
|
||||
{ extra = "cu124" },
|
||||
],
|
||||
]
|
||||
|
||||
[tool.poetry.group.pypi-torch]
|
||||
optional = false
|
||||
|
||||
[tool.poetry.group.pypi-torch.dependencies]
|
||||
[tool.uv.sources]
|
||||
torch = [
|
||||
{version = "!=2.4.1+cpu" },
|
||||
{ index = "pytorch-cpu", extra = "cpu" },
|
||||
{ index = "pytorch-cu124", extra = "cu124" },
|
||||
]
|
||||
torchvision = [
|
||||
{version = "!=0.19.1+cpu" },
|
||||
{ index = "pytorch-cpu", extra = "cpu" },
|
||||
{ index = "pytorch-cu124", extra = "cu124" },
|
||||
]
|
||||
|
||||
[tool.poetry.group.cpu]
|
||||
optional = true
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
explicit = true
|
||||
|
||||
[tool.poetry.group.cpu.dependencies]
|
||||
torch = [
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.10"', url="https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.11"', url="https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp311-cp311-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
|
||||
]
|
||||
torchvision = [
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.10"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp310-cp310-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.11"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp311-cp311-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
|
||||
]
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cu124"
|
||||
url = "https://download.pytorch.org/whl/cu124"
|
||||
explicit = true
|
||||
|
||||
[tool.poetry.group.constraints.dependencies]
|
||||
numpy = [
|
||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
||||
]
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["docling_serve*"]
|
||||
namespaces = true
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^24.8.0"
|
||||
isort = "^5.13.2"
|
||||
pre-commit = "^3.8.0"
|
||||
autoflake = "^2.3.1"
|
||||
flake8 = "^7.1.1"
|
||||
pytest = "^8.3.2"
|
||||
mypy = "^1.11.2"
|
||||
[project.scripts]
|
||||
docling-serve = "docling_serve.__main__:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/docling-project/docling-serve"
|
||||
# Documentation = "https://ds4sd.github.io/docling"
|
||||
Repository = "https://github.com/docling-project/docling-serve"
|
||||
Issues = "https://github.com/docling-project/docling-serve/issues"
|
||||
Changelog = "https://github.com/docling-project/docling-serve/blob/main/CHANGELOG.md"
|
||||
|
||||
[tool.black]
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
line-length = 88
|
||||
target-version = ["py310"]
|
||||
include = '\.pyi?$'
|
||||
respect-gitignore = true
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
line_length = 88
|
||||
py_version=311
|
||||
# extend-exclude = [
|
||||
# "tests",
|
||||
# ]
|
||||
|
||||
[tool.autoflake]
|
||||
in-place = true
|
||||
remove-all-unused-imports = true
|
||||
remove-unused-variables = true
|
||||
expand-star-imports = true
|
||||
recursive = true
|
||||
[tool.ruff.format]
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# "B", # flake8-bugbear
|
||||
"C", # flake8-comprehensions
|
||||
"C9", # mccabe
|
||||
# "D", # flake8-docstrings
|
||||
"E", # pycodestyle errors (default)
|
||||
"F", # pyflakes (default)
|
||||
"I", # isort
|
||||
"PD", # pandas-vet
|
||||
"PIE", # pie
|
||||
# "PTH", # pathlib
|
||||
"Q", # flake8-quotes
|
||||
# "RET", # return
|
||||
"RUF", # Enable all ruff-specific checks
|
||||
# "SIM", # simplify
|
||||
"S307", # eval
|
||||
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
||||
"W", # pycodestyle warnings
|
||||
"ASYNC", # async
|
||||
"UP", # pyupgrade
|
||||
]
|
||||
|
||||
ignore = [
|
||||
"E501", # Line too long, handled by ruff formatter
|
||||
"D107", # "Missing docstring in __init__",
|
||||
"F811", # "redefinition of the same function"
|
||||
"PL", # Pylint
|
||||
"RUF012", # Mutable Class Attributes
|
||||
"UP007", # Option and Union
|
||||
]
|
||||
|
||||
#extend-select = []
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"__init__.py" = ["E402", "F401"]
|
||||
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
||||
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 15
|
||||
|
||||
[tool.ruff.lint.isort.sections]
|
||||
"docling" = ["docling", "docling_core"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
combine-as-imports = true
|
||||
section-order = [
|
||||
"future",
|
||||
"standard-library",
|
||||
"third-party",
|
||||
"docling",
|
||||
"first-party",
|
||||
"local-folder",
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
pretty = true
|
||||
@@ -125,5 +194,31 @@ module = [
|
||||
"easyocr.*",
|
||||
"tesserocr.*",
|
||||
"rapidocr_onnxruntime.*",
|
||||
"requests.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
asyncio_default_fixture_loop_scope = "function"
|
||||
minversion = "8.2"
|
||||
testpaths = [
|
||||
"tests",
|
||||
]
|
||||
addopts = "-rA --color=yes --tb=short --maxfail=5"
|
||||
markers = [
|
||||
"asyncio",
|
||||
]
|
||||
|
||||
[tool.semantic_release]
|
||||
# for default values check:
|
||||
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
||||
|
||||
version_source = "tag_only"
|
||||
branch = "main"
|
||||
|
||||
# configure types which should trigger minor and patch version bumps respectively
|
||||
# (note that they must be a subset of the configured allowed types):
|
||||
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
|
||||
parser_angular_minor_types = "feat"
|
||||
parser_angular_patch_types = "fix,perf"
|
||||
|
||||
BIN
tests/2206.01062v1.pdf
Normal file
BIN
tests/2206.01062v1.pdf
Normal file
Binary file not shown.
BIN
tests/2408.09869v5.pdf
Normal file
BIN
tests/2408.09869v5.pdf
Normal file
Binary file not shown.
129
tests/test_1-file-all-outputs.py
Normal file
129
tests/test_1-file-all-outputs.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(async_client):
|
||||
"""Test convert single file to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
options = {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = {
|
||||
"files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
|
||||
}
|
||||
|
||||
response = await async_client.post(
|
||||
url, files=files, data={"options": json.dumps(options)}
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Response content checks
|
||||
# Helper function to safely slice strings
|
||||
def safe_slice(value, length=100):
|
||||
if isinstance(value, str):
|
||||
return value[:length]
|
||||
return str(value) # Convert non-string values to string for debug purposes
|
||||
|
||||
# Document check
|
||||
check.is_in(
|
||||
"document",
|
||||
data,
|
||||
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
||||
)
|
||||
# MD check
|
||||
check.is_in(
|
||||
"md_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("md_content") is not None:
|
||||
check.is_in(
|
||||
"## DocLayNet: ",
|
||||
data["document"]["md_content"],
|
||||
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
||||
)
|
||||
# JSON check
|
||||
check.is_in(
|
||||
"json_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("json_content") is not None:
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
"html_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("html_content") is not None:
|
||||
check.is_in(
|
||||
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
||||
data["document"]["html_content"],
|
||||
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
||||
)
|
||||
# Text check
|
||||
check.is_in(
|
||||
"text_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("text_content") is not None:
|
||||
check.is_in(
|
||||
"DocLayNet: A Large Human-Annotated Dataset",
|
||||
data["document"]["text_content"],
|
||||
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
||||
)
|
||||
# DocTags check
|
||||
check.is_in(
|
||||
"doctags_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("doctags_content") is not None:
|
||||
check.is_in(
|
||||
"<document>\n<section_header_level_1><location>",
|
||||
data["document"]["doctags_content"],
|
||||
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
||||
)
|
||||
123
tests/test_1-url-all-outputs.py
Normal file
123
tests/test_1-url-all-outputs.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import json
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}],
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
response = await async_client.post(url, json=payload)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Response content checks
|
||||
# Helper function to safely slice strings
|
||||
def safe_slice(value, length=100):
|
||||
if isinstance(value, str):
|
||||
return value[:length]
|
||||
return str(value) # Convert non-string values to string for debug purposes
|
||||
|
||||
# Document check
|
||||
check.is_in(
|
||||
"document",
|
||||
data,
|
||||
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
||||
)
|
||||
# MD check
|
||||
check.is_in(
|
||||
"md_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("md_content") is not None:
|
||||
check.is_in(
|
||||
"## DocLayNet: ",
|
||||
data["document"]["md_content"],
|
||||
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
||||
)
|
||||
# JSON check
|
||||
check.is_in(
|
||||
"json_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("json_content") is not None:
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
"html_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("html_content") is not None:
|
||||
check.is_in(
|
||||
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
||||
data["document"]["html_content"],
|
||||
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
||||
)
|
||||
# Text check
|
||||
check.is_in(
|
||||
"text_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("text_content") is not None:
|
||||
check.is_in(
|
||||
"DocLayNet: A Large Human-Annotated Dataset",
|
||||
data["document"]["text_content"],
|
||||
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
||||
)
|
||||
# DocTags check
|
||||
check.is_in(
|
||||
"doctags_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("doctags_content") is not None:
|
||||
check.is_in(
|
||||
"<document>\n<section_header_level_1><location>",
|
||||
data["document"]["doctags_content"],
|
||||
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
||||
)
|
||||
48
tests/test_1-url-async-ws.py
Normal file
48
tests/test_1-url-async-ws.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from websockets.sync.client import connect
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client: httpx.AsyncClient):
|
||||
"""Test convert URL to all outputs"""
|
||||
|
||||
doc_filename = Path("tests/2408.09869v5.pdf")
|
||||
encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode()
|
||||
|
||||
base_url = "http://localhost:5001/v1alpha"
|
||||
payload = {
|
||||
"options": {
|
||||
"to_formats": ["md", "json"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
# "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}],
|
||||
"file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
|
||||
}
|
||||
# print(json.dumps(payload, indent=2))
|
||||
|
||||
for n in range(5):
|
||||
response = await async_client.post(
|
||||
f"{base_url}/convert/source/async", json=payload
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
task = response.json()
|
||||
|
||||
uri = f"ws://localhost:5001/v1alpha/status/ws/{task['task_id']}"
|
||||
with connect(uri) as websocket:
|
||||
for message in websocket:
|
||||
print(message)
|
||||
60
tests/test_1-url-async.py
Normal file
60
tests/test_1-url-async.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
|
||||
example_docs = [
|
||||
"https://arxiv.org/pdf/2411.19710",
|
||||
"https://arxiv.org/pdf/2501.17887",
|
||||
"https://www.nature.com/articles/s41467-024-50779-y.pdf",
|
||||
"https://arxiv.org/pdf/2306.12802",
|
||||
"https://arxiv.org/pdf/2311.18481",
|
||||
]
|
||||
|
||||
base_url = "http://localhost:5001/v1alpha"
|
||||
payload = {
|
||||
"options": {
|
||||
"to_formats": ["md", "json"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": random.choice(example_docs)}],
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
for n in range(5):
|
||||
response = await async_client.post(
|
||||
f"{base_url}/convert/source/async", json=payload
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
task = response.json()
|
||||
|
||||
print(json.dumps(task, indent=2))
|
||||
|
||||
while task["task_status"] not in ("success", "failure"):
|
||||
response = await async_client.get(f"{base_url}/status/poll/{task['task_id']}")
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
task = response.json()
|
||||
print(f"{task['task_status']=}")
|
||||
print(f"{task['task_position']=}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
assert task["task_status"] == "success"
|
||||
74
tests/test_2-files-all-outputs.py
Normal file
74
tests/test_2-files-all-outputs.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(async_client):
|
||||
"""Test convert single file to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
options = {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = [
|
||||
("files", ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf")),
|
||||
("files", ("2408.09869v5.pdf", open(file_path, "rb"), "application/pdf")),
|
||||
]
|
||||
|
||||
response = await async_client.post(
|
||||
url, files=files, data={"options": json.dumps(options)}
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
# Check for zip file attachment
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert content_disposition is not None, (
|
||||
"Content-Disposition header should be present"
|
||||
)
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert 'filename="converted_docs.zip"' in content_disposition, (
|
||||
"Attachment filename should be 'converted_docs.zip'"
|
||||
)
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert content_type == "application/zip", (
|
||||
"Content-Type should be 'application/zip'"
|
||||
)
|
||||
67
tests/test_2-urls-all-outputs.py
Normal file
67
tests/test_2-urls-all-outputs.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [
|
||||
{"url": "https://arxiv.org/pdf/2206.01062"},
|
||||
{"url": "https://arxiv.org/pdf/2408.09869"},
|
||||
],
|
||||
}
|
||||
|
||||
response = await async_client.post(url, json=payload)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
# Check for zip file attachment
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert content_disposition is not None, (
|
||||
"Content-Disposition header should be present"
|
||||
)
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert 'filename="converted_docs.zip"' in content_disposition, (
|
||||
"Attachment filename should be 'converted_docs.zip'"
|
||||
)
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert content_type == "application/zip", (
|
||||
"Content-Type should be 'application/zip'"
|
||||
)
|
||||
Reference in New Issue
Block a user