mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 16:43:24 +00:00
Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cad1053e36 | ||
|
|
7e6d9cdef3 | ||
|
|
343b985287 | ||
|
|
c430d9b1a1 | ||
|
|
63141f1cc7 | ||
|
|
d5557fad9f | ||
|
|
36967f7f61 | ||
|
|
3b54d9b6ef | ||
|
|
4877248368 | ||
|
|
ec33a61faa | ||
|
|
663e03303a | ||
|
|
c64a450bf9 | ||
|
|
ae3b4906f1 | ||
|
|
7a351fcdea | ||
|
|
1615f977a2 | ||
|
|
1bf487b18e | ||
|
|
be7e4162af | ||
|
|
de42baf8dc | ||
|
|
4da28565a7 | ||
|
|
2a78142b96 | ||
|
|
d0e8578931 | ||
|
|
c6539c42de | ||
|
|
ddf3144512 |
40
.dockerignore
Normal file
40
.dockerignore
Normal file
@@ -0,0 +1,40 @@
|
||||
# Ignore Python cache files
|
||||
__pycache__/
|
||||
**/__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
|
||||
# Ignore virtual environments
|
||||
env/
|
||||
venv/
|
||||
|
||||
# Ignore development artifacts
|
||||
*.log
|
||||
*.db
|
||||
*.sqlite3
|
||||
|
||||
# Ignore configuration and sensitive files
|
||||
**/.env
|
||||
*.env
|
||||
*.ini
|
||||
*.cfg
|
||||
|
||||
# Ignore IDE and editor settings
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Ignore Git files
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Ignore Docker files themselves (optional if not needed in the image)
|
||||
.dockerignore
|
||||
Dockerfile*
|
||||
|
||||
# Ignore build artifacts (if applicable)
|
||||
build/
|
||||
dist/
|
||||
*.egg-info
|
||||
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
<!-- Thank you for contributing to Docling! -->
|
||||
|
||||
<!-- STEPS TO FOLLOW:
|
||||
1. Add a description of the changes (frequently the same as the commit description)
|
||||
2. Enter the issue number next to "Resolves #" below (if there is no tracking issue resolved, **remove that section**)
|
||||
3. Make sure the PR title follows the **Commit Message Formatting**: https://www.conventionalcommits.org/en/v1.0.0/#summary.
|
||||
-->
|
||||
|
||||
<!-- Uncomment this section with the issue number if an issue is being resolved
|
||||
**Issue resolved by this Pull Request:**
|
||||
Resolves #
|
||||
--->
|
||||
23
.github/SECURITY.md
vendored
Normal file
23
.github/SECURITY.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
# Security and Disclosure Information Policy for the Docling Project
|
||||
|
||||
The Docling team and community take security bugs seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
If you think you've identified a security issue in an Docling project repository, please DO NOT report the issue publicly via the GitHub issue tracker, etc.
|
||||
|
||||
Instead, send an email with as many details as possible to [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). This is a private mailing list for the maintainers team.
|
||||
|
||||
Please do not create a public issue.
|
||||
|
||||
## Security Vulnerability Response
|
||||
|
||||
Each report is acknowledged and analyzed by the core maintainers within 3 working days.
|
||||
|
||||
Any vulnerability information shared with core maintainers stays within the Docling project and will not be disseminated to other projects unless it is necessary to get the issue fixed.
|
||||
|
||||
After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
|
||||
|
||||
## Security Alerts
|
||||
|
||||
We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/DS4SD/docling/discussions/categories/announcements).
|
||||
19
.github/actions/setup-poetry/action.yml
vendored
19
.github/actions/setup-poetry/action.yml
vendored
@@ -1,19 +0,0 @@
|
||||
name: 'Set up Poetry and install'
|
||||
description: 'Set up a specific version of Poetry and install dependencies using caching.'
|
||||
inputs:
|
||||
python-version:
|
||||
description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
|
||||
default: '3.11'
|
||||
runs:
|
||||
using: 'composite'
|
||||
steps:
|
||||
- name: Install poetry
|
||||
run: pipx install poetry==1.8.3
|
||||
shell: bash
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
cache: 'poetry'
|
||||
- name: Install dependencies
|
||||
run: poetry install --all-extras
|
||||
shell: bash
|
||||
9
.github/mergify.yml
vendored
Normal file
9
.github/mergify.yml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
merge_protections:
|
||||
- name: Enforce conventional commit
|
||||
description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/
|
||||
if:
|
||||
- base = main
|
||||
success_conditions:
|
||||
- "title ~=
|
||||
^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\
|
||||
\\))?(!)?:"
|
||||
39
.github/scripts/release.sh
vendored
Executable file
39
.github/scripts/release.sh
vendored
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e # trigger failure on error - do not remove!
|
||||
set -x # display command on output
|
||||
|
||||
if [ -z "${TARGET_VERSION}" ]; then
|
||||
>&2 echo "No TARGET_VERSION specified"
|
||||
exit 1
|
||||
fi
|
||||
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
|
||||
|
||||
# update package version
|
||||
uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
|
||||
|
||||
# collect release notes
|
||||
REL_NOTES=$(mktemp)
|
||||
uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"
|
||||
|
||||
# update changelog
|
||||
TMP_CHGLOG=$(mktemp)
|
||||
TARGET_TAG_NAME="v${TARGET_VERSION}"
|
||||
RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
|
||||
printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
|
||||
cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
|
||||
if [ -f "${CHGLOG_FILE}" ]; then
|
||||
printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
|
||||
fi
|
||||
mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
|
||||
|
||||
# push changes
|
||||
git config --global user.name 'github-actions[bot]'
|
||||
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
||||
git add pyproject.toml "${CHGLOG_FILE}"
|
||||
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
|
||||
git commit -m "${COMMIT_MSG}"
|
||||
git push origin main
|
||||
|
||||
# create GitHub release (incl. Git tag)
|
||||
gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
|
||||
59
.github/workflows/cd.yml
vendored
Normal file
59
.github/workflows/cd.yml
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
name: "Run CD"
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
code-checks:
|
||||
uses: ./.github/workflows/job-checks.yml
|
||||
pre-release-check:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # for fetching tags, required for semantic-release
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --only-dev
|
||||
- name: Check version of potential release
|
||||
id: version_check
|
||||
run: |
|
||||
TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
|
||||
echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
|
||||
echo "${TRGT_VERSION}"
|
||||
- name: Check notes of potential release
|
||||
run: uv run --no-sync semantic-release changelog --unreleased
|
||||
release:
|
||||
needs: [code-checks, pre-release-check]
|
||||
if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
|
||||
environment: auto-release
|
||||
runs-on: ubuntu-latest
|
||||
concurrency: release
|
||||
steps:
|
||||
- uses: actions/create-github-app-token@v1
|
||||
id: app-token
|
||||
with:
|
||||
app-id: ${{ vars.CI_APP_ID }}
|
||||
private-key: ${{ secrets.CI_PRIVATE_KEY }}
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
token: ${{ steps.app-token.outputs.token }}
|
||||
fetch-depth: 0 # for fetching tags, required for semantic-release
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --only-dev
|
||||
- name: Run release script
|
||||
env:
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }}
|
||||
CHGLOG_FILE: CHANGELOG.md
|
||||
run: ./.github/scripts/release.sh
|
||||
shell: bash
|
||||
41
.github/workflows/ci-images-dryrun.yml
vendored
Normal file
41
.github/workflows/ci-images-dryrun.yml
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
name: Dry run docling-serve image building
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_image:
|
||||
name: Build ${{ matrix.spec.name }} container image
|
||||
strategy:
|
||||
matrix:
|
||||
spec:
|
||||
- name: ds4sd/docling-serve
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: ds4sd/docling-serve-cpu
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: ds4sd/docling-serve-cu124
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: false
|
||||
build_args: ${{ matrix.spec.build_args }}
|
||||
ghcr_image_name: ${{ matrix.spec.name }}
|
||||
quay_image_name: ""
|
||||
platforms: ${{ matrix.spec.platforms }}
|
||||
25
.github/workflows/ci.yml
vendored
Normal file
25
.github/workflows/ci.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
name: "Run CI"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
pull_request:
|
||||
branches: ["main"]
|
||||
|
||||
jobs:
|
||||
code-checks:
|
||||
# if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling-serve' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling-serve') }}
|
||||
uses: ./.github/workflows/job-checks.yml
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
build-images:
|
||||
uses: ./.github/workflows/ci-images-dryrun.yml
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
105
.github/workflows/images-dryrun.yml
vendored
105
.github/workflows/images-dryrun.yml
vendored
@@ -1,105 +0,0 @@
|
||||
name: Dry run docling-serve image building
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: ["main"]
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
GHCR_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
GHCR_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
|
||||
jobs:
|
||||
build_cpu_image:
|
||||
name: Build docling-serve "CPU only" container image
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (CPU only) ghcr image
|
||||
id: ghcr_serve_cpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_CPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build docling-serve-cpu image
|
||||
id: build-serve-cpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: false
|
||||
tags: ${{ steps.ghcr_serve_cpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_cpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64, linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=true
|
||||
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
docker image prune -af
|
||||
|
||||
build_gpu_image:
|
||||
name: Build docling-serve (with GPU support) container image
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (GPU) ghcr image
|
||||
id: ghcr_serve_gpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_GPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build docling-serve (GPU) image
|
||||
id: build-serve-gpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: false
|
||||
tags: ${{ steps.ghcr_serve_gpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_gpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=false
|
||||
213
.github/workflows/images.yml
vendored
213
.github/workflows/images.yml
vendored
@@ -4,193 +4,44 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'v*'
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
GHCR_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
GHCR_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
QUAY_REGISTRY: quay.io
|
||||
QUAY_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
QUAY_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_and_publish_cpu_images:
|
||||
name: Push docling-serve "CPU only" container image to GHCR and QUAY
|
||||
runs-on: ubuntu-latest
|
||||
environment: registry-creds
|
||||
build_and_publish_images:
|
||||
name: Build and push ${{ matrix.spec.name }} container image to GHCR and QUAY
|
||||
strategy:
|
||||
matrix:
|
||||
spec:
|
||||
- name: ds4sd/docling-serve
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: ds4sd/docling-serve-cpu
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: ds4sd/docling-serve-cu124
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
secrets: inherit
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the GHCR container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.GHCR_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Log in to the Quay container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.QUAY_REGISTRY }}
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (CPU only) ghcr image
|
||||
id: ghcr_serve_cpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_CPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve-cpu image to ghcr.io
|
||||
id: push-serve-cpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.ghcr_serve_cpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_cpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64, linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=true
|
||||
|
||||
- name: Generate artifact attestation
|
||||
uses: actions/attest-build-provenance@v1
|
||||
with:
|
||||
subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_CPU_IMAGE_NAME}}
|
||||
subject-digest: ${{ steps.push-serve-cpu-ghcr.outputs.digest }}
|
||||
push-to-registry: true
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (CPU only) quay image
|
||||
id: quay_serve_cpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_DOCLING_SERVE_CPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve-cpu image to quay.io
|
||||
id: push-serve-cpu-quay
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.quay_serve_cpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.quay_serve_cpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64, linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=true
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
docker image prune -af
|
||||
|
||||
build_and_publish_gpu_images:
|
||||
name: Push docling-serve (with GPU support) container image to GHCR and QUAY
|
||||
runs-on: ubuntu-latest
|
||||
environment: registry-creds
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the GHCR container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.GHCR_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Log in to the Quay container image registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.QUAY_REGISTRY }}
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (GPU) ghcr image
|
||||
id: ghcr_serve_gpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_GPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve (GPU) image to ghcr.io
|
||||
id: push-serve-gpu-ghcr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.ghcr_serve_gpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_serve_gpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=false
|
||||
|
||||
- name: Generate artifact attestation
|
||||
uses: actions/attest-build-provenance@v1
|
||||
with:
|
||||
subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_DOCLING_SERVE_GPU_IMAGE_NAME}}
|
||||
subject-digest: ${{ steps.push-serve-gpu-ghcr.outputs.digest }}
|
||||
push-to-registry: true
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve (GPU) quay image
|
||||
id: quay_serve_gpu_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_DOCLING_SERVE_GPU_IMAGE_NAME }}
|
||||
|
||||
- name: Build and push docling-serve (GPU) image to quay.io
|
||||
id: push-serve-gpu-quay
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.quay_serve_gpu_meta.outputs.tags }}
|
||||
labels: ${{ steps.quay_serve_gpu_meta.outputs.labels }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: |
|
||||
--build-arg CPU_ONLY=false
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: true
|
||||
environment: registry-creds
|
||||
build_args: ${{ matrix.spec.build_args }}
|
||||
ghcr_image_name: ${{ matrix.spec.name }}
|
||||
quay_image_name: ${{ matrix.spec.name }}
|
||||
platforms: ${{ matrix.spec.platforms }}
|
||||
|
||||
@@ -1,27 +1,25 @@
|
||||
name: Run linter checks
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
pull_request:
|
||||
branches: ["main"]
|
||||
name: Run checks
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
py-lint:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11']
|
||||
python-version: ['3.12']
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/setup-poetry
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-extras --no-extra cu124
|
||||
- name: Run styling check
|
||||
run: poetry run pre-commit run --all-files
|
||||
run: uv run --no-sync pre-commit run --all-files
|
||||
|
||||
markdown-lint:
|
||||
runs-on: ubuntu-latest
|
||||
141
.github/workflows/job-image.yml
vendored
Normal file
141
.github/workflows/job-image.yml
vendored
Normal file
@@ -0,0 +1,141 @@
|
||||
name: Build docling-serve container image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build_args:
|
||||
type: string
|
||||
description: "Extra build arguments for the build."
|
||||
default: ""
|
||||
ghcr_image_name:
|
||||
type: string
|
||||
description: "Name of the image for GHCR."
|
||||
quay_image_name:
|
||||
type: string
|
||||
description: "Name of the image Quay."
|
||||
platforms:
|
||||
type: string
|
||||
description: "Platform argument for building images."
|
||||
default: linux/amd64, linux/arm64
|
||||
publish:
|
||||
type: boolean
|
||||
description: "If true, the images will be published."
|
||||
default: false
|
||||
environment:
|
||||
type: string
|
||||
description: "GH Action environment"
|
||||
default: ""
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
QUAY_REGISTRY: quay.io
|
||||
|
||||
jobs:
|
||||
image:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
environment: ${{ inputs.environment }}
|
||||
|
||||
steps:
|
||||
- name: Free up space in github runner
|
||||
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
|
||||
run: |
|
||||
df -h
|
||||
sudo rm -rf "/usr/local/share/boost"
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
|
||||
# shellcheck disable=SC2046
|
||||
sudo docker rmi "$(docker image ls -aq)" >/dev/null 2>&1 || true
|
||||
df -h
|
||||
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the GHCR container image registry
|
||||
if: ${{ inputs.publish }}
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.GHCR_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Log in to the Quay container image registry
|
||||
if: ${{ inputs.publish }}
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.QUAY_REGISTRY }}
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-buildx-
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve ghcr image
|
||||
id: ghcr_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}
|
||||
|
||||
- name: Build and push image to ghcr.io
|
||||
id: ghcr_push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: ${{ inputs.publish }}
|
||||
tags: ${{ steps.ghcr_meta.outputs.tags }}
|
||||
labels: ${{ steps.ghcr_meta.outputs.labels }}
|
||||
platforms: ${{ inputs.platforms}}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
|
||||
- name: Generate artifact attestation
|
||||
if: ${{ inputs.publish }}
|
||||
uses: actions/attest-build-provenance@v1
|
||||
with:
|
||||
subject-name: ${{ env.GHCR_REGISTRY }}/${{ inputs.ghcr_image_name }}
|
||||
subject-digest: ${{ steps.ghcr_push.outputs.digest }}
|
||||
push-to-registry: true
|
||||
|
||||
- name: Extract metadata (tags, labels) for docling-serve quay image
|
||||
if: ${{ inputs.publish }}
|
||||
id: quay_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.QUAY_REGISTRY }}/${{ inputs.quay_image_name }}
|
||||
|
||||
- name: Build and push image to quay.io
|
||||
if: ${{ inputs.publish }}
|
||||
# id: push-serve-cpu-quay
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: ${{ inputs.publish }}
|
||||
tags: ${{ steps.quay_meta.outputs.tags }}
|
||||
labels: ${{ steps.quay_meta.outputs.labels }}
|
||||
platforms: ${{ inputs.platforms}}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
|
||||
# - name: Inspect the image details
|
||||
# run: |
|
||||
# echo "${{ steps.ghcr_push.outputs.metadata }}"
|
||||
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
docker image prune -af
|
||||
32
.github/workflows/pypi.yml
vendored
Normal file
32
.github/workflows/pypi.yml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
name: "Build and publish package"
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/docling-serve # Replace <package-name> with your PyPI project name
|
||||
permissions:
|
||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-extras --no-extra cu124
|
||||
- name: Build
|
||||
run: uv build
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
# currently not working with reusable workflows
|
||||
attestations: false
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,7 @@
|
||||
model_artifacts/
|
||||
scratch/
|
||||
.md-lint
|
||||
actionlint
|
||||
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||
|
||||
@@ -2,5 +2,7 @@ config:
|
||||
line-length: false
|
||||
no-emphasis-as-header: false
|
||||
first-line-heading: false
|
||||
MD033:
|
||||
allowed_elements: ["details", "summary"]
|
||||
globs:
|
||||
- "**/*.md"
|
||||
|
||||
@@ -1,41 +1,24 @@
|
||||
fail_fast: true
|
||||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: Black
|
||||
entry: poetry run black docling_serve tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: isort
|
||||
entry: poetry run isort docling_serve tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: flake8
|
||||
entry: poetry run flake8 docling_serve
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: MyPy
|
||||
entry: poetry run mypy docling_serve
|
||||
entry: uv run --no-sync mypy docling_serve
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
# uv version.
|
||||
rev: 0.6.1
|
||||
hooks:
|
||||
- id: system
|
||||
name: Poetry check
|
||||
entry: poetry check --lock
|
||||
pass_filenames: false
|
||||
language: system
|
||||
- id: uv-lock
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.6
|
||||
hooks:
|
||||
# Run the Ruff linter.
|
||||
- id: ruff
|
||||
args: [--exit-non-zero-on-fix, --config=pyproject.toml]
|
||||
# Run the Ruff formatter.
|
||||
# - id: ruff-format
|
||||
# args: [--config=pyproject.toml]
|
||||
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.12
|
||||
18
CHANGELOG.md
Normal file
18
CHANGELOG.md
Normal file
@@ -0,0 +1,18 @@
|
||||
## [v0.4.0](https://github.com/DS4SD/docling-serve/releases/tag/v0.4.0) - 2025-02-26
|
||||
|
||||
### Feature
|
||||
|
||||
* New container images ([#68](https://github.com/DS4SD/docling-serve/issues/68)) ([`7e6d9cd`](https://github.com/DS4SD/docling-serve/commit/7e6d9cdef398df70a5b4d626aeb523c428c10d56))
|
||||
* Render DoclingDocument with npm docling-components in the example UI ([#65](https://github.com/DS4SD/docling-serve/issues/65)) ([`c430d9b`](https://github.com/DS4SD/docling-serve/commit/c430d9b1a162ab29104d86ebaa1ac5a5488b1f09))
|
||||
|
||||
## [v0.3.0](https://github.com/DS4SD/docling-serve/releases/tag/v0.3.0) - 2025-02-19
|
||||
|
||||
### Feature
|
||||
|
||||
* Add new docling-serve cli ([#50](https://github.com/DS4SD/docling-serve/issues/50)) ([`ec33a61`](https://github.com/DS4SD/docling-serve/commit/ec33a61faa7846b9b7998fbf557ebe39a3b800f6))
|
||||
|
||||
### Fix
|
||||
|
||||
* Set DOCLING_SERVE_ARTIFACTS_PATH in images ([#53](https://github.com/DS4SD/docling-serve/issues/53)) ([`4877248`](https://github.com/DS4SD/docling-serve/commit/487724836896576ca4f98e84abf15fd1c383bec8))
|
||||
* Set root UI path when behind proxy ([#38](https://github.com/DS4SD/docling-serve/issues/38)) ([`c64a450`](https://github.com/DS4SD/docling-serve/commit/c64a450bf9ba9947ab180e92bef2763ff710b210))
|
||||
* Support python 3.13 and docling updates and switch to uv ([#48](https://github.com/DS4SD/docling-serve/issues/48)) ([`ae3b490`](https://github.com/DS4SD/docling-serve/commit/ae3b4906f1c0829b1331ea491f3518741cabff71))
|
||||
@@ -1,32 +1,62 @@
|
||||
FROM python:3.11-slim-bookworm
|
||||
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
|
||||
|
||||
ARG CPU_ONLY=false
|
||||
WORKDIR /docling-serve
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||
&& apt-get clean
|
||||
ARG MODELS_LIST="layout tableformer picture_classifier easyocr"
|
||||
ARG UV_SYNC_EXTRA_ARGS=""
|
||||
|
||||
RUN pip install --no-cache-dir poetry
|
||||
USER 0
|
||||
|
||||
COPY pyproject.toml poetry.lock README.md /docling-serve/
|
||||
###################################################################################################
|
||||
# OS Layer #
|
||||
###################################################################################################
|
||||
|
||||
RUN if [ "$CPU_ONLY" = "true" ]; then \
|
||||
poetry install --no-root --with cpu; \
|
||||
else \
|
||||
poetry install --no-root; \
|
||||
fi
|
||||
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
|
||||
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
|
||||
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
|
||||
dnf config-manager --enable crb && \
|
||||
dnf -y update && \
|
||||
dnf install -y $(cat /tmp/os-packages.txt) && \
|
||||
dnf -y clean all && \
|
||||
rm -rf /var/cache/dnf
|
||||
|
||||
ENV HF_HOME=/tmp/
|
||||
ENV TORCH_HOME=/tmp/
|
||||
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
|
||||
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.6.1 /uv /uvx /bin/
|
||||
|
||||
###################################################################################################
|
||||
# Docling layer #
|
||||
###################################################################################################
|
||||
|
||||
USER 1001
|
||||
|
||||
WORKDIR /opt/app-root/src
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
|
||||
COPY ./docling_serve /docling-serve/docling_serve
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=utf-8
|
||||
ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
|
||||
ENV UV_PROJECT_ENVIRONMENT=/opt/app-root
|
||||
|
||||
ENV DOCLING_SERVE_ARTIFACTS_PATH=/opt/app-root/src/.cache/docling/models
|
||||
|
||||
COPY --chown=1001:0 pyproject.toml uv.lock README.md ./
|
||||
|
||||
RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
uv sync --frozen --no-install-project --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA}
|
||||
|
||||
RUN echo "Downloading models..." && \
|
||||
docling-tools models download -o "${DOCLING_SERVE_ARTIFACTS_PATH}" ${MODELS_LIST} && \
|
||||
chown -R 1001:0 /opt/app-root/src/.cache && \
|
||||
chmod -R g=u /opt/app-root/src/.cache
|
||||
|
||||
COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
|
||||
RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA}
|
||||
|
||||
EXPOSE 5001
|
||||
|
||||
CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
|
||||
CMD ["docling-serve", "run"]
|
||||
|
||||
49
Makefile
49
Makefile
@@ -24,19 +24,26 @@ action-lint-file:
|
||||
md-lint-file:
|
||||
$(CMD_PREFIX) touch .markdown-lint
|
||||
|
||||
.PHONY: docling-serve-image
|
||||
docling-serve-image: Containerfile
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu" -f Containerfile -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) quay.io/ds4sd/docling-serve:main
|
||||
|
||||
.PHONY: docling-serve-cpu-image
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" continaer image
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU ONLY]"
|
||||
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=true -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124" -f Containerfile -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
|
||||
|
||||
.PHONY: docling-serve-gpu-image
|
||||
docling-serve-gpu-image: Containerfile ## Build docling-serve continaer image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with GPU]"
|
||||
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=false -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) quay.io/ds4sd/docling-serve:main
|
||||
.PHONY: docling-serve-cu124-image
|
||||
docling-serve-cu124-image: Containerfile ## Build docling-serve container image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with Cuda 12.4]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cpu" -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cu124:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cu124:$(TAG) ghcr.io/ds4sd/docling-serve-cu124:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cu124:$(TAG) quay.io/ds4sd/docling-serve-cu124:main
|
||||
|
||||
.PHONY: action-lint
|
||||
action-lint: .action-lint ## Lint GitHub Action workflows
|
||||
@@ -62,14 +69,26 @@ md-lint: .md-lint ## Lint markdown files
|
||||
$(CMD_PREFIX) docker run --rm -v $$(pwd):/workdir davidanson/markdownlint-cli2:v0.14.0 "**/*.md"
|
||||
$(CMD_PREFIX) touch $@
|
||||
|
||||
|
||||
.PHONY: py-Lint
|
||||
py-lint: ## Lint Python files
|
||||
$(ECHO_PREFIX) printf " %-12s ./...\n" "[PY LINT]"
|
||||
$(CMD_PREFIX) if ! which poetry $(PIPE_DEV_NULL) ; then \
|
||||
echo "Please install poetry." ; \
|
||||
echo "pip install poetry" ; \
|
||||
$(CMD_PREFIX) if ! which uv $(PIPE_DEV_NULL) ; then \
|
||||
echo "Please install uv." ; \
|
||||
exit 1 ; \
|
||||
fi
|
||||
$(CMD_PREFIX) poetry install --all-extras
|
||||
$(CMD_PREFIX) poetry run pre-commit run --all-files
|
||||
$(CMD_PREFIX) uv sync --extra ui
|
||||
$(CMD_PREFIX) uv run pre-commit run --all-files
|
||||
|
||||
.PHONY: run-docling-cpu
|
||||
run-docling-cpu: ## Run the docling-serve container with CPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-cpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with CPU support on port 5001...\n" "[RUN CPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-cpu -p 5001:5001 ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
|
||||
.PHONY: run-docling-gpu
|
||||
run-docling-gpu: ## Run the docling-serve container with GPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-gpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN GPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-gpu -p 5001:5001 ghcr.io/ds4sd/docling-serve:main
|
||||
|
||||
457
README.md
457
README.md
@@ -2,55 +2,452 @@
|
||||
|
||||
Running [Docling](https://github.com/DS4SD/docling) as an API service.
|
||||
|
||||
> [!NOTE]
|
||||
> This is an unstable draft implementation which will quickly evolve.
|
||||
## Usage
|
||||
|
||||
## Development
|
||||
The API provides two endpoints: one for urls, one for files. This is necessary to send files directly in binary format instead of base64-encoded strings.
|
||||
|
||||
Install the dependencies
|
||||
### Common parameters
|
||||
|
||||
```sh
|
||||
# Install poetry if not already available
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
On top of the source of file (see below), both endpoints support the same parameters, which are almost the same as the Docling CLI.
|
||||
|
||||
# Install dependencies
|
||||
poetry install
|
||||
- `from_format` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
|
||||
- `to_formats` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
|
||||
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
||||
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
||||
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
||||
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
||||
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`. Defaults to `dlparse_v2`.
|
||||
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
||||
- `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
|
||||
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
|
||||
- `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
|
||||
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to true.
|
||||
- `images_scale` (float): Scale factor for images. Defaults to 2.0.
|
||||
|
||||
# Run the server
|
||||
poetry run uvicorn docling_serve.app:app --reload
|
||||
### URL endpoint
|
||||
|
||||
The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
|
||||
|
||||
On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
|
||||
The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
|
||||
No `options` is required, they can be partially or completely omitted.
|
||||
|
||||
Simple payload example:
|
||||
|
||||
```json
|
||||
{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
Example payload (http source):
|
||||
<details>
|
||||
|
||||
<summary>Complete payload example:</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/convert' \
|
||||
'http://localhost:5001/v1alpha/convert/source' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"http_source": {
|
||||
"url": "https://arxiv.org/pdf/2206.01062"
|
||||
}
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx"
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": [
|
||||
"fr",
|
||||
"de",
|
||||
"es",
|
||||
"en"
|
||||
],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
"do_table_structure": true,
|
||||
"include_images": true,
|
||||
"images_scale": 2,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}'
|
||||
```
|
||||
|
||||
### Cuda GPU Support
|
||||
</details>
|
||||
|
||||
For GPU support try the following:
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": "en",
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
|
||||
response = await async_client_client.post(url, json=payload)
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### File as base64
|
||||
|
||||
The `file_sources` argument in the endpoint allows to send files as base64-encoded strings.
|
||||
When your PDF or other file type is too large, encoding it and passing it inline to curl
|
||||
can lead to an “Argument list too long” error on some systems. To avoid this, we write
|
||||
the JSON request body to a file and have curl read from that file.
|
||||
|
||||
<details>
|
||||
<summary>CURL steps:</summary>
|
||||
|
||||
```sh
|
||||
# Create a virtual env
|
||||
python3 -m venv venv
|
||||
# 1. Base64-encode the file
|
||||
B64_DATA=$(base64 -w 0 /path/to/file/pdf-to-convert.pdf)
|
||||
|
||||
# Activate the venv
|
||||
source venv/bin/active
|
||||
# 2. Build the JSON with your options
|
||||
cat <<EOF > /tmp/request_body.json
|
||||
{
|
||||
"options": {
|
||||
},
|
||||
"file_sources": [{
|
||||
"base64_string": "${B64_DATA}",
|
||||
"filename": "pdf-to-convert.pdf"
|
||||
}]
|
||||
}
|
||||
EOF
|
||||
|
||||
# Install torch with the special index
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
|
||||
# Install the package
|
||||
pip install -e .
|
||||
|
||||
# Run the server
|
||||
poetry run uvicorn docling_serve.app:app --reload
|
||||
# 3. POST the request to the docling service
|
||||
curl -X POST "localhost:5001/v1alpha/convert/source" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @/tmp/request_body.json
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### File endpoint
|
||||
|
||||
The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
|
||||
|
||||
<details>
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:5001/v1alpha/convert/file' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'ocr_engine=easyocr' \
|
||||
-F 'pdf_backend=dlparse_v2' \
|
||||
-F 'from_formats=pdf' \
|
||||
-F 'from_formats=docx' \
|
||||
-F 'force_ocr=false' \
|
||||
-F 'image_export_mode=embedded' \
|
||||
-F 'ocr_lang=en' \
|
||||
-F 'ocr_lang=pl' \
|
||||
-F 'table_mode=fast' \
|
||||
-F 'files=@2206.01062v1.pdf;type=application/pdf' \
|
||||
-F 'abort_on_error=false' \
|
||||
-F 'to_formats=md' \
|
||||
-F 'to_formats=text' \
|
||||
-F 'return_as_file=false' \
|
||||
-F 'do_ocr=true'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
parameters = {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, '2206.01062v1.pdf')
|
||||
|
||||
files = {
|
||||
'files': ('2206.01062v1.pdf', open(file_path, 'rb'), 'application/pdf'),
|
||||
}
|
||||
|
||||
response = await async_client.post(url, files=files, data={"parameters": json.dumps(parameters)})
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Response format
|
||||
|
||||
The response can be a JSON Document or a File.
|
||||
|
||||
- If you process only one file, the response will be a JSON document with the following format:
|
||||
|
||||
```jsonc
|
||||
{
|
||||
"document": {
|
||||
"md_content": "",
|
||||
"json_content": {},
|
||||
"html_content": "",
|
||||
"text_content": "",
|
||||
"doctags_content": ""
|
||||
},
|
||||
"status": "<success|partial_success|skipped|failure>",
|
||||
"processing_time": 0.0,
|
||||
"timings": {},
|
||||
"errors": []
|
||||
}
|
||||
```
|
||||
|
||||
Depending on the value you set in `output_formats`, the different items will be populated with their respective results or empty.
|
||||
|
||||
`processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
|
||||
timing of all the internal Docling components.
|
||||
|
||||
- If you set the parameter `return_as_file` to True, the response will be a zip file.
|
||||
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
|
||||
|
||||
## Run docling-serve
|
||||
|
||||
Clone the repository and run the following from within the cloned directory root.
|
||||
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install "docling-serve[ui]"
|
||||
docling-serve run --enable-ui
|
||||
```
|
||||
|
||||
## Helpers
|
||||
|
||||
- A full Swagger UI is available at the `/docs` endpoint.
|
||||
|
||||

|
||||
|
||||
- An easy to use UI is available at the `/ui` endpoint.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## Development
|
||||
|
||||
### CPU only
|
||||
|
||||
```sh
|
||||
# Install uv if not already available
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Install dependencies
|
||||
uv sync --extra cpu
|
||||
```
|
||||
|
||||
### Cuda GPU
|
||||
|
||||
For GPU support use the following command:
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
uv sync
|
||||
```
|
||||
|
||||
### Gradio UI and different OCR backends
|
||||
|
||||
`/ui` endpoint using `gradio` and different OCR backends can be enabled via package extras:
|
||||
|
||||
```sh
|
||||
# Enable ui and rapidocr
|
||||
uv sync --extra ui --extra rapidocr
|
||||
```
|
||||
|
||||
```sh
|
||||
# Enable tesserocr
|
||||
uv sync --extra tesserocr
|
||||
```
|
||||
|
||||
See `[project.optional-dependencies]` section in `pyproject.toml` for full list of options and runtime options with `uv run docling-serve --help`.
|
||||
|
||||
### Run the server
|
||||
|
||||
The `docling-serve` executable is a convenient script for launching the webserver both in
|
||||
development and production mode.
|
||||
|
||||
```sh
|
||||
# Run the server in development mode
|
||||
# - reload is enabled by default
|
||||
# - listening on the 127.0.0.1 address
|
||||
# - ui is enabled by default
|
||||
docling-serve dev
|
||||
|
||||
# Run the server in production mode
|
||||
# - reload is disabled by default
|
||||
# - listening on the 0.0.0.0 address
|
||||
# - ui is disabled by default
|
||||
docling-serve run
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
The `docling-serve` executable allows is controlled with both command line
|
||||
options and environment variables.
|
||||
|
||||
<details>
|
||||
<summary>`docling-serve` help message</summary>
|
||||
|
||||
```sh
|
||||
$ docling-serve dev --help
|
||||
|
||||
Usage: docling-serve dev [OPTIONS]
|
||||
|
||||
Run a Docling Serve app in development mode. 🧪
|
||||
This is equivalent to docling-serve run but with reload
|
||||
enabled and listening on the 127.0.0.1 address.
|
||||
|
||||
Options can be set also with the corresponding ENV variable, with the exception
|
||||
of --enable-ui, --host and --reload.
|
||||
|
||||
╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --host TEXT The host to serve on. For local development in localhost │
|
||||
│ use 127.0.0.1. To enable public access, e.g. in a │
|
||||
│ container, use all the IP addresses available with │
|
||||
│ 0.0.0.0. │
|
||||
│ [default: 127.0.0.1] │
|
||||
│ --port INTEGER The port to serve on. [default: 5001] │
|
||||
│ --reload --no-reload Enable auto-reload of the server when (code) files │
|
||||
│ change. This is resource intensive, use it only during │
|
||||
│ development. │
|
||||
│ [default: reload] │
|
||||
│ --root-path TEXT The root path is used to tell your app that it is being │
|
||||
│ served to the outside world with some path prefix set up │
|
||||
│ in some termination proxy or similar. │
|
||||
│ --proxy-headers --no-proxy-headers Enable/Disable X-Forwarded-Proto, X-Forwarded-For, │
|
||||
│ X-Forwarded-Port to populate remote address info. │
|
||||
│ [default: proxy-headers] │
|
||||
│ --artifacts-path PATH If set to a valid directory, the model weights will be │
|
||||
│ loaded from this path. │
|
||||
│ [default: None] │
|
||||
│ --enable-ui --no-enable-ui Enable the development UI. [default: enable-ui] │
|
||||
│ --help Show this message and exit. │
|
||||
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### Environment variables
|
||||
|
||||
The environment variables controlling the `uvicorn` execution can be specified with the `UVICORN_` prefix:
|
||||
|
||||
- `UVICORN_WORKERS`: Number of workers to use.
|
||||
- `UVICORN_RELOAD`: If `True`, this will enable auto-reload when you modify files, useful for development.
|
||||
|
||||
The environment variables controlling specifics of the Docling Serve app can be specified with the
|
||||
`DOCLING_SERVE_` prefix:
|
||||
|
||||
- `DOCLING_SERVE_ARTIFACTS_PATH`: if set Docling will use only the local weights of models, for example `/opt/app-root/src/.cache/docling/models`.
|
||||
- `DOCLING_SERVE_ENABLE_UI`: If `True`, The Gradio UI will be available at `/ui`.
|
||||
|
||||
Others:
|
||||
|
||||
- `TESSDATA_PREFIX`: Tesseract data location, example `/usr/share/tesseract/tessdata/`.
|
||||
|
||||
## Get help and support
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling Serve](https://github.com/DS4SD/docling-serve/blob/main/CONTRIBUTING.md) for details.
|
||||
|
||||
## References
|
||||
|
||||
If you use Docling in your projects, please consider citing the following:
|
||||
|
||||
```bib
|
||||
@techreport{Docling,
|
||||
author = {Deep Search Team},
|
||||
month = {8},
|
||||
title = {Docling Technical Report},
|
||||
url = {https://arxiv.org/abs/2408.09869},
|
||||
eprint = {2408.09869},
|
||||
doi = {10.48550/arXiv.2408.09869},
|
||||
version = {1.0.0},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The Docling Serve codebase is under MIT license.
|
||||
|
||||
## IBM ❤️ Open Source AI
|
||||
|
||||
Docling has been brought to you by IBM.
|
||||
|
||||
3
docling_serve/.env.example
Normal file
3
docling_serve/.env.example
Normal file
@@ -0,0 +1,3 @@
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
UVICORN_WORKERS=2
|
||||
UVICORN_RELOAD=True
|
||||
302
docling_serve/__main__.py
Normal file
302
docling_serve/__main__.py
Normal file
@@ -0,0 +1,302 @@
|
||||
import importlib
|
||||
import logging
|
||||
import platform
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Optional, Union
|
||||
|
||||
import typer
|
||||
import uvicorn
|
||||
from rich.console import Console
|
||||
|
||||
from docling_serve.settings import docling_serve_settings, uvicorn_settings
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
|
||||
|
||||
err_console = Console(stderr=True)
|
||||
console = Console()
|
||||
|
||||
app = typer.Typer(
|
||||
no_args_is_help=True,
|
||||
rich_markup_mode="rich",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def version_callback(value: bool) -> None:
|
||||
if value:
|
||||
docling_serve_version = importlib.metadata.version("docling_serve")
|
||||
docling_version = importlib.metadata.version("docling")
|
||||
docling_core_version = importlib.metadata.version("docling-core")
|
||||
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
||||
docling_parse_version = importlib.metadata.version("docling-parse")
|
||||
platform_str = platform.platform()
|
||||
py_impl_version = sys.implementation.cache_tag
|
||||
py_lang_version = platform.python_version()
|
||||
console.print(f"Docling Serve version: {docling_serve_version}")
|
||||
console.print(f"Docling version: {docling_version}")
|
||||
console.print(f"Docling Core version: {docling_core_version}")
|
||||
console.print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
||||
console.print(f"Docling Parse version: {docling_parse_version}")
|
||||
console.print(f"Python: {py_impl_version} ({py_lang_version})")
|
||||
console.print(f"Platform: {platform_str}")
|
||||
raise typer.Exit()
|
||||
|
||||
|
||||
@app.callback()
|
||||
def callback(
|
||||
version: Annotated[
|
||||
Union[bool, None],
|
||||
typer.Option(
|
||||
"--version", help="Show the version and exit.", callback=version_callback
|
||||
),
|
||||
] = None,
|
||||
verbose: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--verbose",
|
||||
"-v",
|
||||
count=True,
|
||||
help="Set the verbosity level. -v for info logging, -vv for debug logging.",
|
||||
),
|
||||
] = 0,
|
||||
) -> None:
|
||||
if verbose == 0:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
elif verbose == 1:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
elif verbose == 2:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
def _run(
|
||||
*,
|
||||
command: str,
|
||||
) -> None:
|
||||
server_type = "development" if command == "dev" else "production"
|
||||
|
||||
console.print(f"Starting {server_type} server 🚀")
|
||||
|
||||
url = f"http://{uvicorn_settings.host}:{uvicorn_settings.port}"
|
||||
url_docs = f"{url}/docs"
|
||||
url_ui = f"{url}/ui"
|
||||
|
||||
console.print("")
|
||||
console.print(f"Server started at [link={url}]{url}[/]")
|
||||
console.print(f"Documentation at [link={url_docs}]{url_docs}[/]")
|
||||
if docling_serve_settings.enable_ui:
|
||||
console.print(f"UI at [link={url_ui}]{url_ui}[/]")
|
||||
|
||||
if command == "dev":
|
||||
console.print("")
|
||||
console.print(
|
||||
"Running in development mode, for production use: "
|
||||
"[bold]docling-serve run[/]",
|
||||
)
|
||||
|
||||
console.print("")
|
||||
console.print("Logs:")
|
||||
|
||||
uvicorn.run(
|
||||
app="docling_serve.app:create_app",
|
||||
factory=True,
|
||||
host=uvicorn_settings.host,
|
||||
port=uvicorn_settings.port,
|
||||
reload=uvicorn_settings.reload,
|
||||
workers=uvicorn_settings.workers,
|
||||
root_path=uvicorn_settings.root_path,
|
||||
proxy_headers=uvicorn_settings.proxy_headers,
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def dev(
|
||||
*,
|
||||
# uvicorn options
|
||||
host: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The host to serve on. For local development in localhost "
|
||||
"use [blue]127.0.0.1[/blue]. To enable public access, "
|
||||
"e.g. in a container, use all the IP addresses "
|
||||
"available with [blue]0.0.0.0[/blue]."
|
||||
)
|
||||
),
|
||||
] = "127.0.0.1",
|
||||
port: Annotated[
|
||||
int,
|
||||
typer.Option(help="The port to serve on."),
|
||||
] = uvicorn_settings.port,
|
||||
reload: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable auto-reload of the server when (code) files change. "
|
||||
"This is [bold]resource intensive[/bold], "
|
||||
"use it only during development."
|
||||
)
|
||||
),
|
||||
] = True,
|
||||
root_path: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The root path is used to tell your app that it is being served "
|
||||
"to the outside world with some [bold]path prefix[/bold] "
|
||||
"set up in some termination proxy or similar."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.root_path,
|
||||
proxy_headers: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable/Disable X-Forwarded-Proto, X-Forwarded-For, "
|
||||
"X-Forwarded-Port to populate remote address info."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.proxy_headers,
|
||||
# docling options
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
typer.Option(
|
||||
help=(
|
||||
"If set to a valid directory, "
|
||||
"the model weights will be loaded from this path."
|
||||
)
|
||||
),
|
||||
] = docling_serve_settings.artifacts_path,
|
||||
enable_ui: Annotated[bool, typer.Option(help="Enable the development UI.")] = True,
|
||||
) -> Any:
|
||||
"""
|
||||
Run a [bold]Docling Serve[/bold] app in [yellow]development[/yellow] mode. 🧪
|
||||
|
||||
This is equivalent to [bold]docling-serve run[/bold] but with [bold]reload[/bold]
|
||||
enabled and listening on the [blue]127.0.0.1[/blue] address.
|
||||
|
||||
Options can be set also with the corresponding ENV variable, with the exception
|
||||
of --enable-ui, --host and --reload.
|
||||
"""
|
||||
|
||||
uvicorn_settings.host = host
|
||||
uvicorn_settings.port = port
|
||||
uvicorn_settings.reload = reload
|
||||
uvicorn_settings.root_path = root_path
|
||||
uvicorn_settings.proxy_headers = proxy_headers
|
||||
|
||||
docling_serve_settings.artifacts_path = artifacts_path
|
||||
docling_serve_settings.enable_ui = enable_ui
|
||||
|
||||
_run(
|
||||
command="dev",
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def run(
|
||||
*,
|
||||
host: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The host to serve on. For local development in localhost "
|
||||
"use [blue]127.0.0.1[/blue]. To enable public access, "
|
||||
"e.g. in a container, use all the IP addresses "
|
||||
"available with [blue]0.0.0.0[/blue]."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.host,
|
||||
port: Annotated[
|
||||
int,
|
||||
typer.Option(help="The port to serve on."),
|
||||
] = uvicorn_settings.port,
|
||||
reload: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable auto-reload of the server when (code) files change. "
|
||||
"This is [bold]resource intensive[/bold], "
|
||||
"use it only during development."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.reload,
|
||||
workers: Annotated[
|
||||
Union[int, None],
|
||||
typer.Option(
|
||||
help=(
|
||||
"Use multiple worker processes. "
|
||||
"Mutually exclusive with the --reload flag."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.workers,
|
||||
root_path: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
help=(
|
||||
"The root path is used to tell your app that it is being served "
|
||||
"to the outside world with some [bold]path prefix[/bold] "
|
||||
"set up in some termination proxy or similar."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.root_path,
|
||||
proxy_headers: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
help=(
|
||||
"Enable/Disable X-Forwarded-Proto, X-Forwarded-For, "
|
||||
"X-Forwarded-Port to populate remote address info."
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.proxy_headers,
|
||||
# docling options
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
typer.Option(
|
||||
help=(
|
||||
"If set to a valid directory, "
|
||||
"the model weights will be loaded from this path."
|
||||
)
|
||||
),
|
||||
] = docling_serve_settings.artifacts_path,
|
||||
enable_ui: Annotated[
|
||||
bool, typer.Option(help="Enable the development UI.")
|
||||
] = docling_serve_settings.enable_ui,
|
||||
) -> Any:
|
||||
"""
|
||||
Run a [bold]Docling Serve[/bold] app in [green]production[/green] mode. 🚀
|
||||
|
||||
This is equivalent to [bold]docling-serve dev[/bold] but with [bold]reload[/bold]
|
||||
disabled and listening on the [blue]0.0.0.0[/blue] address.
|
||||
|
||||
Options can be set also with the corresponding ENV variable, e.g. UVICORN_PORT
|
||||
or DOCLING_SERVE_ENABLE_UI.
|
||||
"""
|
||||
|
||||
uvicorn_settings.host = host
|
||||
uvicorn_settings.port = port
|
||||
uvicorn_settings.reload = reload
|
||||
uvicorn_settings.workers = workers
|
||||
uvicorn_settings.root_path = root_path
|
||||
uvicorn_settings.proxy_headers = proxy_headers
|
||||
|
||||
docling_serve_settings.artifacts_path = artifacts_path
|
||||
docling_serve_settings.enable_ui = enable_ui
|
||||
|
||||
_run(
|
||||
command="run",
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
app()
|
||||
|
||||
|
||||
# Launch the CLI when calling python -m docling_serve
|
||||
if __name__ == "__main__":
|
||||
|
||||
main()
|
||||
@@ -1,177 +1,72 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import tempfile
|
||||
from contextlib import asynccontextmanager
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Dict, List, Optional, Union
|
||||
|
||||
import httpx
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
from fastapi import BackgroundTasks, FastAPI, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import RedirectResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.docling_conversion import (
|
||||
ConvertDocumentFileSourcesRequest,
|
||||
ConvertDocumentsOptions,
|
||||
ConvertDocumentsRequest,
|
||||
convert_documents,
|
||||
converters,
|
||||
get_pdf_pipeline_opts,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
from docling_serve.helper_functions import FormDepends
|
||||
from docling_serve.response_preparation import ConvertDocumentResponse, process_results
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
|
||||
# Set up custom logging as we'll be intermixes with FastAPI/Uvicorn's logging
|
||||
class ColoredLogFormatter(logging.Formatter):
|
||||
COLOR_CODES = {
|
||||
logging.DEBUG: "\033[94m", # Blue
|
||||
logging.INFO: "\033[92m", # Green
|
||||
logging.WARNING: "\033[93m", # Yellow
|
||||
logging.ERROR: "\033[91m", # Red
|
||||
logging.CRITICAL: "\033[95m", # Magenta
|
||||
}
|
||||
RESET_CODE = "\033[0m"
|
||||
|
||||
def format(self, record):
|
||||
color = self.COLOR_CODES.get(record.levelno, "")
|
||||
record.levelname = f"{color}{record.levelname}{self.RESET_CODE}"
|
||||
return super().format(record)
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, # Set the logging level
|
||||
format="%(levelname)s:\t%(asctime)s - %(name)s - %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from docling_core.utils.file import resolve_remote_filename
|
||||
from fastapi import FastAPI, HTTPException, Response
|
||||
from pydantic import AnyHttpUrl, BaseModel
|
||||
|
||||
|
||||
# TODO: import enum from Docling, once it is exposed
|
||||
class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT = "tesseract"
|
||||
RAPIDOCR = "rapidocr"
|
||||
|
||||
|
||||
class ConvertOptions(BaseModel):
|
||||
output_docling_document: bool = True
|
||||
output_markdown: bool = False
|
||||
output_html: bool = False
|
||||
do_ocr: bool = True
|
||||
ocr_engine: OcrEngine = OcrEngine.EASYOCR
|
||||
ocr_lang: Optional[List[str]] = None
|
||||
force_ocr: bool = False
|
||||
do_table_structure: bool = True
|
||||
include_images: bool = True
|
||||
images_scale: float = 2.0
|
||||
|
||||
|
||||
class DocumentConvertBase(BaseModel):
|
||||
options: ConvertOptions = ConvertOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: str
|
||||
headers: Dict[str, Any] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: str
|
||||
filename: str
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
|
||||
http_source: HttpSource
|
||||
|
||||
|
||||
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
|
||||
file_source: FileSource
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
markdown: Optional[str] = None
|
||||
docling_document: Optional[DoclingDocument] = None
|
||||
html: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
# errors: List[ErrorItem] = []
|
||||
|
||||
|
||||
ConvertDocumentRequest = Union[
|
||||
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
|
||||
]
|
||||
|
||||
|
||||
class MarkdownTextResponse(Response):
|
||||
media_type = "text/markdown"
|
||||
|
||||
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
|
||||
|
||||
if options.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
|
||||
|
||||
if options.ocr_lang is not None:
|
||||
ocr_options.lang = options.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=options.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=options.do_table_structure,
|
||||
generate_page_images=options.include_images,
|
||||
generate_picture_images=options.include_images,
|
||||
images_scale=options.images_scale,
|
||||
)
|
||||
|
||||
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
|
||||
|
||||
return pipeline_options, options_hash
|
||||
|
||||
|
||||
converters: Dict[str, DocumentConverter] = {}
|
||||
|
||||
# Override the formatter with the custom ColoredLogFormatter
|
||||
root_logger = logging.getLogger() # Get the root logger
|
||||
for handler in root_logger.handlers: # Iterate through existing handlers
|
||||
if handler.formatter:
|
||||
handler.setFormatter(ColoredLogFormatter(handler.formatter._fmt))
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Context manager to initialize and clean up the lifespan of the FastAPI app
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# settings = Settings()
|
||||
|
||||
# Converter with default options
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -180,100 +75,156 @@ async def lifespan(app: FastAPI):
|
||||
yield
|
||||
|
||||
converters.clear()
|
||||
# if WITH_UI:
|
||||
# gradio_ui.close()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
##################################
|
||||
# App creation and configuration #
|
||||
##################################
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
def create_app():
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
origins = ["*"]
|
||||
methods = ["*"]
|
||||
headers = ["*"]
|
||||
|
||||
def _convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConversionResult:
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=methods,
|
||||
allow_headers=headers,
|
||||
)
|
||||
|
||||
filename: str
|
||||
buf: BytesIO
|
||||
# Mount the Gradio app
|
||||
if docling_serve_settings.enable_ui:
|
||||
|
||||
if isinstance(body, ConvertDocumentFileSourceRequest):
|
||||
buf = BytesIO(base64.b64decode(body.file_source.base64_string))
|
||||
filename = body.file_source.filename
|
||||
elif isinstance(body, ConvertDocumentHttpSourceRequest):
|
||||
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
|
||||
buf = BytesIO(http_res.content)
|
||||
filename = resolve_remote_filename(
|
||||
http_url=AnyHttpUrl(body.http_source.url),
|
||||
response_headers=dict(**http_res.headers),
|
||||
try:
|
||||
import gradio as gr
|
||||
|
||||
from docling_serve.gradio_ui import ui as gradio_ui
|
||||
|
||||
tmp_output_dir = Path(tempfile.mkdtemp())
|
||||
gradio_ui.gradio_output_dir = tmp_output_dir
|
||||
app = gr.mount_gradio_app(
|
||||
app,
|
||||
gradio_ui,
|
||||
path="/ui",
|
||||
allowed_paths=["./logo.png", tmp_output_dir],
|
||||
root_path="/ui",
|
||||
)
|
||||
except ImportError:
|
||||
_log.warning(
|
||||
"Docling Serve enable_ui is activated, but gradio is not installed. "
|
||||
"Install it with `pip install docling-serve[ui]` "
|
||||
"or `pip install gradio`"
|
||||
)
|
||||
|
||||
#############################
|
||||
# API Endpoints definitions #
|
||||
#############################
|
||||
|
||||
# Favicon
|
||||
@app.get("/favicon.ico", include_in_schema=False)
|
||||
async def favicon():
|
||||
response = RedirectResponse(
|
||||
url="https://ds4sd.github.io/docling/assets/logo.png"
|
||||
)
|
||||
return response
|
||||
|
||||
doc_input = DocumentStream(name=filename, stream=buf)
|
||||
# Status
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
|
||||
if options_hash not in converters:
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
# API readiness compatibility for OpenShift AI Workbench
|
||||
@app.get("/api", include_in_schema=False)
|
||||
def api_check() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
# Convert a document from URL(s)
|
||||
@app.post(
|
||||
"/v1alpha/convert/source",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
# "description": "Return the JSON item or an image.",
|
||||
}
|
||||
},
|
||||
)
|
||||
def process_url(
|
||||
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
||||
):
|
||||
sources: List[Union[str, DocumentStream]] = []
|
||||
headers: Optional[Dict[str, Any]] = None
|
||||
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
||||
for file_source in conversion_request.file_sources:
|
||||
sources.append(file_source.to_document_stream())
|
||||
else:
|
||||
for http_source in conversion_request.http_sources:
|
||||
sources.append(http_source.url)
|
||||
if headers is None and http_source.headers:
|
||||
headers = http_source.headers
|
||||
|
||||
# Note: results are only an iterator->lazy evaluation
|
||||
results = convert_documents(
|
||||
sources=sources, options=conversion_request.options, headers=headers
|
||||
)
|
||||
|
||||
result: ConversionResult = converters[options_hash].convert(doc_input)
|
||||
|
||||
if result is None or result.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=result.errors)
|
||||
|
||||
if result is None or result.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
}:
|
||||
raise HTTPException(
|
||||
status_code=500, detail={"errors": result.errors, "status": result.status}
|
||||
# The real processing will happen here
|
||||
response = process_results(
|
||||
background_tasks=background_tasks,
|
||||
conversion_options=conversion_request.options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return result
|
||||
return response
|
||||
|
||||
|
||||
@app.post(
|
||||
"/convert",
|
||||
)
|
||||
def convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConvertDocumentResponse:
|
||||
|
||||
result = _convert_document(body=body)
|
||||
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
# Convert a document from file(s)
|
||||
@app.post(
|
||||
"/v1alpha/convert/file",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
}
|
||||
},
|
||||
)
|
||||
doc_resp = DocumentResponse()
|
||||
if body.options.output_docling_document:
|
||||
doc_resp.docling_document = result.document
|
||||
if body.options.output_markdown:
|
||||
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
|
||||
if body.options.output_html:
|
||||
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
|
||||
async def process_file(
|
||||
background_tasks: BackgroundTasks,
|
||||
files: List[UploadFile],
|
||||
options: Annotated[
|
||||
ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
|
||||
],
|
||||
):
|
||||
|
||||
return ConvertDocumentResponse(
|
||||
document=doc_resp, status=result.status, timings=result.timings
|
||||
)
|
||||
_log.info(f"Received {len(files)} files for processing.")
|
||||
|
||||
# Load the uploaded files to Docling DocumentStream
|
||||
file_sources = []
|
||||
for file in files:
|
||||
buf = BytesIO(file.file.read())
|
||||
name = file.filename if file.filename else "file.pdf"
|
||||
file_sources.append(DocumentStream(name=name, stream=buf))
|
||||
|
||||
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
|
||||
def convert_document_md(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> MarkdownTextResponse:
|
||||
result = _convert_document(body=body)
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
)
|
||||
return MarkdownTextResponse(
|
||||
result.document.export_to_markdown(image_mode=image_mode)
|
||||
)
|
||||
results = convert_documents(sources=file_sources, options=options)
|
||||
|
||||
response = process_results(
|
||||
background_tasks=background_tasks,
|
||||
conversion_options=options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
return app
|
||||
|
||||
430
docling_serve/docling_conversion.py
Normal file
430
docling_serve/docling_conversion.py
Normal file
@@ -0,0 +1,430 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Annotated,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat, OutputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrEngine,
|
||||
OcrOptions,
|
||||
PdfBackend,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TableFormerMode,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from fastapi import HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Define the input options for the API
|
||||
class ConvertDocumentsOptions(BaseModel):
|
||||
from_formats: Annotated[
|
||||
List[InputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Input format(s) to convert from. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
||||
"Optional, defaults to all formats."
|
||||
),
|
||||
examples=[[v.value for v in InputFormat]],
|
||||
),
|
||||
] = list(InputFormat)
|
||||
|
||||
to_formats: Annotated[
|
||||
List[OutputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Output format(s) to convert to. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
||||
"Optional, defaults to Markdown."
|
||||
),
|
||||
examples=[[OutputFormat.MARKDOWN]],
|
||||
),
|
||||
] = [OutputFormat.MARKDOWN]
|
||||
|
||||
image_export_mode: Annotated[
|
||||
ImageRefMode,
|
||||
Field(
|
||||
description=(
|
||||
"Image export mode for the document (in case of JSON,"
|
||||
" Markdown or HTML). "
|
||||
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
||||
"Optional, defaults to Embedded."
|
||||
),
|
||||
examples=[ImageRefMode.EMBEDDED.value],
|
||||
# pattern="embedded|placeholder|referenced",
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED
|
||||
|
||||
do_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the bitmap content will be processed using OCR. "
|
||||
"Boolean. Optional, defaults to true"
|
||||
),
|
||||
# examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
force_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, replace existing text with OCR-generated "
|
||||
"text over content. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
# TODO: use a restricted list based on what is installed on the system
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine,
|
||||
Field(
|
||||
description=(
|
||||
"The OCR engine to use. String. "
|
||||
"Allowed values: easyocr, tesseract, rapidocr. "
|
||||
"Optional, defaults to easyocr."
|
||||
),
|
||||
examples=[OcrEngine.EASYOCR],
|
||||
),
|
||||
] = OcrEngine.EASYOCR
|
||||
|
||||
ocr_lang: Annotated[
|
||||
Optional[List[str]],
|
||||
Field(
|
||||
description=(
|
||||
"List of languages used by the OCR engine. "
|
||||
"Note that each OCR engine has "
|
||||
"different values for the language names. String or list of strings. "
|
||||
"Optional, defaults to empty."
|
||||
),
|
||||
examples=[["fr", "de", "es", "en"]],
|
||||
),
|
||||
] = None
|
||||
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend,
|
||||
Field(
|
||||
description=(
|
||||
"The PDF backend to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
||||
f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
|
||||
),
|
||||
examples=[PdfBackend.DLPARSE_V2],
|
||||
),
|
||||
] = PdfBackend.DLPARSE_V2
|
||||
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
Field(
|
||||
TableFormerMode.FAST,
|
||||
description=(
|
||||
"Mode to use for table structure, String. "
|
||||
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
||||
"Optional, defaults to fast."
|
||||
),
|
||||
examples=[TableFormerMode.FAST],
|
||||
# pattern="fast|accurate",
|
||||
),
|
||||
] = TableFormerMode.FAST
|
||||
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
return_as_file: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Return the output as a zip file "
|
||||
"(will happen anyway if multiple files are generated). "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_table_structure: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the table structure will be extracted. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
include_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, images will be extracted from the document. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
images_scale: Annotated[
|
||||
float,
|
||||
Field(
|
||||
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
||||
examples=[2.0],
|
||||
),
|
||||
] = 2.0
|
||||
|
||||
|
||||
class DocumentsConvertBase(BaseModel):
|
||||
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="HTTP url to process",
|
||||
examples=["https://arxiv.org/pdf/2206.01062"],
|
||||
),
|
||||
]
|
||||
headers: Annotated[
|
||||
Dict[str, Any],
|
||||
Field(
|
||||
description="Additional headers used to fetch the urls, "
|
||||
"e.g. authorization, agent, etc"
|
||||
),
|
||||
] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Content of the file serialized in base64. "
|
||||
"For example it can be obtained via "
|
||||
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
||||
),
|
||||
]
|
||||
filename: Annotated[
|
||||
str,
|
||||
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
||||
]
|
||||
|
||||
def to_document_stream(self) -> DocumentStream:
|
||||
buf = BytesIO(base64.b64decode(self.base64_string))
|
||||
return DocumentStream(stream=buf, name=self.filename)
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
||||
http_sources: List[HttpSource]
|
||||
|
||||
|
||||
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
||||
file_sources: List[FileSource]
|
||||
|
||||
|
||||
ConvertDocumentsRequest = Union[
|
||||
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
||||
]
|
||||
|
||||
|
||||
# Document converters will be preloaded and stored in a dictionary
|
||||
converters: Dict[bytes, DocumentConverter] = {}
|
||||
|
||||
|
||||
# Custom serializer for PdfFormatOption
|
||||
# (model_dump_json does not work with some classes)
|
||||
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
||||
data = pdf_format_option.model_dump()
|
||||
|
||||
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
||||
if pdf_format_option.pipeline_options:
|
||||
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()
|
||||
|
||||
# Replace `artifacts_path` with a string representation
|
||||
data["pipeline_options"]["artifacts_path"] = repr(
|
||||
data["pipeline_options"]["artifacts_path"]
|
||||
)
|
||||
|
||||
# Replace `pipeline_cls` with a string representation
|
||||
data["pipeline_cls"] = repr(data["pipeline_cls"])
|
||||
|
||||
# Replace `backend` with a string representation
|
||||
data["backend"] = repr(data["backend"])
|
||||
|
||||
# Handle `device` in `accelerator_options`
|
||||
if "accelerator_options" in data and "device" in data["accelerator_options"]:
|
||||
data["accelerator_options"]["device"] = repr(
|
||||
data["accelerator_options"]["device"]
|
||||
)
|
||||
|
||||
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
||||
return json.dumps(data, sort_keys=True)
|
||||
|
||||
|
||||
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
||||
def get_pdf_pipeline_opts( # noqa: C901
|
||||
request: ConvertDocumentsOptions,
|
||||
) -> Tuple[PdfFormatOption, bytes]:
|
||||
if request.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
elif request.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
elif request.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}")
|
||||
|
||||
if request.ocr_lang is not None:
|
||||
if isinstance(request.ocr_lang, str):
|
||||
ocr_options.lang = _to_list_of_strings(request.ocr_lang)
|
||||
else:
|
||||
ocr_options.lang = request.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=request.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=request.do_table_structure,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
|
||||
|
||||
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||
pipeline_options.generate_page_images = True
|
||||
if request.images_scale:
|
||||
pipeline_options.images_scale = request.images_scale
|
||||
|
||||
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
|
||||
|
||||
if docling_serve_settings.artifacts_path is not None:
|
||||
if str(docling_serve_settings.artifacts_path.absolute()) == "":
|
||||
_log.info(
|
||||
"artifacts_path is an empty path, model weights will be dowloaded "
|
||||
"at runtime."
|
||||
)
|
||||
pipeline_options.artifacts_path = None
|
||||
elif docling_serve_settings.artifacts_path.is_dir():
|
||||
_log.info(
|
||||
"artifacts_path is set to a valid directory. "
|
||||
"No model weights will be downloaded at runtime."
|
||||
)
|
||||
pipeline_options.artifacts_path = docling_serve_settings.artifacts_path
|
||||
else:
|
||||
_log.warning(
|
||||
"artifacts_path is set to an invalid directory. "
|
||||
"The system will download the model weights at runtime."
|
||||
)
|
||||
pipeline_options.artifacts_path = None
|
||||
else:
|
||||
_log.info(
|
||||
"artifacts_path is unset. "
|
||||
"The system will download the model weights at runtime."
|
||||
)
|
||||
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
serialized_data = _serialize_pdf_format_option(pdf_format_option)
|
||||
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||
|
||||
return pdf_format_option, options_hash
|
||||
|
||||
|
||||
def convert_documents(
|
||||
sources: Iterable[Union[Path, str, DocumentStream]],
|
||||
options: ConvertDocumentsOptions,
|
||||
headers: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
||||
|
||||
if options_hash not in converters:
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
converters[options_hash] = DocumentConverter(format_options=format_options)
|
||||
_log.info(f"We now have {len(converters)} converters in memory.")
|
||||
|
||||
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
|
||||
sources,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
return results
|
||||
669
docling_serve/gradio_ui.py
Normal file
669
docling_serve/gradio_ui.py
Normal file
@@ -0,0 +1,669 @@
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
##############################
|
||||
# Head JS for web components #
|
||||
##############################
|
||||
head = """
|
||||
<script src="https://unpkg.com/@docling/docling-components@0.0.3" type="module"></script>
|
||||
"""
|
||||
|
||||
#################
|
||||
# CSS and theme #
|
||||
#################
|
||||
|
||||
css = """
|
||||
#logo {
|
||||
border-style: none;
|
||||
background: none;
|
||||
box-shadow: none;
|
||||
min-width: 80px;
|
||||
}
|
||||
#dark_mode_column {
|
||||
display: flex;
|
||||
align-content: flex-end;
|
||||
}
|
||||
#title {
|
||||
text-align: left;
|
||||
display:block;
|
||||
height: auto;
|
||||
padding-top: 5px;
|
||||
line-height: 0;
|
||||
}
|
||||
.title-text h1 > p, .title-text p {
|
||||
margin-top: 0px !important;
|
||||
margin-bottom: 2px !important;
|
||||
}
|
||||
#custom-container {
|
||||
border: 0.909091px solid;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
#custom-container h4 {
|
||||
font-size: 14px;
|
||||
}
|
||||
#file_input_zone {
|
||||
height: 140px;
|
||||
}
|
||||
|
||||
docling-img::part(pages) {
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
docling-img::part(page) {
|
||||
box-shadow: 0 0.5rem 1rem 0 rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
"""
|
||||
|
||||
theme = gr.themes.Default(
|
||||
text_size="md",
|
||||
spacing_size="md",
|
||||
font=[
|
||||
gr.themes.GoogleFont("Red Hat Display"),
|
||||
"ui-sans-serif",
|
||||
"system-ui",
|
||||
"sans-serif",
|
||||
],
|
||||
font_mono=[
|
||||
gr.themes.GoogleFont("Red Hat Mono"),
|
||||
"ui-monospace",
|
||||
"Consolas",
|
||||
"monospace",
|
||||
],
|
||||
)
|
||||
|
||||
#############
|
||||
# Variables #
|
||||
#############
|
||||
|
||||
gradio_output_dir = None # Will be set by FastAPI when mounted
|
||||
file_output_path = None # Will be set when a new file is generated
|
||||
|
||||
#############
|
||||
# Functions #
|
||||
#############
|
||||
|
||||
|
||||
def health_check():
|
||||
response = requests.get(f"http://localhost:{int(os.getenv('PORT', '5001'))}/health")
|
||||
if response.status_code == 200:
|
||||
return "Healthy"
|
||||
return "Unhealthy"
|
||||
|
||||
|
||||
def set_options_visibility(x):
|
||||
return gr.Accordion("Options", open=x)
|
||||
|
||||
|
||||
def set_outputs_visibility_direct(x, y):
|
||||
content = gr.Row(visible=x)
|
||||
file = gr.Row(visible=y)
|
||||
return content, file
|
||||
|
||||
|
||||
def set_outputs_visibility_process(x):
|
||||
content = gr.Row(visible=not x)
|
||||
file = gr.Row(visible=x)
|
||||
return content, file
|
||||
|
||||
|
||||
def set_download_button_label(label_text: gr.State):
|
||||
return gr.DownloadButton(label=str(label_text), scale=1)
|
||||
|
||||
|
||||
def clear_outputs():
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
json_rendered_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
|
||||
return (
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
json_rendered_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
doctags_content,
|
||||
)
|
||||
|
||||
|
||||
def clear_url_input():
|
||||
return ""
|
||||
|
||||
|
||||
def clear_file_input():
|
||||
return None
|
||||
|
||||
|
||||
def auto_set_return_as_file(url_input, file_input, image_export_mode):
|
||||
# If more than one input source is provided, return as file
|
||||
if (
|
||||
(len(url_input.split(",")) > 1)
|
||||
or (file_input and len(file_input) > 1)
|
||||
or (image_export_mode == "referenced")
|
||||
):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def change_ocr_lang(ocr_engine):
|
||||
if ocr_engine == "easyocr":
|
||||
return "en,fr,de,es"
|
||||
elif ocr_engine == "tesseract_cli":
|
||||
return "eng,fra,deu,spa"
|
||||
elif ocr_engine == "tesseract":
|
||||
return "eng,fra,deu,spa"
|
||||
elif ocr_engine == "rapidocr":
|
||||
return "english,chinese"
|
||||
|
||||
|
||||
def process_url(
|
||||
input_sources,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
):
|
||||
parameters = {
|
||||
"http_sources": [{"url": source} for source in input_sources.split(",")],
|
||||
"options": {
|
||||
"to_formats": to_formats,
|
||||
"image_export_mode": image_export_mode,
|
||||
"ocr": ocr,
|
||||
"force_ocr": force_ocr,
|
||||
"ocr_engine": ocr_engine,
|
||||
"ocr_lang": _to_list_of_strings(ocr_lang),
|
||||
"pdf_backend": pdf_backend,
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": abort_on_error,
|
||||
"return_as_file": return_as_file,
|
||||
},
|
||||
}
|
||||
if (
|
||||
not parameters["http_sources"]
|
||||
or len(parameters["http_sources"]) == 0
|
||||
or parameters["http_sources"][0]["url"] == ""
|
||||
):
|
||||
logger.error("No input sources provided.")
|
||||
raise gr.Error("No input sources provided.", print_exception=False)
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/source",
|
||||
json=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing URL: {e}")
|
||||
raise gr.Error(f"Error processing URL: {e}", print_exception=False)
|
||||
if response.status_code != 200:
|
||||
data = response.json()
|
||||
error_message = data.get("detail", "An unknown error occurred.")
|
||||
logger.error(f"Error processing file: {error_message}")
|
||||
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
||||
output = response_to_output(response, return_as_file)
|
||||
return output
|
||||
|
||||
|
||||
def process_file(
|
||||
files,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
):
|
||||
if not files or len(files) == 0 or files[0] == "":
|
||||
logger.error("No files provided.")
|
||||
raise gr.Error("No files provided.", print_exception=False)
|
||||
files_data = [("files", (file.name, open(file.name, "rb"))) for file in files]
|
||||
|
||||
parameters = {
|
||||
"to_formats": to_formats,
|
||||
"image_export_mode": image_export_mode,
|
||||
"ocr": str(ocr).lower(),
|
||||
"force_ocr": str(force_ocr).lower(),
|
||||
"ocr_engine": ocr_engine,
|
||||
"ocr_lang": _to_list_of_strings(ocr_lang),
|
||||
"pdf_backend": pdf_backend,
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": str(abort_on_error).lower(),
|
||||
"return_as_file": str(return_as_file).lower(),
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/file",
|
||||
files=files_data,
|
||||
data=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file(s): {e}")
|
||||
raise gr.Error(f"Error processing file(s): {e}", print_exception=False)
|
||||
if response.status_code != 200:
|
||||
data = response.json()
|
||||
error_message = data.get("detail", "An unknown error occurred.")
|
||||
logger.error(f"Error processing file: {error_message}")
|
||||
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
||||
output = response_to_output(response, return_as_file)
|
||||
return output
|
||||
|
||||
|
||||
def response_to_output(response, return_as_file):
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
json_rendered_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
download_button = gr.DownloadButton(visible=False, label="Download Output", scale=1)
|
||||
if return_as_file:
|
||||
filename = (
|
||||
response.headers.get("Content-Disposition").split("filename=")[1].strip('"')
|
||||
)
|
||||
tmp_output_dir = Path(tempfile.mkdtemp(dir=gradio_output_dir, prefix="ui_"))
|
||||
file_output_path = f"{tmp_output_dir}/{filename}"
|
||||
# logger.info(f"Saving file to: {file_output_path}")
|
||||
with open(file_output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
download_button = gr.DownloadButton(
|
||||
visible=True, label=f"Download {filename}", scale=1, value=file_output_path
|
||||
)
|
||||
else:
|
||||
full_content = response.json()
|
||||
markdown_content = full_content.get("document").get("md_content")
|
||||
json_content = json.dumps(
|
||||
full_content.get("document").get("json_content"), indent=2
|
||||
)
|
||||
# Embed document JSON and trigger load at client via an image.
|
||||
json_rendered_content = f"""
|
||||
<docling-img id="dclimg" pagenumbers tooltip="parsed"></docling-img>
|
||||
<script id="dcljson" type="application/json" onload="document.getElementById('dclimg').src = JSON.parse(document.getElementById('dcljson').textContent);">{json_content}</script>
|
||||
<img src onerror="document.getElementById('dclimg').src = JSON.parse(document.getElementById('dcljson').textContent);" />
|
||||
"""
|
||||
html_content = full_content.get("document").get("html_content")
|
||||
text_content = full_content.get("document").get("text_content")
|
||||
doctags_content = full_content.get("document").get("doctags_content")
|
||||
return (
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
json_rendered_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
doctags_content,
|
||||
download_button,
|
||||
)
|
||||
|
||||
|
||||
############
|
||||
# UI Setup #
|
||||
############
|
||||
|
||||
with gr.Blocks(
|
||||
head=head,
|
||||
css=css,
|
||||
theme=theme,
|
||||
title="Docling Serve",
|
||||
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
||||
) as ui:
|
||||
|
||||
# Constants stored in states to be able to pass them as inputs to functions
|
||||
processing_text = gr.State("Processing your document(s), please wait...")
|
||||
true_bool = gr.State(True)
|
||||
false_bool = gr.State(False)
|
||||
|
||||
# Banner
|
||||
with gr.Row(elem_id="check_health"):
|
||||
# Logo
|
||||
with gr.Column(scale=1, min_width=90):
|
||||
gr.Image(
|
||||
"https://ds4sd.github.io/docling/assets/logo.png",
|
||||
height=80,
|
||||
width=80,
|
||||
show_download_button=False,
|
||||
show_label=False,
|
||||
show_fullscreen_button=False,
|
||||
container=False,
|
||||
elem_id="logo",
|
||||
scale=0,
|
||||
)
|
||||
# Title
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
gr.Markdown(
|
||||
f"# Docling Serve \n(docling version: "
|
||||
f"{importlib.metadata.version('docling')})",
|
||||
elem_id="title",
|
||||
elem_classes=["title-text"],
|
||||
)
|
||||
# Dark mode button
|
||||
with gr.Column(scale=16, elem_id="dark_mode_column"):
|
||||
dark_mode_btn = gr.Button("Dark/Light Mode", scale=0)
|
||||
dark_mode_btn.click(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
js="""() => {
|
||||
if (document.querySelectorAll('.dark').length) {
|
||||
document.querySelectorAll('.dark').forEach(
|
||||
el => el.classList.remove('dark')
|
||||
);
|
||||
} else {
|
||||
document.querySelector('body').classList.add('dark');
|
||||
}
|
||||
}""",
|
||||
show_api=False,
|
||||
)
|
||||
|
||||
# URL Processing Tab
|
||||
with gr.Tab("Convert URL(s)"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
url_input = gr.Textbox(
|
||||
label="Input Sources (comma-separated URLs)",
|
||||
placeholder="https://arxiv.org/pdf/2206.01062",
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
url_process_btn = gr.Button("Process URL(s)", scale=1)
|
||||
url_reset_btn = gr.Button("Reset", scale=1)
|
||||
|
||||
# File Processing Tab
|
||||
with gr.Tab("Convert File(s)"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
file_input = gr.File(
|
||||
elem_id="file_input_zone",
|
||||
label="Upload Files",
|
||||
file_types=[
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".html",
|
||||
".xlsx",
|
||||
".asciidoc",
|
||||
".txt",
|
||||
".md",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
],
|
||||
file_count="multiple",
|
||||
scale=4,
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
file_process_btn = gr.Button("Process File(s)", scale=1)
|
||||
file_reset_btn = gr.Button("Reset", scale=1)
|
||||
|
||||
# Options
|
||||
with gr.Accordion("Options") as options:
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1):
|
||||
to_formats = gr.CheckboxGroup(
|
||||
[
|
||||
("Markdown", "md"),
|
||||
("Docling (JSON)", "json"),
|
||||
("HTML", "html"),
|
||||
("Plain Text", "text"),
|
||||
("Doc Tags", "doctags"),
|
||||
],
|
||||
label="To Formats",
|
||||
value=["md"],
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
image_export_mode = gr.Radio(
|
||||
[
|
||||
("Embedded", "embedded"),
|
||||
("Placeholder", "placeholder"),
|
||||
("Referenced", "referenced"),
|
||||
],
|
||||
label="Image Export Mode",
|
||||
value="embedded",
|
||||
)
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr = gr.Checkbox(label="Enable OCR", value=True)
|
||||
force_ocr = gr.Checkbox(label="Force OCR", value=False)
|
||||
with gr.Column(scale=1):
|
||||
ocr_engine = gr.Radio(
|
||||
[
|
||||
("EasyOCR", "easyocr"),
|
||||
("Tesseract", "tesseract"),
|
||||
("RapidOCR", "rapidocr"),
|
||||
],
|
||||
label="OCR Engine",
|
||||
value="easyocr",
|
||||
)
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr_lang = gr.Textbox(
|
||||
label="OCR Language (beware of the format)", value="en,fr,de,es"
|
||||
)
|
||||
ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
|
||||
with gr.Row():
|
||||
with gr.Column(scale=2):
|
||||
pdf_backend = gr.Radio(
|
||||
["pypdfium2", "dlparse_v1", "dlparse_v2"],
|
||||
label="PDF Backend",
|
||||
value="dlparse_v2",
|
||||
)
|
||||
with gr.Column(scale=2):
|
||||
table_mode = gr.Radio(
|
||||
["fast", "accurate"], label="Table Mode", value="fast"
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
|
||||
return_as_file = gr.Checkbox(label="Return as File", value=False)
|
||||
|
||||
# Document output
|
||||
with gr.Row(visible=False) as content_output:
|
||||
with gr.Tab("Markdown"):
|
||||
output_markdown = gr.Code(
|
||||
language="markdown", wrap_lines=True, show_label=False
|
||||
)
|
||||
with gr.Tab("Markdown-Rendered"):
|
||||
output_markdown_rendered = gr.Markdown(label="Response")
|
||||
with gr.Tab("Docling (JSON)"):
|
||||
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("Docling-Rendered"):
|
||||
output_json_rendered = gr.HTML()
|
||||
with gr.Tab("HTML"):
|
||||
output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("HTML-Rendered"):
|
||||
output_html_rendered = gr.HTML(label="Response")
|
||||
with gr.Tab("Text"):
|
||||
output_text = gr.Code(wrap_lines=True, show_label=False)
|
||||
with gr.Tab("DocTags"):
|
||||
output_doctags = gr.Code(wrap_lines=True, show_label=False)
|
||||
|
||||
# File download output
|
||||
with gr.Row(visible=False) as file_output:
|
||||
download_file_btn = gr.DownloadButton(label="Placeholder", scale=1)
|
||||
|
||||
##############
|
||||
# UI Actions #
|
||||
##############
|
||||
|
||||
# Handle Return as File
|
||||
url_input.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
file_input.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
image_export_mode.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
|
||||
# URL processing
|
||||
url_process_btn.click(
|
||||
set_options_visibility, inputs=[false_bool], outputs=[options]
|
||||
).then(
|
||||
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
||||
).then(
|
||||
set_outputs_visibility_process,
|
||||
inputs=[return_as_file],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(
|
||||
process_url,
|
||||
inputs=[
|
||||
url_input,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
download_file_btn,
|
||||
],
|
||||
)
|
||||
|
||||
url_reset_btn.click(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_url_input, inputs=None, outputs=[url_input]
|
||||
)
|
||||
|
||||
# File processing
|
||||
file_process_btn.click(
|
||||
set_options_visibility, inputs=[false_bool], outputs=[options]
|
||||
).then(
|
||||
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
||||
).then(
|
||||
set_outputs_visibility_process,
|
||||
inputs=[return_as_file],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(
|
||||
process_file,
|
||||
inputs=[
|
||||
file_input,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
download_file_btn,
|
||||
],
|
||||
)
|
||||
|
||||
file_reset_btn.click(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_file_input, inputs=None, outputs=[file_input]
|
||||
)
|
||||
62
docling_serve/helper_functions.py
Normal file
62
docling_serve/helper_functions.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import inspect
|
||||
import re
|
||||
from typing import List, Type, Union
|
||||
|
||||
from fastapi import Depends, Form
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
||||
def FormDepends(cls: Type[BaseModel]):
|
||||
new_parameters = []
|
||||
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
new_parameters.append(
|
||||
inspect.Parameter(
|
||||
name=field_name,
|
||||
kind=inspect.Parameter.POSITIONAL_ONLY,
|
||||
default=(
|
||||
Form(...)
|
||||
if model_field.is_required()
|
||||
else Form(model_field.default)
|
||||
),
|
||||
annotation=model_field.annotation,
|
||||
)
|
||||
)
|
||||
|
||||
async def as_form_func(**data):
|
||||
return cls(**data)
|
||||
|
||||
sig = inspect.signature(as_form_func)
|
||||
sig = sig.replace(parameters=new_parameters)
|
||||
as_form_func.__signature__ = sig # type: ignore
|
||||
return Depends(as_form_func)
|
||||
|
||||
|
||||
def _to_list_of_strings(input_value: Union[str, List[str]]) -> List[str]:
|
||||
def split_and_strip(value: str) -> List[str]:
|
||||
if re.search(r"[;,]", value):
|
||||
return [item.strip() for item in re.split(r"[;,]", value)]
|
||||
else:
|
||||
return [value.strip()]
|
||||
|
||||
if isinstance(input_value, str):
|
||||
return split_and_strip(input_value)
|
||||
elif isinstance(input_value, list):
|
||||
result = []
|
||||
for item in input_value:
|
||||
result.extend(split_and_strip(str(item)))
|
||||
return result
|
||||
else:
|
||||
raise ValueError("Invalid input: must be a string or a list of strings.")
|
||||
|
||||
|
||||
# Helper functions to parse inputs coming as Form objects
|
||||
def _str_to_bool(value: Union[str, bool]) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value # Already a boolean, return as-is
|
||||
if isinstance(value, str):
|
||||
value = value.strip().lower() # Normalize input
|
||||
return value in ("true", "1", "yes")
|
||||
return False # Default to False if none of the above matches
|
||||
248
docling_serve/response_preparation.py
Normal file
248
docling_serve/response_preparation.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
from docling.datamodel.document import ConversionResult, ConversionStatus, ErrorItem
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from fastapi import BackgroundTasks, HTTPException
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.docling_conversion import ConvertDocumentsOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
filename: str
|
||||
md_content: Optional[str] = None
|
||||
json_content: Optional[DoclingDocument] = None
|
||||
html_content: Optional[str] = None
|
||||
text_content: Optional[str] = None
|
||||
doctags_content: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
processing_time: float
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
|
||||
|
||||
def _export_document_as_content(
|
||||
conv_res: ConversionResult,
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
document = DocumentResponse(filename=conv_res.input.file.name)
|
||||
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
new_doc = conv_res.document._make_copy_with_refmode(Path(), image_mode)
|
||||
|
||||
# Create the different formats
|
||||
if export_json:
|
||||
document.json_content = new_doc
|
||||
if export_html:
|
||||
document.html_content = new_doc.export_to_html(image_mode=image_mode)
|
||||
if export_txt:
|
||||
document.text_content = new_doc.export_to_markdown(
|
||||
strict_text=True, image_mode=image_mode
|
||||
)
|
||||
if export_md:
|
||||
document.md_content = new_doc.export_to_markdown(image_mode=image_mode)
|
||||
if export_doctags:
|
||||
document.doctags_content = new_doc.export_to_document_tokens()
|
||||
elif conv_res.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=conv_res.errors)
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail=conv_res.errors)
|
||||
|
||||
return document
|
||||
|
||||
|
||||
def _export_documents_as_files(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_export_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export JSON format:
|
||||
if export_json:
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
conv_res.document.save_as_json(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export HTML format:
|
||||
if export_html:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
_log.info(f"writing TXT output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname,
|
||||
strict_text=True,
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
conv_res.document.save_as_document_tokens(filename=fname)
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, "
|
||||
f"of which {failure_count} failed"
|
||||
)
|
||||
|
||||
|
||||
def process_results(
|
||||
background_tasks: BackgroundTasks,
|
||||
conversion_options: ConvertDocumentsOptions,
|
||||
conv_results: Iterable[ConversionResult],
|
||||
) -> Union[ConvertDocumentResponse, FileResponse]:
|
||||
|
||||
# Let's start by processing the documents
|
||||
try:
|
||||
start_time = time.monotonic()
|
||||
|
||||
# Convert the iterator to a list to count the number of results and get timings
|
||||
# As it's an iterator (lazy evaluation), it will also start the conversion
|
||||
conv_results = list(conv_results)
|
||||
|
||||
processing_time = time.monotonic() - start_time
|
||||
|
||||
_log.info(
|
||||
f"Processed {len(conv_results)} docs in {processing_time:.2f} seconds."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if len(conv_results) == 0:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="No documents were generated by Docling."
|
||||
)
|
||||
|
||||
# We have some results, let's prepare the response
|
||||
response: Union[FileResponse, ConvertDocumentResponse]
|
||||
|
||||
# Booleans to know what to export
|
||||
export_json = OutputFormat.JSON in conversion_options.to_formats
|
||||
export_html = OutputFormat.HTML in conversion_options.to_formats
|
||||
export_md = OutputFormat.MARKDOWN in conversion_options.to_formats
|
||||
export_txt = OutputFormat.TEXT in conversion_options.to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in conversion_options.to_formats
|
||||
|
||||
# Only 1 document was processed, and we are not returning it as a file
|
||||
if len(conv_results) == 1 and not conversion_options.return_as_file:
|
||||
conv_res = conv_results[0]
|
||||
document = _export_document_as_content(
|
||||
conv_res,
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_mode=conversion_options.image_export_mode,
|
||||
)
|
||||
|
||||
response = ConvertDocumentResponse(
|
||||
document=document,
|
||||
status=conv_res.status,
|
||||
processing_time=processing_time,
|
||||
timings=conv_res.timings,
|
||||
)
|
||||
|
||||
# Multiple documents were processed, or we are forced returning as a file
|
||||
else:
|
||||
# Temporary directory to store the outputs
|
||||
work_dir = Path(tempfile.mkdtemp(prefix="docling_"))
|
||||
output_dir = work_dir / "output"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Worker pid to use in archive identification as we may have multiple workers
|
||||
os.getpid()
|
||||
|
||||
# Export the documents
|
||||
_export_documents_as_files(
|
||||
conv_results=conv_results,
|
||||
output_dir=output_dir,
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_export_mode=conversion_options.image_export_mode,
|
||||
)
|
||||
|
||||
files = os.listdir(output_dir)
|
||||
|
||||
if len(files) == 0:
|
||||
raise HTTPException(status_code=500, detail="No documents were exported.")
|
||||
|
||||
file_path = work_dir / "converted_docs.zip"
|
||||
shutil.make_archive(
|
||||
base_name=str(file_path.with_suffix("")),
|
||||
format="zip",
|
||||
root_dir=output_dir,
|
||||
)
|
||||
|
||||
# Other cleanups after the response is sent
|
||||
# Output directory
|
||||
background_tasks.add_task(shutil.rmtree, work_dir, ignore_errors=True)
|
||||
|
||||
response = FileResponse(
|
||||
file_path, filename=file_path.name, media_type="application/zip"
|
||||
)
|
||||
|
||||
return response
|
||||
@@ -1,6 +1,33 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
class UvicornSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="UVICORN_", env_file=".env", extra="allow"
|
||||
)
|
||||
|
||||
model_config = SettingsConfigDict(env_prefix="DOCLING_")
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 5001
|
||||
reload: bool = False
|
||||
root_path: str = ""
|
||||
proxy_headers: bool = True
|
||||
workers: Union[int, None] = None
|
||||
|
||||
|
||||
class DoclingServeSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="DOCLING_SERVE_",
|
||||
env_file=".env",
|
||||
env_parse_none_str="",
|
||||
extra="allow",
|
||||
)
|
||||
|
||||
enable_ui: bool = False
|
||||
artifacts_path: Optional[Path] = None
|
||||
|
||||
|
||||
uvicorn_settings = UvicornSettings()
|
||||
docling_serve_settings = DoclingServeSettings()
|
||||
|
||||
BIN
img/swagger.png
Normal file
BIN
img/swagger.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
BIN
img/ui-input.png
Normal file
BIN
img/ui-input.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 64 KiB |
BIN
img/ui-output.png
Normal file
BIN
img/ui-output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 124 KiB |
8
os-packages.txt
Normal file
8
os-packages.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
tesseract
|
||||
tesseract-devel
|
||||
tesseract-langpack-eng
|
||||
leptonica-devel
|
||||
libglvnd-glx
|
||||
glib2
|
||||
wget
|
||||
git
|
||||
4430
poetry.lock
generated
4430
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
247
pyproject.toml
247
pyproject.toml
@@ -1,25 +1,25 @@
|
||||
[tool.poetry]
|
||||
[project]
|
||||
name = "docling-serve"
|
||||
version = "0.1.0"
|
||||
version = "0.4.0" # DO NOT EDIT, updated automatically
|
||||
description = "Running Docling as a service"
|
||||
license = "MIT"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
"Michele Dolfi <dol@zurich.ibm.com>",
|
||||
"Christoph Auer <cau@zurich.ibm.com>",
|
||||
"Panos Vagenas <pva@zurich.ibm.com>",
|
||||
"Cesar Berrospi Ramis <ceb@zurich.ibm.com>",
|
||||
"Peter Staar <taa@zurich.ibm.com>",
|
||||
{name="Michele Dolfi", email="dol@zurich.ibm.com"},
|
||||
{name="Guillaume Moutier", email="gmoutier@redhat.com"},
|
||||
{name="Anil Vishnoi", email="avishnoi@redhat.com"},
|
||||
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
|
||||
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
|
||||
{name="Christoph Auer", email="cau@zurich.ibm.com"},
|
||||
{name="Peter Staar", email="taa@zurich.ibm.com"},
|
||||
]
|
||||
maintainers = [
|
||||
"Peter Staar <taa@zurich.ibm.com>",
|
||||
"Christoph Auer <cau@zurich.ibm.com>",
|
||||
"Michele Dolfi <dol@zurich.ibm.com>",
|
||||
"Cesar Berrospi Ramis <ceb@zurich.ibm.com>",
|
||||
"Panos Vagenas <pva@zurich.ibm.com>",
|
||||
{name="Michele Dolfi", email="dol@zurich.ibm.com"},
|
||||
{name="Anil Vishnoi", email="avishnoi@redhat.com"},
|
||||
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
|
||||
{name="Christoph Auer", email="cau@zurich.ibm.com"},
|
||||
{name="Peter Staar", email="taa@zurich.ibm.com"},
|
||||
]
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/DS4SD/docling-serve"
|
||||
homepage = "https://github.com/DS4SD/docling-serve"
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
@@ -28,90 +28,145 @@ classifiers = [
|
||||
"Typing :: Typed",
|
||||
"Programming Language :: Python :: 3"
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
docling = "^2.10.0"
|
||||
fastapi = {version = "^0.115.6", extras = ["standard"]}
|
||||
uvicorn = "^0.32.1"
|
||||
pydantic-settings = "^2.4.0"
|
||||
httpx = "^0.28.1"
|
||||
tesserocr = { version = "^2.7.1", optional = true }
|
||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||
onnxruntime = [
|
||||
# 1.19.2 is the last version with python3.9 support,
|
||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"docling~=2.23",
|
||||
"fastapi[standard]~=0.115",
|
||||
"httpx~=0.28",
|
||||
"pydantic~=2.10",
|
||||
"pydantic-settings~=2.4",
|
||||
"python-multipart>=0.0.14,<0.1.0",
|
||||
"typer~=0.12",
|
||||
"uvicorn[standard]>=0.29.0,<1.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
ui = [
|
||||
"gradio~=5.9"
|
||||
]
|
||||
tesserocr = [
|
||||
"tesserocr~=2.7"
|
||||
]
|
||||
rapidocr = [
|
||||
"rapidocr-onnxruntime~=1.4; python_version<'3.13'",
|
||||
"onnxruntime~=1.7",
|
||||
]
|
||||
cpu = [
|
||||
"torch>=2.6.0",
|
||||
"torchvision>=0.21.0",
|
||||
]
|
||||
cu124 = [
|
||||
"torch>=2.6.0",
|
||||
"torchvision>=0.21.0",
|
||||
]
|
||||
|
||||
[tool.poetry.extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"mypy~=1.11",
|
||||
"pre-commit~=3.8",
|
||||
"pytest~=8.3",
|
||||
"pytest-asyncio~=0.24",
|
||||
"pytest-check~=2.4",
|
||||
"python-semantic-release~=7.32",
|
||||
"ruff>=0.9.6",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
package = true
|
||||
conflicts = [
|
||||
[
|
||||
{ extra = "cpu" },
|
||||
{ extra = "cu124" },
|
||||
],
|
||||
]
|
||||
|
||||
[tool.poetry.group.pypi-torch]
|
||||
optional = false
|
||||
|
||||
[tool.poetry.group.pypi-torch.dependencies]
|
||||
[tool.uv.sources]
|
||||
torch = [
|
||||
{version = "!=2.4.1+cpu" },
|
||||
{ index = "pytorch-cpu", extra = "cpu" },
|
||||
{ index = "pytorch-cu124", extra = "cu124" },
|
||||
]
|
||||
torchvision = [
|
||||
{version = "!=0.19.1+cpu" },
|
||||
{ index = "pytorch-cpu", extra = "cpu" },
|
||||
{ index = "pytorch-cu124", extra = "cu124" },
|
||||
]
|
||||
|
||||
[tool.poetry.group.cpu]
|
||||
optional = true
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
explicit = true
|
||||
|
||||
[tool.poetry.group.cpu.dependencies]
|
||||
torch = [
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.10"', url="https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.11"', url="https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp311-cp311-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
|
||||
]
|
||||
torchvision = [
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.10"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp310-cp310-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.11"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp311-cp311-linux_x86_64.whl"},
|
||||
{markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
|
||||
]
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cu124"
|
||||
url = "https://download.pytorch.org/whl/cu124"
|
||||
explicit = true
|
||||
|
||||
[tool.poetry.group.constraints.dependencies]
|
||||
numpy = [
|
||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
||||
]
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["docling_serve"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^24.8.0"
|
||||
isort = "^5.13.2"
|
||||
pre-commit = "^3.8.0"
|
||||
autoflake = "^2.3.1"
|
||||
flake8 = "^7.1.1"
|
||||
pytest = "^8.3.2"
|
||||
mypy = "^1.11.2"
|
||||
[project.scripts]
|
||||
docling-serve = "docling_serve.__main__:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/DS4SD/docling-serve"
|
||||
# Documentation = "https://ds4sd.github.io/docling"
|
||||
Repository = "https://github.com/DS4SD/docling-serve"
|
||||
Issues = "https://github.com/DS4SD/docling-serve/issues"
|
||||
Changelog = "https://github.com/DS4SD/docling-serve/blob/main/CHANGELOG.md"
|
||||
|
||||
[tool.black]
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
line-length = 88
|
||||
target-version = ["py310"]
|
||||
include = '\.pyi?$'
|
||||
respect-gitignore = true
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
line_length = 88
|
||||
py_version=311
|
||||
# extend-exclude = [
|
||||
# "tests",
|
||||
# ]
|
||||
|
||||
[tool.autoflake]
|
||||
in-place = true
|
||||
remove-all-unused-imports = true
|
||||
remove-unused-variables = true
|
||||
expand-star-imports = true
|
||||
recursive = true
|
||||
[tool.ruff.format]
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# "B", # flake8-bugbear
|
||||
"C", # flake8-comprehensions
|
||||
"C9", # mccabe
|
||||
# "D", # flake8-docstrings
|
||||
"E", # pycodestyle errors (default)
|
||||
"F", # pyflakes (default)
|
||||
"I", # isort
|
||||
"PD", # pandas-vet
|
||||
"PIE", # pie
|
||||
# "PTH", # pathlib
|
||||
"Q", # flake8-quotes
|
||||
# "RET", # return
|
||||
"RUF", # Enable all ruff-specific checks
|
||||
# "SIM", # simplify
|
||||
"S307", # eval
|
||||
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
||||
"W", # pycodestyle warnings
|
||||
"ASYNC" # async
|
||||
]
|
||||
|
||||
ignore = [
|
||||
"E501", # Line too long, handled by ruff formatter
|
||||
"D107", # "Missing docstring in __init__",
|
||||
"F811", # "redefinition of the same function"
|
||||
"PL", # Pylint
|
||||
"RUF012", # Mutable Class Attributes
|
||||
]
|
||||
|
||||
#extend-select = []
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"__init__.py" = ["E402", "F401"]
|
||||
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
||||
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 15
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
combine-as-imports = true
|
||||
known-third-party = ["docling", "docling_core"]
|
||||
|
||||
[tool.mypy]
|
||||
pretty = true
|
||||
@@ -125,5 +180,35 @@ module = [
|
||||
"easyocr.*",
|
||||
"tesserocr.*",
|
||||
"rapidocr_onnxruntime.*",
|
||||
"docling_conversion.*",
|
||||
"gradio_ui.*",
|
||||
"response_preparation.*",
|
||||
"helper_functions.*",
|
||||
"requests.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
asyncio_default_fixture_loop_scope = "function"
|
||||
minversion = "8.2"
|
||||
testpaths = [
|
||||
"tests",
|
||||
]
|
||||
addopts = "-rA --color=yes --tb=short --maxfail=5"
|
||||
markers = [
|
||||
"asyncio",
|
||||
]
|
||||
|
||||
[tool.semantic_release]
|
||||
# for default values check:
|
||||
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
||||
|
||||
version_source = "tag_only"
|
||||
branch = "main"
|
||||
|
||||
# configure types which should trigger minor and patch version bumps respectively
|
||||
# (note that they must be a subset of the configured allowed types):
|
||||
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
|
||||
parser_angular_minor_types = "feat"
|
||||
parser_angular_patch_types = "fix,perf"
|
||||
|
||||
BIN
tests/2206.01062v1.pdf
Normal file
BIN
tests/2206.01062v1.pdf
Normal file
Binary file not shown.
BIN
tests/2408.09869v5.pdf
Normal file
BIN
tests/2408.09869v5.pdf
Normal file
Binary file not shown.
129
tests/test_1-file-all-outputs.py
Normal file
129
tests/test_1-file-all-outputs.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(async_client):
|
||||
"""Test convert single file to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
options = {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = {
|
||||
"files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
|
||||
}
|
||||
|
||||
response = await async_client.post(
|
||||
url, files=files, data={"options": json.dumps(options)}
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Response content checks
|
||||
# Helper function to safely slice strings
|
||||
def safe_slice(value, length=100):
|
||||
if isinstance(value, str):
|
||||
return value[:length]
|
||||
return str(value) # Convert non-string values to string for debug purposes
|
||||
|
||||
# Document check
|
||||
check.is_in(
|
||||
"document",
|
||||
data,
|
||||
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
||||
)
|
||||
# MD check
|
||||
check.is_in(
|
||||
"md_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("md_content") is not None:
|
||||
check.is_in(
|
||||
"## DocLayNet: ",
|
||||
data["document"]["md_content"],
|
||||
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
||||
)
|
||||
# JSON check
|
||||
check.is_in(
|
||||
"json_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("json_content") is not None:
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
"html_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("html_content") is not None:
|
||||
check.is_in(
|
||||
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
||||
data["document"]["html_content"],
|
||||
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
||||
)
|
||||
# Text check
|
||||
check.is_in(
|
||||
"text_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("text_content") is not None:
|
||||
check.is_in(
|
||||
"DocLayNet: A Large Human-Annotated Dataset",
|
||||
data["document"]["text_content"],
|
||||
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
||||
)
|
||||
# DocTags check
|
||||
check.is_in(
|
||||
"doctags_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("doctags_content") is not None:
|
||||
check.is_in(
|
||||
"<document>\n<section_header_level_1><location>",
|
||||
data["document"]["doctags_content"],
|
||||
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
||||
)
|
||||
123
tests/test_1-url-all-outputs.py
Normal file
123
tests/test_1-url-all-outputs.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import json
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}],
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
response = await async_client.post(url, json=payload)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Response content checks
|
||||
# Helper function to safely slice strings
|
||||
def safe_slice(value, length=100):
|
||||
if isinstance(value, str):
|
||||
return value[:length]
|
||||
return str(value) # Convert non-string values to string for debug purposes
|
||||
|
||||
# Document check
|
||||
check.is_in(
|
||||
"document",
|
||||
data,
|
||||
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
||||
)
|
||||
# MD check
|
||||
check.is_in(
|
||||
"md_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("md_content") is not None:
|
||||
check.is_in(
|
||||
"## DocLayNet: ",
|
||||
data["document"]["md_content"],
|
||||
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
||||
)
|
||||
# JSON check
|
||||
check.is_in(
|
||||
"json_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("json_content") is not None:
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
"html_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("html_content") is not None:
|
||||
check.is_in(
|
||||
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
||||
data["document"]["html_content"],
|
||||
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
||||
)
|
||||
# Text check
|
||||
check.is_in(
|
||||
"text_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("text_content") is not None:
|
||||
check.is_in(
|
||||
"DocLayNet: A Large Human-Annotated Dataset",
|
||||
data["document"]["text_content"],
|
||||
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
||||
)
|
||||
# DocTags check
|
||||
check.is_in(
|
||||
"doctags_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("doctags_content") is not None:
|
||||
check.is_in(
|
||||
"<document>\n<section_header_level_1><location>",
|
||||
data["document"]["doctags_content"],
|
||||
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
||||
)
|
||||
74
tests/test_2-files-all-outputs.py
Normal file
74
tests/test_2-files-all-outputs.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(async_client):
|
||||
"""Test convert single file to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
options = {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = [
|
||||
("files", ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf")),
|
||||
("files", ("2408.09869v5.pdf", open(file_path, "rb"), "application/pdf")),
|
||||
]
|
||||
|
||||
response = await async_client.post(
|
||||
url, files=files, data={"options": json.dumps(options)}
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
# Check for zip file attachment
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert (
|
||||
content_disposition is not None
|
||||
), "Content-Disposition header should be present"
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert (
|
||||
'filename="converted_docs.zip"' in content_disposition
|
||||
), "Attachment filename should be 'converted_docs.zip'"
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert (
|
||||
content_type == "application/zip"
|
||||
), "Content-Type should be 'application/zip'"
|
||||
67
tests/test_2-urls-all-outputs.py
Normal file
67
tests/test_2-urls-all-outputs.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [
|
||||
{"url": "https://arxiv.org/pdf/2206.01062"},
|
||||
{"url": "https://arxiv.org/pdf/2408.09869"},
|
||||
],
|
||||
}
|
||||
|
||||
response = await async_client.post(url, json=payload)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
# Check for zip file attachment
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert (
|
||||
content_disposition is not None
|
||||
), "Content-Disposition header should be present"
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert (
|
||||
'filename="converted_docs.zip"' in content_disposition
|
||||
), "Attachment filename should be 'converted_docs.zip'"
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert (
|
||||
content_type == "application/zip"
|
||||
), "Content-Type should be 'application/zip'"
|
||||
Reference in New Issue
Block a user