mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 16:43:24 +00:00
Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff75bab21b | ||
|
|
7a0fabae07 | ||
|
|
9ffe49a359 | ||
|
|
68772bb6f0 | ||
|
|
20ec87a63a | ||
|
|
e30f458923 | ||
|
|
03e405638f | ||
|
|
fd8e40a008 | ||
|
|
422c402bab | ||
|
|
ea090288d3 | ||
|
|
07c48edd5d | ||
|
|
a212547d28 | ||
|
|
c76daac70c | ||
|
|
7994b19b9f | ||
|
|
ec57b528ed | ||
|
|
b92c5d8899 | ||
|
|
3c9825df30 | ||
|
|
8dd0e216fd | ||
|
|
d406802f9d | ||
|
|
a92ad48b28 | ||
|
|
da2b26099d | ||
|
|
98b46eda50 | ||
|
|
7e75919ae8 | ||
|
|
c95db36438 | ||
|
|
82f8900197 | ||
|
|
ed851c95fe | ||
|
|
05df0735d3 | ||
|
|
cad1053e36 | ||
|
|
7e6d9cdef3 | ||
|
|
343b985287 | ||
|
|
c430d9b1a1 | ||
|
|
63141f1cc7 |
7
.flake8
7
.flake8
@@ -1,7 +0,0 @@
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
exclude = test/*
|
||||
max-complexity = 18
|
||||
docstring-convention = google
|
||||
ignore = W503,E203
|
||||
classmethod-decorators = classmethod,validator
|
||||
2
.github/SECURITY.md
vendored
2
.github/SECURITY.md
vendored
@@ -20,4 +20,4 @@ After the initial reply to your report, the security team will keep you informed
|
||||
|
||||
## Security Alerts
|
||||
|
||||
We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/DS4SD/docling/discussions/categories/announcements).
|
||||
We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/docling-project/docling/discussions/categories/announcements).
|
||||
|
||||
3
.github/scripts/release.sh
vendored
3
.github/scripts/release.sh
vendored
@@ -11,6 +11,7 @@ CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
|
||||
|
||||
# update package version
|
||||
uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
|
||||
uv lock --upgrade-package docling-serve
|
||||
|
||||
# collect release notes
|
||||
REL_NOTES=$(mktemp)
|
||||
@@ -30,7 +31,7 @@ mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
|
||||
# push changes
|
||||
git config --global user.name 'github-actions[bot]'
|
||||
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
||||
git add pyproject.toml "${CHGLOG_FILE}"
|
||||
git add pyproject.toml uv.lock "${CHGLOG_FILE}"
|
||||
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
|
||||
git commit -m "${COMMIT_MSG}"
|
||||
git push origin main
|
||||
|
||||
44
.github/workflows/ci-images-dryrun.yml
vendored
44
.github/workflows/ci-images-dryrun.yml
vendored
@@ -8,8 +8,24 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_cpu_image:
|
||||
name: Build docling-serve "CPU only" container image
|
||||
build_image:
|
||||
name: Build ${{ matrix.spec.name }} container image
|
||||
strategy:
|
||||
matrix:
|
||||
spec:
|
||||
- name: docling-project/docling-serve
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cpu
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cu124
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
@@ -19,25 +35,7 @@ jobs:
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: false
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
ghcr_image_name: ds4sd/docling-serve-cpu
|
||||
quay_image_name: ""
|
||||
|
||||
|
||||
build_gpu_image:
|
||||
name: Build docling-serve (with GPU support) container image
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: false
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
ghcr_image_name: ds4sd/docling-serve
|
||||
build_args: ${{ matrix.spec.build_args }}
|
||||
ghcr_image_name: ${{ matrix.spec.name }}
|
||||
quay_image_name: ""
|
||||
platforms: ${{ matrix.spec.platforms }}
|
||||
|
||||
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@@ -8,7 +8,7 @@ on:
|
||||
|
||||
jobs:
|
||||
code-checks:
|
||||
# if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling-serve' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling-serve') }}
|
||||
# if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling-serve' && github.event.pull_request.head.repo.full_name != 'docling-project/docling-serve') }}
|
||||
uses: ./.github/workflows/job-checks.yml
|
||||
permissions:
|
||||
packages: write
|
||||
|
||||
61
.github/workflows/images.yml
vendored
61
.github/workflows/images.yml
vendored
@@ -4,24 +4,32 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
# env:
|
||||
# GHCR_REGISTRY: ghcr.io
|
||||
# # GHCR_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
# # GHCR_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
# QUAY_REGISTRY: quay.io
|
||||
# # QUAY_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
# # QUAY_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_and_publish_cpu_images:
|
||||
name: Push docling-serve "CPU only" container image to GHCR and QUAY
|
||||
build_and_publish_images:
|
||||
name: Build and push ${{ matrix.spec.name }} container image to GHCR and QUAY
|
||||
strategy:
|
||||
matrix:
|
||||
spec:
|
||||
- name: docling-project/docling-serve
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cpu
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
platforms: linux/amd64, linux/arm64
|
||||
- name: docling-project/docling-serve-cu124
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
@@ -33,28 +41,7 @@ jobs:
|
||||
with:
|
||||
publish: true
|
||||
environment: registry-creds
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cu124
|
||||
ghcr_image_name: ds4sd/docling-serve-cpu
|
||||
quay_image_name: ds4sd/docling-serve-cpu
|
||||
|
||||
|
||||
build_and_publish_gpu_images:
|
||||
name: Push docling-serve (with GPU support) container image to GHCR and QUAY
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
attestations: write
|
||||
id-token: write
|
||||
secrets: inherit
|
||||
|
||||
uses: ./.github/workflows/job-image.yml
|
||||
with:
|
||||
publish: true
|
||||
environment: registry-creds
|
||||
build_args: |
|
||||
UV_SYNC_EXTRA_ARGS=--no-extra cpu
|
||||
platforms: linux/amd64
|
||||
ghcr_image_name: ds4sd/docling-serve
|
||||
quay_image_name: ds4sd/docling-serve
|
||||
|
||||
build_args: ${{ matrix.spec.build_args }}
|
||||
ghcr_image_name: ${{ matrix.spec.name }}
|
||||
quay_image_name: ${{ matrix.spec.name }}
|
||||
platforms: ${{ matrix.spec.platforms }}
|
||||
|
||||
29
.github/workflows/job-build.yml
vendored
Normal file
29
.github/workflows/job-build.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: Run checks
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
build-package:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.12']
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-extras --no-extra cu124
|
||||
- name: Build package
|
||||
run: uv build
|
||||
- name: Check content of wheel
|
||||
run: unzip -l dist/*.whl
|
||||
- name: Store the distribution packages
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
39
.github/workflows/job-checks.yml
vendored
39
.github/workflows/job-checks.yml
vendored
@@ -16,10 +16,45 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
|
||||
- name: pre-commit cache key
|
||||
run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pre-commit
|
||||
key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-extras --no-extra cu124
|
||||
run: uv sync --frozen --all-extras --no-extra cu124
|
||||
|
||||
- name: Run styling check
|
||||
run: uv run --no-sync pre-commit run --all-files
|
||||
run: pre-commit run --all-files
|
||||
|
||||
build-package:
|
||||
uses: ./.github/workflows/job-build.yml
|
||||
|
||||
test-package:
|
||||
needs:
|
||||
- build-package
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.12']
|
||||
steps:
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
enable-cache: true
|
||||
- name: Install package
|
||||
run: uv pip install dist/*.whl
|
||||
- name: Create the server
|
||||
run: python -c 'from docling_serve.app import create_app; create_app()'
|
||||
|
||||
markdown-lint:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
8
.github/workflows/job-image.yml
vendored
8
.github/workflows/job-image.yml
vendored
@@ -28,11 +28,7 @@ on:
|
||||
|
||||
env:
|
||||
GHCR_REGISTRY: ghcr.io
|
||||
# GHCR_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
# GHCR_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
QUAY_REGISTRY: quay.io
|
||||
# QUAY_DOCLING_SERVE_CPU_IMAGE_NAME: ds4sd/docling-serve-cpu
|
||||
# QUAY_DOCLING_SERVE_GPU_IMAGE_NAME: ds4sd/docling-serve
|
||||
|
||||
jobs:
|
||||
image:
|
||||
@@ -135,6 +131,10 @@ jobs:
|
||||
cache-to: type=gha,mode=max
|
||||
file: Containerfile
|
||||
build-args: ${{ inputs.build_args }}
|
||||
|
||||
# - name: Inspect the image details
|
||||
# run: |
|
||||
# echo "${{ steps.ghcr_push.outputs.metadata }}"
|
||||
|
||||
- name: Remove Local Docker Images
|
||||
run: |
|
||||
|
||||
18
.github/workflows/pypi.yml
vendored
18
.github/workflows/pypi.yml
vendored
@@ -8,7 +8,13 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
build-package:
|
||||
uses: ./.github/workflows/job-build.yml
|
||||
|
||||
build-and-publish:
|
||||
needs:
|
||||
- build-package
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
@@ -16,15 +22,11 @@ jobs:
|
||||
permissions:
|
||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install uv and set the python version
|
||||
uses: astral-sh/setup-uv@v5
|
||||
- name: Download all the dists
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
enable-cache: true
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-extras --no-extra cu124
|
||||
- name: Build
|
||||
run: uv build
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
|
||||
@@ -3,6 +3,8 @@ config:
|
||||
no-emphasis-as-header: false
|
||||
first-line-heading: false
|
||||
MD033:
|
||||
allowed_elements: ["details", "summary"]
|
||||
allowed_elements: ["details", "summary", "br", "a", "p", "img"]
|
||||
MD024:
|
||||
siblings_only: true
|
||||
globs:
|
||||
- "**/*.md"
|
||||
|
||||
@@ -1,5 +1,18 @@
|
||||
fail_fast: true
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.6
|
||||
hooks:
|
||||
# Run the Ruff formatter.
|
||||
- id: ruff-format
|
||||
name: "Ruff formatter"
|
||||
args: [--config=pyproject.toml]
|
||||
files: '^(docling_serve|tests).*\.(py|ipynb)$'
|
||||
# Run the Ruff linter.
|
||||
- id: ruff
|
||||
name: "Ruff linter"
|
||||
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||
files: '^(docling_serve|tests).*\.(py|ipynb)$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
@@ -13,12 +26,3 @@ repos:
|
||||
rev: 0.6.1
|
||||
hooks:
|
||||
- id: uv-lock
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.6
|
||||
hooks:
|
||||
# Run the Ruff linter.
|
||||
- id: ruff
|
||||
args: [--exit-non-zero-on-fix, --config=pyproject.toml]
|
||||
# Run the Ruff formatter.
|
||||
# - id: ruff-format
|
||||
# args: [--config=pyproject.toml]
|
||||
|
||||
65
CHANGELOG.md
65
CHANGELOG.md
@@ -1,11 +1,66 @@
|
||||
## [v0.3.0](https://github.com/DS4SD/docling-serve/releases/tag/v0.3.0) - 2025-02-19
|
||||
## [v0.7.0](https://github.com/docling-project/docling-serve/releases/tag/v0.7.0) - 2025-03-31
|
||||
|
||||
### Feature
|
||||
|
||||
* Add new docling-serve cli ([#50](https://github.com/DS4SD/docling-serve/issues/50)) ([`ec33a61`](https://github.com/DS4SD/docling-serve/commit/ec33a61faa7846b9b7998fbf557ebe39a3b800f6))
|
||||
* Expose TLS settings and example deploy with oauth-proxy ([#112](https://github.com/docling-project/docling-serve/issues/112)) ([`7a0faba`](https://github.com/docling-project/docling-serve/commit/7a0fabae07020c2659dbb22c3b0359909051a74c))
|
||||
* Offline static files ([#109](https://github.com/docling-project/docling-serve/issues/109)) ([`68772bb`](https://github.com/docling-project/docling-serve/commit/68772bb6f0a87b71094a08ff851f5754c6ca6163))
|
||||
* Update to Docling 2.28 ([#106](https://github.com/docling-project/docling-serve/issues/106)) ([`20ec87a`](https://github.com/docling-project/docling-serve/commit/20ec87a63a99145bc0ad7931549af8a0c30db641))
|
||||
|
||||
### Fix
|
||||
|
||||
* Set DOCLING_SERVE_ARTIFACTS_PATH in images ([#53](https://github.com/DS4SD/docling-serve/issues/53)) ([`4877248`](https://github.com/DS4SD/docling-serve/commit/487724836896576ca4f98e84abf15fd1c383bec8))
|
||||
* Set root UI path when behind proxy ([#38](https://github.com/DS4SD/docling-serve/issues/38)) ([`c64a450`](https://github.com/DS4SD/docling-serve/commit/c64a450bf9ba9947ab180e92bef2763ff710b210))
|
||||
* Support python 3.13 and docling updates and switch to uv ([#48](https://github.com/DS4SD/docling-serve/issues/48)) ([`ae3b490`](https://github.com/DS4SD/docling-serve/commit/ae3b4906f1c0829b1331ea491f3518741cabff71))
|
||||
* Move ARGs to prevent cache invalidation ([#104](https://github.com/docling-project/docling-serve/issues/104)) ([`e30f458`](https://github.com/docling-project/docling-serve/commit/e30f458923d34c169db7d5a5c296848716e8cac4))
|
||||
|
||||
## [v0.6.0](https://github.com/docling-project/docling-serve/releases/tag/v0.6.0) - 2025-03-17
|
||||
|
||||
### Feature
|
||||
|
||||
* Expose options for new features ([#92](https://github.com/docling-project/docling-serve/issues/92)) ([`ec57b52`](https://github.com/docling-project/docling-serve/commit/ec57b528ed3f8e7b9604ff4cdf06da3d52c714dd))
|
||||
|
||||
### Fix
|
||||
|
||||
* Allow changes in CORS settings ([#100](https://github.com/docling-project/docling-serve/issues/100)) ([`422c402`](https://github.com/docling-project/docling-serve/commit/422c402bab7f05e46274ede11f234a19a62e093e))
|
||||
* Avoid exploding options cache using lru and expose size parameter ([#101](https://github.com/docling-project/docling-serve/issues/101)) ([`ea09028`](https://github.com/docling-project/docling-serve/commit/ea090288d3eec4ea8fbdcd32a6a497a99c89189d))
|
||||
* Increase timeout_keep_alive and allow parameter changes ([#98](https://github.com/docling-project/docling-serve/issues/98)) ([`07c48ed`](https://github.com/docling-project/docling-serve/commit/07c48edd5d9437219d9623e3d05bc5166c5bb85a))
|
||||
* Add warning when using incompatible parameters ([#99](https://github.com/docling-project/docling-serve/issues/99)) ([`a212547`](https://github.com/docling-project/docling-serve/commit/a212547d28d6588c65e52000dc7bc04f3f77e69e))
|
||||
* **ui:** Use --port parameter and avoid failing when image is not found ([#97](https://github.com/docling-project/docling-serve/issues/97)) ([`c76daac`](https://github.com/docling-project/docling-serve/commit/c76daac70c87da412f791666881e48b74688b060))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Simplify README and move details to docs ([#102](https://github.com/docling-project/docling-serve/issues/102)) ([`fd8e40a`](https://github.com/docling-project/docling-serve/commit/fd8e40a00849771263d9b75b9a56f6caeccb8517))
|
||||
|
||||
## [v0.5.1](https://github.com/docling-project/docling-serve/releases/tag/v0.5.1) - 2025-03-10
|
||||
|
||||
### Fix
|
||||
|
||||
* Submodules in wheels ([#85](https://github.com/docling-project/docling-serve/issues/85)) ([`a92ad48`](https://github.com/docling-project/docling-serve/commit/a92ad48b287bfcb134011dc0fc3f91ee04e067ee))
|
||||
|
||||
## [v0.5.0](https://github.com/docling-project/docling-serve/releases/tag/v0.5.0) - 2025-03-07
|
||||
|
||||
### Feature
|
||||
|
||||
* Async api ([#60](https://github.com/docling-project/docling-serve/issues/60)) ([`82f8900`](https://github.com/docling-project/docling-serve/commit/82f890019745859699c1b01f9ccfb64cb7e37906))
|
||||
* Display version in fastapi docs ([#78](https://github.com/docling-project/docling-serve/issues/78)) ([`ed851c9`](https://github.com/docling-project/docling-serve/commit/ed851c95fee5f59305ddc3dcd5c09efce618470b))
|
||||
|
||||
### Fix
|
||||
|
||||
* Remove uv from image, merge ARG and ENV declarations ([#57](https://github.com/docling-project/docling-serve/issues/57)) ([`c95db36`](https://github.com/docling-project/docling-serve/commit/c95db3643807a4dfb96d93c8e10d6eb486c49a30))
|
||||
* **docs:** Remove comma in convert/source curl example ([#73](https://github.com/docling-project/docling-serve/issues/73)) ([`05df073`](https://github.com/docling-project/docling-serve/commit/05df0735d35a589bdc2a11fcdd764a10f700cb6f))
|
||||
|
||||
## [v0.4.0](https://github.com/docling-project/docling-serve/releases/tag/v0.4.0) - 2025-02-26
|
||||
|
||||
### Feature
|
||||
|
||||
* New container images ([#68](https://github.com/docling-project/docling-serve/issues/68)) ([`7e6d9cd`](https://github.com/docling-project/docling-serve/commit/7e6d9cdef398df70a5b4d626aeb523c428c10d56))
|
||||
* Render DoclingDocument with npm docling-components in the example UI ([#65](https://github.com/docling-project/docling-serve/issues/65)) ([`c430d9b`](https://github.com/docling-project/docling-serve/commit/c430d9b1a162ab29104d86ebaa1ac5a5488b1f09))
|
||||
|
||||
## [v0.3.0](https://github.com/docling-project/docling-serve/releases/tag/v0.3.0) - 2025-02-19
|
||||
|
||||
### Feature
|
||||
|
||||
* Add new docling-serve cli ([#50](https://github.com/docling-project/docling-serve/issues/50)) ([`ec33a61`](https://github.com/docling-project/docling-serve/commit/ec33a61faa7846b9b7998fbf557ebe39a3b800f6))
|
||||
|
||||
### Fix
|
||||
|
||||
* Set DOCLING_SERVE_ARTIFACTS_PATH in images ([#53](https://github.com/docling-project/docling-serve/issues/53)) ([`4877248`](https://github.com/docling-project/docling-serve/commit/487724836896576ca4f98e84abf15fd1c383bec8))
|
||||
* Set root UI path when behind proxy ([#38](https://github.com/docling-project/docling-serve/issues/38)) ([`c64a450`](https://github.com/docling-project/docling-serve/commit/c64a450bf9ba9947ab180e92bef2763ff710b210))
|
||||
* Support python 3.13 and docling updates and switch to uv ([#48](https://github.com/docling-project/docling-serve/issues/48)) ([`ae3b490`](https://github.com/docling-project/docling-serve/commit/ae3b4906f1c0829b1331ea491f3518741cabff71))
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
Our project welcomes external contributions. If you have an itch, please feel
|
||||
free to scratch it.
|
||||
|
||||
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling-serve/pulls).
|
||||
To contribute code or documentation, please submit a [pull request](https://github.com/docling-project/docling-serve/pulls).
|
||||
|
||||
A good way to familiarize yourself with the codebase and contribution process is
|
||||
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling-serve/issues).
|
||||
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/docling-project/docling-serve/issues).
|
||||
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
|
||||
|
||||
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling-serve/discussions).
|
||||
For general questions or support requests, please refer to the [discussion section](https://github.com/docling-project/docling-serve/discussions).
|
||||
|
||||
**Note: We appreciate your effort, and want to avoid a situation where a contribution
|
||||
requires extensive rework (by you or by us), sits in backlog for a long time, or
|
||||
@@ -17,14 +17,14 @@ cannot be accepted at all!**
|
||||
|
||||
### Proposing new features
|
||||
|
||||
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling-serve/issues)
|
||||
If you would like to implement a new feature, please [raise an issue](https://github.com/docling-project/docling-serve/issues)
|
||||
before sending a pull request so the feature can be discussed. This is to avoid
|
||||
you wasting your valuable time working on a feature that the project developers
|
||||
are not interested in accepting into the code base.
|
||||
|
||||
### Fixing bugs
|
||||
|
||||
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling-serve/issues) before sending a
|
||||
If you would like to fix a bug, please [raise an issue](https://github.com/docling-project/docling-serve/issues) before sending a
|
||||
pull request so it can be tracked.
|
||||
|
||||
### Merge approval
|
||||
@@ -73,7 +73,7 @@ git commit -s
|
||||
|
||||
## Communication
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling-serve/discussions).
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling-serve/discussions).
|
||||
|
||||
## Developing
|
||||
|
||||
@@ -142,8 +142,7 @@ poetry add NAME
|
||||
|
||||
We use the following tools to enforce code style:
|
||||
|
||||
- iSort, to sort imports
|
||||
- Black, to format code
|
||||
- ruff, to sort imports and format code
|
||||
|
||||
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
|
||||
|
||||
@@ -157,4 +156,4 @@ To run the checks on-demand, run:
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||
Note: Formatting checks like `ruff` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||
|
||||
@@ -2,9 +2,6 @@ ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
|
||||
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
ARG MODELS_LIST="layout tableformer picture_classifier easyocr"
|
||||
ARG UV_SYNC_EXTRA_ARGS=""
|
||||
|
||||
USER 0
|
||||
|
||||
###################################################################################################
|
||||
@@ -22,8 +19,6 @@ RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
|
||||
|
||||
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.6.1 /uv /uvx /bin/
|
||||
|
||||
###################################################################################################
|
||||
# Docling layer #
|
||||
###################################################################################################
|
||||
@@ -32,30 +27,40 @@ USER 1001
|
||||
|
||||
WORKDIR /opt/app-root/src
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
ENV \
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
OMP_NUM_THREADS=4 \
|
||||
LANG=en_US.UTF-8 \
|
||||
LC_ALL=en_US.UTF-8 \
|
||||
PYTHONIOENCODING=utf-8 \
|
||||
UV_COMPILE_BYTECODE=1 \
|
||||
UV_LINK_MODE=copy \
|
||||
UV_PROJECT_ENVIRONMENT=/opt/app-root \
|
||||
DOCLING_SERVE_ARTIFACTS_PATH=/opt/app-root/src/.cache/docling/models
|
||||
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=utf-8
|
||||
ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
|
||||
ENV UV_PROJECT_ENVIRONMENT=/opt/app-root
|
||||
ARG UV_SYNC_EXTRA_ARGS=""
|
||||
|
||||
ENV DOCLING_SERVE_ARTIFACTS_PATH=/opt/app-root/src/.cache/docling/models
|
||||
RUN --mount=from=ghcr.io/astral-sh/uv:0.6.1,source=/uv,target=/bin/uv \
|
||||
--mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --frozen --no-install-project --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS}
|
||||
|
||||
COPY --chown=1001:0 pyproject.toml uv.lock README.md ./
|
||||
|
||||
RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
uv sync --frozen --no-install-project --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA}
|
||||
ARG MODELS_LIST="layout tableformer picture_classifier easyocr"
|
||||
|
||||
RUN echo "Downloading models..." && \
|
||||
HF_HUB_DOWNLOAD_TIMEOUT="90" \
|
||||
HF_HUB_ETAG_TIMEOUT="90" \
|
||||
docling-tools models download -o "${DOCLING_SERVE_ARTIFACTS_PATH}" ${MODELS_LIST} && \
|
||||
chown -R 1001:0 /opt/app-root/src/.cache && \
|
||||
chmod -R g=u /opt/app-root/src/.cache
|
||||
|
||||
COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
|
||||
RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA}
|
||||
COPY --chown=1001:0 ./docling_serve ./docling_serve
|
||||
RUN --mount=from=ghcr.io/astral-sh/uv:0.6.1,source=/uv,target=/bin/uv \
|
||||
--mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS}
|
||||
|
||||
EXPOSE 5001
|
||||
|
||||
|
||||
31
Makefile
31
Makefile
@@ -24,19 +24,26 @@ action-lint-file:
|
||||
md-lint-file:
|
||||
$(CMD_PREFIX) touch .markdown-lint
|
||||
|
||||
.PHONY: docling-serve-image
|
||||
docling-serve-image: Containerfile
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124 --no-extra cpu" -f Containerfile -t ghcr.io/docling-project/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve:$(TAG) ghcr.io/docling-project/docling-serve:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve:$(TAG) quay.io/docling-project/docling-serve:main
|
||||
|
||||
.PHONY: docling-serve-cpu-image
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124" -f Containerfile -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cu124" -f Containerfile -t ghcr.io/docling-project/docling-serve-cpu:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cpu:$(TAG) ghcr.io/docling-project/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cpu:$(TAG) quay.io/docling-project/docling-serve-cpu:main
|
||||
|
||||
.PHONY: docling-serve-gpu-image
|
||||
docling-serve-gpu-image: Containerfile ## Build docling-serve container image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with GPU]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cpu" -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) quay.io/ds4sd/docling-serve:main
|
||||
.PHONY: docling-serve-cu124-image
|
||||
docling-serve-cu124-image: Containerfile ## Build docling-serve container image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with Cuda 12.4]"
|
||||
$(CMD_PREFIX) docker build --load --build-arg "UV_SYNC_EXTRA_ARGS=--no-extra cpu" -f Containerfile --platform linux/amd64 -t ghcr.io/docling-project/docling-serve-cu124:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu124:$(TAG) ghcr.io/docling-project/docling-serve-cu124:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/docling-project/docling-serve-cu124:$(TAG) quay.io/docling-project/docling-serve-cu124:main
|
||||
|
||||
.PHONY: action-lint
|
||||
action-lint: .action-lint ## Lint GitHub Action workflows
|
||||
@@ -59,7 +66,7 @@ action-lint: .action-lint ## Lint GitHub Action workflows
|
||||
md-lint: .md-lint ## Lint markdown files
|
||||
.md-lint: $(wildcard */**/*.md) | md-lint-file
|
||||
$(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]"
|
||||
$(CMD_PREFIX) docker run --rm -v $$(pwd):/workdir davidanson/markdownlint-cli2:v0.14.0 "**/*.md"
|
||||
$(CMD_PREFIX) docker run --rm -v $$(pwd):/workdir davidanson/markdownlint-cli2:v0.16.0 "**/*.md" "#.venv"
|
||||
$(CMD_PREFIX) touch $@
|
||||
|
||||
.PHONY: py-Lint
|
||||
@@ -77,11 +84,11 @@ run-docling-cpu: ## Run the docling-serve container with CPU support and assign
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-cpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with CPU support on port 5001...\n" "[RUN CPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-cpu -p 5001:5001 ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-cpu -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:main
|
||||
|
||||
.PHONY: run-docling-gpu
|
||||
run-docling-gpu: ## Run the docling-serve container with GPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-gpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN GPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-gpu -p 5001:5001 ghcr.io/ds4sd/docling-serve:main
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-gpu -p 5001:5001 ghcr.io/docling-project/docling-serve:main
|
||||
|
||||
440
README.md
440
README.md
@@ -1,420 +1,84 @@
|
||||
<p align="center">
|
||||
<a href="https://github.com/docling-project/docling-serve">
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling-serve/raw/main/docs/assets/docling-serve-pic.png" width="30%"/>
|
||||
</a>
|
||||
</p>
|
||||
|
||||
# Docling Serve
|
||||
|
||||
Running [Docling](https://github.com/DS4SD/docling) as an API service.
|
||||
Running [Docling](https://github.com/docling-project/docling) as an API service.
|
||||
|
||||
## Usage
|
||||
## Getting started
|
||||
|
||||
The API provides two endpoints: one for urls, one for files. This is necessary to send files directly in binary format instead of base64-encoded strings.
|
||||
Install the `docling-serve` package and run the server.
|
||||
|
||||
### Common parameters
|
||||
```bash
|
||||
# Using the python package
|
||||
pip install "docling-serve"
|
||||
docling-serve run
|
||||
|
||||
On top of the source of file (see below), both endpoints support the same parameters, which are almost the same as the Docling CLI.
|
||||
|
||||
- `from_format` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
|
||||
- `to_formats` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
|
||||
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
||||
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
||||
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
||||
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
||||
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`. Defaults to `dlparse_v2`.
|
||||
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
||||
- `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
|
||||
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
|
||||
- `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
|
||||
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to true.
|
||||
- `images_scale` (float): Scale factor for images. Defaults to 2.0.
|
||||
|
||||
### URL endpoint
|
||||
|
||||
The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
|
||||
|
||||
On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
|
||||
The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
|
||||
No `options` is required, they can be partially or completely omitted.
|
||||
|
||||
Simple payload example:
|
||||
|
||||
```json
|
||||
{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
# Using container images, e.g. with Podman
|
||||
podman run -p 5001:5001 quay.io/docling-project/docling-serve
|
||||
```
|
||||
|
||||
<details>
|
||||
The server is available at
|
||||
|
||||
<summary>Complete payload example:</summary>
|
||||
- API <http://127.0.0.1:5001>
|
||||
- API documentation <http://127.0.0.1:5001/docs>
|
||||

|
||||
|
||||
```json
|
||||
{
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
Try it out with a simple conversion:
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://localhost:5001/v1alpha/convert/source' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx"
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": [
|
||||
"fr",
|
||||
"de",
|
||||
"es",
|
||||
"en"
|
||||
],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
"do_table_structure": true,
|
||||
"include_images": true,
|
||||
"images_scale": 2,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}'
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
|
||||
}'
|
||||
```
|
||||
|
||||
</details>
|
||||
### Container images
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
Available container images:
|
||||
|
||||
```python
|
||||
import httpx
|
||||
| Name | Description | Arch | Size |
|
||||
| -----|-------------|------|------|
|
||||
| [`ghcr.io/docling-project/docling-serve`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve) <br /> [`quay.io/docling-project/docling-serve`](https://quay.io/repository/docling-project/docling-serve) | Simple image for Docling Serve, installing all packages from the official pypi.org index. | `linux/amd64`, `linux/arm64` | 3.6 GB |
|
||||
| [`ghcr.io/docling-project/docling-serve-cpu`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cpu) <br /> [`quay.io/docling-project/docling-serve-cpu`](https://quay.io/repository/docling-project/docling-serve-cpu) | Cpu-only image which installs `torch` from the pytorch cpu index. | `linux/amd64`, `linux/arm64` | 3.6 GB |
|
||||
| [`ghcr.io/docling-project/docling-serve-cu124`](https://github.com/docling-project/docling-serve/pkgs/container/docling-serve-cu124) <br /> [`quay.io/docling-project/docling-serve-cu124`](https://quay.io/repository/docling-project/docling-serve-cu124) | Cuda 12.4 image which installs `torch` from the pytorch cu124 index. | `linux/amd64` | 8.7 GB |
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": "en",
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
Coming soon: `docling-serve-slim` images will reduce the size by skipping the model weights download.
|
||||
|
||||
response = await async_client_client.post(url, json=payload)
|
||||
### Demonstration UI
|
||||
|
||||
data = response.json()
|
||||
```bash
|
||||
# Install the Python package with the extra dependencies
|
||||
pip install "docling-serve[ui]"
|
||||
docling-serve run --enable-ui
|
||||
|
||||
# Run the container image with the extra env parameters
|
||||
podman run -p 5001:5001 -e DOCLING_SERVE_ENABLE_UI=true quay.io/docling-project/docling-serve
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### File as base64
|
||||
|
||||
The `file_sources` argument in the endpoint allows to send files as base64-encoded strings.
|
||||
When your PDF or other file type is too large, encoding it and passing it inline to curl
|
||||
can lead to an “Argument list too long” error on some systems. To avoid this, we write
|
||||
the JSON request body to a file and have curl read from that file.
|
||||
|
||||
<details>
|
||||
<summary>CURL steps:</summary>
|
||||
|
||||
```sh
|
||||
# 1. Base64-encode the file
|
||||
B64_DATA=$(base64 -w 0 /path/to/file/pdf-to-convert.pdf)
|
||||
|
||||
# 2. Build the JSON with your options
|
||||
cat <<EOF > /tmp/request_body.json
|
||||
{
|
||||
"options": {
|
||||
},
|
||||
"file_sources": [{
|
||||
"base64_string": "${B64_DATA}",
|
||||
"filename": "pdf-to-convert.pdf"
|
||||
}]
|
||||
}
|
||||
EOF
|
||||
|
||||
# 3. POST the request to the docling service
|
||||
curl -X POST "localhost:5001/v1alpha/convert/source" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @/tmp/request_body.json
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### File endpoint
|
||||
|
||||
The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
|
||||
|
||||
<details>
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:5001/v1alpha/convert/file' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'ocr_engine=easyocr' \
|
||||
-F 'pdf_backend=dlparse_v2' \
|
||||
-F 'from_formats=pdf' \
|
||||
-F 'from_formats=docx' \
|
||||
-F 'force_ocr=false' \
|
||||
-F 'image_export_mode=embedded' \
|
||||
-F 'ocr_lang=en' \
|
||||
-F 'ocr_lang=pl' \
|
||||
-F 'table_mode=fast' \
|
||||
-F 'files=@2206.01062v1.pdf;type=application/pdf' \
|
||||
-F 'abort_on_error=false' \
|
||||
-F 'to_formats=md' \
|
||||
-F 'to_formats=text' \
|
||||
-F 'return_as_file=false' \
|
||||
-F 'do_ocr=true'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
parameters = {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, '2206.01062v1.pdf')
|
||||
|
||||
files = {
|
||||
'files': ('2206.01062v1.pdf', open(file_path, 'rb'), 'application/pdf'),
|
||||
}
|
||||
|
||||
response = await async_client.post(url, files=files, data={"parameters": json.dumps(parameters)})
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Response format
|
||||
|
||||
The response can be a JSON Document or a File.
|
||||
|
||||
- If you process only one file, the response will be a JSON document with the following format:
|
||||
|
||||
```jsonc
|
||||
{
|
||||
"document": {
|
||||
"md_content": "",
|
||||
"json_content": {},
|
||||
"html_content": "",
|
||||
"text_content": "",
|
||||
"doctags_content": ""
|
||||
},
|
||||
"status": "<success|partial_success|skipped|failure>",
|
||||
"processing_time": 0.0,
|
||||
"timings": {},
|
||||
"errors": []
|
||||
}
|
||||
```
|
||||
|
||||
Depending on the value you set in `output_formats`, the different items will be populated with their respective results or empty.
|
||||
|
||||
`processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
|
||||
timing of all the internal Docling components.
|
||||
|
||||
- If you set the parameter `return_as_file` to True, the response will be a zip file.
|
||||
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
|
||||
|
||||
## Helpers
|
||||
|
||||
- A full Swagger UI is available at the `/docs` endpoint.
|
||||
|
||||

|
||||
|
||||
- An easy to use UI is available at the `/ui` endpoint.
|
||||
An easy to use UI is available at the `/ui` endpoint.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## Development
|
||||
## Documentation and advance usages
|
||||
|
||||
### CPU only
|
||||
|
||||
```sh
|
||||
# Install uv if not already available
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Install dependencies
|
||||
uv sync --extra cpu
|
||||
```
|
||||
|
||||
### Cuda GPU
|
||||
|
||||
For GPU support use the following command:
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
uv sync
|
||||
```
|
||||
|
||||
### Gradio UI and different OCR backends
|
||||
|
||||
`/ui` endpoint using `gradio` and different OCR backends can be enabled via package extras:
|
||||
|
||||
```sh
|
||||
# Enable ui and rapidocr
|
||||
uv sync --extra ui --extra rapidocr
|
||||
```
|
||||
|
||||
```sh
|
||||
# Enable tesserocr
|
||||
uv sync --extra tesserocr
|
||||
```
|
||||
|
||||
See `[project.optional-dependencies]` section in `pyproject.toml` for full list of options.
|
||||
|
||||
### Run the server
|
||||
|
||||
The `docling-serve` executable is a convenient script for launching the webserver both in
|
||||
development and production mode.
|
||||
|
||||
```sh
|
||||
# Run the server in development mode
|
||||
# - reload is enabled by default
|
||||
# - listening on the 127.0.0.1 address
|
||||
# - ui is enabled by default
|
||||
docling-serve dev
|
||||
|
||||
# Run the server in production mode
|
||||
# - reload is disabled by default
|
||||
# - listening on the 0.0.0.0 address
|
||||
# - ui is disabled by default
|
||||
docling-serve run
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
The `docling-serve` executable allows is controlled with both command line
|
||||
options and environment variables.
|
||||
|
||||
<details>
|
||||
<summary>`docling-serve` help message</summary>
|
||||
|
||||
```sh
|
||||
$ docling-serve dev --help
|
||||
|
||||
Usage: docling-serve dev [OPTIONS]
|
||||
|
||||
Run a Docling Serve app in development mode. 🧪
|
||||
This is equivalent to docling-serve run but with reload
|
||||
enabled and listening on the 127.0.0.1 address.
|
||||
|
||||
Options can be set also with the corresponding ENV variable, with the exception
|
||||
of --enable-ui, --host and --reload.
|
||||
|
||||
╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --host TEXT The host to serve on. For local development in localhost │
|
||||
│ use 127.0.0.1. To enable public access, e.g. in a │
|
||||
│ container, use all the IP addresses available with │
|
||||
│ 0.0.0.0. │
|
||||
│ [default: 127.0.0.1] │
|
||||
│ --port INTEGER The port to serve on. [default: 5001] │
|
||||
│ --reload --no-reload Enable auto-reload of the server when (code) files │
|
||||
│ change. This is resource intensive, use it only during │
|
||||
│ development. │
|
||||
│ [default: reload] │
|
||||
│ --root-path TEXT The root path is used to tell your app that it is being │
|
||||
│ served to the outside world with some path prefix set up │
|
||||
│ in some termination proxy or similar. │
|
||||
│ --proxy-headers --no-proxy-headers Enable/Disable X-Forwarded-Proto, X-Forwarded-For, │
|
||||
│ X-Forwarded-Port to populate remote address info. │
|
||||
│ [default: proxy-headers] │
|
||||
│ --artifacts-path PATH If set to a valid directory, the model weights will be │
|
||||
│ loaded from this path. │
|
||||
│ [default: None] │
|
||||
│ --enable-ui --no-enable-ui Enable the development UI. [default: enable-ui] │
|
||||
│ --help Show this message and exit. │
|
||||
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### Environment variables
|
||||
|
||||
The environment variables controlling the `uvicorn` execution can be specified with the `UVICORN_` prefix:
|
||||
|
||||
- `UVICORN_WORKERS`: Number of workers to use.
|
||||
- `UVICORN_RELOAD`: If `True`, this will enable auto-reload when you modify files, useful for development.
|
||||
|
||||
The environment variables controlling specifics of the Docling Serve app can be specified with the
|
||||
`DOCLING_SERVE_` prefix:
|
||||
|
||||
- `DOCLING_SERVE_ARTIFACTS_PATH`: if set Docling will use only the local weights of models, for example `/opt/app-root/src/.cache/docling/models`.
|
||||
- `DOCLING_SERVE_ENABLE_UI`: If `True`, The Gradio UI will be available at `/ui`.
|
||||
|
||||
Others:
|
||||
|
||||
- `TESSDATA_PREFIX`: Tesseract data location, example `/usr/share/tesseract/tessdata/`.
|
||||
Visit the [Docling Serve documentation](./docs/README.md) for learning how to [configure the webserver](./docs/configuration.md), use all the [runtime options](./docs/usage.md) of the API and [deployment examples](./docs/deployment.md).
|
||||
|
||||
## Get help and support
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling Serve](https://github.com/DS4SD/docling-serve/blob/main/CONTRIBUTING.md) for details.
|
||||
Please read [Contributing to Docling Serve](https://github.com/docling-project/docling-serve/blob/main/CONTRIBUTING.md) for details.
|
||||
|
||||
## References
|
||||
|
||||
@@ -422,14 +86,14 @@ If you use Docling in your projects, please consider citing the following:
|
||||
|
||||
```bib
|
||||
@techreport{Docling,
|
||||
author = {Deep Search Team},
|
||||
month = {8},
|
||||
title = {Docling Technical Report},
|
||||
url = {https://arxiv.org/abs/2408.09869},
|
||||
eprint = {2408.09869},
|
||||
doi = {10.48550/arXiv.2408.09869},
|
||||
version = {1.0.0},
|
||||
year = {2024}
|
||||
author = {Docling Contributors},
|
||||
month = {1},
|
||||
title = {Docling: An Efficient Open-Source Toolkit for AI-driven Document Conversion},
|
||||
url = {https://arxiv.org/abs/2501.17887},
|
||||
eprint = {2501.17887},
|
||||
doi = {10.48550/arXiv.2501.17887},
|
||||
version = {2.0.0},
|
||||
year = {2025}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import platform
|
||||
import sys
|
||||
@@ -51,9 +51,7 @@ def version_callback(value: bool) -> None:
|
||||
def callback(
|
||||
version: Annotated[
|
||||
Union[bool, None],
|
||||
typer.Option(
|
||||
"--version", help="Show the version and exit.", callback=version_callback
|
||||
),
|
||||
typer.Option(help="Show the version and exit.", callback=version_callback),
|
||||
] = None,
|
||||
verbose: Annotated[
|
||||
int,
|
||||
@@ -76,12 +74,44 @@ def callback(
|
||||
def _run(
|
||||
*,
|
||||
command: str,
|
||||
# Docling serve parameters
|
||||
artifacts_path: Path | None,
|
||||
enable_ui: bool,
|
||||
) -> None:
|
||||
server_type = "development" if command == "dev" else "production"
|
||||
|
||||
console.print(f"Starting {server_type} server 🚀")
|
||||
|
||||
url = f"http://{uvicorn_settings.host}:{uvicorn_settings.port}"
|
||||
run_subprocess = (
|
||||
uvicorn_settings.workers is not None and uvicorn_settings.workers > 1
|
||||
) or uvicorn_settings.reload
|
||||
|
||||
run_ssl = (
|
||||
uvicorn_settings.ssl_certfile is not None
|
||||
and uvicorn_settings.ssl_keyfile is not None
|
||||
)
|
||||
|
||||
if run_subprocess and docling_serve_settings.artifacts_path != artifacts_path:
|
||||
err_console.print(
|
||||
"\n[yellow]:warning: The server will run with reload or multiple workers. \n"
|
||||
"The argument [bold]--artifacts-path[/bold] will be ignored, please set the value \n"
|
||||
"using the environment variable [bold]DOCLING_SERVE_ARTIFACTS_PATH[/bold].[/yellow]"
|
||||
)
|
||||
|
||||
if run_subprocess and docling_serve_settings.enable_ui != enable_ui:
|
||||
err_console.print(
|
||||
"\n[yellow]:warning: The server will run with reload or multiple workers. \n"
|
||||
"The argument [bold]--enable-ui[/bold] will be ignored, please set the value \n"
|
||||
"using the environment variable [bold]DOCLING_SERVE_ENABLE_UI[/bold].[/yellow]"
|
||||
)
|
||||
|
||||
# Propagate the settings to the app settings
|
||||
docling_serve_settings.artifacts_path = artifacts_path
|
||||
docling_serve_settings.enable_ui = enable_ui
|
||||
|
||||
# Print documentation
|
||||
protocol = "https" if run_ssl else "http"
|
||||
url = f"{protocol}://{uvicorn_settings.host}:{uvicorn_settings.port}"
|
||||
url_docs = f"{url}/docs"
|
||||
url_ui = f"{url}/ui"
|
||||
|
||||
@@ -101,6 +131,7 @@ def _run(
|
||||
console.print("")
|
||||
console.print("Logs:")
|
||||
|
||||
# Launch the server
|
||||
uvicorn.run(
|
||||
app="docling_serve.app:create_app",
|
||||
factory=True,
|
||||
@@ -110,6 +141,10 @@ def _run(
|
||||
workers=uvicorn_settings.workers,
|
||||
root_path=uvicorn_settings.root_path,
|
||||
proxy_headers=uvicorn_settings.proxy_headers,
|
||||
timeout_keep_alive=uvicorn_settings.timeout_keep_alive,
|
||||
ssl_certfile=uvicorn_settings.ssl_certfile,
|
||||
ssl_keyfile=uvicorn_settings.ssl_keyfile,
|
||||
ssl_keyfile_password=uvicorn_settings.ssl_keyfile_password,
|
||||
)
|
||||
|
||||
|
||||
@@ -161,6 +196,18 @@ def dev(
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.proxy_headers,
|
||||
timeout_keep_alive: Annotated[
|
||||
int, typer.Option(help="Timeout for the server response.")
|
||||
] = uvicorn_settings.timeout_keep_alive,
|
||||
ssl_certfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL certificate file")
|
||||
] = uvicorn_settings.ssl_certfile,
|
||||
ssl_keyfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL key file")
|
||||
] = uvicorn_settings.ssl_keyfile,
|
||||
ssl_keyfile_password: Annotated[
|
||||
Optional[str], typer.Option(help="SSL keyfile password")
|
||||
] = uvicorn_settings.ssl_keyfile_password,
|
||||
# docling options
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
@@ -188,12 +235,15 @@ def dev(
|
||||
uvicorn_settings.reload = reload
|
||||
uvicorn_settings.root_path = root_path
|
||||
uvicorn_settings.proxy_headers = proxy_headers
|
||||
|
||||
docling_serve_settings.artifacts_path = artifacts_path
|
||||
docling_serve_settings.enable_ui = enable_ui
|
||||
uvicorn_settings.timeout_keep_alive = timeout_keep_alive
|
||||
uvicorn_settings.ssl_certfile = ssl_certfile
|
||||
uvicorn_settings.ssl_keyfile = ssl_keyfile
|
||||
uvicorn_settings.ssl_keyfile_password = ssl_keyfile_password
|
||||
|
||||
_run(
|
||||
command="dev",
|
||||
artifacts_path=artifacts_path,
|
||||
enable_ui=enable_ui,
|
||||
)
|
||||
|
||||
|
||||
@@ -253,6 +303,18 @@ def run(
|
||||
)
|
||||
),
|
||||
] = uvicorn_settings.proxy_headers,
|
||||
timeout_keep_alive: Annotated[
|
||||
int, typer.Option(help="Timeout for the server response.")
|
||||
] = uvicorn_settings.timeout_keep_alive,
|
||||
ssl_certfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL certificate file")
|
||||
] = uvicorn_settings.ssl_certfile,
|
||||
ssl_keyfile: Annotated[
|
||||
Optional[Path], typer.Option(help="SSL key file")
|
||||
] = uvicorn_settings.ssl_keyfile,
|
||||
ssl_keyfile_password: Annotated[
|
||||
Optional[str], typer.Option(help="SSL keyfile password")
|
||||
] = uvicorn_settings.ssl_keyfile_password,
|
||||
# docling options
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
@@ -283,12 +345,15 @@ def run(
|
||||
uvicorn_settings.workers = workers
|
||||
uvicorn_settings.root_path = root_path
|
||||
uvicorn_settings.proxy_headers = proxy_headers
|
||||
|
||||
docling_serve_settings.artifacts_path = artifacts_path
|
||||
docling_serve_settings.enable_ui = enable_ui
|
||||
uvicorn_settings.timeout_keep_alive = timeout_keep_alive
|
||||
uvicorn_settings.ssl_certfile = ssl_certfile
|
||||
uvicorn_settings.ssl_keyfile = ssl_keyfile
|
||||
uvicorn_settings.ssl_keyfile_password = ssl_keyfile_password
|
||||
|
||||
_run(
|
||||
command="run",
|
||||
artifacts_path=artifacts_path,
|
||||
enable_ui=enable_ui,
|
||||
)
|
||||
|
||||
|
||||
@@ -298,5 +363,4 @@ def main() -> None:
|
||||
|
||||
# Launch the CLI when calling python -m docling_serve
|
||||
if __name__ == "__main__":
|
||||
|
||||
main()
|
||||
|
||||
@@ -1,27 +1,57 @@
|
||||
import asyncio
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import tempfile
|
||||
from contextlib import asynccontextmanager
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Dict, List, Optional, Union
|
||||
from typing import Annotated, Any, Optional, Union
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
from fastapi import BackgroundTasks, FastAPI, UploadFile
|
||||
from fastapi import (
|
||||
BackgroundTasks,
|
||||
Depends,
|
||||
FastAPI,
|
||||
HTTPException,
|
||||
Query,
|
||||
UploadFile,
|
||||
WebSocket,
|
||||
WebSocketDisconnect,
|
||||
)
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.openapi.docs import (
|
||||
get_redoc_html,
|
||||
get_swagger_ui_html,
|
||||
get_swagger_ui_oauth2_redirect_html,
|
||||
)
|
||||
from fastapi.responses import RedirectResponse
|
||||
from pydantic import BaseModel
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from docling_serve.docling_conversion import (
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
from docling_serve.datamodel.requests import (
|
||||
ConvertDocumentFileSourcesRequest,
|
||||
ConvertDocumentsOptions,
|
||||
ConvertDocumentsRequest,
|
||||
)
|
||||
from docling_serve.datamodel.responses import (
|
||||
ConvertDocumentResponse,
|
||||
HealthCheckResponse,
|
||||
MessageKind,
|
||||
TaskStatusResponse,
|
||||
WebsocketMessage,
|
||||
)
|
||||
from docling_serve.docling_conversion import (
|
||||
convert_documents,
|
||||
converters,
|
||||
get_converter,
|
||||
get_pdf_pipeline_opts,
|
||||
)
|
||||
from docling_serve.engines import get_orchestrator
|
||||
from docling_serve.engines.async_local.orchestrator import (
|
||||
AsyncLocalOrchestrator,
|
||||
TaskNotFoundError,
|
||||
)
|
||||
from docling_serve.helper_functions import FormDepends
|
||||
from docling_serve.response_preparation import ConvertDocumentResponse, process_results
|
||||
from docling_serve.response_preparation import process_results
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
|
||||
@@ -60,23 +90,23 @@ _log = logging.getLogger(__name__)
|
||||
# Context manager to initialize and clean up the lifespan of the FastAPI app
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
|
||||
# Converter with default options
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
)
|
||||
pdf_format_option = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
get_converter(pdf_format_option)
|
||||
|
||||
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
||||
orchestrator = get_orchestrator()
|
||||
|
||||
# Start the background queue processor
|
||||
queue_task = asyncio.create_task(orchestrator.process_queue())
|
||||
|
||||
yield
|
||||
|
||||
converters.clear()
|
||||
# if WITH_UI:
|
||||
# gradio_ui.close()
|
||||
# Cancel the background queue processor on shutdown
|
||||
queue_task.cancel()
|
||||
try:
|
||||
await queue_task
|
||||
except asyncio.CancelledError:
|
||||
_log.info("Queue processor cancelled.")
|
||||
|
||||
|
||||
##################################
|
||||
@@ -84,15 +114,33 @@ async def lifespan(app: FastAPI):
|
||||
##################################
|
||||
|
||||
|
||||
def create_app():
|
||||
def create_app(): # noqa: C901
|
||||
try:
|
||||
version = importlib.metadata.version("docling_serve")
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
_log.warning("Unable to get docling_serve version, falling back to 0.0.0")
|
||||
|
||||
version = "0.0.0"
|
||||
|
||||
offline_docs_assets = False
|
||||
if (
|
||||
docling_serve_settings.static_path is not None
|
||||
and (docling_serve_settings.static_path).is_dir()
|
||||
):
|
||||
offline_docs_assets = True
|
||||
_log.info("Found static assets.")
|
||||
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
docs_url=None if offline_docs_assets else "/docs",
|
||||
redoc_url=None if offline_docs_assets else "/redocs",
|
||||
lifespan=lifespan,
|
||||
version=version,
|
||||
)
|
||||
|
||||
origins = ["*"]
|
||||
methods = ["*"]
|
||||
headers = ["*"]
|
||||
origins = docling_serve_settings.cors_origins
|
||||
methods = docling_serve_settings.cors_methods
|
||||
headers = docling_serve_settings.cors_headers
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -104,7 +152,6 @@ def create_app():
|
||||
|
||||
# Mount the Gradio app
|
||||
if docling_serve_settings.enable_ui:
|
||||
|
||||
try:
|
||||
import gradio as gr
|
||||
|
||||
@@ -126,6 +173,38 @@ def create_app():
|
||||
"or `pip install gradio`"
|
||||
)
|
||||
|
||||
#############################
|
||||
# Offline assets definition #
|
||||
#############################
|
||||
if offline_docs_assets:
|
||||
app.mount(
|
||||
"/static",
|
||||
StaticFiles(directory=docling_serve_settings.static_path),
|
||||
name="static",
|
||||
)
|
||||
|
||||
@app.get("/docs", include_in_schema=False)
|
||||
async def custom_swagger_ui_html():
|
||||
return get_swagger_ui_html(
|
||||
openapi_url=app.openapi_url,
|
||||
title=app.title + " - Swagger UI",
|
||||
oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
|
||||
swagger_js_url="/static/swagger-ui-bundle.js",
|
||||
swagger_css_url="/static/swagger-ui.css",
|
||||
)
|
||||
|
||||
@app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
|
||||
async def swagger_ui_redirect():
|
||||
return get_swagger_ui_oauth2_redirect_html()
|
||||
|
||||
@app.get("/redoc", include_in_schema=False)
|
||||
async def redoc_html():
|
||||
return get_redoc_html(
|
||||
openapi_url=app.openapi_url,
|
||||
title=app.title + " - ReDoc",
|
||||
redoc_js_url="/static/redoc.standalone.js",
|
||||
)
|
||||
|
||||
#############################
|
||||
# API Endpoints definitions #
|
||||
#############################
|
||||
@@ -133,15 +212,12 @@ def create_app():
|
||||
# Favicon
|
||||
@app.get("/favicon.ico", include_in_schema=False)
|
||||
async def favicon():
|
||||
response = RedirectResponse(
|
||||
url="https://ds4sd.github.io/docling/assets/logo.png"
|
||||
)
|
||||
logo_url = "https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"
|
||||
if offline_docs_assets:
|
||||
logo_url = "/static/logo.svg"
|
||||
response = RedirectResponse(url=logo_url)
|
||||
return response
|
||||
|
||||
# Status
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
@@ -165,8 +241,8 @@ def create_app():
|
||||
def process_url(
|
||||
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
||||
):
|
||||
sources: List[Union[str, DocumentStream]] = []
|
||||
headers: Optional[Dict[str, Any]] = None
|
||||
sources: list[Union[str, DocumentStream]] = []
|
||||
headers: Optional[dict[str, Any]] = None
|
||||
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
||||
for file_source in conversion_request.file_sources:
|
||||
sources.append(file_source.to_document_stream())
|
||||
@@ -202,12 +278,11 @@ def create_app():
|
||||
)
|
||||
async def process_file(
|
||||
background_tasks: BackgroundTasks,
|
||||
files: List[UploadFile],
|
||||
files: list[UploadFile],
|
||||
options: Annotated[
|
||||
ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
|
||||
],
|
||||
):
|
||||
|
||||
_log.info(f"Received {len(files)} files for processing.")
|
||||
|
||||
# Load the uploaded files to Docling DocumentStream
|
||||
@@ -227,4 +302,129 @@ def create_app():
|
||||
|
||||
return response
|
||||
|
||||
# Convert a document from URL(s) using the async api
|
||||
@app.post(
|
||||
"/v1alpha/convert/source/async",
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def process_url_async(
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
conversion_request: ConvertDocumentsRequest,
|
||||
):
|
||||
task = await orchestrator.enqueue(request=conversion_request)
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task.task_id
|
||||
)
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
|
||||
# Task status poll
|
||||
@app.get(
|
||||
"/v1alpha/status/poll/{task_id}",
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
async def task_status_poll(
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
task_id: str,
|
||||
wait: Annotated[
|
||||
float, Query(help="Number of seconds to wait for a completed status.")
|
||||
] = 0.0,
|
||||
):
|
||||
try:
|
||||
task = await orchestrator.task_status(task_id=task_id, wait=wait)
|
||||
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
||||
except TaskNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="Task not found.")
|
||||
return TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
|
||||
# Task status websocket
|
||||
@app.websocket(
|
||||
"/v1alpha/status/ws/{task_id}",
|
||||
)
|
||||
async def task_status_ws(
|
||||
websocket: WebSocket,
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
task_id: str,
|
||||
):
|
||||
await websocket.accept()
|
||||
|
||||
if task_id not in orchestrator.tasks:
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(
|
||||
message=MessageKind.ERROR, error="Task not found."
|
||||
).model_dump_json()
|
||||
)
|
||||
await websocket.close()
|
||||
return
|
||||
|
||||
task = orchestrator.tasks[task_id]
|
||||
|
||||
# Track active WebSocket connections for this job
|
||||
orchestrator.task_subscribers[task_id].add(websocket)
|
||||
|
||||
try:
|
||||
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
||||
task_response = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(
|
||||
message=MessageKind.CONNECTION, task=task_response
|
||||
).model_dump_json()
|
||||
)
|
||||
while True:
|
||||
task_queue_position = await orchestrator.get_queue_position(
|
||||
task_id=task_id
|
||||
)
|
||||
task_response = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(
|
||||
message=MessageKind.UPDATE, task=task_response
|
||||
).model_dump_json()
|
||||
)
|
||||
# each client message will be interpreted as a request for update
|
||||
msg = await websocket.receive_text()
|
||||
_log.debug(f"Received message: {msg}")
|
||||
|
||||
except WebSocketDisconnect:
|
||||
_log.info(f"WebSocket disconnected for job {task_id}")
|
||||
|
||||
finally:
|
||||
orchestrator.task_subscribers[task_id].remove(websocket)
|
||||
|
||||
# Task result
|
||||
@app.get(
|
||||
"/v1alpha/result/{task_id}",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
}
|
||||
},
|
||||
)
|
||||
async def task_result(
|
||||
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
||||
task_id: str,
|
||||
):
|
||||
result = await orchestrator.task_result(task_id=task_id)
|
||||
if result is None:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Task result not found. Please wait for a completion status.",
|
||||
)
|
||||
return result
|
||||
|
||||
return app
|
||||
|
||||
0
docling_serve/datamodel/__init__.py
Normal file
0
docling_serve/datamodel/__init__.py
Normal file
229
docling_serve/datamodel/convert.py
Normal file
229
docling_serve/datamodel/convert.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# Define the input options for the API
|
||||
from typing import Annotated, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling.datamodel.base_models import InputFormat, OutputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfBackend,
|
||||
TableFormerMode,
|
||||
)
|
||||
from docling.models.factories import get_ocr_factory
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
ocr_factory = get_ocr_factory(
|
||||
allow_external_plugins=docling_serve_settings.allow_external_plugins
|
||||
)
|
||||
ocr_engines_enum = ocr_factory.get_enum()
|
||||
|
||||
|
||||
class ConvertDocumentsOptions(BaseModel):
|
||||
from_formats: Annotated[
|
||||
list[InputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Input format(s) to convert from. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
||||
"Optional, defaults to all formats."
|
||||
),
|
||||
examples=[[v.value for v in InputFormat]],
|
||||
),
|
||||
] = list(InputFormat)
|
||||
|
||||
to_formats: Annotated[
|
||||
list[OutputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Output format(s) to convert to. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
||||
"Optional, defaults to Markdown."
|
||||
),
|
||||
examples=[[OutputFormat.MARKDOWN]],
|
||||
),
|
||||
] = [OutputFormat.MARKDOWN]
|
||||
|
||||
image_export_mode: Annotated[
|
||||
ImageRefMode,
|
||||
Field(
|
||||
description=(
|
||||
"Image export mode for the document (in case of JSON,"
|
||||
" Markdown or HTML). "
|
||||
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
||||
"Optional, defaults to Embedded."
|
||||
),
|
||||
examples=[ImageRefMode.EMBEDDED.value],
|
||||
# pattern="embedded|placeholder|referenced",
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED
|
||||
|
||||
do_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the bitmap content will be processed using OCR. "
|
||||
"Boolean. Optional, defaults to true"
|
||||
),
|
||||
# examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
force_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, replace existing text with OCR-generated "
|
||||
"text over content. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
ocr_engine: Annotated[ # type: ignore
|
||||
ocr_engines_enum,
|
||||
Field(
|
||||
description=(
|
||||
"The OCR engine to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in ocr_engines_enum])}. "
|
||||
"Optional, defaults to easyocr."
|
||||
),
|
||||
examples=[EasyOcrOptions.kind],
|
||||
),
|
||||
] = ocr_engines_enum(EasyOcrOptions.kind) # type: ignore
|
||||
|
||||
ocr_lang: Annotated[
|
||||
Optional[list[str]],
|
||||
Field(
|
||||
description=(
|
||||
"List of languages used by the OCR engine. "
|
||||
"Note that each OCR engine has "
|
||||
"different values for the language names. String or list of strings. "
|
||||
"Optional, defaults to empty."
|
||||
),
|
||||
examples=[["fr", "de", "es", "en"]],
|
||||
),
|
||||
] = None
|
||||
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend,
|
||||
Field(
|
||||
description=(
|
||||
"The PDF backend to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
||||
f"Optional, defaults to {PdfBackend.DLPARSE_V4.value}."
|
||||
),
|
||||
examples=[PdfBackend.DLPARSE_V4],
|
||||
),
|
||||
] = PdfBackend.DLPARSE_V4
|
||||
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
Field(
|
||||
TableFormerMode.FAST,
|
||||
description=(
|
||||
"Mode to use for table structure, String. "
|
||||
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
||||
"Optional, defaults to fast."
|
||||
),
|
||||
examples=[TableFormerMode.FAST],
|
||||
# pattern="fast|accurate",
|
||||
),
|
||||
] = TableFormerMode.FAST
|
||||
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
return_as_file: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Return the output as a zip file "
|
||||
"(will happen anyway if multiple files are generated). "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_table_structure: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the table structure will be extracted. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
include_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, images will be extracted from the document. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
images_scale: Annotated[
|
||||
float,
|
||||
Field(
|
||||
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
||||
examples=[2.0],
|
||||
),
|
||||
] = 2.0
|
||||
|
||||
do_code_enrichment: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, perform OCR code enrichment. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_formula_enrichment: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, perform formula OCR, return Latex code. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_picture_classification: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, classify pictures in documents. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_picture_description: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, describe pictures in documents. "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
12
docling_serve/datamodel/engines.py
Normal file
12
docling_serve/datamodel/engines.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import enum
|
||||
|
||||
|
||||
class TaskStatus(str, enum.Enum):
|
||||
SUCCESS = "success"
|
||||
PENDING = "pending"
|
||||
STARTED = "started"
|
||||
FAILURE = "failure"
|
||||
|
||||
|
||||
class AsyncEngine(str, enum.Enum):
|
||||
LOCAL = "local"
|
||||
62
docling_serve/datamodel/requests.py
Normal file
62
docling_serve/datamodel/requests.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Any, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
|
||||
|
||||
class DocumentsConvertBase(BaseModel):
|
||||
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="HTTP url to process",
|
||||
examples=["https://arxiv.org/pdf/2206.01062"],
|
||||
),
|
||||
]
|
||||
headers: Annotated[
|
||||
dict[str, Any],
|
||||
Field(
|
||||
description="Additional headers used to fetch the urls, "
|
||||
"e.g. authorization, agent, etc"
|
||||
),
|
||||
] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Content of the file serialized in base64. "
|
||||
"For example it can be obtained via "
|
||||
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
||||
),
|
||||
]
|
||||
filename: Annotated[
|
||||
str,
|
||||
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
||||
]
|
||||
|
||||
def to_document_stream(self) -> DocumentStream:
|
||||
buf = BytesIO(base64.b64decode(self.base64_string))
|
||||
return DocumentStream(stream=buf, name=self.filename)
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
||||
http_sources: list[HttpSource]
|
||||
|
||||
|
||||
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
||||
file_sources: list[FileSource]
|
||||
|
||||
|
||||
ConvertDocumentsRequest = Union[
|
||||
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
||||
]
|
||||
52
docling_serve/datamodel/responses.py
Normal file
52
docling_serve/datamodel/responses.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import enum
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.document import ConversionStatus, ErrorItem
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
|
||||
# Status
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
filename: str
|
||||
md_content: Optional[str] = None
|
||||
json_content: Optional[DoclingDocument] = None
|
||||
html_content: Optional[str] = None
|
||||
text_content: Optional[str] = None
|
||||
doctags_content: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: list[ErrorItem] = []
|
||||
processing_time: float
|
||||
timings: dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
|
||||
|
||||
class TaskStatusResponse(BaseModel):
|
||||
task_id: str
|
||||
task_status: str
|
||||
task_position: Optional[int] = None
|
||||
|
||||
|
||||
class MessageKind(str, enum.Enum):
|
||||
CONNECTION = "connection"
|
||||
UPDATE = "update"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
class WebsocketMessage(BaseModel):
|
||||
message: MessageKind
|
||||
task: Optional[TaskStatusResponse] = None
|
||||
error: Optional[str] = None
|
||||
19
docling_serve/datamodel/task.py
Normal file
19
docling_serve/datamodel/task.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.datamodel.engines import TaskStatus
|
||||
from docling_serve.datamodel.requests import ConvertDocumentsRequest
|
||||
from docling_serve.datamodel.responses import ConvertDocumentResponse
|
||||
|
||||
|
||||
class Task(BaseModel):
|
||||
task_id: str
|
||||
task_status: TaskStatus = TaskStatus.PENDING
|
||||
request: Optional[ConvertDocumentsRequest]
|
||||
result: Optional[ConvertDocumentResponse] = None
|
||||
|
||||
def is_completed(self) -> bool:
|
||||
if self.task_status in [TaskStatus.SUCCESS, TaskStatus.FAILURE]:
|
||||
return True
|
||||
return False
|
||||
@@ -1,276 +1,39 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from collections.abc import Iterable, Iterator
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Annotated,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat, OutputFormat
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrEngine,
|
||||
OcrOptions,
|
||||
PdfBackend,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TableFormerMode,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from fastapi import HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions, ocr_factory
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Define the input options for the API
|
||||
class ConvertDocumentsOptions(BaseModel):
|
||||
from_formats: Annotated[
|
||||
List[InputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Input format(s) to convert from. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
||||
"Optional, defaults to all formats."
|
||||
),
|
||||
examples=[[v.value for v in InputFormat]],
|
||||
),
|
||||
] = list(InputFormat)
|
||||
|
||||
to_formats: Annotated[
|
||||
List[OutputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Output format(s) to convert to. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
||||
"Optional, defaults to Markdown."
|
||||
),
|
||||
examples=[[OutputFormat.MARKDOWN]],
|
||||
),
|
||||
] = [OutputFormat.MARKDOWN]
|
||||
|
||||
image_export_mode: Annotated[
|
||||
ImageRefMode,
|
||||
Field(
|
||||
description=(
|
||||
"Image export mode for the document (in case of JSON,"
|
||||
" Markdown or HTML). "
|
||||
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
||||
"Optional, defaults to Embedded."
|
||||
),
|
||||
examples=[ImageRefMode.EMBEDDED.value],
|
||||
# pattern="embedded|placeholder|referenced",
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED
|
||||
|
||||
do_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the bitmap content will be processed using OCR. "
|
||||
"Boolean. Optional, defaults to true"
|
||||
),
|
||||
# examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
force_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, replace existing text with OCR-generated "
|
||||
"text over content. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
# TODO: use a restricted list based on what is installed on the system
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine,
|
||||
Field(
|
||||
description=(
|
||||
"The OCR engine to use. String. "
|
||||
"Allowed values: easyocr, tesseract, rapidocr. "
|
||||
"Optional, defaults to easyocr."
|
||||
),
|
||||
examples=[OcrEngine.EASYOCR],
|
||||
),
|
||||
] = OcrEngine.EASYOCR
|
||||
|
||||
ocr_lang: Annotated[
|
||||
Optional[List[str]],
|
||||
Field(
|
||||
description=(
|
||||
"List of languages used by the OCR engine. "
|
||||
"Note that each OCR engine has "
|
||||
"different values for the language names. String or list of strings. "
|
||||
"Optional, defaults to empty."
|
||||
),
|
||||
examples=[["fr", "de", "es", "en"]],
|
||||
),
|
||||
] = None
|
||||
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend,
|
||||
Field(
|
||||
description=(
|
||||
"The PDF backend to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
||||
f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
|
||||
),
|
||||
examples=[PdfBackend.DLPARSE_V2],
|
||||
),
|
||||
] = PdfBackend.DLPARSE_V2
|
||||
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
Field(
|
||||
TableFormerMode.FAST,
|
||||
description=(
|
||||
"Mode to use for table structure, String. "
|
||||
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
||||
"Optional, defaults to fast."
|
||||
),
|
||||
examples=[TableFormerMode.FAST],
|
||||
# pattern="fast|accurate",
|
||||
),
|
||||
] = TableFormerMode.FAST
|
||||
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
return_as_file: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Return the output as a zip file "
|
||||
"(will happen anyway if multiple files are generated). "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_table_structure: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the table structure will be extracted. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
include_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, images will be extracted from the document. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
images_scale: Annotated[
|
||||
float,
|
||||
Field(
|
||||
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
||||
examples=[2.0],
|
||||
),
|
||||
] = 2.0
|
||||
|
||||
|
||||
class DocumentsConvertBase(BaseModel):
|
||||
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="HTTP url to process",
|
||||
examples=["https://arxiv.org/pdf/2206.01062"],
|
||||
),
|
||||
]
|
||||
headers: Annotated[
|
||||
Dict[str, Any],
|
||||
Field(
|
||||
description="Additional headers used to fetch the urls, "
|
||||
"e.g. authorization, agent, etc"
|
||||
),
|
||||
] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Content of the file serialized in base64. "
|
||||
"For example it can be obtained via "
|
||||
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
||||
),
|
||||
]
|
||||
filename: Annotated[
|
||||
str,
|
||||
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
||||
]
|
||||
|
||||
def to_document_stream(self) -> DocumentStream:
|
||||
buf = BytesIO(base64.b64decode(self.base64_string))
|
||||
return DocumentStream(stream=buf, name=self.filename)
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
||||
http_sources: List[HttpSource]
|
||||
|
||||
|
||||
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
||||
file_sources: List[FileSource]
|
||||
|
||||
|
||||
ConvertDocumentsRequest = Union[
|
||||
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
||||
]
|
||||
|
||||
|
||||
# Document converters will be preloaded and stored in a dictionary
|
||||
converters: Dict[bytes, DocumentConverter] = {}
|
||||
|
||||
|
||||
# Custom serializer for PdfFormatOption
|
||||
# (model_dump_json does not work with some classes)
|
||||
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
||||
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
||||
data = pdf_format_option.model_dump()
|
||||
|
||||
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
||||
@@ -295,51 +58,50 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
||||
)
|
||||
|
||||
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
||||
return json.dumps(data, sort_keys=True)
|
||||
serialized_data = json.dumps(data, sort_keys=True)
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||
return options_hash
|
||||
|
||||
|
||||
# Cache of DocumentConverter objects
|
||||
_options_map: dict[bytes, PdfFormatOption] = {}
|
||||
|
||||
|
||||
@lru_cache(maxsize=docling_serve_settings.options_cache_size)
|
||||
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
||||
pdf_format_option = _options_map[options_hash]
|
||||
format_options: dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
return DocumentConverter(format_options=format_options)
|
||||
|
||||
|
||||
def get_converter(pdf_format_option: PdfFormatOption) -> DocumentConverter:
|
||||
options_hash = _hash_pdf_format_option(pdf_format_option)
|
||||
_options_map[options_hash] = pdf_format_option
|
||||
return _get_converter_from_hash(options_hash)
|
||||
|
||||
|
||||
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
||||
def get_pdf_pipeline_opts( # noqa: C901
|
||||
def get_pdf_pipeline_opts(
|
||||
request: ConvertDocumentsOptions,
|
||||
) -> Tuple[PdfFormatOption, bytes]:
|
||||
if request.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
elif request.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
elif request.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}")
|
||||
) -> PdfFormatOption:
|
||||
try:
|
||||
ocr_options: OcrOptions = ocr_factory.create_options(
|
||||
kind=request.ocr_engine.value, # type: ignore
|
||||
force_full_page_ocr=request.force_ocr,
|
||||
)
|
||||
except ImportError as err:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})" # type: ignore
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.\n"
|
||||
f"{err}",
|
||||
)
|
||||
|
||||
if request.ocr_lang is not None:
|
||||
if isinstance(request.ocr_lang, str):
|
||||
@@ -351,6 +113,10 @@ def get_pdf_pipeline_opts( # noqa: C901
|
||||
do_ocr=request.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=request.do_table_structure,
|
||||
do_code_enrichment=request.do_code_enrichment,
|
||||
do_formula_enrichment=request.do_formula_enrichment,
|
||||
do_picture_classification=request.do_picture_classification,
|
||||
do_picture_description=request.do_picture_description,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
|
||||
@@ -361,9 +127,11 @@ def get_pdf_pipeline_opts( # noqa: C901
|
||||
pipeline_options.images_scale = request.images_scale
|
||||
|
||||
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.DLPARSE_V4:
|
||||
backend = DoclingParseV4DocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
@@ -399,30 +167,17 @@ def get_pdf_pipeline_opts( # noqa: C901
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
serialized_data = _serialize_pdf_format_option(pdf_format_option)
|
||||
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).digest()
|
||||
|
||||
return pdf_format_option, options_hash
|
||||
return pdf_format_option
|
||||
|
||||
|
||||
def convert_documents(
|
||||
sources: Iterable[Union[Path, str, DocumentStream]],
|
||||
options: ConvertDocumentsOptions,
|
||||
headers: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
||||
|
||||
if options_hash not in converters:
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
converters[options_hash] = DocumentConverter(format_options=format_options)
|
||||
_log.info(f"We now have {len(converters)} converters in memory.")
|
||||
|
||||
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
|
||||
pdf_format_option = get_pdf_pipeline_opts(options)
|
||||
converter = get_converter(pdf_format_option)
|
||||
results: Iterator[ConversionResult] = converter.convert_all(
|
||||
sources,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
8
docling_serve/engines/__init__.py
Normal file
8
docling_serve/engines/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from functools import lru_cache
|
||||
|
||||
from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_orchestrator() -> AsyncLocalOrchestrator:
|
||||
return AsyncLocalOrchestrator()
|
||||
0
docling_serve/engines/async_local/__init__.py
Normal file
0
docling_serve/engines/async_local/__init__.py
Normal file
102
docling_serve/engines/async_local/orchestrator.py
Normal file
102
docling_serve/engines/async_local/orchestrator.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import WebSocket
|
||||
|
||||
from docling_serve.datamodel.engines import TaskStatus
|
||||
from docling_serve.datamodel.requests import ConvertDocumentsRequest
|
||||
from docling_serve.datamodel.responses import (
|
||||
MessageKind,
|
||||
TaskStatusResponse,
|
||||
WebsocketMessage,
|
||||
)
|
||||
from docling_serve.datamodel.task import Task
|
||||
from docling_serve.engines.async_local.worker import AsyncLocalWorker
|
||||
from docling_serve.engines.base_orchestrator import BaseOrchestrator
|
||||
from docling_serve.settings import docling_serve_settings
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OrchestratorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TaskNotFoundError(OrchestratorError):
|
||||
pass
|
||||
|
||||
|
||||
class AsyncLocalOrchestrator(BaseOrchestrator):
|
||||
def __init__(self):
|
||||
self.task_queue = asyncio.Queue()
|
||||
self.tasks: dict[str, Task] = {}
|
||||
self.queue_list: list[str] = []
|
||||
self.task_subscribers: dict[str, set[WebSocket]] = {}
|
||||
|
||||
async def enqueue(self, request: ConvertDocumentsRequest) -> Task:
|
||||
task_id = str(uuid.uuid4())
|
||||
task = Task(task_id=task_id, request=request)
|
||||
self.tasks[task_id] = task
|
||||
self.queue_list.append(task_id)
|
||||
self.task_subscribers[task_id] = set()
|
||||
await self.task_queue.put(task_id)
|
||||
return task
|
||||
|
||||
async def queue_size(self) -> int:
|
||||
return self.task_queue.qsize()
|
||||
|
||||
async def get_queue_position(self, task_id: str) -> Optional[int]:
|
||||
return (
|
||||
self.queue_list.index(task_id) + 1 if task_id in self.queue_list else None
|
||||
)
|
||||
|
||||
async def task_status(self, task_id: str, wait: float = 0.0) -> Task:
|
||||
if task_id not in self.tasks:
|
||||
raise TaskNotFoundError()
|
||||
return self.tasks[task_id]
|
||||
|
||||
async def task_result(self, task_id: str):
|
||||
if task_id not in self.tasks:
|
||||
raise TaskNotFoundError()
|
||||
return self.tasks[task_id].result
|
||||
|
||||
async def process_queue(self):
|
||||
# Create a pool of workers
|
||||
workers = []
|
||||
for i in range(docling_serve_settings.eng_loc_num_workers):
|
||||
_log.debug(f"Starting worker {i}")
|
||||
w = AsyncLocalWorker(i, self)
|
||||
worker_task = asyncio.create_task(w.loop())
|
||||
workers.append(worker_task)
|
||||
|
||||
# Wait for all workers to complete (they won't, as they run indefinitely)
|
||||
await asyncio.gather(*workers)
|
||||
_log.debug("All workers completed.")
|
||||
|
||||
async def notify_task_subscribers(self, task_id: str):
|
||||
if task_id not in self.task_subscribers:
|
||||
raise RuntimeError(f"Task {task_id} does not have a subscribers list.")
|
||||
|
||||
task = self.tasks[task_id]
|
||||
task_queue_position = await self.get_queue_position(task_id)
|
||||
msg = TaskStatusResponse(
|
||||
task_id=task.task_id,
|
||||
task_status=task.task_status,
|
||||
task_position=task_queue_position,
|
||||
)
|
||||
for websocket in self.task_subscribers[task_id]:
|
||||
await websocket.send_text(
|
||||
WebsocketMessage(message=MessageKind.UPDATE, task=msg).model_dump_json()
|
||||
)
|
||||
if task.is_completed():
|
||||
await websocket.close()
|
||||
|
||||
async def notify_queue_positions(self):
|
||||
for task_id in self.task_subscribers.keys():
|
||||
# notify only pending tasks
|
||||
if self.tasks[task_id].task_status != TaskStatus.PENDING:
|
||||
continue
|
||||
|
||||
await self.notify_task_subscribers(task_id)
|
||||
116
docling_serve/engines/async_local/worker.py
Normal file
116
docling_serve/engines/async_local/worker.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from fastapi import BackgroundTasks
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
from docling_serve.datamodel.engines import TaskStatus
|
||||
from docling_serve.datamodel.requests import ConvertDocumentFileSourcesRequest
|
||||
from docling_serve.datamodel.responses import ConvertDocumentResponse
|
||||
from docling_serve.docling_conversion import convert_documents
|
||||
from docling_serve.response_preparation import process_results
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AsyncLocalWorker:
|
||||
def __init__(self, worker_id: int, orchestrator: "AsyncLocalOrchestrator"):
|
||||
self.worker_id = worker_id
|
||||
self.orchestrator = orchestrator
|
||||
|
||||
async def loop(self):
|
||||
_log.debug(f"Starting loop for worker {self.worker_id}")
|
||||
while True:
|
||||
task_id: str = await self.orchestrator.task_queue.get()
|
||||
self.orchestrator.queue_list.remove(task_id)
|
||||
|
||||
if task_id not in self.orchestrator.tasks:
|
||||
raise RuntimeError(f"Task {task_id} not found.")
|
||||
task = self.orchestrator.tasks[task_id]
|
||||
|
||||
try:
|
||||
task.task_status = TaskStatus.STARTED
|
||||
_log.info(f"Worker {self.worker_id} processing task {task_id}")
|
||||
|
||||
# Notify clients about task updates
|
||||
await self.orchestrator.notify_task_subscribers(task_id)
|
||||
|
||||
# Notify clients about queue updates
|
||||
await self.orchestrator.notify_queue_positions()
|
||||
|
||||
# Get the current event loop
|
||||
asyncio.get_event_loop()
|
||||
|
||||
# Define a callback function to send progress updates to the client.
|
||||
# TODO: send partial updates, e.g. when a document in the batch is done
|
||||
def run_conversion():
|
||||
sources: list[Union[str, DocumentStream]] = []
|
||||
headers: Optional[dict[str, Any]] = None
|
||||
if isinstance(task.request, ConvertDocumentFileSourcesRequest):
|
||||
for file_source in task.request.file_sources:
|
||||
sources.append(file_source.to_document_stream())
|
||||
else:
|
||||
for http_source in task.request.http_sources:
|
||||
sources.append(http_source.url)
|
||||
if headers is None and http_source.headers:
|
||||
headers = http_source.headers
|
||||
|
||||
# Note: results are only an iterator->lazy evaluation
|
||||
results = convert_documents(
|
||||
sources=sources,
|
||||
options=task.request.options,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# The real processing will happen here
|
||||
response = process_results(
|
||||
background_tasks=BackgroundTasks(),
|
||||
conversion_options=task.request.options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Run the prediction in a thread to avoid blocking the event loop.
|
||||
start_time = time.monotonic()
|
||||
# future = asyncio.run_coroutine_threadsafe(
|
||||
# run_conversion(),
|
||||
# loop=loop
|
||||
# )
|
||||
# response = future.result()
|
||||
|
||||
response = await asyncio.to_thread(
|
||||
run_conversion,
|
||||
)
|
||||
processing_time = time.monotonic() - start_time
|
||||
|
||||
if not isinstance(response, ConvertDocumentResponse):
|
||||
_log.error(
|
||||
f"Worker {self.worker_id} got un-processable "
|
||||
"result for {task_id}: {type(response)}"
|
||||
)
|
||||
task.result = response
|
||||
task.request = None
|
||||
|
||||
task.task_status = TaskStatus.SUCCESS
|
||||
_log.info(
|
||||
f"Worker {self.worker_id} completed job {task_id} "
|
||||
f"in {processing_time:.2f} seconds"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
_log.error(
|
||||
f"Worker {self.worker_id} failed to process job {task_id}: {e}"
|
||||
)
|
||||
task.task_status = TaskStatus.FAILURE
|
||||
|
||||
finally:
|
||||
await self.orchestrator.notify_task_subscribers(task_id)
|
||||
self.orchestrator.task_queue.task_done()
|
||||
_log.debug(f"Worker {self.worker_id} completely done with {task_id}")
|
||||
21
docling_serve/engines/base_orchestrator.py
Normal file
21
docling_serve/engines/base_orchestrator.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from docling_serve.datamodel.task import Task
|
||||
|
||||
|
||||
class BaseOrchestrator(ABC):
|
||||
@abstractmethod
|
||||
async def enqueue(self, task) -> Task:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def queue_size(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def task_status(self, task_id: str) -> Task:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def task_result(self, task_id: str):
|
||||
pass
|
||||
0
docling_serve/engines/block_local/__init__.py
Normal file
0
docling_serve/engines/block_local/__init__.py
Normal file
@@ -1,7 +1,6 @@
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
@@ -9,9 +8,31 @@ import gradio as gr
|
||||
import requests
|
||||
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
from docling_serve.settings import docling_serve_settings, uvicorn_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
############################
|
||||
# Path of static artifacts #
|
||||
############################
|
||||
|
||||
logo_path = "https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"
|
||||
js_components_url = "https://unpkg.com/@docling/docling-components@0.0.3"
|
||||
if (
|
||||
docling_serve_settings.static_path is not None
|
||||
and docling_serve_settings.static_path.is_dir()
|
||||
):
|
||||
logo_path = str(docling_serve_settings.static_path / "logo.svg")
|
||||
js_components_url = "/static/docling-components.js"
|
||||
|
||||
|
||||
##############################
|
||||
# Head JS for web components #
|
||||
##############################
|
||||
head = f"""
|
||||
<script src="{js_components_url}" type="module"></script>
|
||||
"""
|
||||
|
||||
#################
|
||||
# CSS and theme #
|
||||
#################
|
||||
@@ -49,6 +70,14 @@ css = """
|
||||
#file_input_zone {
|
||||
height: 140px;
|
||||
}
|
||||
|
||||
docling-img::part(pages) {
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
docling-img::part(page) {
|
||||
box-shadow: 0 0.5rem 1rem 0 rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
"""
|
||||
|
||||
theme = gr.themes.Default(
|
||||
@@ -81,7 +110,7 @@ file_output_path = None # Will be set when a new file is generated
|
||||
|
||||
|
||||
def health_check():
|
||||
response = requests.get(f"http://localhost:{int(os.getenv('PORT', '5001'))}/health")
|
||||
response = requests.get(f"http://localhost:{uvicorn_settings.port}/health")
|
||||
if response.status_code == 200:
|
||||
return "Healthy"
|
||||
return "Unhealthy"
|
||||
@@ -110,6 +139,7 @@ def set_download_button_label(label_text: gr.State):
|
||||
def clear_outputs():
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
json_rendered_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
@@ -118,6 +148,7 @@ def clear_outputs():
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
json_rendered_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
@@ -168,6 +199,10 @@ def process_url(
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
):
|
||||
parameters = {
|
||||
"http_sources": [{"url": source} for source in input_sources.split(",")],
|
||||
@@ -182,6 +217,10 @@ def process_url(
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": abort_on_error,
|
||||
"return_as_file": return_as_file,
|
||||
"do_code_enrichment": do_code_enrichment,
|
||||
"do_formula_enrichment": do_formula_enrichment,
|
||||
"do_picture_classification": do_picture_classification,
|
||||
"do_picture_description": do_picture_description,
|
||||
},
|
||||
}
|
||||
if (
|
||||
@@ -193,7 +232,7 @@ def process_url(
|
||||
raise gr.Error("No input sources provided.", print_exception=False)
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/source",
|
||||
f"http://localhost:{uvicorn_settings.port}/v1alpha/convert/source",
|
||||
json=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -220,6 +259,10 @@ def process_file(
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
):
|
||||
if not files or len(files) == 0 or files[0] == "":
|
||||
logger.error("No files provided.")
|
||||
@@ -237,11 +280,15 @@ def process_file(
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": str(abort_on_error).lower(),
|
||||
"return_as_file": str(return_as_file).lower(),
|
||||
"do_code_enrichment": str(do_code_enrichment).lower(),
|
||||
"do_formula_enrichment": str(do_formula_enrichment).lower(),
|
||||
"do_picture_classification": str(do_picture_classification).lower(),
|
||||
"do_picture_description": str(do_picture_description).lower(),
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/file",
|
||||
f"http://localhost:{uvicorn_settings.port}/v1alpha/convert/file",
|
||||
files=files_data,
|
||||
data=parameters,
|
||||
)
|
||||
@@ -260,6 +307,7 @@ def process_file(
|
||||
def response_to_output(response, return_as_file):
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
json_rendered_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
@@ -282,6 +330,12 @@ def response_to_output(response, return_as_file):
|
||||
json_content = json.dumps(
|
||||
full_content.get("document").get("json_content"), indent=2
|
||||
)
|
||||
# Embed document JSON and trigger load at client via an image.
|
||||
json_rendered_content = f"""
|
||||
<docling-img id="dclimg" pagenumbers tooltip="parsed"></docling-img>
|
||||
<script id="dcljson" type="application/json" onload="document.getElementById('dclimg').src = JSON.parse(document.getElementById('dcljson').textContent);">{json_content}</script>
|
||||
<img src onerror="document.getElementById('dclimg').src = JSON.parse(document.getElementById('dcljson').textContent);" />
|
||||
"""
|
||||
html_content = full_content.get("document").get("html_content")
|
||||
text_content = full_content.get("document").get("text_content")
|
||||
doctags_content = full_content.get("document").get("doctags_content")
|
||||
@@ -289,6 +343,7 @@ def response_to_output(response, return_as_file):
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
json_rendered_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
@@ -302,12 +357,12 @@ def response_to_output(response, return_as_file):
|
||||
############
|
||||
|
||||
with gr.Blocks(
|
||||
head=head,
|
||||
css=css,
|
||||
theme=theme,
|
||||
title="Docling Serve",
|
||||
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
||||
) as ui:
|
||||
|
||||
# Constants stored in states to be able to pass them as inputs to functions
|
||||
processing_text = gr.State("Processing your document(s), please wait...")
|
||||
true_bool = gr.State(True)
|
||||
@@ -317,17 +372,21 @@ with gr.Blocks(
|
||||
with gr.Row(elem_id="check_health"):
|
||||
# Logo
|
||||
with gr.Column(scale=1, min_width=90):
|
||||
gr.Image(
|
||||
"https://ds4sd.github.io/docling/assets/logo.png",
|
||||
height=80,
|
||||
width=80,
|
||||
show_download_button=False,
|
||||
show_label=False,
|
||||
show_fullscreen_button=False,
|
||||
container=False,
|
||||
elem_id="logo",
|
||||
scale=0,
|
||||
)
|
||||
try:
|
||||
gr.Image(
|
||||
logo_path,
|
||||
height=80,
|
||||
width=80,
|
||||
show_download_button=False,
|
||||
show_label=False,
|
||||
show_fullscreen_button=False,
|
||||
container=False,
|
||||
elem_id="logo",
|
||||
scale=0,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("Logo not found.")
|
||||
|
||||
# Title
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
gr.Markdown(
|
||||
@@ -453,6 +512,21 @@ with gr.Blocks(
|
||||
with gr.Column(scale=1):
|
||||
abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
|
||||
return_as_file = gr.Checkbox(label="Return as File", value=False)
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
do_code_enrichment = gr.Checkbox(
|
||||
label="Enable code enrichment", value=False
|
||||
)
|
||||
do_formula_enrichment = gr.Checkbox(
|
||||
label="Enable formula enrichment", value=False
|
||||
)
|
||||
with gr.Column():
|
||||
do_picture_classification = gr.Checkbox(
|
||||
label="Enable picture classification", value=False
|
||||
)
|
||||
do_picture_description = gr.Checkbox(
|
||||
label="Enable picture description", value=False
|
||||
)
|
||||
|
||||
# Document output
|
||||
with gr.Row(visible=False) as content_output:
|
||||
@@ -464,6 +538,8 @@ with gr.Blocks(
|
||||
output_markdown_rendered = gr.Markdown(label="Response")
|
||||
with gr.Tab("Docling (JSON)"):
|
||||
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("Docling-Rendered"):
|
||||
output_json_rendered = gr.HTML()
|
||||
with gr.Tab("HTML"):
|
||||
output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("HTML-Rendered"):
|
||||
@@ -514,6 +590,7 @@ with gr.Blocks(
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
@@ -533,11 +610,16 @@ with gr.Blocks(
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
@@ -553,6 +635,7 @@ with gr.Blocks(
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
@@ -562,9 +645,7 @@ with gr.Blocks(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_url_input, inputs=None, outputs=[url_input]
|
||||
)
|
||||
).then(clear_url_input, inputs=None, outputs=[url_input])
|
||||
|
||||
# File processing
|
||||
file_process_btn.click(
|
||||
@@ -582,6 +663,7 @@ with gr.Blocks(
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
@@ -601,11 +683,16 @@ with gr.Blocks(
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
do_code_enrichment,
|
||||
do_formula_enrichment,
|
||||
do_picture_classification,
|
||||
do_picture_description,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
@@ -621,6 +708,7 @@ with gr.Blocks(
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_json_rendered,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
@@ -630,6 +718,4 @@ with gr.Blocks(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_file_input, inputs=None, outputs=[file_input]
|
||||
)
|
||||
).then(clear_file_input, inputs=None, outputs=[file_input])
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import inspect
|
||||
import re
|
||||
from typing import List, Type, Union
|
||||
from typing import Union
|
||||
|
||||
from fastapi import Depends, Form
|
||||
from pydantic import BaseModel
|
||||
@@ -8,7 +8,7 @@ from pydantic import BaseModel
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
||||
def FormDepends(cls: Type[BaseModel]):
|
||||
def FormDepends(cls: type[BaseModel]):
|
||||
new_parameters = []
|
||||
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
@@ -34,8 +34,8 @@ def FormDepends(cls: Type[BaseModel]):
|
||||
return Depends(as_form_func)
|
||||
|
||||
|
||||
def _to_list_of_strings(input_value: Union[str, List[str]]) -> List[str]:
|
||||
def split_and_strip(value: str) -> List[str]:
|
||||
def _to_list_of_strings(input_value: Union[str, list[str]]) -> list[str]:
|
||||
def split_and_strip(value: str) -> list[str]:
|
||||
if re.search(r"[;,]", value):
|
||||
return [item.strip() for item in re.split(r"[;,]", value)]
|
||||
else:
|
||||
|
||||
@@ -3,43 +3,23 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
from typing import Union
|
||||
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
from docling.datamodel.document import ConversionResult, ConversionStatus, ErrorItem
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from fastapi import BackgroundTasks, HTTPException
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.docling_conversion import ConvertDocumentsOptions
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
from docling.datamodel.document import ConversionResult, ConversionStatus
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
||||
from docling_serve.datamodel.responses import ConvertDocumentResponse, DocumentResponse
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
filename: str
|
||||
md_content: Optional[str] = None
|
||||
json_content: Optional[DoclingDocument] = None
|
||||
html_content: Optional[str] = None
|
||||
text_content: Optional[str] = None
|
||||
doctags_content: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
processing_time: float
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
|
||||
|
||||
def _export_document_as_content(
|
||||
conv_res: ConversionResult,
|
||||
export_json: bool,
|
||||
@@ -49,7 +29,6 @@ def _export_document_as_content(
|
||||
export_doctags: bool,
|
||||
image_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
document = DocumentResponse(filename=conv_res.input.file.name)
|
||||
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
@@ -86,7 +65,6 @@ def _export_documents_as_files(
|
||||
export_doctags: bool,
|
||||
image_export_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
@@ -150,7 +128,6 @@ def process_results(
|
||||
conversion_options: ConvertDocumentsOptions,
|
||||
conv_results: Iterable[ConversionResult],
|
||||
) -> Union[ConvertDocumentResponse, FileResponse]:
|
||||
|
||||
# Let's start by processing the documents
|
||||
try:
|
||||
start_time = time.monotonic()
|
||||
|
||||
@@ -3,6 +3,8 @@ from typing import Optional, Union
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
from docling_serve.datamodel.engines import AsyncEngine
|
||||
|
||||
|
||||
class UvicornSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
@@ -14,6 +16,10 @@ class UvicornSettings(BaseSettings):
|
||||
reload: bool = False
|
||||
root_path: str = ""
|
||||
proxy_headers: bool = True
|
||||
timeout_keep_alive: int = 60
|
||||
ssl_certfile: Optional[Path] = None
|
||||
ssl_keyfile: Optional[Path] = None
|
||||
ssl_keyfile_password: Optional[str] = None
|
||||
workers: Union[int, None] = None
|
||||
|
||||
|
||||
@@ -27,6 +33,16 @@ class DoclingServeSettings(BaseSettings):
|
||||
|
||||
enable_ui: bool = False
|
||||
artifacts_path: Optional[Path] = None
|
||||
static_path: Optional[Path] = None
|
||||
options_cache_size: int = 2
|
||||
allow_external_plugins: bool = False
|
||||
|
||||
cors_origins: list[str] = ["*"]
|
||||
cors_methods: list[str] = ["*"]
|
||||
cors_headers: list[str] = ["*"]
|
||||
|
||||
eng_kind: AsyncEngine = AsyncEngine.LOCAL
|
||||
eng_loc_num_workers: int = 2
|
||||
|
||||
|
||||
uvicorn_settings = UvicornSettings()
|
||||
|
||||
8
docs/README.md
Normal file
8
docs/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Dolcing Serve documentation
|
||||
|
||||
This documentation pages explore the webserver configurations, runtime options, deployment examples as well as development best practices.
|
||||
|
||||
- [Configuration](./configuration.md)
|
||||
- [Advance usage](./usage.md)
|
||||
- [Deployment](./deployment.md)
|
||||
- [Development](./development.md)
|
||||
BIN
docs/assets/docling-serve-pic.png
Normal file
BIN
docs/assets/docling-serve-pic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 504 KiB |
44
docs/configuration.md
Normal file
44
docs/configuration.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# Configuration
|
||||
|
||||
The `docling-serve` executable allows to configure the server via command line
|
||||
options as well as environment variables.
|
||||
Configurations are divided between the settings used for the `uvicorn` asgi
|
||||
server and the actual app-specific configurations.
|
||||
|
||||
> [!WARNING]
|
||||
> When the server is running with `reload` or with multiple `workers`, uvicorn
|
||||
> will spawn multiple subprocessed. This invalides all the values configured
|
||||
> via the CLI command line options. Please use environment variables in this
|
||||
> type of deployments.
|
||||
|
||||
## Webserver configuration
|
||||
|
||||
The following table shows the options which are propagated directly to the
|
||||
`uvicorn` webserver runtime.
|
||||
|
||||
| CLI option | ENV | Default | Description |
|
||||
| -----------|-----|---------|-------------|
|
||||
| `--host` | `UVICORN_HOST` | `0.0.0.0` for `run`, `localhost` for `dev` | THe host to serve on. |
|
||||
| `--port` | `UVICORN_PORT` | `5001` | The port to serve on. |
|
||||
| `--reload` | `UVICORN_RELOAD` | `false` for `run`, `true` for `dev` | Enable auto-reload of the server when (code) files change. |
|
||||
| `--workers` | `UVICORN_WORKERS` | `1` | Use multiple worker processes. |
|
||||
| `--root-path` | `UVICORN_ROOT_PATH` | `""` | The root path is used to tell your app that it is being served to the outside world with some |
|
||||
| `--proxy-headers` | `UVICORN_PROXY_HEADERS` | `true` | Enable/Disable X-Forwarded-Proto, X-Forwarded-For, X-Forwarded-Port to populate remote address info. |
|
||||
| `--timeout-keep-alive` | `UVICORN_TIMEOUT_KEEP_ALIVE` | `60` | Timeout for the server response. |
|
||||
| `--ssl-certfile` | `UVICORN_SSL_CERTFILE` | | SSL certificate file. |
|
||||
| `--ssl-keyfile` | `UVICORN_SSL_KEYFILE` | | SSL key file. |
|
||||
| `--ssl-keyfile-password` | `UVICORN_SSL_KEYFILE_PASSWORD` | | SSL keyfile password. |
|
||||
|
||||
## Docling Serve configuration
|
||||
|
||||
THe following table describes the options to configure the Docling Serve app.
|
||||
|
||||
| CLI option | ENV | Default | Description |
|
||||
| -----------|-----|---------|-------------|
|
||||
| `--artifacts-path` | `DOCLING_SERVE_ARTIFACTS_PATH` | unset | If set to a valid directory, the model weights will be loaded from this path |
|
||||
| | `DOCLING_SERVE_STATIC_PATH` | unset | If set to a valid directory, the static assets for the docs and ui will be loaded from this path |
|
||||
| `--enable-ui` | `DOCLING_SERVE_ENABLE_UI` | `false` | Enable the demonstrator UI. |
|
||||
| | `DOCLING_SERVE_OPTIONS_CACHE_SIZE` | `2` | How many DocumentConveter objects (including their loaded models) to keep in the cache. |
|
||||
| | `DOCLING_SERVE_CORS_ORIGINS` | `["*"]` | A list of origins that should be permitted to make cross-origin requests. |
|
||||
| | `DOCLING_SERVE_CORS_METHODS` | `["*"]` | A list of HTTP methods that should be allowed for cross-origin requests. |
|
||||
| | `DOCLING_SERVE_CORS_HEADERS` | `["*"]` | A list of HTTP request headers that should be supported for cross-origin requests. |
|
||||
209
docs/deploy-examples/docling-serve-oauth.yaml
Normal file
209
docs/deploy-examples/docling-serve-oauth.yaml
Normal file
@@ -0,0 +1,209 @@
|
||||
# This example deployment configures Docling Serve with a OAuth-Proxy sidecar and TLS termination
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
annotations:
|
||||
serviceaccounts.openshift.io/oauth-redirectreference.primary: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"docling-serve"}}'
|
||||
---
|
||||
kind: Role
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: docling-serve-oauth
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
rules:
|
||||
- verbs:
|
||||
- create
|
||||
apiGroups:
|
||||
- authorization.k8s.io
|
||||
resources:
|
||||
- subjectaccessreviews
|
||||
- verbs:
|
||||
- create
|
||||
apiGroups:
|
||||
- authentication.k8s.io
|
||||
resources:
|
||||
- tokenreviews
|
||||
---
|
||||
kind: RoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: docling-serve-oauth
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: docling-serve
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: docling-serve-oauth
|
||||
---
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: docling-serve
|
||||
port:
|
||||
targetPort: oauth
|
||||
tls:
|
||||
termination: Reencrypt
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
annotations:
|
||||
service.alpha.openshift.io/serving-cert-secret-name: docling-serve-tls
|
||||
spec:
|
||||
ports:
|
||||
- name: oauth
|
||||
port: 8443
|
||||
targetPort: oauth
|
||||
- name: http
|
||||
port: 5001
|
||||
targetPort: http
|
||||
selector:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
---
|
||||
kind: Deployment
|
||||
apiVersion: apps/v1
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
restartPolicy: Always
|
||||
serviceAccountName: docling-serve
|
||||
containers:
|
||||
- name: api
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 10
|
||||
timeoutSeconds: 2
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 3
|
||||
timeoutSeconds: 2
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
env:
|
||||
- name: DOCLING_SERVE_ENABLE_UI
|
||||
value: 'true'
|
||||
- name: UVICORN_SSL_CERTFILE
|
||||
value: '/etc/tls/private/tls.crt'
|
||||
- name: UVICORN_SSL_KEYFILE
|
||||
value: '/etc/tls/private/tls.key'
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 5001
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: proxy-tls
|
||||
mountPath: /etc/tls/private
|
||||
imagePullPolicy: Always
|
||||
image: 'ghcr.io/docling-project/docling-serve:dev-ssl'
|
||||
- name: oauth-proxy
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /oauth/healthz
|
||||
port: oauth
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 1
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /oauth/healthz
|
||||
port: oauth
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 30
|
||||
timeoutSeconds: 1
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
ports:
|
||||
- name: oauth
|
||||
containerPort: 8443
|
||||
protocol: TCP
|
||||
imagePullPolicy: IfNotPresent
|
||||
volumeMounts:
|
||||
- name: proxy-tls
|
||||
mountPath: /etc/tls/private
|
||||
env:
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
image: 'registry.redhat.io/openshift4/ose-oauth-proxy:v4.13'
|
||||
args:
|
||||
- '--https-address=:8443'
|
||||
- '--provider=openshift'
|
||||
- '--openshift-service-account=docling-serve'
|
||||
- '--upstream=https://docling-serve.$(NAMESPACE).svc.cluster.local:5001'
|
||||
- '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt'
|
||||
- '--tls-cert=/etc/tls/private/tls.crt'
|
||||
- '--tls-key=/etc/tls/private/tls.key'
|
||||
- '--cookie-secret=SECRET'
|
||||
- '--openshift-delegate-urls={"/": {"group":"route.openshift.io","resource":"routes","verb":"get","name":"docling-serve","namespace":"$(NAMESPACE)"}}'
|
||||
- '--openshift-sar={"namespace":"$(NAMESPACE)","resource":"routes","resourceName":"docling-serve","verb":"get","resourceAPIGroup":"route.openshift.io"}'
|
||||
- '--skip-auth-regex=''(^/health|^/docs)'''
|
||||
volumes:
|
||||
- name: proxy-tls
|
||||
secret:
|
||||
secretName: docling-serve-tls
|
||||
defaultMode: 420
|
||||
40
docs/deployment.md
Normal file
40
docs/deployment.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Deployment
|
||||
|
||||
## OpenShift
|
||||
|
||||
### Secure deployment with `oauth-proxy`
|
||||
|
||||
Manifest example: [docling-serve-oauth.yaml](./deploy-examples/docling-serve-oauth.yaml)
|
||||
|
||||
This deployment has the following features:
|
||||
|
||||
- TLS encryption between all components (using the cluster-internal CA authority).
|
||||
- Authentication via a secure `oauth-proxy` sidecar.
|
||||
- Expose the service using a secure OpenShift `Route`
|
||||
|
||||
Install the app with:
|
||||
|
||||
```sh
|
||||
kubectl apply -f docs/deploy-examples/docling-serve-oauth.yaml
|
||||
```
|
||||
|
||||
For using the API:
|
||||
|
||||
```sh
|
||||
# Retrieve the endpoint
|
||||
DOCLING_NAME=docling-serve
|
||||
DOCLING_ROUTE="https://$(oc get routes ${DOCLING_NAME} --template={{.spec.host}})"
|
||||
|
||||
# Retrieve the authentication token
|
||||
OCP_AUTH_TOKEN=$(oc whoami --show-token)
|
||||
|
||||
# Make a test query
|
||||
curl -X 'POST' \
|
||||
"${DOCLING_ROUTE}/v1alpha/convert/source/async" \
|
||||
-H "Authorization: Bearer ${OCP_AUTH_TOKEN}" \
|
||||
-H "accept: application/json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
|
||||
}'
|
||||
```
|
||||
57
docs/development.md
Normal file
57
docs/development.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Development
|
||||
|
||||
## Install dependencies
|
||||
|
||||
### CPU only
|
||||
|
||||
```sh
|
||||
# Install uv if not already available
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Install dependencies
|
||||
uv sync --extra cpu
|
||||
```
|
||||
|
||||
### Cuda GPU
|
||||
|
||||
For GPU support use the following command:
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
uv sync
|
||||
```
|
||||
|
||||
### Gradio UI and different OCR backends
|
||||
|
||||
`/ui` endpoint using `gradio` and different OCR backends can be enabled via package extras:
|
||||
|
||||
```sh
|
||||
# Enable ui and rapidocr
|
||||
uv sync --extra ui --extra rapidocr
|
||||
```
|
||||
|
||||
```sh
|
||||
# Enable tesserocr
|
||||
uv sync --extra tesserocr
|
||||
```
|
||||
|
||||
See `[project.optional-dependencies]` section in `pyproject.toml` for full list of options and runtime options with `uv run docling-serve --help`.
|
||||
|
||||
### Run the server
|
||||
|
||||
The `docling-serve` executable is a convenient script for launching the webserver both in
|
||||
development and production mode.
|
||||
|
||||
```sh
|
||||
# Run the server in development mode
|
||||
# - reload is enabled by default
|
||||
# - listening on the 127.0.0.1 address
|
||||
# - ui is enabled by default
|
||||
docling-serve dev
|
||||
|
||||
# Run the server in production mode
|
||||
# - reload is disabled by default
|
||||
# - listening on the 0.0.0.0 address
|
||||
# - ui is disabled by default
|
||||
docling-serve run
|
||||
```
|
||||
279
docs/usage.md
Normal file
279
docs/usage.md
Normal file
@@ -0,0 +1,279 @@
|
||||
# Usage
|
||||
|
||||
The API provides two endpoints: one for urls, one for files. This is necessary to send files directly in binary format instead of base64-encoded strings.
|
||||
|
||||
## Common parameters
|
||||
|
||||
On top of the source of file (see below), both endpoints support the same parameters, which are almost the same as the Docling CLI.
|
||||
|
||||
- `from_format` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
|
||||
- `to_formats` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
|
||||
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
||||
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
||||
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
||||
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
||||
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`. Defaults to `dlparse_v2`.
|
||||
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
||||
- `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
|
||||
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
|
||||
- `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
|
||||
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to true.
|
||||
- `images_scale` (float): Scale factor for images. Defaults to 2.0.
|
||||
|
||||
## Convert endpoints
|
||||
|
||||
### Source endpoint
|
||||
|
||||
The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
|
||||
|
||||
On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
|
||||
The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
|
||||
No `options` is required, they can be partially or completely omitted.
|
||||
|
||||
Simple payload example:
|
||||
|
||||
```json
|
||||
{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Complete payload example:</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://localhost:5001/v1alpha/convert/source' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx"
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": [
|
||||
"fr",
|
||||
"de",
|
||||
"es",
|
||||
"en"
|
||||
],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
"do_table_structure": true,
|
||||
"include_images": true,
|
||||
"images_scale": 2
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": "en",
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
|
||||
response = await async_client_client.post(url, json=payload)
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### File as base64
|
||||
|
||||
The `file_sources` argument in the endpoint allows to send files as base64-encoded strings.
|
||||
When your PDF or other file type is too large, encoding it and passing it inline to curl
|
||||
can lead to an “Argument list too long” error on some systems. To avoid this, we write
|
||||
the JSON request body to a file and have curl read from that file.
|
||||
|
||||
<details>
|
||||
<summary>CURL steps:</summary>
|
||||
|
||||
```sh
|
||||
# 1. Base64-encode the file
|
||||
B64_DATA=$(base64 -w 0 /path/to/file/pdf-to-convert.pdf)
|
||||
|
||||
# 2. Build the JSON with your options
|
||||
cat <<EOF > /tmp/request_body.json
|
||||
{
|
||||
"options": {
|
||||
},
|
||||
"file_sources": [{
|
||||
"base64_string": "${B64_DATA}",
|
||||
"filename": "pdf-to-convert.pdf"
|
||||
}]
|
||||
}
|
||||
EOF
|
||||
|
||||
# 3. POST the request to the docling service
|
||||
curl -X POST "localhost:5001/v1alpha/convert/source" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @/tmp/request_body.json
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### File endpoint
|
||||
|
||||
The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
|
||||
|
||||
<details>
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:5001/v1alpha/convert/file' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'ocr_engine=easyocr' \
|
||||
-F 'pdf_backend=dlparse_v2' \
|
||||
-F 'from_formats=pdf' \
|
||||
-F 'from_formats=docx' \
|
||||
-F 'force_ocr=false' \
|
||||
-F 'image_export_mode=embedded' \
|
||||
-F 'ocr_lang=en' \
|
||||
-F 'ocr_lang=pl' \
|
||||
-F 'table_mode=fast' \
|
||||
-F 'files=@2206.01062v1.pdf;type=application/pdf' \
|
||||
-F 'abort_on_error=false' \
|
||||
-F 'to_formats=md' \
|
||||
-F 'to_formats=text' \
|
||||
-F 'return_as_file=false' \
|
||||
-F 'do_ocr=true'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
parameters = {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, '2206.01062v1.pdf')
|
||||
|
||||
files = {
|
||||
'files': ('2206.01062v1.pdf', open(file_path, 'rb'), 'application/pdf'),
|
||||
}
|
||||
|
||||
response = await async_client.post(url, files=files, data={"parameters": json.dumps(parameters)})
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Response format
|
||||
|
||||
The response can be a JSON Document or a File.
|
||||
|
||||
- If you process only one file, the response will be a JSON document with the following format:
|
||||
|
||||
```jsonc
|
||||
{
|
||||
"document": {
|
||||
"md_content": "",
|
||||
"json_content": {},
|
||||
"html_content": "",
|
||||
"text_content": "",
|
||||
"doctags_content": ""
|
||||
},
|
||||
"status": "<success|partial_success|skipped|failure>",
|
||||
"processing_time": 0.0,
|
||||
"timings": {},
|
||||
"errors": []
|
||||
}
|
||||
```
|
||||
|
||||
Depending on the value you set in `output_formats`, the different items will be populated with their respective results or empty.
|
||||
|
||||
`processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
|
||||
timing of all the internal Docling components.
|
||||
|
||||
- If you set the parameter `return_as_file` to True, the response will be a zip file.
|
||||
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
|
||||
|
||||
## Asynchronous API
|
||||
|
||||
TBA
|
||||
@@ -4,5 +4,3 @@ tesseract-langpack-eng
|
||||
leptonica-devel
|
||||
libglvnd-glx
|
||||
glib2
|
||||
wget
|
||||
git
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "docling-serve"
|
||||
version = "0.3.0" # DO NOT EDIT, updated automatically
|
||||
version = "0.7.0" # DO NOT EDIT, updated automatically
|
||||
description = "Running Docling as a service"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
@@ -30,7 +30,7 @@ classifiers = [
|
||||
]
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"docling~=2.23",
|
||||
"docling~=2.28",
|
||||
"fastapi[standard]~=0.115",
|
||||
"httpx~=0.28",
|
||||
"pydantic~=2.10",
|
||||
@@ -38,6 +38,7 @@ dependencies = [
|
||||
"python-multipart>=0.0.14,<0.1.0",
|
||||
"typer~=0.12",
|
||||
"uvicorn[standard]>=0.29.0,<1.0.0",
|
||||
"websockets~=14.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -63,7 +64,7 @@ cu124 = [
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"mypy~=1.11",
|
||||
"pre-commit~=3.8",
|
||||
"pre-commit-uv~=4.1",
|
||||
"pytest~=8.3",
|
||||
"pytest-asyncio~=0.24",
|
||||
"pytest-check~=2.4",
|
||||
@@ -101,17 +102,18 @@ url = "https://download.pytorch.org/whl/cu124"
|
||||
explicit = true
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["docling_serve"]
|
||||
include = ["docling_serve*"]
|
||||
namespaces = true
|
||||
|
||||
[project.scripts]
|
||||
docling-serve = "docling_serve.__main__:main"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/DS4SD/docling-serve"
|
||||
Homepage = "https://github.com/docling-project/docling-serve"
|
||||
# Documentation = "https://ds4sd.github.io/docling"
|
||||
Repository = "https://github.com/DS4SD/docling-serve"
|
||||
Issues = "https://github.com/DS4SD/docling-serve/issues"
|
||||
Changelog = "https://github.com/DS4SD/docling-serve/blob/main/CHANGELOG.md"
|
||||
Repository = "https://github.com/docling-project/docling-serve"
|
||||
Issues = "https://github.com/docling-project/docling-serve/issues"
|
||||
Changelog = "https://github.com/docling-project/docling-serve/blob/main/CHANGELOG.md"
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
@@ -144,7 +146,8 @@ select = [
|
||||
"S307", # eval
|
||||
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
||||
"W", # pycodestyle warnings
|
||||
"ASYNC" # async
|
||||
"ASYNC", # async
|
||||
"UP", # pyupgrade
|
||||
]
|
||||
|
||||
ignore = [
|
||||
@@ -153,6 +156,7 @@ ignore = [
|
||||
"F811", # "redefinition of the same function"
|
||||
"PL", # Pylint
|
||||
"RUF012", # Mutable Class Attributes
|
||||
"UP007", # Option and Union
|
||||
]
|
||||
|
||||
#extend-select = []
|
||||
@@ -164,9 +168,19 @@ ignore = [
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 15
|
||||
|
||||
[tool.ruff.lint.isort.sections]
|
||||
"docling" = ["docling", "docling_core"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
combine-as-imports = true
|
||||
known-third-party = ["docling", "docling_core"]
|
||||
section-order = [
|
||||
"future",
|
||||
"standard-library",
|
||||
"third-party",
|
||||
"docling",
|
||||
"first-party",
|
||||
"local-folder",
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
pretty = true
|
||||
@@ -180,10 +194,6 @@ module = [
|
||||
"easyocr.*",
|
||||
"tesserocr.*",
|
||||
"rapidocr_onnxruntime.*",
|
||||
"docling_conversion.*",
|
||||
"gradio_ui.*",
|
||||
"response_preparation.*",
|
||||
"helper_functions.*",
|
||||
"requests.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
@@ -89,7 +89,7 @@ async def test_convert_file(async_client):
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
||||
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
|
||||
@@ -83,7 +83,7 @@ async def test_convert_url(async_client):
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
||||
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
|
||||
48
tests/test_1-url-async-ws.py
Normal file
48
tests/test_1-url-async-ws.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from websockets.sync.client import connect
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client: httpx.AsyncClient):
|
||||
"""Test convert URL to all outputs"""
|
||||
|
||||
doc_filename = Path("tests/2408.09869v5.pdf")
|
||||
encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode()
|
||||
|
||||
base_url = "http://localhost:5001/v1alpha"
|
||||
payload = {
|
||||
"options": {
|
||||
"to_formats": ["md", "json"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
# "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}],
|
||||
"file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
|
||||
}
|
||||
# print(json.dumps(payload, indent=2))
|
||||
|
||||
for n in range(5):
|
||||
response = await async_client.post(
|
||||
f"{base_url}/convert/source/async", json=payload
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
task = response.json()
|
||||
|
||||
uri = f"ws://localhost:5001/v1alpha/status/ws/{task['task_id']}"
|
||||
with connect(uri) as websocket:
|
||||
for message in websocket:
|
||||
print(message)
|
||||
60
tests/test_1-url-async.py
Normal file
60
tests/test_1-url-async.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
|
||||
example_docs = [
|
||||
"https://arxiv.org/pdf/2411.19710",
|
||||
"https://arxiv.org/pdf/2501.17887",
|
||||
"https://www.nature.com/articles/s41467-024-50779-y.pdf",
|
||||
"https://arxiv.org/pdf/2306.12802",
|
||||
"https://arxiv.org/pdf/2311.18481",
|
||||
]
|
||||
|
||||
base_url = "http://localhost:5001/v1alpha"
|
||||
payload = {
|
||||
"options": {
|
||||
"to_formats": ["md", "json"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": random.choice(example_docs)}],
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
for n in range(5):
|
||||
response = await async_client.post(
|
||||
f"{base_url}/convert/source/async", json=payload
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
task = response.json()
|
||||
|
||||
print(json.dumps(task, indent=2))
|
||||
|
||||
while task["task_status"] not in ("success", "failure"):
|
||||
response = await async_client.get(f"{base_url}/status/poll/{task['task_id']}")
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
task = response.json()
|
||||
print(f"{task['task_status']=}")
|
||||
print(f"{task['task_position']=}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
assert task["task_status"] == "success"
|
||||
@@ -57,18 +57,18 @@ async def test_convert_file(async_client):
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert (
|
||||
content_disposition is not None
|
||||
), "Content-Disposition header should be present"
|
||||
assert content_disposition is not None, (
|
||||
"Content-Disposition header should be present"
|
||||
)
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert (
|
||||
'filename="converted_docs.zip"' in content_disposition
|
||||
), "Attachment filename should be 'converted_docs.zip'"
|
||||
assert 'filename="converted_docs.zip"' in content_disposition, (
|
||||
"Attachment filename should be 'converted_docs.zip'"
|
||||
)
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert (
|
||||
content_type == "application/zip"
|
||||
), "Content-Type should be 'application/zip'"
|
||||
assert content_type == "application/zip", (
|
||||
"Content-Type should be 'application/zip'"
|
||||
)
|
||||
|
||||
@@ -50,18 +50,18 @@ async def test_convert_url(async_client):
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert (
|
||||
content_disposition is not None
|
||||
), "Content-Disposition header should be present"
|
||||
assert content_disposition is not None, (
|
||||
"Content-Disposition header should be present"
|
||||
)
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert (
|
||||
'filename="converted_docs.zip"' in content_disposition
|
||||
), "Attachment filename should be 'converted_docs.zip'"
|
||||
assert 'filename="converted_docs.zip"' in content_disposition, (
|
||||
"Attachment filename should be 'converted_docs.zip'"
|
||||
)
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert (
|
||||
content_type == "application/zip"
|
||||
), "Content-Type should be 'application/zip'"
|
||||
assert content_type == "application/zip", (
|
||||
"Content-Type should be 'application/zip'"
|
||||
)
|
||||
|
||||
159
uv.lock
generated
159
uv.lock
generated
@@ -349,38 +349,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2e/38/3fd83c4690dc7d753a442a284b3826ea5e5c380a411443c66421cd823898/cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7", size = 3134657 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deepsearch-glm"
|
||||
version = "1.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pywin32", marker = "sys_platform == 'win32' or (extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/73/d5/a907234e57f5c4f6480c9ddbc3cdacc47f727c768e502be3d361719fac4e/deepsearch_glm-1.0.0.tar.gz", hash = "sha256:e8dce88ac519a693c260f28bd3c4ec409811e65ade84fb508f6c6e37ca065e62", size = 2401014 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/40/65/4b2013784d5ed8d3664a2efa61f15600c8bf090766b0363c036d78aca550/deepsearch_glm-1.0.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94792b57df7a1c4ba8b47ebd8f36ea0a090d4f27a4fba39bd7b166b6b537260a", size = 6303790 },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/2a/1e95260a712948a21b74dcb239032d9e612f7e1a273657008655749f4115/deepsearch_glm-1.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ff46e352e96a2f56ce7ae4fdf04b271ee841c29ff159b1dec0e5ecaaadba8d4d", size = 5945851 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9e/1a/5c37a98f27644fd02bc447df651e8d5ce484cd6ce7cb178218625b4de5bc/deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d77d3d94d49641888aa15f3ad23e81158e791aa9d9608dd8168dc71788e56f3", size = 7431282 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e8/e2/56b5e7ae3ccc4d8ee758427c8c9a403c985e250a468c53538c269897bef2/deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:143de0fd111a570be12935d8799a2715fe1775d4dc4e256337860b429cee5d36", size = 7759571 },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/f4/e39a5090a2bf0d641449918865566ad5adabef156993a922bdbf4a3ebb60/deepsearch_glm-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9f2872dd573cd2206ce7f9e2e6016c38b66d9ecbd983283ff5e8c6023813c311", size = 7904646 },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/f7/8e8dd9738554f97522b59b0a6d7680ccf2d527bd3471ec4aa4e52acf552a/deepsearch_glm-1.0.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:e64d94ff5209f0a11e8c75c6b28b033ef27b95a22c2fbcbd945e7fe8cc421545", size = 6309301 },
|
||||
{ url = "https://files.pythonhosted.org/packages/17/37/4d8514d8ef851e44513a71f675a7ebb373f109aece38e324c7d444ced20c/deepsearch_glm-1.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a5702205677b768b51f881d15d933370f6ef3c826dfac3b9aa0b904d2e6c495a", size = 5951522 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/c6/3680318e66df278fa7f0811dc862d6cb3c328ce168b4f36736eb77120b6d/deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0417a2ae998e1709f03458cfb9adb55423bb1328224eb055300796baa757879f", size = 7434315 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c3/cd/9ffb616d347d568f868f47585b3261c16e277aa7b37740e8720eee71c539/deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0e1efe9af0d28e9b473fe599246deb3a0be7c3d546a478da284747144d086a", size = 7761264 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3d/d3/e5ebdda9cee8a1c846e6a960a0e5b97624aff2f248c2bc89ae490b9a1342/deepsearch_glm-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:807faf13eb0deea55a1951d479a85d5e20de0ff8b2e0b57b2f7939552759a426", size = 7908603 },
|
||||
{ url = "https://files.pythonhosted.org/packages/60/ca/6adbadc979910b11594cd0242f1991942c22528eead431d47de064ac2860/deepsearch_glm-1.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:56d9575df9eceb8c2ae33e3d15e133924cc195714c3d268599b6f8414c1f6bb8", size = 6308715 },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/7c/bf1e9c458705c7143c6630cb6847554ad694d25dc6f1f038512b9c86160a/deepsearch_glm-1.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:51f5c6522f60ba73eb12eeb7217bd98d871ba7c078337a4059d05878d8baf2d6", size = 5949609 },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/b1/eb0cd0db50d05f2d7a510a77960e85e6caee727eb3d931ed0ec067917813/deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6211eaf497ad7cfcb68f80f9b5387940be0204fe149a9fc03988a95145f410a", size = 7433929 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/7e/2b7db77ff02fe9eec41f3605fcd72e3eb4e6b48561b344d432b417a75cfe/deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b003bf457fce61ea4de79e2d7d0228a1ae349f677eb6570e745f79d4429804f", size = 7760438 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ab/97/ffb2bb5d2432c7b0e9f3a3e6b5873fbcd6e19e82b620393bfb8e01bdecb1/deepsearch_glm-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9d61f66048e6ab60fe9f84c823fd593bf8517755833bd9efb59156d77a2b42d0", size = 7907583 },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/06/08c5fd0e1144c2c8d76d06da1545a9cf589278a37f8b9e6235b5b416eb52/deepsearch_glm-1.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:7d558e8b365c27ee665d0589165fd074fb252c73715f9cc6aeb4304a63683f37", size = 6308867 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ba/fb/f5f9787876b67ce83d5afa4903901be9f8071530bc0706dc2228afc0b6c0/deepsearch_glm-1.0.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:3199093a9472e5756214b9b6563f827c19c001c7dd8ae00e03eed1140c12930d", size = 5949719 },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/0f/42b5a4aa798acbc6309d748435b006c489e58102b6cb2278e7b8f0194743/deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f18d1ee68a0479592e0c714e6cbf9e2d0fa8edd692d580da64431c84cbef5c2", size = 7434981 },
|
||||
{ url = "https://files.pythonhosted.org/packages/17/6a/c2c4eaa4470b78dde6c03f055cbb09f3f7f15b8a6ff38f5bea5180339e6f/deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62c1c0ea0a544219da15c017632f9e0be116ecdc335b865c6c5760429557fe23", size = 7760773 },
|
||||
{ url = "https://files.pythonhosted.org/packages/01/0a/7c3cf75bad38a8d6ff3842b78b3263dd81ad4eaf1d859f4b8e1ab465cad5/deepsearch_glm-1.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:962f393dcec2204de1a5cb0f635c65258bde2424ad2d4e0f5df770139c3958de", size = 7908766 },
|
||||
{ url = "https://files.pythonhosted.org/packages/1f/cd/e6507d924aa69e9647f917ed671e2d62e19e41d4f120a15fcbb583661667/deepsearch_glm-1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2315cc4ffe7032dada294a0cd72a47dbc6c0121fd07d4b5719f9a9e9519d091", size = 14644989 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dill"
|
||||
version = "0.3.9"
|
||||
@@ -410,12 +378,11 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "docling"
|
||||
version = "2.23.0"
|
||||
version = "2.28.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "certifi" },
|
||||
{ name = "deepsearch-glm" },
|
||||
{ name = "docling-core", extra = ["chunking"] },
|
||||
{ name = "docling-ibm-models" },
|
||||
{ name = "docling-parse" },
|
||||
@@ -427,8 +394,10 @@ dependencies = [
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pluggy" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pydantic-settings" },
|
||||
{ name = "pylatexenc" },
|
||||
{ name = "pypdfium2" },
|
||||
{ name = "python-docx" },
|
||||
{ name = "python-pptx" },
|
||||
@@ -438,14 +407,14 @@ dependencies = [
|
||||
{ name = "tqdm" },
|
||||
{ name = "typer" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/94/45/807f81abeaa1a018998525a98d577508e6c0eba4173e6ec4beeee77f0644/docling-2.23.0.tar.gz", hash = "sha256:7ffde3366b01e2f1c0e47574700501a3b8667082cf3a185efe7e103b8473ee43", size = 106065 }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/96/8a/4b342316cf6f1bd56f30da0595eaf7b2eb8899297653e97eb6c6e01b6cc4/docling-2.28.0.tar.gz", hash = "sha256:26fe35a161039b3b33939358918e25f96e902121690e1791f3e324c6332d2f2f", size = 121741 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/9f/7f/79985fbb4f115f87688b2a8d18ddc14d45fcccd8989a02104ddc8cf9b02c/docling-2.23.0-py3-none-any.whl", hash = "sha256:bd3d05bf48fc842e502af9f26153c40e2fcc1df1945ec72ab8c4d5dd1f3b6528", size = 137161 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d3/2c/c14d03c1631dbe366cb539cf7f3bef0df676594bfb244ae709a25b248ff2/docling-2.28.0-py3-none-any.whl", hash = "sha256:5a2b87788c4c969016f38c8f288599c942b4b36687fff43e7e49ef2419b038d1", size = 159615 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "2.19.1"
|
||||
version = "2.23.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "jsonref" },
|
||||
@@ -459,9 +428,9 @@ dependencies = [
|
||||
{ name = "typer" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a3/e5/ddc7f15e28165929cff022f12755023cd29f0273b5008cc4651191a38bc1/docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928", size = 75441 }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/89/e7/dae1d9e2be58fc42a60680ec463dba7df87fb84219e67724054ad69e66e2/docling_core-2.23.3.tar.gz", hash = "sha256:a64ce41e0881c06962a2b3ec80e0665f84de0809dedf1bf84f3a14b75dd665c4", size = 92751 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e2/ed/59d4aab03446fa037de532225096162cb967f90d97c163741fd824f76b98/docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e", size = 95587 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/1c/c0c9e90de6b1d697d274be7e4c1c6c52f8e169e367e9b2fbf6bc1b6aad3b/docling_core-2.23.3-py3-none-any.whl", hash = "sha256:a2166ffc41f8fdf6fdb99b33da6c7146eccf6382712ea92e95772604fb5af5e5", size = 115889 },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
@@ -472,14 +441,16 @@ chunking = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-ibm-models"
|
||||
version = "3.3.2"
|
||||
version = "3.4.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "docling-core" },
|
||||
{ name = "huggingface-hub" },
|
||||
{ name = "jsonlines" },
|
||||
{ name = "numpy" },
|
||||
{ name = "opencv-python-headless" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "safetensors", extra = ["torch"] },
|
||||
{ name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine == 'x86_64' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform != 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" },
|
||||
{ name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (extra != 'extra-13-docling-serve-cpu' and extra != 'extra-13-docling-serve-cu124')" },
|
||||
@@ -492,14 +463,14 @@ dependencies = [
|
||||
{ name = "tqdm" },
|
||||
{ name = "transformers" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6c/e5/7ff58b9481beb43e5b93084b784fe686be553a947c961d09cda557630dd0/docling_ibm_models-3.3.2.tar.gz", hash = "sha256:f6ed59dfb3f98a71ccdd003c13c9a868e3003c22bd5adc554197da7eec227cde", size = 66096 }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/eb/a5/88d5b7c970d5e10a06062fe9e9de3cde6acdefcc1f85854f689a82863c2a/docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96", size = 69794 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/84/8a/1539d9b951a761e141eeabfed7fcfa99d7ae88aa3073711d42e8df3e8d1a/docling_ibm_models-3.3.2-py3-none-any.whl", hash = "sha256:9f82a2ef73c6cd8d729ab2fcc4223079ccb8b6eec0bf0643c56e55352b97b5cb", size = 76659 },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/8f/0f2b823fa09d06deacbdfc6d5d7809d462ddc508f43146960083d113c4c6/docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986", size = 80886 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docling-parse"
|
||||
version = "3.4.0"
|
||||
version = "4.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "docling-core" },
|
||||
@@ -508,34 +479,34 @@ dependencies = [
|
||||
{ name = "pywin32", marker = "sys_platform == 'win32' or (extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" },
|
||||
{ name = "tabulate" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b9/86/f927c8455c985f10aedf1e5f28afdf89fce61c8e927046c2127a09777fa5/docling_parse-3.4.0.tar.gz", hash = "sha256:36cdd17bcc4a833b5c9af9ae3dc461ed18a975c1b084ccfd19a9d9cde4f66e14", size = 36234965 }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1c/49/c722b719882442f909d254c33ed8d0ff87e72bee6dc50e0dc3ba9a9a7519/docling_parse-4.0.0.tar.gz", hash = "sha256:5be0ba4e0098524f116743e6b709f29fe273e441e61923c3a262e054643c5ee6", size = 36249833 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/51/ad/52d9ace2d46c2a5a31ea77ab38857a447a224f7b2878f6042d17b06c6bc9/docling_parse-3.4.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:96e95e63ab722dfe5340fcb04d0e07bd1c0a0ba2f62e93c91ac26dda0a312a44", size = 14711344 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/01/3bd99e200e63d9c238d4abbd3dd982ec347fc2ee7e2e91e8bdb0ee72dc17/docling_parse-3.4.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:f9e14a7a0b92526d4dfd3f390f3d7e075f59d14d6b8a0a564fbc26299e56cd47", size = 14588249 },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/15/f41568765d908ad2cb5dff32d42044cb5a03753744d679dd7d9f5162fcb4/docling_parse-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdef1d51291e841e5b6a32689a39a9f35986389f863b415eaa1790b29d021101", size = 15030528 },
|
||||
{ url = "https://files.pythonhosted.org/packages/48/9c/35fd6f6ab719553920c85c4fc0246f60c4a2f7a533d7ecd394f8c3a37083/docling_parse-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68652610d6c34adc684dbaa77b5d596b25d004912a78e85ec4ae57910bf7086f", size = 15101143 },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/cb/dd9ba1862162ac437137920d834d6a2256f5d5c9ea0775d710b854c0ec54/docling_parse-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:daad07fe93f306d8e2378acb24ef2fa68535ccdb960a1b99d6b36ab8c299fef1", size = 15893428 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/ac/c136192d1784ee8fab3c6830593e3a87bf1016509ddd7a2764eac05ba771/docling_parse-3.4.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:6f30c5fd3c04bd3d1a7d06baeae2e5c3adbebc284071a9a52b0150bcd4917a3d", size = 14712548 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/99/d538dcf7ae680758a7a7d02bd81f8006e65a6d3e3d025e6e6080156e7d39/docling_parse-3.4.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:2c3664e4c8980dc44e0d026b1b01fbc94f0dac9adf7be835071d4a761977c36d", size = 14590167 },
|
||||
{ url = "https://files.pythonhosted.org/packages/cd/ce/1de7ae0ff12ba4d42521b94966519f1002188e167e7381a8cc8d91c70020/docling_parse-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3febf7515453d18df03c275356db2bb5b0618ba9fc033aba05d58318a9846b1a", size = 15031706 },
|
||||
{ url = "https://files.pythonhosted.org/packages/79/3f/637dffc7f6dd801f5c75c4966a1214fb861d6c8a5a9bc20a6df059c94e4b/docling_parse-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75aeb038bb7f6400ecde99cf6c4ef35867c528ac21676071a822ed72d0653149", size = 15102430 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9b/e7/947e71491bf3d6fbe4447153abd795f557dc3d8a85231517da8979bf1d2c/docling_parse-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d20e3584022542448c21ed0ac868b2457ae35211cea63ed20142e375549e633", size = 15894464 },
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/3a/08bd1f4812c111bd2445efaf966ca9ae25f201ac9f4acee7698764ff21a6/docling_parse-3.4.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:ddfe2bd730ed08363f25954a0480da021e6e6bdb175276643cc2913a6bbd98e2", size = 14713125 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/aa/5aaf003f1c9828e62356306ae100f78cf9014a5910f11e9cb0de6beec79a/docling_parse-3.4.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:faf8ba9eaab8c17ea72516be5d440f754fcca27f37488dcf126a0f3ac3a63058", size = 14589373 },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/e5/6dfc59a2aa1adedd43775b48a573e61722e3370d7e435c2fede2f11cdedd/docling_parse-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9eb5e7e50b3057690d0d4fa651363cafd7735bb952378dd8a4ca6c7d359507db", size = 15030339 },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/08/40e4cf6d1e795b6e713d761331ee5bc1f3bb908ea5e2897f1e57fb220493/docling_parse-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:452334b387e2c699f69acf37a4ea4ae7097d062a2dd1980c573b73051c031158", size = 15101855 },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/f4/e5f336bee750f149eb8d85e880569a67cf826aedc3b1f182f47863746a38/docling_parse-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1ba00147ccb0a1dc10cdf58645e67f4ee895c6920bc583bc6f25d27cd562bfed", size = 15894431 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/bb/8442795663aa32259b4789002a25966a0a46129bb5f7877c4efaa7cfde24/docling_parse-3.4.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:2b22a33a2d2f3616a7ac0f4b2f2ba6099f8a5dc6fa328be0f17c9c506455d7c1", size = 14713233 },
|
||||
{ url = "https://files.pythonhosted.org/packages/55/1c/d8ccd619ec3105bc8b1c933540f2344e3adb8b73f7bf65e3d8b6867e258d/docling_parse-3.4.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0dd2440a94d555f98b702e88bfe7cc5a585d9191f4ea93884b02e286e7af3a06", size = 14589505 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0a/6d/c1f798eb3cf942fa34e9d9cbd896f0ad2cb457ce49ff73f53a11ba16cf4e/docling_parse-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f5828744a0e33136e09e8c61ca0b2c0ead8f76595f2e0955beaac16adce51f5", size = 15030504 },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/a5/9f024aaf9ae30ab2e362b753f43962a097709befa726a1362b0c29740db9/docling_parse-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26fff6e36809d17ff855532f985df3738ada8d86a9fc746049ea6e6524d5e0a2", size = 15102307 },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/1f/ebb8f766ed0b9aa10643f71e03cca422bca4eef2df539f35b0dfe0e66dda/docling_parse-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:13fc442f64171280db98dc4507274ffa0a65bac94eecbcc60c3cbf41f433b556", size = 15894198 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e2/24/e81e2b523984f6e25f5e5a5c117df3d5971d3e83c517d6f8371bf73f4a92/docling_parse-3.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:930f5a5d78404de573c0ba302d313b6647f1e86714766e5a1cdc09af014ca111", size = 17696437 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ad/6f/95aa63b6a90f3856098c1279df2d4a8765e4918e3b8992dbc8d3afd34a34/docling_parse-4.0.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:6de7fa8ec4919f604c9a02a3fa8ca0e13a3a8e3c0652adc41848616b737925d9", size = 14705811 },
|
||||
{ url = "https://files.pythonhosted.org/packages/95/88/3f1c565f9ccced7dbac2eeabc770a94b5a878396b82845fde1aa009ba1d7/docling_parse-4.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:82704280ab086a84a30d9ec9def6cd96b733aefc6973546b2101d09eed7a958e", size = 14582832 },
|
||||
{ url = "https://files.pythonhosted.org/packages/2e/bc/12cc8fa558223e8cf13c255a8ced9e0a7f891fe0d4befeb91f00d87b5e12/docling_parse-4.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f51ec645978d75e7cf232fa7c571ebf164a5bdf418588c663f9b3c062df6ba72", size = 15031715 },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/ca/8cf0c729a41ec527efdfc044164c553b38d1091c29f4217f32f627dce3c6/docling_parse-4.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d5da855f35303f9229198891da550e3c1e1f4025e52ab8c0303d345669ff46f", size = 15102096 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/cd/72e934f791fdb5dff7782d5847590644c8c2853020540c5550bbbde9ef58/docling_parse-4.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:ba36cb329aadb306cc25901305d49fe6d2ed9e93e9dc993b4baf13fcc90a98e1", size = 15887834 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0f/26/8d7d860f7801b291f0ea3bd00c19ae975955670b0b6ffb4e79a71fc7d810/docling_parse-4.0.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9b7afbf09945b4d9e3ddb9c24a13d7b9f987cf32d5c9d68532ceb63fb26697df", size = 14707010 },
|
||||
{ url = "https://files.pythonhosted.org/packages/17/41/55195ee0026553d806ed0f4ce191f31905f3e8b89498ad18702bf806dd69/docling_parse-4.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:6daaec89c5045e968785a225b9b5a42b36dfe6b5a4437995e2d34e1595e2c162", size = 14584599 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/e2/22b152b41a43ea96da8df0646d1a4109e99fa752e671aed2dbfad2f4938d/docling_parse-4.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e638ef2ad36e9e4a8ef881073696467e6699bf206e5a416de4abaaf531b0e1", size = 15032938 },
|
||||
{ url = "https://files.pythonhosted.org/packages/be/38/e8d1509ee97de600c9948d98f7f3065788d3ed0443571eb37660eda3f784/docling_parse-4.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87246eb0d259202a7f093336f17235cb1fffb67e82b41dbc0e88f9c05b08014e", size = 15103407 },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/8b/aa17424464e01cb03c16f8ac5fd215866204427aa6fcf2d6f400a7c70ea8/docling_parse-4.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:0ae44b913b010994c3e36869e5fc9dad252a7dc7434225790928075c8b5a7f6c", size = 15888873 },
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/9e/59f33d23052393ccd80566b48dcce391dcaa0bb14cf6cbd0234d1f878f34/docling_parse-4.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:ed6d8ac29c1014ed7a126d782b6bc963c9a9c09f41224fa90f9a8b45bf3191f9", size = 14707578 },
|
||||
{ url = "https://files.pythonhosted.org/packages/88/a3/ff04496290f242cb6b6679c79dde5f2dd37ba8332c60234787caacc154de/docling_parse-4.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:4a2dd46cee8e54f3aa511dbf552ef5f9f422944c54de73888ee55b2c4a6e10b9", size = 14583898 },
|
||||
{ url = "https://files.pythonhosted.org/packages/fc/45/461992f6d866ecd19f6b00004d6ced7063561733cabc040e3079ed44b730/docling_parse-4.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722fbd63f7f28e8a49fa2cd92d1571290f6c5295b86c7406b7c20a6c6e8b3975", size = 15031532 },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/08/b87ef326fa7b97b91476d9e79c241fba55b3825a7d128e3cc7ee328e37c6/docling_parse-4.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc155767b51a23f5bfd5abaabaf8c4a57777aa0277c813e13b9f6c43532964bd", size = 15102813 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/7c/540a6cec0e06826d978ed363c7b3e042c8226ffb61c92a1bb70f649405d8/docling_parse-4.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:e45ab31fffe4ae571bd2ecc9e0a9d5665a1486463396924160add84828d2a7e7", size = 15888837 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/e3/902a70c90037fedde03d4656bbd0ae9870cbab9738ccae62139f67722d71/docling_parse-4.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:d93fd3cec032e5b7f6385f7a021e228c52eb381f28fc037224708aeaad487d8b", size = 14707694 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0f/86/348563fb71079ec79b38961cbc9cdafbe18e14a24c727e96c11d011f39be/docling_parse-4.0.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:d9f64847cd7e9a7a34a3d5a14f0827022ed3b7f50f39d5126ef003c55d574ba3", size = 14584061 },
|
||||
{ url = "https://files.pythonhosted.org/packages/39/f1/dcf8a7530ae4966f22c1078a683e92a5e174fb2eff9d5ce78e95151bbf9a/docling_parse-4.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6ac283f08680dfde568b5629ab94830cab32795d74086553e755460b6879901", size = 15031728 },
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/76/bdf33b0c3a555405d150f1a29cacb2c1df2875b4f70e62e4432e06adfa8c/docling_parse-4.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97eca28220dc5075099e01f2cb7a3e9005b9951dee0ca0eb743e298be7284279", size = 15103250 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/db/c40db2c555860a86da67453cca1aa842d434a8f28693e4c96ef5f85936c0/docling_parse-4.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:6019288cfe25a97993c2aab453386fc3e366d7761637e682b25915ba2c856cc4", size = 15888608 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4c/8d/b6b5a557d75e4313d82ab508e4bf0c645805bef2474c6e0e1164661b8cc0/docling_parse-4.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:30c0c1b33c0a0aeb6897537f7d8fa09ed5a26f05685b18a2d27c73a789343679", size = 17690840 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docling-serve"
|
||||
version = "0.3.0"
|
||||
version = "0.7.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "docling" },
|
||||
@@ -546,6 +517,7 @@ dependencies = [
|
||||
{ name = "python-multipart" },
|
||||
{ name = "typer" },
|
||||
{ name = "uvicorn", extra = ["standard"] },
|
||||
{ name = "websockets" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
@@ -573,7 +545,7 @@ ui = [
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "mypy" },
|
||||
{ name = "pre-commit" },
|
||||
{ name = "pre-commit-uv" },
|
||||
{ name = "pytest" },
|
||||
{ name = "pytest-asyncio" },
|
||||
{ name = "pytest-check" },
|
||||
@@ -583,7 +555,7 @@ dev = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "docling", specifier = "~=2.23" },
|
||||
{ name = "docling", specifier = "~=2.28" },
|
||||
{ name = "fastapi", extras = ["standard"], specifier = "~=0.115" },
|
||||
{ name = "gradio", marker = "extra == 'ui'", specifier = "~=5.9" },
|
||||
{ name = "httpx", specifier = "~=0.28" },
|
||||
@@ -599,13 +571,14 @@ requires-dist = [
|
||||
{ name = "torchvision", marker = "extra == 'cu124'", specifier = ">=0.21.0", index = "https://download.pytorch.org/whl/cu124", conflict = { package = "docling-serve", extra = "cu124" } },
|
||||
{ name = "typer", specifier = "~=0.12" },
|
||||
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.29.0,<1.0.0" },
|
||||
{ name = "websockets", specifier = "~=14.0" },
|
||||
]
|
||||
provides-extras = ["ui", "tesserocr", "rapidocr", "cpu", "cu124"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
{ name = "mypy", specifier = "~=1.11" },
|
||||
{ name = "pre-commit", specifier = "~=3.8" },
|
||||
{ name = "pre-commit-uv", specifier = "~=4.1" },
|
||||
{ name = "pytest", specifier = "~=8.3" },
|
||||
{ name = "pytest-asyncio", specifier = "~=0.24" },
|
||||
{ name = "pytest-check", specifier = "~=2.4" },
|
||||
@@ -1974,6 +1947,19 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f", size = 204643 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pre-commit-uv"
|
||||
version = "4.1.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pre-commit" },
|
||||
{ name = "uv" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b1/6c/c3c1d01698c8abb0b546defc0304971fa7fb2ba84ad35587b9dad095d73f/pre_commit_uv-4.1.4.tar.gz", hash = "sha256:3db606a79b226127b27dbbd8381b78c0e30de3ac775a8492c576a68e9250535c", size = 6493 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/70/1b65f9118ef64f6ffe5d57a67170bbff25d4f4a3d1cb78e8ed3392e16114/pre_commit_uv-4.1.4-py3-none-any.whl", hash = "sha256:7f01fb494fa1caa5097d20a38f71df7cea0209197b2564699cef9b3f3aa9d135", size = 5578 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "5.29.3"
|
||||
@@ -2149,6 +2135,12 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pylatexenc"
|
||||
version = "2.10"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597 }
|
||||
|
||||
[[package]]
|
||||
name = "pypdfium2"
|
||||
version = "4.30.1"
|
||||
@@ -3473,6 +3465,10 @@ dependencies = [
|
||||
{ name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (extra != 'extra-13-docling-serve-cpu' and extra != 'extra-13-docling-serve-cu124')" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/20/72eb0b5b08fa293f20fc41c374e37cf899f0033076f0144d2cdc48f9faee/torchvision-0.21.0-1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5568c5a1ff1b2ec33127b629403adb530fab81378d9018ca4ed6508293f76e2b", size = 2327643 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/3d/b7241abfa3e6651c6e00796f5de2bd1ce4d500bf5159bcbfeea47e711b93/torchvision-0.21.0-1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ff96666b94a55e802ea6796cabe788541719e6f4905fc59c380fed3517b6a64d", size = 2329320 },
|
||||
{ url = "https://files.pythonhosted.org/packages/52/5b/76ca113a853b19c7b1da761f8a72cb6429b3bd0bf932537d8df4657f47c3/torchvision-0.21.0-1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:ffa2a16499508fe6798323e455f312c7c55f2a88901c9a7c0fb1efa86cf7e327", size = 2329878 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/fe/5e193353706dab96fe73ae100d5a633ff635ce310e0d92f3bc2958d075b1/torchvision-0.21.0-1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:7e9e9afa150e40cd2a8f0701c43cb82a8d724f512896455c0918b987f94b84a4", size = 2280711 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/0d/143bd264876fad17c82096b6c2d433f1ac9b29cdc69ee45023096976ee3d/torchvision-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:044ea420b8c6c3162a234cada8e2025b9076fa82504758cd11ec5d0f8cd9fa37", size = 1784140 },
|
||||
{ url = "https://files.pythonhosted.org/packages/5e/44/32e2d2d174391374d5ff3c4691b802e8efda9ae27ab9062eca2255b006af/torchvision-0.21.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:b0c0b264b89ab572888244f2e0bad5b7eaf5b696068fc0b93e96f7c3c198953f", size = 7237187 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/6b/4fca9373eda42c1b04096758306b7bd55f7d8f78ba273446490855a0f25d/torchvision-0.21.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:54815e0a56dde95cc6ec952577f67e0dc151eadd928e8d9f6a7f821d69a4a734", size = 14699067 },
|
||||
@@ -3666,6 +3662,31 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uv"
|
||||
version = "0.6.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6d/05/118e10d91981b85f47b27d089782a6598a9584ff607bffb8e2f6be1f1245/uv-0.6.2.tar.gz", hash = "sha256:d696a4f3d4a3ac1b305255e8814ae3a147ea3428a977bb3b4335a339941799bc", size = 3066291 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a1/cf/9c3c9a427c7ecc37be238c4433188614b3d342191c0299c632f512d493ff/uv-0.6.2-py3-none-linux_armv6l.whl", hash = "sha256:d501ae16fb33969b12a64ac7b9c49d672b8c3964026c5dcaee3b1dcd50a6a22c", size = 15513992 },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/01/1e1f88826d92d11f2232f96eef190574a4edb470546a141bba652cd37240/uv-0.6.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2c13ca920d87dc00721a86ac3d19667cff5435b369d21e3d6df76b373d8fa8df", size = 15659547 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/40/59e9c03431d4c82420e081f92719e5784db8f1c92a25b2abdfe6ac645b7e/uv-0.6.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f24e119d338bae32b5a604585b7b518036fba556e2c2d9dbd2d7cf1411213b57", size = 14589044 },
|
||||
{ url = "https://files.pythonhosted.org/packages/11/8b/5d9f9f4e3969d6a2c9ce9a0b4a85ecb8ca89bf5c00e9ec097cf472abb2a2/uv-0.6.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1db90b728a173926e2018b89df776a373b1e50520466f61e0dbf05f9a64a6db5", size = 15034328 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f3/ba/f31fd6af8f70b21d9e0b7cca0241a8f10e03d24862f49f93fbc5ff1e4fce/uv-0.6.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d23fb9cd41aecb31845e884d0bfde243e04e763abeab3532138321b4ebe7437c", size = 15275180 },
|
||||
{ url = "https://files.pythonhosted.org/packages/aa/3b/358cfea4265a0966fafa7934ed0f9f1fb031d7ebbe8a15e02a308afff6ad/uv-0.6.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df0a1d95fd1539c05de434259fafcee0b6852900d4178e94b3b6b6b06438b60c", size = 15969503 },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/f5/840d8fb46c1cf723e1b7168832de52e58d86764aa625c2100b35a27261af/uv-0.6.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f2f0dc9a0564b31d4efdee317c176a23bbe7e61aec6d281a331ba6ae32f828ff", size = 16950563 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/37/75c5ff09db56c34f0f5d3d55dd4188e52d09219ef76bfe176dae58ed5f4a/uv-0.6.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:326aff8c4fb8153e2384e79904c27b1c9d4c3a5879b53a6fbc2da3283fda321d", size = 16631562 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/5f/91bfae5ecf9f6c5f4754aa794159acc77245a53233a966865ae4974e5cdf/uv-0.6.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8763f310a473f46c0226f5e08a876bd34de121ac370cc7294a5397a13a18d8a", size = 20994598 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/39/17f77b4b5f1a1e579d9ce94859aada9418c9ebcaa227b54b10648218bafa/uv-0.6.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2e421947ef889e6c8913992c560d611826464eabc78f8f702a5eff824aabc7", size = 16367280 },
|
||||
{ url = "https://files.pythonhosted.org/packages/a7/6b/fbd9794e1344b299e02993322f44b500f4d66ecdb83860e2fcf35d8cac2c/uv-0.6.2-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7dd26dabd918e5648ecf94fb7c0787db954237e34ea3bdd944b98d007b44c3a5", size = 15317824 },
|
||||
{ url = "https://files.pythonhosted.org/packages/51/a0/9249a55365c2f9781243a7f35c3a01864b19aa9a62b1fc50b7231793346e/uv-0.6.2-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:f3719da2e59403783eab634a6238b90051fc65379e02c10b9ca1b32b26d35f77", size = 15228644 },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/76/790b3d9c0b9ecd9ab6c1b7e904c36d470685c70d0b21a134b026452e0fcc/uv-0.6.2-py3-none-musllinux_1_1_i686.whl", hash = "sha256:b435687e5c26a64858ea842fbb4b35ced8e8741a99d1b75d0c0143462e956db9", size = 15608612 },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/b6/79961374b2318461b4dfc0e565d63281bf788fea93fc81b2d1738847aec2/uv-0.6.2-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:0f1e8e15c92607862e72e0467a31947af7b9aef93924072e9b4d5dcb5633d374", size = 16480962 },
|
||||
{ url = "https://files.pythonhosted.org/packages/68/20/df7788bde9d114c501cd8ebb60235be07ff0fb0dc26fa1e7e99ada251d73/uv-0.6.2-py3-none-win32.whl", hash = "sha256:52b7452f4c523b9875de53ba73df87acd1cdea36640281d0d80c8074eda42f16", size = 15717804 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/0a/fc966f859b6252050c71e1afcdce116c8ef3513f8b423bb3ca05fb13485d/uv-0.6.2-py3-none-win_amd64.whl", hash = "sha256:5337cdb6ecc604d0cf36fe6799dd0479111b606009e6c29685d213c74eb40373", size = 17017798 },
|
||||
{ url = "https://files.pythonhosted.org/packages/03/82/4318c4874c8dd59a0386e2bf0f4d09fc5bb4900349238828153235d387eb/uv-0.6.2-py3-none-win_arm64.whl", hash = "sha256:27ecb8f6ef796220062f31a12e2dc5dc7a14704aa1df0da2dfa3530346c7e3cc", size = 15923484 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uvicorn"
|
||||
version = "0.34.0"
|
||||
|
||||
Reference in New Issue
Block a user