mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-30 17:13:19 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
be7e4162af | ||
|
|
de42baf8dc | ||
|
|
4da28565a7 | ||
|
|
2a78142b96 | ||
|
|
d0e8578931 | ||
|
|
c6539c42de | ||
|
|
ddf3144512 |
40
.dockerignore
Normal file
40
.dockerignore
Normal file
@@ -0,0 +1,40 @@
|
||||
# Ignore Python cache files
|
||||
__pycache__/
|
||||
**/__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
|
||||
# Ignore virtual environments
|
||||
env/
|
||||
venv/
|
||||
|
||||
# Ignore development artifacts
|
||||
*.log
|
||||
*.db
|
||||
*.sqlite3
|
||||
|
||||
# Ignore configuration and sensitive files
|
||||
**/.env
|
||||
*.env
|
||||
*.ini
|
||||
*.cfg
|
||||
|
||||
# Ignore IDE and editor settings
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Ignore Git files
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Ignore Docker files themselves (optional if not needed in the image)
|
||||
.dockerignore
|
||||
Dockerfile*
|
||||
|
||||
# Ignore build artifacts (if applicable)
|
||||
build/
|
||||
dist/
|
||||
*.egg-info
|
||||
@@ -2,5 +2,7 @@ config:
|
||||
line-length: false
|
||||
no-emphasis-as-header: false
|
||||
first-line-heading: false
|
||||
MD033:
|
||||
allowed_elements: ["details", "summary"]
|
||||
globs:
|
||||
- "**/*.md"
|
||||
|
||||
@@ -16,6 +16,14 @@ repos:
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: autoflake
|
||||
name: autoflake
|
||||
entry: poetry run autoflake docling_serve tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
|
||||
@@ -1,32 +1,61 @@
|
||||
FROM python:3.11-slim-bookworm
|
||||
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
|
||||
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
ARG CPU_ONLY=false
|
||||
WORKDIR /docling-serve
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||
&& apt-get clean
|
||||
USER 0
|
||||
|
||||
RUN pip install --no-cache-dir poetry
|
||||
###################################################################################################
|
||||
# OS Layer #
|
||||
###################################################################################################
|
||||
|
||||
COPY pyproject.toml poetry.lock README.md /docling-serve/
|
||||
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
|
||||
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
|
||||
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
|
||||
dnf config-manager --enable crb && \
|
||||
dnf -y update && \
|
||||
dnf install -y $(cat /tmp/os-packages.txt) && \
|
||||
dnf -y clean all && \
|
||||
rm -rf /var/cache/dnf
|
||||
|
||||
RUN if [ "$CPU_ONLY" = "true" ]; then \
|
||||
poetry install --no-root --with cpu; \
|
||||
else \
|
||||
poetry install --no-root; \
|
||||
fi
|
||||
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
|
||||
ENV HF_HOME=/tmp/
|
||||
ENV TORCH_HOME=/tmp/
|
||||
###################################################################################################
|
||||
# Docling layer #
|
||||
###################################################################################################
|
||||
|
||||
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
|
||||
USER 1001
|
||||
|
||||
WORKDIR /opt/app-root/src
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
|
||||
COPY ./docling_serve /docling-serve/docling_serve
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=utf-8
|
||||
|
||||
ENV WITH_UI=True
|
||||
|
||||
COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./
|
||||
|
||||
RUN pip install --no-cache-dir poetry && \
|
||||
# We already are in a virtual environment, so we don't need to create a new one, only activate it.
|
||||
poetry config virtualenvs.create false && \
|
||||
source /opt/app-root/bin/activate && \
|
||||
if [ "$CPU_ONLY" = "true" ]; then \
|
||||
poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
|
||||
else \
|
||||
poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
|
||||
fi && \
|
||||
echo "Downloading models..." && \
|
||||
python models_download.py && \
|
||||
chown -R 1001:0 /opt/app-root/src && \
|
||||
chmod -R g=u /opt/app-root/src
|
||||
|
||||
COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
|
||||
|
||||
EXPOSE 5001
|
||||
|
||||
CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
|
||||
CMD ["python", "-m", "docling_serve"]
|
||||
|
||||
19
Makefile
19
Makefile
@@ -25,14 +25,14 @@ md-lint-file:
|
||||
$(CMD_PREFIX) touch .markdown-lint
|
||||
|
||||
.PHONY: docling-serve-cpu-image
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" continaer image
|
||||
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU ONLY]"
|
||||
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=true -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
|
||||
|
||||
.PHONY: docling-serve-gpu-image
|
||||
docling-serve-gpu-image: Containerfile ## Build docling-serve continaer image with GPU support
|
||||
docling-serve-gpu-image: Containerfile ## Build docling-serve container image with GPU support
|
||||
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with GPU]"
|
||||
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=false -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
||||
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
||||
@@ -62,7 +62,6 @@ md-lint: .md-lint ## Lint markdown files
|
||||
$(CMD_PREFIX) docker run --rm -v $$(pwd):/workdir davidanson/markdownlint-cli2:v0.14.0 "**/*.md"
|
||||
$(CMD_PREFIX) touch $@
|
||||
|
||||
|
||||
.PHONY: py-Lint
|
||||
py-lint: ## Lint Python files
|
||||
$(ECHO_PREFIX) printf " %-12s ./...\n" "[PY LINT]"
|
||||
@@ -73,3 +72,17 @@ py-lint: ## Lint Python files
|
||||
fi
|
||||
$(CMD_PREFIX) poetry install --all-extras
|
||||
$(CMD_PREFIX) poetry run pre-commit run --all-files
|
||||
|
||||
.PHONY: run-docling-cpu
|
||||
run-docling-cpu: ## Run the docling-serve container with CPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-cpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with CPU support on port 5001...\n" "[RUN CPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-cpu -p 5001:5001 ghcr.io/ds4sd/docling-serve-cpu:main
|
||||
|
||||
.PHONY: run-docling-gpu
|
||||
run-docling-gpu: ## Run the docling-serve container with GPU support and assign a container name
|
||||
$(ECHO_PREFIX) printf " %-12s Removing existing container if it exists...\n" "[CLEANUP]"
|
||||
$(CMD_PREFIX) docker rm -f docling-serve-gpu 2>/dev/null || true
|
||||
$(ECHO_PREFIX) printf " %-12s Running docling-serve container with GPU support on port 5001...\n" "[RUN GPU]"
|
||||
$(CMD_PREFIX) docker run -it --name docling-serve-gpu -p 5001:5001 ghcr.io/ds4sd/docling-serve:main
|
||||
|
||||
380
README.md
380
README.md
@@ -2,55 +2,363 @@
|
||||
|
||||
Running [Docling](https://github.com/DS4SD/docling) as an API service.
|
||||
|
||||
> [!NOTE]
|
||||
> This is an unstable draft implementation which will quickly evolve.
|
||||
## Usage
|
||||
|
||||
The API provides two endpoints: one for urls, one for files. This is necessary to send files directly in binary format instead of base64-encoded strings.
|
||||
|
||||
### Common parameters
|
||||
|
||||
On top of the source of file (see below), both endpoints support the same parameters, which are almost the same as the Docling CLI.
|
||||
|
||||
- `from_format` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
|
||||
- `to_formats` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
|
||||
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
||||
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
||||
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
||||
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
||||
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`. Defaults to `dlparse_v2`.
|
||||
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
||||
- `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
|
||||
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
|
||||
- `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
|
||||
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to true.
|
||||
- `images_scale` (float): Scale factor for images. Defaults to 2.0.
|
||||
|
||||
### URL endpoint
|
||||
|
||||
The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
|
||||
|
||||
On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
|
||||
The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
|
||||
No `options` is required, they can be partially or completely omitted.
|
||||
|
||||
Simple payload example:
|
||||
|
||||
```json
|
||||
{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Complete payload example:</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://localhost:5001/v1alpha/convert/source' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx"
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": true,
|
||||
"force_ocr": false,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": [
|
||||
"fr",
|
||||
"de",
|
||||
"es",
|
||||
"en"
|
||||
],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": false,
|
||||
"return_as_file": false,
|
||||
"do_table_structure": true,
|
||||
"include_images": true,
|
||||
"images_scale": 2,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": "en",
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
||||
}
|
||||
|
||||
response = await async_client_client.post(url, json=payload)
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### File as base64
|
||||
|
||||
The `file_sources` argument in the endpoint allows to send files as base64-encoded strings.
|
||||
When your PDF or other file type is too large, encoding it and passing it inline to curl
|
||||
can lead to an “Argument list too long” error on some systems. To avoid this, we write
|
||||
the JSON request body to a file and have curl read from that file.
|
||||
|
||||
<details>
|
||||
<summary>CURL steps:</summary>
|
||||
|
||||
```sh
|
||||
# 1. Base64-encode the file
|
||||
B64_DATA=$(base64 -w 0 /path/to/file/pdf-to-convert.pdf)
|
||||
|
||||
# 2. Build the JSON with your options
|
||||
cat <<EOF > /tmp/request_body.json
|
||||
{
|
||||
"options": {
|
||||
},
|
||||
"file_sources": [{
|
||||
"base64_string": "${B64_DATA}",
|
||||
"filename": "pdf-to-convert.pdf"
|
||||
}]
|
||||
}
|
||||
EOF
|
||||
|
||||
# 3. POST the request to the docling service
|
||||
curl -X POST "localhost:5001/v1alpha/convert/source" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @/tmp/request_body.json
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### File endpoint
|
||||
|
||||
The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
|
||||
|
||||
<details>
|
||||
<summary>CURL example:</summary>
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:5001/v1alpha/convert/file' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: multipart/form-data' \
|
||||
-F 'ocr_engine=easyocr' \
|
||||
-F 'pdf_backend=dlparse_v2' \
|
||||
-F 'from_formats=pdf' \
|
||||
-F 'from_formats=docx' \
|
||||
-F 'force_ocr=false' \
|
||||
-F 'image_export_mode=embedded' \
|
||||
-F 'ocr_lang=en' \
|
||||
-F 'ocr_lang=pl' \
|
||||
-F 'table_mode=fast' \
|
||||
-F 'files=@2206.01062v1.pdf;type=application/pdf' \
|
||||
-F 'abort_on_error=false' \
|
||||
-F 'to_formats=md' \
|
||||
-F 'to_formats=text' \
|
||||
-F 'return_as_file=false' \
|
||||
-F 'do_ocr=true'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Python example:</summary>
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
async_client = httpx.AsyncClient(timeout=60.0)
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
parameters = {
|
||||
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, '2206.01062v1.pdf')
|
||||
|
||||
files = {
|
||||
'files': ('2206.01062v1.pdf', open(file_path, 'rb'), 'application/pdf'),
|
||||
}
|
||||
|
||||
response = await async_client.post(url, files=files, data={"parameters": json.dumps(parameters)})
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Response format
|
||||
|
||||
The response can be a JSON Document or a File.
|
||||
|
||||
- If you process only one file, the response will be a JSON document with the following format:
|
||||
|
||||
```jsonc
|
||||
{
|
||||
"document": {
|
||||
"md_content": "",
|
||||
"json_content": {},
|
||||
"html_content": "",
|
||||
"text_content": "",
|
||||
"doctags_content": ""
|
||||
},
|
||||
"status": "<success|partial_success|skipped|failure>",
|
||||
"processing_time": 0.0,
|
||||
"timings": {},
|
||||
"errors": []
|
||||
}
|
||||
```
|
||||
|
||||
Depending on the value you set in `output_formats`, the different items will be populated with their respective results or empty.
|
||||
|
||||
`processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
|
||||
timing of all the internal Docling components.
|
||||
|
||||
- If you set the parameter `return_as_file` to True, the response will be a zip file.
|
||||
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
|
||||
|
||||
## Helpers
|
||||
|
||||
- A full Swagger UI is available at the `/docs` endpoint.
|
||||
|
||||

|
||||
|
||||
- An easy to use UI is available at the `/ui` endpoint.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## Development
|
||||
|
||||
Install the dependencies
|
||||
### CPU only
|
||||
|
||||
```sh
|
||||
# Install poetry if not already available
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
# Install dependencies
|
||||
poetry install --with cpu
|
||||
```
|
||||
|
||||
### Cuda GPU
|
||||
|
||||
For GPU support use the following command:
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
poetry install
|
||||
|
||||
# Run the server
|
||||
poetry run uvicorn docling_serve.app:app --reload
|
||||
```
|
||||
|
||||
Example payload (http source):
|
||||
### Run the server
|
||||
|
||||
The [start_server.sh](./start_server.sh) executable is a convenient script for launching the local webserver.
|
||||
|
||||
```sh
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/convert' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"http_source": {
|
||||
"url": "https://arxiv.org/pdf/2206.01062"
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### Cuda GPU Support
|
||||
|
||||
For GPU support try the following:
|
||||
|
||||
```sh
|
||||
# Create a virtual env
|
||||
python3 -m venv venv
|
||||
|
||||
# Activate the venv
|
||||
source venv/bin/active
|
||||
|
||||
# Install torch with the special index
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
|
||||
# Install the package
|
||||
pip install -e .
|
||||
|
||||
# Run the server
|
||||
poetry run uvicorn docling_serve.app:app --reload
|
||||
bash start_server.sh
|
||||
|
||||
# Run the server with live reload
|
||||
RELOAD=true bash start_server.sh
|
||||
```
|
||||
|
||||
### Environment variables
|
||||
|
||||
The following variables are available:
|
||||
|
||||
`TESSDATA_PREFIX`: Tesseract data location, example `/usr/share/tesseract/tessdata/`.
|
||||
`UVICORN_WORKERS`: Number of workers to use.
|
||||
`RELOAD`: If `True`, this will enable auto-reload when you modify files, useful for development.
|
||||
`WITH_UI`: If `True`, The Gradio UI will be available at `/ui`.
|
||||
|
||||
## Get help and support
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling Serve](https://github.com/DS4SD/docling-serve/blob/main/CONTRIBUTING.md) for details.
|
||||
|
||||
## References
|
||||
|
||||
If you use Docling in your projects, please consider citing the following:
|
||||
|
||||
```bib
|
||||
@techreport{Docling,
|
||||
author = {Deep Search Team},
|
||||
month = {8},
|
||||
title = {Docling Technical Report},
|
||||
url = {https://arxiv.org/abs/2408.09869},
|
||||
eprint = {2408.09869},
|
||||
doi = {10.48550/arXiv.2408.09869},
|
||||
version = {1.0.0},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The Docling Serve codebase is under MIT license.
|
||||
|
||||
## IBM ❤️ Open Source AI
|
||||
|
||||
Docling has been brought to you by IBM.
|
||||
|
||||
3
docling_serve/.env.example
Normal file
3
docling_serve/.env.example
Normal file
@@ -0,0 +1,3 @@
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
UVICORN_WORKERS=2
|
||||
RELOAD=True
|
||||
20
docling_serve/__main__.py
Normal file
20
docling_serve/__main__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import os
|
||||
|
||||
from docling_serve.app import app
|
||||
from docling_serve.helper_functions import _str_to_bool
|
||||
|
||||
# Launch the FastAPI server
|
||||
if __name__ == "__main__":
|
||||
from uvicorn import run
|
||||
|
||||
port = int(os.getenv("PORT", "5001"))
|
||||
workers = int(os.getenv("UVICORN_WORKERS", "1"))
|
||||
reload = _str_to_bool(os.getenv("RELOAD", "False"))
|
||||
run(
|
||||
app,
|
||||
host="0.0.0.0",
|
||||
port=port,
|
||||
workers=workers,
|
||||
timeout_keep_alive=600,
|
||||
reload=reload,
|
||||
)
|
||||
@@ -1,177 +1,83 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from contextlib import asynccontextmanager
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Dict, List, Optional, Union
|
||||
|
||||
import httpx
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import RedirectResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.docling_conversion import (
|
||||
ConvertDocumentFileSourcesRequest,
|
||||
ConvertDocumentsOptions,
|
||||
ConvertDocumentsRequest,
|
||||
convert_documents,
|
||||
converters,
|
||||
get_pdf_pipeline_opts,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
from docling_serve.helper_functions import FormDepends, _str_to_bool
|
||||
from docling_serve.response_preparation import ConvertDocumentResponse, process_results
|
||||
|
||||
# Load local env vars if present
|
||||
load_dotenv()
|
||||
|
||||
WITH_UI = _str_to_bool(os.getenv("WITH_UI", "False"))
|
||||
if WITH_UI:
|
||||
import gradio as gr
|
||||
|
||||
from docling_serve.gradio_ui import ui as gradio_ui
|
||||
|
||||
|
||||
# Set up custom logging as we'll be intermixes with FastAPI/Uvicorn's logging
|
||||
class ColoredLogFormatter(logging.Formatter):
|
||||
COLOR_CODES = {
|
||||
logging.DEBUG: "\033[94m", # Blue
|
||||
logging.INFO: "\033[92m", # Green
|
||||
logging.WARNING: "\033[93m", # Yellow
|
||||
logging.ERROR: "\033[91m", # Red
|
||||
logging.CRITICAL: "\033[95m", # Magenta
|
||||
}
|
||||
RESET_CODE = "\033[0m"
|
||||
|
||||
def format(self, record):
|
||||
color = self.COLOR_CODES.get(record.levelno, "")
|
||||
record.levelname = f"{color}{record.levelname}{self.RESET_CODE}"
|
||||
return super().format(record)
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, # Set the logging level
|
||||
format="%(levelname)s:\t%(asctime)s - %(name)s - %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from docling_core.utils.file import resolve_remote_filename
|
||||
from fastapi import FastAPI, HTTPException, Response
|
||||
from pydantic import AnyHttpUrl, BaseModel
|
||||
|
||||
|
||||
# TODO: import enum from Docling, once it is exposed
|
||||
class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT = "tesseract"
|
||||
RAPIDOCR = "rapidocr"
|
||||
|
||||
|
||||
class ConvertOptions(BaseModel):
|
||||
output_docling_document: bool = True
|
||||
output_markdown: bool = False
|
||||
output_html: bool = False
|
||||
do_ocr: bool = True
|
||||
ocr_engine: OcrEngine = OcrEngine.EASYOCR
|
||||
ocr_lang: Optional[List[str]] = None
|
||||
force_ocr: bool = False
|
||||
do_table_structure: bool = True
|
||||
include_images: bool = True
|
||||
images_scale: float = 2.0
|
||||
|
||||
|
||||
class DocumentConvertBase(BaseModel):
|
||||
options: ConvertOptions = ConvertOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: str
|
||||
headers: Dict[str, Any] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: str
|
||||
filename: str
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
|
||||
http_source: HttpSource
|
||||
|
||||
|
||||
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
|
||||
file_source: FileSource
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
markdown: Optional[str] = None
|
||||
docling_document: Optional[DoclingDocument] = None
|
||||
html: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
# errors: List[ErrorItem] = []
|
||||
|
||||
|
||||
ConvertDocumentRequest = Union[
|
||||
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
|
||||
]
|
||||
|
||||
|
||||
class MarkdownTextResponse(Response):
|
||||
media_type = "text/markdown"
|
||||
|
||||
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
|
||||
|
||||
if options.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
elif options.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={options.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
|
||||
|
||||
if options.ocr_lang is not None:
|
||||
ocr_options.lang = options.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=options.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=options.do_table_structure,
|
||||
generate_page_images=options.include_images,
|
||||
generate_picture_images=options.include_images,
|
||||
images_scale=options.images_scale,
|
||||
)
|
||||
|
||||
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
|
||||
|
||||
return pipeline_options, options_hash
|
||||
|
||||
|
||||
converters: Dict[str, DocumentConverter] = {}
|
||||
|
||||
# Override the formatter with the custom ColoredLogFormatter
|
||||
root_logger = logging.getLogger() # Get the root logger
|
||||
for handler in root_logger.handlers: # Iterate through existing handlers
|
||||
if handler.formatter:
|
||||
handler.setFormatter(ColoredLogFormatter(handler.formatter._fmt))
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Context manager to initialize and clean up the lifespan of the FastAPI app
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# settings = Settings()
|
||||
|
||||
# Converter with default options
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -180,100 +86,139 @@ async def lifespan(app: FastAPI):
|
||||
yield
|
||||
|
||||
converters.clear()
|
||||
if WITH_UI:
|
||||
gradio_ui.close()
|
||||
|
||||
|
||||
##################################
|
||||
# App creation and configuration #
|
||||
##################################
|
||||
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
origins = ["*"]
|
||||
methods = ["*"]
|
||||
headers = ["*"]
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=methods,
|
||||
allow_headers=headers,
|
||||
)
|
||||
|
||||
# Mount the Gradio app
|
||||
if WITH_UI:
|
||||
tmp_output_dir = Path(tempfile.mkdtemp())
|
||||
gradio_ui.gradio_output_dir = tmp_output_dir
|
||||
app = gr.mount_gradio_app(
|
||||
app, gradio_ui, path="/ui", allowed_paths=["./logo.png", tmp_output_dir]
|
||||
)
|
||||
|
||||
|
||||
#############################
|
||||
# API Endpoints definitions #
|
||||
#############################
|
||||
|
||||
|
||||
# Favicon
|
||||
@app.get("/favicon.ico", include_in_schema=False)
|
||||
async def favicon():
|
||||
response = RedirectResponse(url="https://ds4sd.github.io/docling/assets/logo.png")
|
||||
return response
|
||||
|
||||
|
||||
# Status
|
||||
class HealthCheckResponse(BaseModel):
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
|
||||
def _convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConversionResult:
|
||||
|
||||
filename: str
|
||||
buf: BytesIO
|
||||
|
||||
if isinstance(body, ConvertDocumentFileSourceRequest):
|
||||
buf = BytesIO(base64.b64decode(body.file_source.base64_string))
|
||||
filename = body.file_source.filename
|
||||
elif isinstance(body, ConvertDocumentHttpSourceRequest):
|
||||
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
|
||||
buf = BytesIO(http_res.content)
|
||||
filename = resolve_remote_filename(
|
||||
http_url=AnyHttpUrl(body.http_source.url),
|
||||
response_headers=dict(**http_res.headers),
|
||||
)
|
||||
|
||||
doc_input = DocumentStream(name=filename, stream=buf)
|
||||
|
||||
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
|
||||
if options_hash not in converters:
|
||||
converters[options_hash] = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
||||
}
|
||||
)
|
||||
|
||||
result: ConversionResult = converters[options_hash].convert(doc_input)
|
||||
|
||||
if result is None or result.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=result.errors)
|
||||
|
||||
if result is None or result.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
}:
|
||||
raise HTTPException(
|
||||
status_code=500, detail={"errors": result.errors, "status": result.status}
|
||||
)
|
||||
|
||||
return result
|
||||
# API readiness compatibility for OpenShift AI Workbench
|
||||
@app.get("/api", include_in_schema=False)
|
||||
def api_check() -> HealthCheckResponse:
|
||||
return HealthCheckResponse()
|
||||
|
||||
|
||||
# Convert a document from URL(s)
|
||||
@app.post(
|
||||
"/convert",
|
||||
"/v1alpha/convert/source",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
# "description": "Return the JSON item or an image.",
|
||||
}
|
||||
},
|
||||
)
|
||||
def convert_document(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> ConvertDocumentResponse:
|
||||
def process_url(
|
||||
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
||||
):
|
||||
sources: List[Union[str, DocumentStream]] = []
|
||||
headers: Optional[Dict[str, Any]] = None
|
||||
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
||||
for file_source in conversion_request.file_sources:
|
||||
sources.append(file_source.to_document_stream())
|
||||
else:
|
||||
for http_source in conversion_request.http_sources:
|
||||
sources.append(http_source.url)
|
||||
if headers is None and http_source.headers:
|
||||
headers = http_source.headers
|
||||
|
||||
result = _convert_document(body=body)
|
||||
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
)
|
||||
doc_resp = DocumentResponse()
|
||||
if body.options.output_docling_document:
|
||||
doc_resp.docling_document = result.document
|
||||
if body.options.output_markdown:
|
||||
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
|
||||
if body.options.output_html:
|
||||
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
|
||||
|
||||
return ConvertDocumentResponse(
|
||||
document=doc_resp, status=result.status, timings=result.timings
|
||||
# Note: results are only an iterator->lazy evaluation
|
||||
results = convert_documents(
|
||||
sources=sources, options=conversion_request.options, headers=headers
|
||||
)
|
||||
|
||||
# The real processing will happen here
|
||||
response = process_results(
|
||||
background_tasks=background_tasks,
|
||||
conversion_options=conversion_request.options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
|
||||
def convert_document_md(
|
||||
body: ConvertDocumentRequest,
|
||||
) -> MarkdownTextResponse:
|
||||
result = _convert_document(body=body)
|
||||
image_mode = (
|
||||
ImageRefMode.EMBEDDED
|
||||
if body.options.include_images
|
||||
else ImageRefMode.PLACEHOLDER
|
||||
)
|
||||
return MarkdownTextResponse(
|
||||
result.document.export_to_markdown(image_mode=image_mode)
|
||||
return response
|
||||
|
||||
|
||||
# Convert a document from file(s)
|
||||
@app.post(
|
||||
"/v1alpha/convert/file",
|
||||
response_model=ConvertDocumentResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/zip": {}},
|
||||
}
|
||||
},
|
||||
)
|
||||
async def process_file(
|
||||
background_tasks: BackgroundTasks,
|
||||
files: List[UploadFile],
|
||||
options: Annotated[ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)],
|
||||
):
|
||||
|
||||
_log.info(f"Received {len(files)} files for processing.")
|
||||
|
||||
# Load the uploaded files to Docling DocumentStream
|
||||
file_sources = []
|
||||
for file in files:
|
||||
buf = BytesIO(file.file.read())
|
||||
name = file.filename if file.filename else "file.pdf"
|
||||
file_sources.append(DocumentStream(name=name, stream=buf))
|
||||
|
||||
results = convert_documents(sources=file_sources, options=options)
|
||||
|
||||
response = process_results(
|
||||
background_tasks=background_tasks,
|
||||
conversion_options=options,
|
||||
conv_results=results,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
400
docling_serve/docling_conversion.py
Normal file
400
docling_serve/docling_conversion.py
Normal file
@@ -0,0 +1,400 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Annotated,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat, OutputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrEngine,
|
||||
OcrOptions,
|
||||
PdfBackend,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TableFormerMode,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from fastapi import HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Define the input options for the API
|
||||
class ConvertDocumentsOptions(BaseModel):
|
||||
from_formats: Annotated[
|
||||
List[InputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Input format(s) to convert from. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
||||
"Optional, defaults to all formats."
|
||||
),
|
||||
examples=[[v.value for v in InputFormat]],
|
||||
),
|
||||
] = [v for v in InputFormat]
|
||||
|
||||
to_formats: Annotated[
|
||||
List[OutputFormat],
|
||||
Field(
|
||||
description=(
|
||||
"Output format(s) to convert to. String or list of strings. "
|
||||
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
||||
"Optional, defaults to Markdown."
|
||||
),
|
||||
examples=[[OutputFormat.MARKDOWN]],
|
||||
),
|
||||
] = [OutputFormat.MARKDOWN]
|
||||
|
||||
image_export_mode: Annotated[
|
||||
ImageRefMode,
|
||||
Field(
|
||||
description=(
|
||||
"Image export mode for the document (in case of JSON,"
|
||||
" Markdown or HTML). "
|
||||
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
||||
"Optional, defaults to Embedded."
|
||||
),
|
||||
examples=[ImageRefMode.EMBEDDED.value],
|
||||
# pattern="embedded|placeholder|referenced",
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED
|
||||
|
||||
do_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the bitmap content will be processed using OCR. "
|
||||
"Boolean. Optional, defaults to true"
|
||||
),
|
||||
# examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
force_ocr: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, replace existing text with OCR-generated "
|
||||
"text over content. Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
# TODO: use a restricted list based on what is installed on the system
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine,
|
||||
Field(
|
||||
description=(
|
||||
"The OCR engine to use. String. "
|
||||
"Allowed values: easyocr, tesseract, rapidocr. "
|
||||
"Optional, defaults to easyocr."
|
||||
),
|
||||
examples=[OcrEngine.EASYOCR],
|
||||
),
|
||||
] = OcrEngine.EASYOCR
|
||||
|
||||
ocr_lang: Annotated[
|
||||
Optional[List[str]],
|
||||
Field(
|
||||
description=(
|
||||
"List of languages used by the OCR engine. "
|
||||
"Note that each OCR engine has "
|
||||
"different values for the language names. String or list of strings. "
|
||||
"Optional, defaults to empty."
|
||||
),
|
||||
examples=[["fr", "de", "es", "en"]],
|
||||
),
|
||||
] = None
|
||||
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend,
|
||||
Field(
|
||||
description=(
|
||||
"The PDF backend to use. String. "
|
||||
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
||||
f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
|
||||
),
|
||||
examples=[PdfBackend.DLPARSE_V2],
|
||||
),
|
||||
] = PdfBackend.DLPARSE_V2
|
||||
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
Field(
|
||||
TableFormerMode.FAST,
|
||||
description=(
|
||||
"Mode to use for table structure, String. "
|
||||
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
||||
"Optional, defaults to fast."
|
||||
),
|
||||
examples=[TableFormerMode.FAST],
|
||||
# pattern="fast|accurate",
|
||||
),
|
||||
] = TableFormerMode.FAST
|
||||
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Abort on error if enabled. " "Boolean. Optional, defaults to false."
|
||||
),
|
||||
# examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
return_as_file: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"Return the output as a zip file "
|
||||
"(will happen anyway if multiple files are generated). "
|
||||
"Boolean. Optional, defaults to false."
|
||||
),
|
||||
examples=[False],
|
||||
),
|
||||
] = False
|
||||
|
||||
do_table_structure: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, the table structure will be extracted. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
include_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
description=(
|
||||
"If enabled, images will be extracted from the document. "
|
||||
"Boolean. Optional, defaults to true."
|
||||
),
|
||||
examples=[True],
|
||||
),
|
||||
] = True
|
||||
|
||||
images_scale: Annotated[
|
||||
float,
|
||||
Field(
|
||||
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
||||
examples=[2.0],
|
||||
),
|
||||
] = 2.0
|
||||
|
||||
|
||||
class DocumentsConvertBase(BaseModel):
|
||||
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
||||
|
||||
|
||||
class HttpSource(BaseModel):
|
||||
url: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="HTTP url to process",
|
||||
examples=["https://arxiv.org/pdf/2206.01062"],
|
||||
),
|
||||
]
|
||||
headers: Annotated[
|
||||
Dict[str, Any],
|
||||
Field(
|
||||
description="Additional headers used to fetch the urls, "
|
||||
"e.g. authorization, agent, etc"
|
||||
),
|
||||
] = {}
|
||||
|
||||
|
||||
class FileSource(BaseModel):
|
||||
base64_string: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="Content of the file serialized in base64. "
|
||||
"For example it can be obtained via "
|
||||
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
||||
),
|
||||
]
|
||||
filename: Annotated[
|
||||
str,
|
||||
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
||||
]
|
||||
|
||||
def to_document_stream(self) -> DocumentStream:
|
||||
buf = BytesIO(base64.b64decode(self.base64_string))
|
||||
return DocumentStream(stream=buf, name=self.filename)
|
||||
|
||||
|
||||
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
||||
http_sources: List[HttpSource]
|
||||
|
||||
|
||||
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
||||
file_sources: List[FileSource]
|
||||
|
||||
|
||||
ConvertDocumentsRequest = Union[
|
||||
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
||||
]
|
||||
|
||||
|
||||
# Document converters will be preloaded and stored in a dictionary
|
||||
converters: Dict[str, DocumentConverter] = {}
|
||||
|
||||
|
||||
# Custom serializer for PdfFormatOption
|
||||
# (model_dump_json does not work with some classes)
|
||||
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
||||
data = pdf_format_option.model_dump()
|
||||
|
||||
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
||||
if pdf_format_option.pipeline_options:
|
||||
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()
|
||||
|
||||
# Replace `pipeline_cls` with a string representation
|
||||
data["pipeline_cls"] = repr(data["pipeline_cls"])
|
||||
|
||||
# Replace `backend` with a string representation
|
||||
data["backend"] = repr(data["backend"])
|
||||
|
||||
# Handle `device` in `accelerator_options`
|
||||
if "accelerator_options" in data and "device" in data["accelerator_options"]:
|
||||
data["accelerator_options"]["device"] = repr(
|
||||
data["accelerator_options"]["device"]
|
||||
)
|
||||
|
||||
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
||||
return json.dumps(data, sort_keys=True)
|
||||
|
||||
|
||||
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
||||
def get_pdf_pipeline_opts(
|
||||
request: ConvertDocumentsOptions,
|
||||
) -> Tuple[PdfFormatOption, str]:
|
||||
|
||||
if request.ocr_engine == OcrEngine.EASYOCR:
|
||||
try:
|
||||
import easyocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
elif request.ocr_engine == OcrEngine.TESSERACT:
|
||||
try:
|
||||
import tesserocr # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
elif request.ocr_engine == OcrEngine.RAPIDOCR:
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
||||
except ImportError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The requested OCR engine"
|
||||
f" (ocr_engine={request.ocr_engine.value})"
|
||||
" is not available on this system. Please choose another OCR engine "
|
||||
"or contact your system administrator.",
|
||||
)
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}")
|
||||
|
||||
if request.ocr_lang is not None:
|
||||
if isinstance(request.ocr_lang, str):
|
||||
ocr_options.lang = _to_list_of_strings(request.ocr_lang)
|
||||
else:
|
||||
ocr_options.lang = request.ocr_lang
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=request.do_ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=request.do_table_structure,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
|
||||
|
||||
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||
pipeline_options.generate_page_images = True
|
||||
if request.images_scale:
|
||||
pipeline_options.images_scale = request.images_scale
|
||||
|
||||
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
|
||||
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
serialized_data = _serialize_pdf_format_option(pdf_format_option)
|
||||
|
||||
options_hash = hashlib.sha1(serialized_data.encode()).hexdigest()
|
||||
|
||||
return pdf_format_option, options_hash
|
||||
|
||||
|
||||
def convert_documents(
|
||||
sources: Iterable[Union[Path, str, DocumentStream]],
|
||||
options: ConvertDocumentsOptions,
|
||||
headers: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
||||
|
||||
if options_hash not in converters:
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
converters[options_hash] = DocumentConverter(format_options=format_options)
|
||||
_log.info(f"We now have {len(converters)} converters in memory.")
|
||||
|
||||
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
|
||||
sources,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
return results
|
||||
635
docling_serve/gradio_ui.py
Normal file
635
docling_serve/gradio_ui.py
Normal file
@@ -0,0 +1,635 @@
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
|
||||
from docling_serve.helper_functions import _to_list_of_strings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#################
|
||||
# CSS and theme #
|
||||
#################
|
||||
|
||||
css = """
|
||||
#logo {
|
||||
border-style: none;
|
||||
background: none;
|
||||
box-shadow: none;
|
||||
min-width: 80px;
|
||||
}
|
||||
#dark_mode_column {
|
||||
display: flex;
|
||||
align-content: flex-end;
|
||||
}
|
||||
#title {
|
||||
text-align: left;
|
||||
display:block;
|
||||
height: auto;
|
||||
padding-top: 5px;
|
||||
line-height: 0;
|
||||
}
|
||||
.title-text h1 > p, .title-text p {
|
||||
margin-top: 0px !important;
|
||||
margin-bottom: 2px !important;
|
||||
}
|
||||
#custom-container {
|
||||
border: 0.909091px solid;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
#custom-container h4 {
|
||||
font-size: 14px;
|
||||
}
|
||||
#file_input_zone {
|
||||
height: 140px;
|
||||
}
|
||||
"""
|
||||
|
||||
theme = gr.themes.Default(
|
||||
text_size="md",
|
||||
spacing_size="md",
|
||||
font=[
|
||||
gr.themes.GoogleFont("Red Hat Display"),
|
||||
"ui-sans-serif",
|
||||
"system-ui",
|
||||
"sans-serif",
|
||||
],
|
||||
font_mono=[
|
||||
gr.themes.GoogleFont("Red Hat Mono"),
|
||||
"ui-monospace",
|
||||
"Consolas",
|
||||
"monospace",
|
||||
],
|
||||
)
|
||||
|
||||
#############
|
||||
# Variables #
|
||||
#############
|
||||
|
||||
gradio_output_dir = None # Will be set by FastAPI when mounted
|
||||
file_output_path = None # Will be set when a new file is generated
|
||||
|
||||
#############
|
||||
# Functions #
|
||||
#############
|
||||
|
||||
|
||||
def health_check():
|
||||
response = requests.get(f"http://localhost:{int(os.getenv('PORT', '5001'))}/health")
|
||||
if response.status_code == 200:
|
||||
return "Healthy"
|
||||
return "Unhealthy"
|
||||
|
||||
|
||||
def set_options_visibility(x):
|
||||
return gr.Accordion("Options", open=x)
|
||||
|
||||
|
||||
def set_outputs_visibility_direct(x, y):
|
||||
content = gr.Row(visible=x)
|
||||
file = gr.Row(visible=y)
|
||||
return content, file
|
||||
|
||||
|
||||
def set_outputs_visibility_process(x):
|
||||
content = gr.Row(visible=not x)
|
||||
file = gr.Row(visible=x)
|
||||
return content, file
|
||||
|
||||
|
||||
def set_download_button_label(label_text: gr.State):
|
||||
return gr.DownloadButton(label=str(label_text), scale=1)
|
||||
|
||||
|
||||
def clear_outputs():
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
|
||||
return (
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
doctags_content,
|
||||
)
|
||||
|
||||
|
||||
def clear_url_input():
|
||||
return ""
|
||||
|
||||
|
||||
def clear_file_input():
|
||||
return None
|
||||
|
||||
|
||||
def auto_set_return_as_file(url_input, file_input, image_export_mode):
|
||||
# If more than one input source is provided, return as file
|
||||
if (
|
||||
(len(url_input.split(",")) > 1)
|
||||
or (file_input and len(file_input) > 1)
|
||||
or (image_export_mode == "referenced")
|
||||
):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def change_ocr_lang(ocr_engine):
|
||||
if ocr_engine == "easyocr":
|
||||
return "en,fr,de,es"
|
||||
elif ocr_engine == "tesseract_cli":
|
||||
return "eng,fra,deu,spa"
|
||||
elif ocr_engine == "tesseract":
|
||||
return "eng,fra,deu,spa"
|
||||
elif ocr_engine == "rapidocr":
|
||||
return "english,chinese"
|
||||
|
||||
|
||||
def process_url(
|
||||
input_sources,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
):
|
||||
parameters = {
|
||||
"http_sources": [{"url": source} for source in input_sources.split(",")],
|
||||
"options": {
|
||||
"to_formats": to_formats,
|
||||
"image_export_mode": image_export_mode,
|
||||
"ocr": ocr,
|
||||
"force_ocr": force_ocr,
|
||||
"ocr_engine": ocr_engine,
|
||||
"ocr_lang": _to_list_of_strings(ocr_lang),
|
||||
"pdf_backend": pdf_backend,
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": abort_on_error,
|
||||
"return_as_file": return_as_file,
|
||||
},
|
||||
}
|
||||
if (
|
||||
not parameters["http_sources"]
|
||||
or len(parameters["http_sources"]) == 0
|
||||
or parameters["http_sources"][0]["url"] == ""
|
||||
):
|
||||
logger.error("No input sources provided.")
|
||||
raise gr.Error("No input sources provided.", print_exception=False)
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/source",
|
||||
json=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing URL: {e}")
|
||||
raise gr.Error(f"Error processing URL: {e}", print_exception=False)
|
||||
if response.status_code != 200:
|
||||
data = response.json()
|
||||
error_message = data.get("detail", "An unknown error occurred.")
|
||||
logger.error(f"Error processing file: {error_message}")
|
||||
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
||||
output = response_to_output(response, return_as_file)
|
||||
return output
|
||||
|
||||
|
||||
def process_file(
|
||||
files,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
):
|
||||
if not files or len(files) == 0 or files[0] == "":
|
||||
logger.error("No files provided.")
|
||||
raise gr.Error("No files provided.", print_exception=False)
|
||||
files_data = [("files", (file.name, open(file.name, "rb"))) for file in files]
|
||||
|
||||
parameters = {
|
||||
"to_formats": to_formats,
|
||||
"image_export_mode": image_export_mode,
|
||||
"ocr": str(ocr).lower(),
|
||||
"force_ocr": str(force_ocr).lower(),
|
||||
"ocr_engine": ocr_engine,
|
||||
"ocr_lang": _to_list_of_strings(ocr_lang),
|
||||
"pdf_backend": pdf_backend,
|
||||
"table_mode": table_mode,
|
||||
"abort_on_error": str(abort_on_error).lower(),
|
||||
"return_as_file": str(return_as_file).lower(),
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/file",
|
||||
files=files_data,
|
||||
data=parameters,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file(s): {e}")
|
||||
raise gr.Error(f"Error processing file(s): {e}", print_exception=False)
|
||||
if response.status_code != 200:
|
||||
data = response.json()
|
||||
error_message = data.get("detail", "An unknown error occurred.")
|
||||
logger.error(f"Error processing file: {error_message}")
|
||||
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
||||
output = response_to_output(response, return_as_file)
|
||||
return output
|
||||
|
||||
|
||||
def response_to_output(response, return_as_file):
|
||||
markdown_content = ""
|
||||
json_content = ""
|
||||
html_content = ""
|
||||
text_content = ""
|
||||
doctags_content = ""
|
||||
download_button = gr.DownloadButton(visible=False, label="Download Output", scale=1)
|
||||
if return_as_file:
|
||||
filename = (
|
||||
response.headers.get("Content-Disposition").split("filename=")[1].strip('"')
|
||||
)
|
||||
tmp_output_dir = Path(tempfile.mkdtemp(dir=gradio_output_dir, prefix="ui_"))
|
||||
file_output_path = f"{tmp_output_dir}/{filename}"
|
||||
# logger.info(f"Saving file to: {file_output_path}")
|
||||
with open(file_output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
download_button = gr.DownloadButton(
|
||||
visible=True, label=f"Download {filename}", scale=1, value=file_output_path
|
||||
)
|
||||
else:
|
||||
full_content = response.json()
|
||||
markdown_content = full_content.get("document").get("md_content")
|
||||
json_content = json.dumps(
|
||||
full_content.get("document").get("json_content"), indent=2
|
||||
)
|
||||
html_content = full_content.get("document").get("html_content")
|
||||
text_content = full_content.get("document").get("text_content")
|
||||
doctags_content = full_content.get("document").get("doctags_content")
|
||||
return (
|
||||
markdown_content,
|
||||
markdown_content,
|
||||
json_content,
|
||||
html_content,
|
||||
html_content,
|
||||
text_content,
|
||||
doctags_content,
|
||||
download_button,
|
||||
)
|
||||
|
||||
|
||||
############
|
||||
# UI Setup #
|
||||
############
|
||||
|
||||
with gr.Blocks(
|
||||
css=css,
|
||||
theme=theme,
|
||||
title="Docling Serve",
|
||||
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
||||
) as ui:
|
||||
|
||||
# Constants stored in states to be able to pass them as inputs to functions
|
||||
processing_text = gr.State("Processing your document(s), please wait...")
|
||||
true_bool = gr.State(True)
|
||||
false_bool = gr.State(False)
|
||||
|
||||
# Banner
|
||||
with gr.Row(elem_id="check_health"):
|
||||
# Logo
|
||||
with gr.Column(scale=1, min_width=90):
|
||||
gr.Image(
|
||||
"https://ds4sd.github.io/docling/assets/logo.png",
|
||||
height=80,
|
||||
width=80,
|
||||
show_download_button=False,
|
||||
show_label=False,
|
||||
show_fullscreen_button=False,
|
||||
container=False,
|
||||
elem_id="logo",
|
||||
scale=0,
|
||||
)
|
||||
# Title
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
gr.Markdown(
|
||||
f"# Docling Serve \n(docling version: "
|
||||
f"{importlib.metadata.version('docling')})",
|
||||
elem_id="title",
|
||||
elem_classes=["title-text"],
|
||||
)
|
||||
# Dark mode button
|
||||
with gr.Column(scale=16, elem_id="dark_mode_column"):
|
||||
dark_mode_btn = gr.Button("Dark/Light Mode", scale=0)
|
||||
dark_mode_btn.click(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
js="""() => {
|
||||
if (document.querySelectorAll('.dark').length) {
|
||||
document.querySelectorAll('.dark').forEach(
|
||||
el => el.classList.remove('dark')
|
||||
);
|
||||
} else {
|
||||
document.querySelector('body').classList.add('dark');
|
||||
}
|
||||
}""",
|
||||
show_api=False,
|
||||
)
|
||||
|
||||
# URL Processing Tab
|
||||
with gr.Tab("Convert URL(s)"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
url_input = gr.Textbox(
|
||||
label="Input Sources (comma-separated URLs)",
|
||||
placeholder="https://arxiv.org/pdf/2206.01062",
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
url_process_btn = gr.Button("Process URL(s)", scale=1)
|
||||
url_reset_btn = gr.Button("Reset", scale=1)
|
||||
|
||||
# File Processing Tab
|
||||
with gr.Tab("Convert File(s)"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
file_input = gr.File(
|
||||
elem_id="file_input_zone",
|
||||
label="Upload Files",
|
||||
file_types=[
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".html",
|
||||
".xlsx",
|
||||
".asciidoc",
|
||||
".txt",
|
||||
".md",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
],
|
||||
file_count="multiple",
|
||||
scale=4,
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
file_process_btn = gr.Button("Process File(s)", scale=1)
|
||||
file_reset_btn = gr.Button("Reset", scale=1)
|
||||
|
||||
# Options
|
||||
with gr.Accordion("Options") as options:
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1):
|
||||
to_formats = gr.CheckboxGroup(
|
||||
[
|
||||
("Markdown", "md"),
|
||||
("Docling (JSON)", "json"),
|
||||
("HTML", "html"),
|
||||
("Plain Text", "text"),
|
||||
("Doc Tags", "doctags"),
|
||||
],
|
||||
label="To Formats",
|
||||
value=["md"],
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
image_export_mode = gr.Radio(
|
||||
[
|
||||
("Embedded", "embedded"),
|
||||
("Placeholder", "placeholder"),
|
||||
("Referenced", "referenced"),
|
||||
],
|
||||
label="Image Export Mode",
|
||||
value="embedded",
|
||||
)
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr = gr.Checkbox(label="Enable OCR", value=True)
|
||||
force_ocr = gr.Checkbox(label="Force OCR", value=False)
|
||||
with gr.Column(scale=1):
|
||||
ocr_engine = gr.Radio(
|
||||
[
|
||||
("EasyOCR", "easyocr"),
|
||||
("Tesseract", "tesseract"),
|
||||
("RapidOCR", "rapidocr"),
|
||||
],
|
||||
label="OCR Engine",
|
||||
value="easyocr",
|
||||
)
|
||||
with gr.Column(scale=1, min_width=200):
|
||||
ocr_lang = gr.Textbox(
|
||||
label="OCR Language (beware of the format)", value="en,fr,de,es"
|
||||
)
|
||||
ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
|
||||
with gr.Row():
|
||||
with gr.Column(scale=2):
|
||||
pdf_backend = gr.Radio(
|
||||
["pypdfium2", "dlparse_v1", "dlparse_v2"],
|
||||
label="PDF Backend",
|
||||
value="dlparse_v2",
|
||||
)
|
||||
with gr.Column(scale=2):
|
||||
table_mode = gr.Radio(
|
||||
["fast", "accurate"], label="Table Mode", value="fast"
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
|
||||
return_as_file = gr.Checkbox(label="Return as File", value=False)
|
||||
|
||||
# Document output
|
||||
with gr.Row(visible=False) as content_output:
|
||||
with gr.Tab("Markdown"):
|
||||
output_markdown = gr.Code(
|
||||
language="markdown", wrap_lines=True, show_label=False
|
||||
)
|
||||
with gr.Tab("Markdown-Rendered"):
|
||||
output_markdown_rendered = gr.Markdown(label="Response")
|
||||
with gr.Tab("Docling (JSON)"):
|
||||
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("HTML"):
|
||||
output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
|
||||
with gr.Tab("HTML-Rendered"):
|
||||
output_html_rendered = gr.HTML(label="Response")
|
||||
with gr.Tab("Text"):
|
||||
output_text = gr.Code(wrap_lines=True, show_label=False)
|
||||
with gr.Tab("DocTags"):
|
||||
output_doctags = gr.Code(wrap_lines=True, show_label=False)
|
||||
|
||||
# File download output
|
||||
with gr.Row(visible=False) as file_output:
|
||||
download_file_btn = gr.DownloadButton(label="Placeholder", scale=1)
|
||||
|
||||
##############
|
||||
# UI Actions #
|
||||
##############
|
||||
|
||||
# Handle Return as File
|
||||
url_input.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
file_input.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
image_export_mode.change(
|
||||
auto_set_return_as_file,
|
||||
inputs=[url_input, file_input, image_export_mode],
|
||||
outputs=[return_as_file],
|
||||
)
|
||||
|
||||
# URL processing
|
||||
url_process_btn.click(
|
||||
set_options_visibility, inputs=[false_bool], outputs=[options]
|
||||
).then(
|
||||
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
||||
).then(
|
||||
set_outputs_visibility_process,
|
||||
inputs=[return_as_file],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(
|
||||
process_url,
|
||||
inputs=[
|
||||
url_input,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
download_file_btn,
|
||||
],
|
||||
)
|
||||
|
||||
url_reset_btn.click(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_url_input, inputs=None, outputs=[url_input]
|
||||
)
|
||||
|
||||
# File processing
|
||||
file_process_btn.click(
|
||||
set_options_visibility, inputs=[false_bool], outputs=[options]
|
||||
).then(
|
||||
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
||||
).then(
|
||||
set_outputs_visibility_process,
|
||||
inputs=[return_as_file],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(
|
||||
process_file,
|
||||
inputs=[
|
||||
file_input,
|
||||
to_formats,
|
||||
image_export_mode,
|
||||
ocr,
|
||||
force_ocr,
|
||||
ocr_engine,
|
||||
ocr_lang,
|
||||
pdf_backend,
|
||||
table_mode,
|
||||
abort_on_error,
|
||||
return_as_file,
|
||||
],
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
download_file_btn,
|
||||
],
|
||||
)
|
||||
|
||||
file_reset_btn.click(
|
||||
clear_outputs,
|
||||
inputs=None,
|
||||
outputs=[
|
||||
output_markdown,
|
||||
output_markdown_rendered,
|
||||
output_json,
|
||||
output_html,
|
||||
output_html_rendered,
|
||||
output_text,
|
||||
output_doctags,
|
||||
],
|
||||
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
||||
set_outputs_visibility_direct,
|
||||
inputs=[false_bool, false_bool],
|
||||
outputs=[content_output, file_output],
|
||||
).then(
|
||||
clear_file_input, inputs=None, outputs=[file_input]
|
||||
)
|
||||
62
docling_serve/helper_functions.py
Normal file
62
docling_serve/helper_functions.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import inspect
|
||||
import re
|
||||
from typing import List, Type, Union
|
||||
|
||||
from fastapi import Depends, Form
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
||||
def FormDepends(cls: Type[BaseModel]):
|
||||
new_parameters = []
|
||||
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
new_parameters.append(
|
||||
inspect.Parameter(
|
||||
name=field_name,
|
||||
kind=inspect.Parameter.POSITIONAL_ONLY,
|
||||
default=(
|
||||
Form(...)
|
||||
if model_field.is_required()
|
||||
else Form(model_field.default)
|
||||
),
|
||||
annotation=model_field.annotation,
|
||||
)
|
||||
)
|
||||
|
||||
async def as_form_func(**data):
|
||||
return cls(**data)
|
||||
|
||||
sig = inspect.signature(as_form_func)
|
||||
sig = sig.replace(parameters=new_parameters)
|
||||
as_form_func.__signature__ = sig # type: ignore
|
||||
return Depends(as_form_func)
|
||||
|
||||
|
||||
def _to_list_of_strings(input_value: Union[str, List[str]]) -> List[str]:
|
||||
def split_and_strip(value: str) -> List[str]:
|
||||
if re.search(r"[;,]", value):
|
||||
return [item.strip() for item in re.split(r"[;,]", value)]
|
||||
else:
|
||||
return [value.strip()]
|
||||
|
||||
if isinstance(input_value, str):
|
||||
return split_and_strip(input_value)
|
||||
elif isinstance(input_value, list):
|
||||
result = []
|
||||
for item in input_value:
|
||||
result.extend(split_and_strip(str(item)))
|
||||
return result
|
||||
else:
|
||||
raise ValueError("Invalid input: must be a string or a list of strings.")
|
||||
|
||||
|
||||
# Helper functions to parse inputs coming as Form objects
|
||||
def _str_to_bool(value: Union[str, bool]) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value # Already a boolean, return as-is
|
||||
if isinstance(value, str):
|
||||
value = value.strip().lower() # Normalize input
|
||||
return value in ("true", "1", "yes")
|
||||
return False # Default to False if none of the above matches
|
||||
248
docling_serve/response_preparation.py
Normal file
248
docling_serve/response_preparation.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
from docling.datamodel.document import ConversionResult, ConversionStatus, ErrorItem
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
||||
from fastapi import BackgroundTasks, HTTPException
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling_serve.docling_conversion import ConvertDocumentsOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
filename: str
|
||||
md_content: Optional[str] = None
|
||||
json_content: Optional[DoclingDocument] = None
|
||||
html_content: Optional[str] = None
|
||||
text_content: Optional[str] = None
|
||||
doctags_content: Optional[str] = None
|
||||
|
||||
|
||||
class ConvertDocumentResponse(BaseModel):
|
||||
document: DocumentResponse
|
||||
status: ConversionStatus
|
||||
errors: List[ErrorItem] = []
|
||||
processing_time: float
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
|
||||
class ConvertDocumentErrorResponse(BaseModel):
|
||||
status: ConversionStatus
|
||||
|
||||
|
||||
def _export_document_as_content(
|
||||
conv_res: ConversionResult,
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
document = DocumentResponse(filename=conv_res.input.file.name)
|
||||
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
new_doc = conv_res.document._make_copy_with_refmode(Path(), image_mode)
|
||||
|
||||
# Create the different formats
|
||||
if export_json:
|
||||
document.json_content = new_doc
|
||||
if export_html:
|
||||
document.html_content = new_doc.export_to_html(image_mode=image_mode)
|
||||
if export_txt:
|
||||
document.text_content = new_doc.export_to_markdown(
|
||||
strict_text=True, image_mode=image_mode
|
||||
)
|
||||
if export_md:
|
||||
document.md_content = new_doc.export_to_markdown(image_mode=image_mode)
|
||||
if export_doctags:
|
||||
document.doctags_content = new_doc.export_to_document_tokens()
|
||||
elif conv_res.status == ConversionStatus.SKIPPED:
|
||||
raise HTTPException(status_code=400, detail=conv_res.errors)
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail=conv_res.errors)
|
||||
|
||||
return document
|
||||
|
||||
|
||||
def _export_documents_as_files(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_export_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export JSON format:
|
||||
if export_json:
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
conv_res.document.save_as_json(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export HTML format:
|
||||
if export_html:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
_log.info(f"writing TXT output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname,
|
||||
strict_text=True,
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
conv_res.document.save_as_document_tokens(filename=fname)
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, "
|
||||
f"of which {failure_count} failed"
|
||||
)
|
||||
|
||||
|
||||
def process_results(
|
||||
background_tasks: BackgroundTasks,
|
||||
conversion_options: ConvertDocumentsOptions,
|
||||
conv_results: Iterable[ConversionResult],
|
||||
) -> Union[ConvertDocumentResponse, FileResponse]:
|
||||
|
||||
# Let's start by processing the documents
|
||||
try:
|
||||
start_time = time.monotonic()
|
||||
|
||||
# Convert the iterator to a list to count the number of results and get timings
|
||||
# As it's an iterator (lazy evaluation), it will also start the conversion
|
||||
conv_results = list(conv_results)
|
||||
|
||||
processing_time = time.monotonic() - start_time
|
||||
|
||||
_log.info(
|
||||
f"Processed {len(conv_results)} docs in {processing_time:.2f} seconds."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if len(conv_results) == 0:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="No documents were generated by Docling."
|
||||
)
|
||||
|
||||
# We have some results, let's prepare the response
|
||||
response: Union[FileResponse, ConvertDocumentResponse]
|
||||
|
||||
# Booleans to know what to export
|
||||
export_json = OutputFormat.JSON in conversion_options.to_formats
|
||||
export_html = OutputFormat.HTML in conversion_options.to_formats
|
||||
export_md = OutputFormat.MARKDOWN in conversion_options.to_formats
|
||||
export_txt = OutputFormat.TEXT in conversion_options.to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in conversion_options.to_formats
|
||||
|
||||
# Only 1 document was processed, and we are not returning it as a file
|
||||
if len(conv_results) == 1 and not conversion_options.return_as_file:
|
||||
conv_res = conv_results[0]
|
||||
document = _export_document_as_content(
|
||||
conv_res,
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_mode=conversion_options.image_export_mode,
|
||||
)
|
||||
|
||||
response = ConvertDocumentResponse(
|
||||
document=document,
|
||||
status=conv_res.status,
|
||||
processing_time=processing_time,
|
||||
timings=conv_res.timings,
|
||||
)
|
||||
|
||||
# Multiple documents were processed, or we are forced returning as a file
|
||||
else:
|
||||
# Temporary directory to store the outputs
|
||||
work_dir = Path(tempfile.mkdtemp(prefix="docling_"))
|
||||
output_dir = work_dir / "output"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Worker pid to use in archive identification as we may have multiple workers
|
||||
os.getpid()
|
||||
|
||||
# Export the documents
|
||||
_export_documents_as_files(
|
||||
conv_results=conv_results,
|
||||
output_dir=output_dir,
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_export_mode=conversion_options.image_export_mode,
|
||||
)
|
||||
|
||||
files = os.listdir(output_dir)
|
||||
|
||||
if len(files) == 0:
|
||||
raise HTTPException(status_code=500, detail="No documents were exported.")
|
||||
|
||||
file_path = work_dir / "converted_docs.zip"
|
||||
shutil.make_archive(
|
||||
base_name=str(file_path.with_suffix("")),
|
||||
format="zip",
|
||||
root_dir=output_dir,
|
||||
)
|
||||
|
||||
# Other cleanups after the response is sent
|
||||
# Output directory
|
||||
background_tasks.add_task(shutil.rmtree, work_dir, ignore_errors=True)
|
||||
|
||||
response = FileResponse(
|
||||
file_path, filename=file_path.name, media_type="application/zip"
|
||||
)
|
||||
|
||||
return response
|
||||
BIN
img/swagger.png
Normal file
BIN
img/swagger.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
BIN
img/ui-input.png
Normal file
BIN
img/ui-input.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 64 KiB |
BIN
img/ui-output.png
Normal file
BIN
img/ui-output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 124 KiB |
36
models_download.py
Normal file
36
models_download.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import requests
|
||||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
# Download Docling models
|
||||
StandardPdfPipeline.download_models_hf(force=True)
|
||||
load_pretrained_nlp_models(verbose=True)
|
||||
|
||||
# Download EasyOCR models
|
||||
urls = [
|
||||
"https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
|
||||
"https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
|
||||
]
|
||||
|
||||
local_zip_paths = [
|
||||
"/opt/app-root/src/latin_g2.zip",
|
||||
"/opt/app-root/src/craft_mlt_25k.zip"
|
||||
]
|
||||
|
||||
extract_path = "/opt/app-root/src/.EasyOCR/model/"
|
||||
|
||||
for url, local_zip_path in zip(urls, local_zip_paths):
|
||||
# Download the file
|
||||
response = requests.get(url)
|
||||
with open(local_zip_path, "wb") as file:
|
||||
file.write(response.content)
|
||||
|
||||
# Unzip the file
|
||||
with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
|
||||
zip_ref.extractall(extract_path)
|
||||
|
||||
# Clean up the zip file
|
||||
os.remove(local_zip_path)
|
||||
8
os-packages.txt
Normal file
8
os-packages.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
tesseract
|
||||
tesseract-devel
|
||||
tesseract-langpack-eng
|
||||
leptonica-devel
|
||||
libglvnd-glx
|
||||
glib2
|
||||
wget
|
||||
git
|
||||
2231
poetry.lock
generated
2231
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "docling-serve"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
description = "Running Docling as a service"
|
||||
license = "MIT"
|
||||
authors = [
|
||||
@@ -30,11 +30,14 @@ classifiers = [
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
docling = "^2.10.0"
|
||||
python = ">=3.10,<3.13" # 3.10 needed for Gradio, and no torchvision build for 3.13 yet
|
||||
docling = "^2.14.0"
|
||||
fastapi = {version = "^0.115.6", extras = ["standard"]}
|
||||
uvicorn = "^0.32.1"
|
||||
gradio = { version = "^5.9.1", optional = true }
|
||||
uvicorn = "~0.29.0"
|
||||
pydantic = "^2.10.3"
|
||||
pydantic-settings = "^2.4.0"
|
||||
python-multipart = "^0.0.19"
|
||||
httpx = "^0.28.1"
|
||||
tesserocr = { version = "^2.7.1", optional = true }
|
||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
||||
@@ -47,6 +50,7 @@ onnxruntime = [
|
||||
|
||||
|
||||
[tool.poetry.extras]
|
||||
ui = ["gradio"]
|
||||
tesserocr = ["tesserocr"]
|
||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||
|
||||
@@ -89,7 +93,9 @@ isort = "^5.13.2"
|
||||
pre-commit = "^3.8.0"
|
||||
autoflake = "^2.3.1"
|
||||
flake8 = "^7.1.1"
|
||||
pytest = "^8.3.2"
|
||||
pytest = "^8.3.4"
|
||||
pytest-asyncio = "^0.24.0"
|
||||
pytest-check = "^2.4.1"
|
||||
mypy = "^1.11.2"
|
||||
|
||||
[build-system]
|
||||
@@ -125,5 +131,22 @@ module = [
|
||||
"easyocr.*",
|
||||
"tesserocr.*",
|
||||
"rapidocr_onnxruntime.*",
|
||||
"docling_conversion.*",
|
||||
"gradio_ui.*",
|
||||
"response_preparation.*",
|
||||
"helper_functions.*",
|
||||
"requests.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
asyncio_default_fixture_loop_scope = "function"
|
||||
minversion = "8.2"
|
||||
testpaths = [
|
||||
"tests",
|
||||
]
|
||||
addopts = "-rA --color=yes --tb=short --maxfail=5"
|
||||
markers = [
|
||||
"asyncio",
|
||||
]
|
||||
|
||||
30
start_server.sh
Executable file
30
start_server.sh
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
set -Eeuo pipefail
|
||||
|
||||
# Network settings
|
||||
export PORT="${PORT:-5001}"
|
||||
export HOST="${HOST:-"0.0.0.0"}"
|
||||
|
||||
# Performance settings
|
||||
UVICORN_WORKERS="${UVICORN_WORKERS:-1}"
|
||||
|
||||
# Development settings
|
||||
export WITH_UI="${WITH_UI:-"true"}"
|
||||
export RELOAD=${RELOAD:-"false"}
|
||||
|
||||
# --------------------------------------
|
||||
# Process env settings
|
||||
|
||||
EXTRA_ARGS=""
|
||||
if [ "$RELOAD" == "true" ]; then
|
||||
EXTRA_ARGS="$EXTRA_ARGS --reload"
|
||||
fi
|
||||
|
||||
# Launch
|
||||
exec poetry run uvicorn \
|
||||
docling_serve.app:app \
|
||||
--host=${HOST} \
|
||||
--port=${PORT} \
|
||||
--timeout-keep-alive=600 \
|
||||
${EXTRA_ARGS} \
|
||||
--workers=${UVICORN_WORKERS}
|
||||
BIN
tests/2206.01062v1.pdf
Normal file
BIN
tests/2206.01062v1.pdf
Normal file
Binary file not shown.
BIN
tests/2408.09869v5.pdf
Normal file
BIN
tests/2408.09869v5.pdf
Normal file
Binary file not shown.
129
tests/test_1-file-all-outputs.py
Normal file
129
tests/test_1-file-all-outputs.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(async_client):
|
||||
"""Test convert single file to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
options = {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = {
|
||||
"files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
|
||||
}
|
||||
|
||||
response = await async_client.post(
|
||||
url, files=files, data={"options": json.dumps(options)}
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Response content checks
|
||||
# Helper function to safely slice strings
|
||||
def safe_slice(value, length=100):
|
||||
if isinstance(value, str):
|
||||
return value[:length]
|
||||
return str(value) # Convert non-string values to string for debug purposes
|
||||
|
||||
# Document check
|
||||
check.is_in(
|
||||
"document",
|
||||
data,
|
||||
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
||||
)
|
||||
# MD check
|
||||
check.is_in(
|
||||
"md_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("md_content") is not None:
|
||||
check.is_in(
|
||||
"## DocLayNet: ",
|
||||
data["document"]["md_content"],
|
||||
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
||||
)
|
||||
# JSON check
|
||||
check.is_in(
|
||||
"json_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("json_content") is not None:
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
"html_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("html_content") is not None:
|
||||
check.is_in(
|
||||
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
||||
data["document"]["html_content"],
|
||||
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
||||
)
|
||||
# Text check
|
||||
check.is_in(
|
||||
"text_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("text_content") is not None:
|
||||
check.is_in(
|
||||
"DocLayNet: A Large Human-Annotated Dataset",
|
||||
data["document"]["text_content"],
|
||||
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
||||
)
|
||||
# DocTags check
|
||||
check.is_in(
|
||||
"doctags_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("doctags_content") is not None:
|
||||
check.is_in(
|
||||
"<document>\n<section_header_level_1><location>",
|
||||
data["document"]["doctags_content"],
|
||||
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
||||
)
|
||||
123
tests/test_1-url-all-outputs.py
Normal file
123
tests/test_1-url-all-outputs.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import json
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}],
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
response = await async_client.post(url, json=payload)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Response content checks
|
||||
# Helper function to safely slice strings
|
||||
def safe_slice(value, length=100):
|
||||
if isinstance(value, str):
|
||||
return value[:length]
|
||||
return str(value) # Convert non-string values to string for debug purposes
|
||||
|
||||
# Document check
|
||||
check.is_in(
|
||||
"document",
|
||||
data,
|
||||
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
||||
)
|
||||
# MD check
|
||||
check.is_in(
|
||||
"md_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("md_content") is not None:
|
||||
check.is_in(
|
||||
"## DocLayNet: ",
|
||||
data["document"]["md_content"],
|
||||
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
||||
)
|
||||
# JSON check
|
||||
check.is_in(
|
||||
"json_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("json_content") is not None:
|
||||
check.is_in(
|
||||
'{"schema_name": "DoclingDocument"',
|
||||
json.dumps(data["document"]["json_content"]),
|
||||
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
||||
)
|
||||
# HTML check
|
||||
check.is_in(
|
||||
"html_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("html_content") is not None:
|
||||
check.is_in(
|
||||
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
||||
data["document"]["html_content"],
|
||||
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
||||
)
|
||||
# Text check
|
||||
check.is_in(
|
||||
"text_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("text_content") is not None:
|
||||
check.is_in(
|
||||
"DocLayNet: A Large Human-Annotated Dataset",
|
||||
data["document"]["text_content"],
|
||||
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
||||
)
|
||||
# DocTags check
|
||||
check.is_in(
|
||||
"doctags_content",
|
||||
data.get("document", {}),
|
||||
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
||||
)
|
||||
if data.get("document", {}).get("doctags_content") is not None:
|
||||
check.is_in(
|
||||
"<document>\n<section_header_level_1><location>",
|
||||
data["document"]["doctags_content"],
|
||||
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
||||
)
|
||||
74
tests/test_2-files-all-outputs.py
Normal file
74
tests/test_2-files-all-outputs.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(async_client):
|
||||
"""Test convert single file to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/file"
|
||||
options = {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = [
|
||||
("files", ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf")),
|
||||
("files", ("2408.09869v5.pdf", open(file_path, "rb"), "application/pdf")),
|
||||
]
|
||||
|
||||
response = await async_client.post(
|
||||
url, files=files, data={"options": json.dumps(options)}
|
||||
)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
# Check for zip file attachment
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert (
|
||||
content_disposition is not None
|
||||
), "Content-Disposition header should be present"
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert (
|
||||
'filename="converted_docs.zip"' in content_disposition
|
||||
), "Attachment filename should be 'converted_docs.zip'"
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert (
|
||||
content_type == "application/zip"
|
||||
), "Content-Type should be 'application/zip'"
|
||||
67
tests/test_2-urls-all-outputs.py
Normal file
67
tests/test_2-urls-all-outputs.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from pytest_check import check
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_client():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_url(async_client):
|
||||
"""Test convert URL to all outputs"""
|
||||
url = "http://localhost:5001/v1alpha/convert/source"
|
||||
payload = {
|
||||
"options": {
|
||||
"from_formats": [
|
||||
"docx",
|
||||
"pptx",
|
||||
"html",
|
||||
"image",
|
||||
"pdf",
|
||||
"asciidoc",
|
||||
"md",
|
||||
"xlsx",
|
||||
],
|
||||
"to_formats": ["md", "json", "html", "text", "doctags"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": ["en"],
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
},
|
||||
"http_sources": [
|
||||
{"url": "https://arxiv.org/pdf/2206.01062"},
|
||||
{"url": "https://arxiv.org/pdf/2408.09869"},
|
||||
],
|
||||
}
|
||||
|
||||
response = await async_client.post(url, json=payload)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
# Check for zip file attachment
|
||||
content_disposition = response.headers.get("content-disposition")
|
||||
|
||||
with check:
|
||||
assert (
|
||||
content_disposition is not None
|
||||
), "Content-Disposition header should be present"
|
||||
with check:
|
||||
assert "attachment" in content_disposition, "Response should be an attachment"
|
||||
with check:
|
||||
assert (
|
||||
'filename="converted_docs.zip"' in content_disposition
|
||||
), "Attachment filename should be 'converted_docs.zip'"
|
||||
|
||||
content_type = response.headers.get("content-type")
|
||||
with check:
|
||||
assert (
|
||||
content_type == "application/zip"
|
||||
), "Content-Type should be 'application/zip'"
|
||||
Reference in New Issue
Block a user