mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 16:43:24 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
717fb3a8d8 | ||
|
|
873d05aefe | ||
|
|
196c5ce42a | ||
|
|
b5c5f47892 | ||
|
|
d5455b7f66 | ||
|
|
7a682494d6 | ||
|
|
524f6a8997 | ||
|
|
9ccf8e3b5e | ||
|
|
ffea34732b | ||
|
|
b299af002b | ||
|
|
c4c41f16df | ||
|
|
7066f3520a | ||
|
|
6a8190c315 |
2
.github/dco.yml
vendored
Normal file
2
.github/dco.yml
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
allowRemediationCommits:
|
||||
individual: true
|
||||
192
.github/workflows/dco-advisor.yml
vendored
Normal file
192
.github/workflows/dco-advisor.yml
vendored
Normal file
@@ -0,0 +1,192 @@
|
||||
name: DCO Advisor Bot
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened, synchronize]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
dco_advisor:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Handle DCO check result
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
script: |
|
||||
const pr = context.payload.pull_request || context.payload.check_run?.pull_requests?.[0];
|
||||
if (!pr) return;
|
||||
|
||||
const prNumber = pr.number;
|
||||
const baseRef = pr.base.ref;
|
||||
const headSha =
|
||||
context.payload.check_run?.head_sha ||
|
||||
pr.head?.sha;
|
||||
const username = pr.user.login;
|
||||
|
||||
console.log("HEAD SHA:", headSha);
|
||||
|
||||
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
// Poll until DCO check has a conclusion (max 6 attempts, 30s)
|
||||
let dcoCheck = null;
|
||||
for (let attempt = 0; attempt < 6; attempt++) {
|
||||
const { data: checks } = await github.rest.checks.listForRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: headSha
|
||||
});
|
||||
|
||||
|
||||
console.log("All check runs:");
|
||||
checks.check_runs.forEach(run => {
|
||||
console.log(`- ${run.name} (${run.status}/${run.conclusion}) @ ${run.head_sha}`);
|
||||
});
|
||||
|
||||
dcoCheck = checks.check_runs.find(run =>
|
||||
run.name.toLowerCase().includes("dco") &&
|
||||
!run.name.toLowerCase().includes("dco_advisor") &&
|
||||
run.head_sha === headSha
|
||||
);
|
||||
|
||||
|
||||
if (dcoCheck?.conclusion) break;
|
||||
console.log(`Waiting for DCO check... (${attempt + 1})`);
|
||||
await sleep(5000); // wait 5 seconds
|
||||
}
|
||||
|
||||
if (!dcoCheck || !dcoCheck.conclusion) {
|
||||
console.log("DCO check did not complete in time.");
|
||||
return;
|
||||
}
|
||||
|
||||
const isFailure = ["failure", "action_required"].includes(dcoCheck.conclusion);
|
||||
console.log(`DCO check conclusion for ${headSha}: ${dcoCheck.conclusion} (treated as ${isFailure ? "failure" : "success"})`);
|
||||
|
||||
// Parse DCO output for commit SHAs and author
|
||||
let badCommits = [];
|
||||
let authorName = "";
|
||||
let authorEmail = "";
|
||||
let moreInfo = `More info: [DCO check report](${dcoCheck?.html_url})`;
|
||||
|
||||
if (isFailure) {
|
||||
const { data: commits } = await github.rest.pulls.listCommits({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: prNumber,
|
||||
});
|
||||
|
||||
for (const commit of commits) {
|
||||
const commitMessage = commit.commit.message;
|
||||
const signoffMatch = commitMessage.match(/^Signed-off-by:\s+.+<.+>$/m);
|
||||
if (!signoffMatch) {
|
||||
console.log(`Bad commit found ${commit.sha}`)
|
||||
badCommits.push({
|
||||
sha: commit.sha,
|
||||
authorName: commit.commit.author.name,
|
||||
authorEmail: commit.commit.author.email,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If multiple authors are present, you could adapt the message accordingly
|
||||
// For now, we'll just use the first one
|
||||
if (badCommits.length > 0) {
|
||||
authorName = badCommits[0].authorName;
|
||||
authorEmail = badCommits[0].authorEmail;
|
||||
}
|
||||
|
||||
// Generate remediation commit message if needed
|
||||
let remediationSnippet = "";
|
||||
if (badCommits.length && authorEmail) {
|
||||
remediationSnippet = `git commit --allow-empty -s -m "DCO Remediation Commit for ${authorName} <${authorEmail}>\n\n` +
|
||||
badCommits.map(c => `I, ${c.authorName} <${c.authorEmail}>, hereby add my Signed-off-by to this commit: ${c.sha}`).join('\n') +
|
||||
`"`;
|
||||
} else {
|
||||
remediationSnippet = "# Unable to auto-generate remediation message. Please check the DCO check details.";
|
||||
}
|
||||
|
||||
// Build comment
|
||||
const commentHeader = '<!-- dco-advice-bot -->';
|
||||
let body = "";
|
||||
|
||||
if (isFailure) {
|
||||
body = [
|
||||
commentHeader,
|
||||
'❌ **DCO Check Failed**',
|
||||
'',
|
||||
`Hi @${username}, your pull request has failed the Developer Certificate of Origin (DCO) check.`,
|
||||
'',
|
||||
'This repository supports **remediation commits**, so you can fix this without rewriting history — but you must follow the required message format.',
|
||||
'',
|
||||
'---',
|
||||
'',
|
||||
'### 🛠 Quick Fix: Add a remediation commit',
|
||||
'Run this command:',
|
||||
'',
|
||||
'```bash',
|
||||
remediationSnippet,
|
||||
'git push',
|
||||
'```',
|
||||
'',
|
||||
'---',
|
||||
'',
|
||||
'<details>',
|
||||
'<summary>🔧 Advanced: Sign off each commit directly</summary>',
|
||||
'',
|
||||
'**For the latest commit:**',
|
||||
'```bash',
|
||||
'git commit --amend --signoff',
|
||||
'git push --force-with-lease',
|
||||
'```',
|
||||
'',
|
||||
'**For multiple commits:**',
|
||||
'```bash',
|
||||
`git rebase --signoff origin/${baseRef}`,
|
||||
'git push --force-with-lease',
|
||||
'```',
|
||||
'',
|
||||
'</details>',
|
||||
'',
|
||||
moreInfo
|
||||
].join('\n');
|
||||
} else {
|
||||
body = [
|
||||
commentHeader,
|
||||
'✅ **DCO Check Passed**',
|
||||
'',
|
||||
`Thanks @${username}, all your commits are properly signed off. 🎉`
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
// Get existing comments on the PR
|
||||
const { data: comments } = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber
|
||||
});
|
||||
|
||||
// Look for a previous bot comment
|
||||
const existingComment = comments.find(c =>
|
||||
c.body.includes("<!-- dco-advice-bot -->")
|
||||
);
|
||||
|
||||
if (existingComment) {
|
||||
await github.rest.issues.updateComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: existingComment.id,
|
||||
body: body
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
body: body
|
||||
});
|
||||
}
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -444,3 +444,5 @@ pip-selfcheck.json
|
||||
# Makefile
|
||||
.action-lint
|
||||
.markdown-lint
|
||||
|
||||
cookies.txt
|
||||
40
CHANGELOG.md
40
CHANGELOG.md
@@ -1,3 +1,43 @@
|
||||
## [v0.15.0](https://github.com/docling-project/docling-serve/releases/tag/v0.15.0) - 2025-06-17
|
||||
|
||||
### Feature
|
||||
|
||||
* Use redocs and scalar as api docs ([#228](https://github.com/docling-project/docling-serve/issues/228)) ([`873d05a`](https://github.com/docling-project/docling-serve/commit/873d05aefe141c63b9c1cf53b23b4fa8c96de05d))
|
||||
|
||||
### Fix
|
||||
|
||||
* "tesserocr" instead of "tesseract_cli" in usage docs ([#223](https://github.com/docling-project/docling-serve/issues/223)) ([`196c5ce`](https://github.com/docling-project/docling-serve/commit/196c5ce42a04d77234a4212c3d9b9772d2c2073e))
|
||||
|
||||
## [v0.14.0](https://github.com/docling-project/docling-serve/releases/tag/v0.14.0) - 2025-06-17
|
||||
|
||||
### Feature
|
||||
|
||||
* Read supported file extensions from docling ([#214](https://github.com/docling-project/docling-serve/issues/214)) ([`524f6a8`](https://github.com/docling-project/docling-serve/commit/524f6a8997b86d2f869ca491ec8fb40585b42ca4))
|
||||
|
||||
### Fix
|
||||
|
||||
* Typo in Headline ([#220](https://github.com/docling-project/docling-serve/issues/220)) ([`d5455b7`](https://github.com/docling-project/docling-serve/commit/d5455b7f66de39ea1f8b8927b5968d2baa23ca88))
|
||||
|
||||
## [v0.13.0](https://github.com/docling-project/docling-serve/releases/tag/v0.13.0) - 2025-06-04
|
||||
|
||||
### Feature
|
||||
|
||||
* Upgrade docling to 2.36 ([#212](https://github.com/docling-project/docling-serve/issues/212)) ([`ffea347`](https://github.com/docling-project/docling-serve/commit/ffea34732b24fdd438fabd6df02d3d9ce66b4534))
|
||||
|
||||
## [v0.12.0](https://github.com/docling-project/docling-serve/releases/tag/v0.12.0) - 2025-06-03
|
||||
|
||||
### Feature
|
||||
|
||||
* Export annotations in markdown and html (Docling upgrade) ([#202](https://github.com/docling-project/docling-serve/issues/202)) ([`c4c41f1`](https://github.com/docling-project/docling-serve/commit/c4c41f16dff83c5d2a0b8a4c625b5de19b36b7c5))
|
||||
|
||||
### Fix
|
||||
|
||||
* Processing complex params in multipart-form ([#210](https://github.com/docling-project/docling-serve/issues/210)) ([`7066f35`](https://github.com/docling-project/docling-serve/commit/7066f3520a88c07df1c80a0cc6c4339eaac4d6a7))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Add openshift replicasets examples ([#209](https://github.com/docling-project/docling-serve/issues/209)) ([`6a8190c`](https://github.com/docling-project/docling-serve/commit/6a8190c315792bd1e0e2b0af310656baaa5551e5))
|
||||
|
||||
## [v0.11.0](https://github.com/docling-project/docling-serve/releases/tag/v0.11.0) - 2025-05-23
|
||||
|
||||
### Feature
|
||||
|
||||
@@ -113,11 +113,13 @@ def _run(
|
||||
protocol = "https" if run_ssl else "http"
|
||||
url = f"{protocol}://{uvicorn_settings.host}:{uvicorn_settings.port}"
|
||||
url_docs = f"{url}/docs"
|
||||
url_scalar = f"{url}/scalar"
|
||||
url_ui = f"{url}/ui"
|
||||
|
||||
console.print("")
|
||||
console.print(f"Server started at [link={url}]{url}[/]")
|
||||
console.print(f"Documentation at [link={url_docs}]{url_docs}[/]")
|
||||
console.print(f"Scalar docs at [link={url_docs}]{url_scalar}[/]")
|
||||
if docling_serve_settings.enable_ui:
|
||||
console.print(f"UI at [link={url_ui}]{url_ui}[/]")
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ from fastapi.openapi.docs import (
|
||||
)
|
||||
from fastapi.responses import RedirectResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from scalar_fastapi import get_scalar_api_reference
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
|
||||
@@ -140,8 +141,8 @@ def create_app(): # noqa: C901
|
||||
|
||||
app = FastAPI(
|
||||
title="Docling Serve",
|
||||
docs_url=None if offline_docs_assets else "/docs",
|
||||
redoc_url=None if offline_docs_assets else "/redocs",
|
||||
docs_url=None if offline_docs_assets else "/swagger",
|
||||
redoc_url=None if offline_docs_assets else "/docs",
|
||||
lifespan=lifespan,
|
||||
version=version,
|
||||
)
|
||||
@@ -192,7 +193,7 @@ def create_app(): # noqa: C901
|
||||
name="static",
|
||||
)
|
||||
|
||||
@app.get("/docs", include_in_schema=False)
|
||||
@app.get("/swagger", include_in_schema=False)
|
||||
async def custom_swagger_ui_html():
|
||||
return get_swagger_ui_html(
|
||||
openapi_url=app.openapi_url,
|
||||
@@ -206,7 +207,7 @@ def create_app(): # noqa: C901
|
||||
async def swagger_ui_redirect():
|
||||
return get_swagger_ui_oauth2_redirect_html()
|
||||
|
||||
@app.get("/redoc", include_in_schema=False)
|
||||
@app.get("/docs", include_in_schema=False)
|
||||
async def redoc_html():
|
||||
return get_redoc_html(
|
||||
openapi_url=app.openapi_url,
|
||||
@@ -214,6 +215,15 @@ def create_app(): # noqa: C901
|
||||
redoc_js_url="/static/redoc.standalone.js",
|
||||
)
|
||||
|
||||
@app.get("/scalar", include_in_schema=False)
|
||||
async def scalar_html():
|
||||
return get_scalar_api_reference(
|
||||
openapi_url=app.openapi_url,
|
||||
title=app.title,
|
||||
scalar_favicon_url="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg",
|
||||
# hide_client_button=True, # not yet released but in main
|
||||
)
|
||||
|
||||
########################
|
||||
# Async / Sync helpers #
|
||||
########################
|
||||
|
||||
@@ -132,7 +132,11 @@ class ConvertDocumentsOptions(BaseModel):
|
||||
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
||||
"Optional, defaults to Markdown."
|
||||
),
|
||||
examples=[[OutputFormat.MARKDOWN]],
|
||||
examples=[
|
||||
[OutputFormat.MARKDOWN],
|
||||
[OutputFormat.MARKDOWN, OutputFormat.JSON],
|
||||
[v.value for v in OutputFormat],
|
||||
],
|
||||
),
|
||||
] = [OutputFormat.MARKDOWN]
|
||||
|
||||
@@ -231,7 +235,7 @@ class ConvertDocumentsOptions(BaseModel):
|
||||
PageRange,
|
||||
Field(
|
||||
description="Only convert a range of pages. The page number starts at 1.",
|
||||
examples=[(1, 4)],
|
||||
examples=[DEFAULT_PAGE_RANGE, (1, 4)],
|
||||
),
|
||||
] = DEFAULT_PAGE_RANGE
|
||||
|
||||
@@ -359,14 +363,24 @@ class ConvertDocumentsOptions(BaseModel):
|
||||
picture_description_local: Annotated[
|
||||
Optional[PictureDescriptionLocal],
|
||||
Field(
|
||||
description="Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api."
|
||||
description="Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.",
|
||||
examples=[
|
||||
PictureDescriptionLocal(repo_id="ibm-granite/granite-vision-3.2-2b"),
|
||||
PictureDescriptionLocal(repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"),
|
||||
],
|
||||
),
|
||||
] = None
|
||||
|
||||
picture_description_api: Annotated[
|
||||
Optional[PictureDescriptionApi],
|
||||
Field(
|
||||
description="API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local."
|
||||
description="API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.",
|
||||
examples=[
|
||||
PictureDescriptionApi(
|
||||
url="http://localhost:11434/v1/chat/completions",
|
||||
params={"model": "granite3.2-vision:2b"},
|
||||
)
|
||||
],
|
||||
),
|
||||
] = None
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import base64
|
||||
import importlib
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import ssl
|
||||
@@ -12,6 +13,7 @@ import certifi
|
||||
import gradio as gr
|
||||
import httpx
|
||||
|
||||
from docling.datamodel.base_models import FormatToExtensions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfBackend,
|
||||
PdfPipeline,
|
||||
@@ -545,19 +547,10 @@ with gr.Blocks(
|
||||
elem_id="file_input_zone",
|
||||
label="Upload File",
|
||||
file_types=[
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".html",
|
||||
".xlsx",
|
||||
".json",
|
||||
".asciidoc",
|
||||
".txt",
|
||||
".md",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
f".{v}"
|
||||
for v in itertools.chain.from_iterable(
|
||||
FormatToExtensions.values()
|
||||
)
|
||||
],
|
||||
file_count="multiple",
|
||||
scale=4,
|
||||
|
||||
@@ -1,9 +1,30 @@
|
||||
import inspect
|
||||
import json
|
||||
import re
|
||||
from typing import Union
|
||||
from typing import Union, get_args, get_origin
|
||||
|
||||
from fastapi import Depends, Form
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, TypeAdapter
|
||||
|
||||
|
||||
def is_pydantic_model(type_):
|
||||
try:
|
||||
if inspect.isclass(type_) and issubclass(type_, BaseModel):
|
||||
return True
|
||||
|
||||
origin = get_origin(type_)
|
||||
if origin is Union:
|
||||
args = get_args(type_)
|
||||
return any(
|
||||
inspect.isclass(arg) and issubclass(arg, BaseModel)
|
||||
for arg in args
|
||||
if arg is not type(None)
|
||||
)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Adapted from
|
||||
@@ -12,25 +33,62 @@ def FormDepends(cls: type[BaseModel]):
|
||||
new_parameters = []
|
||||
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
annotation = model_field.annotation
|
||||
description = model_field.description
|
||||
default = (
|
||||
Form(..., description=description, examples=model_field.examples)
|
||||
if model_field.is_required()
|
||||
else Form(
|
||||
model_field.default,
|
||||
examples=model_field.examples,
|
||||
description=description,
|
||||
)
|
||||
)
|
||||
|
||||
# Flatten nested Pydantic models by accepting them as JSON strings
|
||||
if is_pydantic_model(annotation):
|
||||
annotation = str
|
||||
default = Form(
|
||||
None
|
||||
if model_field.default is None
|
||||
else json.dumps(model_field.default.model_dump(mode="json")),
|
||||
description=description,
|
||||
examples=None
|
||||
if not model_field.examples
|
||||
else [
|
||||
json.dumps(ex.model_dump(mode="json"))
|
||||
for ex in model_field.examples
|
||||
],
|
||||
)
|
||||
|
||||
new_parameters.append(
|
||||
inspect.Parameter(
|
||||
name=field_name,
|
||||
kind=inspect.Parameter.POSITIONAL_ONLY,
|
||||
default=(
|
||||
Form(...)
|
||||
if model_field.is_required()
|
||||
else Form(model_field.default)
|
||||
),
|
||||
annotation=model_field.annotation,
|
||||
default=default,
|
||||
annotation=annotation,
|
||||
)
|
||||
)
|
||||
|
||||
async def as_form_func(**data):
|
||||
for field_name, model_field in cls.model_fields.items():
|
||||
value = data.get(field_name)
|
||||
annotation = model_field.annotation
|
||||
|
||||
# Parse nested models from JSON string
|
||||
if value is not None and is_pydantic_model(annotation):
|
||||
try:
|
||||
validator = TypeAdapter(annotation)
|
||||
data[field_name] = validator.validate_json(value)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid JSON for field '{field_name}': {e}")
|
||||
|
||||
return cls(**data)
|
||||
|
||||
sig = inspect.signature(as_form_func)
|
||||
sig = sig.replace(parameters=new_parameters)
|
||||
as_form_func.__signature__ = sig # type: ignore
|
||||
|
||||
return Depends(as_form_func)
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Dolcing Serve documentation
|
||||
# Docling Serve documentation
|
||||
|
||||
This documentation pages explore the webserver configurations, runtime options, deployment examples as well as development best practices.
|
||||
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
# This example deployment configures Docling Serve with a Route + Sticky sessions, a Service and cpu image
|
||||
---
|
||||
kind: Route
|
||||
apiVersion: route.openshift.io/v1
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
annotations:
|
||||
haproxy.router.openshift.io/disable_cookies: "false" # this annotation enables the sticky sessions
|
||||
spec:
|
||||
path: /
|
||||
to:
|
||||
kind: Service
|
||||
name: docling-serve
|
||||
port:
|
||||
targetPort: http
|
||||
tls:
|
||||
termination: edge
|
||||
insecureEdgeTerminationPolicy: Redirect
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 5001
|
||||
targetPort: http
|
||||
selector:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
---
|
||||
kind: Deployment
|
||||
apiVersion: apps/v1
|
||||
metadata:
|
||||
name: docling-serve
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: docling-serve
|
||||
component: docling-serve-api
|
||||
spec:
|
||||
restartPolicy: Always
|
||||
containers:
|
||||
- name: api
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
env:
|
||||
- name: DOCLING_SERVE_ENABLE_UI
|
||||
value: 'true'
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 5001
|
||||
protocol: TCP
|
||||
imagePullPolicy: Always
|
||||
image: 'ghcr.io/docling-project/docling-serve'
|
||||
@@ -192,3 +192,45 @@ curl -X 'POST' \
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
|
||||
}'
|
||||
```
|
||||
|
||||
### ReplicaSets with `sticky sessions`
|
||||
|
||||
Manifest example: [docling-serve-replicas-w-sticky-sessions.yaml](./deploy-examples/docling-serve-replicas-w-sticky-sessions.yaml)
|
||||
|
||||
This deployment has the following features:
|
||||
|
||||
- Deployment configuration with 3 replicas
|
||||
- Service configuration
|
||||
- Expose the service using a OpenShift `Route` and enables sticky sessions
|
||||
|
||||
Install the app with:
|
||||
|
||||
```sh
|
||||
oc apply -f docs/deploy-examples/docling-serve-replicas-w-sticky-sessions.yaml
|
||||
```
|
||||
|
||||
For using the API:
|
||||
|
||||
```sh
|
||||
# Retrieve the endpoint
|
||||
DOCLING_NAME=docling-serve
|
||||
DOCLING_ROUTE="https://$(oc get routes $DOCLING_NAME --template={{.spec.host}})"
|
||||
|
||||
# Make a test query, store the cookie and taskid
|
||||
task_id=$(curl -s -X 'POST' \
|
||||
"${DOCLING_ROUTE}/v1alpha/convert/source/async" \
|
||||
-H "accept: application/json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}]
|
||||
}' \
|
||||
-c cookies.txt | grep -oP '"task_id":"\K[^"]+')
|
||||
```
|
||||
|
||||
```sh
|
||||
# Grab the taskid and cookie to check the task status
|
||||
curl -v -X 'GET' \
|
||||
"${DOCLING_ROUTE}/v1alpha/status/poll/$task_id?wait=0" \
|
||||
-H "accept: application/json" \
|
||||
-b "cookies.txt"
|
||||
```
|
||||
|
||||
@@ -13,7 +13,7 @@ On top of the source of file (see below), both endpoints support the same parame
|
||||
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
||||
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
||||
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
||||
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesserocr`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`. To use the `tesserocr` engine, `tesserocr` must be installed where docling-serve is running: `pip install tesserocr`
|
||||
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
||||
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`, `dlparse_v4`. Defaults to `dlparse_v4`.
|
||||
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "docling-serve"
|
||||
version = "0.11.0" # DO NOT EDIT, updated automatically
|
||||
version = "0.15.0" # DO NOT EDIT, updated automatically
|
||||
description = "Running Docling as a service"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
@@ -31,6 +31,7 @@ classifiers = [
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"docling[vlm]~=2.28",
|
||||
"docling-core>=2.32.0",
|
||||
"mlx-vlm~=0.1.12; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"fastapi[standard]~=0.115",
|
||||
"httpx~=0.28",
|
||||
@@ -41,6 +42,7 @@ dependencies = [
|
||||
"typer~=0.12",
|
||||
"uvicorn[standard]>=0.29.0,<1.0.0",
|
||||
"websockets~=14.0",
|
||||
"scalar-fastapi>=1.0.3",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -212,6 +214,7 @@ module = [
|
||||
"kfp.*",
|
||||
"kfp_server_api.*",
|
||||
"mlx_vlm.*",
|
||||
"scalar_fastapi.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
77
tests/test_file_opts.py
Normal file
77
tests/test_file_opts.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from asgi_lifespan import LifespanManager
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
from docling_core.types import DoclingDocument
|
||||
from docling_core.types.doc.document import PictureDescriptionData
|
||||
|
||||
from docling_serve.app import create_app
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def event_loop():
|
||||
return asyncio.get_event_loop()
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="session")
|
||||
async def app():
|
||||
app = create_app()
|
||||
|
||||
async with LifespanManager(app) as manager:
|
||||
print("Launching lifespan of app.")
|
||||
yield manager.app
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="session")
|
||||
async def client(app):
|
||||
async with AsyncClient(
|
||||
transport=ASGITransport(app=app), base_url="http://app.io"
|
||||
) as client:
|
||||
print("Client is ready")
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_convert_file(client: AsyncClient):
|
||||
"""Test convert single file to all outputs"""
|
||||
|
||||
endpoint = "/v1alpha/convert/file"
|
||||
options = {
|
||||
"to_formats": ["md", "json"],
|
||||
"image_export_mode": "placeholder",
|
||||
"ocr": False,
|
||||
"do_picture_description": True,
|
||||
"picture_description_api": json.dumps(
|
||||
{
|
||||
"url": "http://localhost:11434/v1/chat/completions", # ollama
|
||||
"params": {"model": "granite3.2-vision:2b"},
|
||||
"timeout": 60,
|
||||
"prompt": "Describe this image in a few sentences. ",
|
||||
}
|
||||
),
|
||||
}
|
||||
|
||||
current_dir = os.path.dirname(__file__)
|
||||
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
||||
|
||||
files = {
|
||||
"files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
|
||||
}
|
||||
|
||||
response = await client.post(endpoint, files=files, data=options)
|
||||
assert response.status_code == 200, "Response should be 200 OK"
|
||||
|
||||
data = response.json()
|
||||
|
||||
doc = DoclingDocument.model_validate(data["document"]["json_content"])
|
||||
|
||||
for pic in doc.pictures:
|
||||
for ann in pic.annotations:
|
||||
if isinstance(ann, PictureDescriptionData):
|
||||
print(f"{pic.self_ref}")
|
||||
print(ann.text)
|
||||
Reference in New Issue
Block a user