From 82f890019745859699c1b01f9ccfb64cb7e37906 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Fri, 7 Mar 2025 11:26:50 +0100 Subject: [PATCH] feat: Async api (#60) Signed-off-by: Michele Dolfi --- .pre-commit-config.yaml | 18 +- Containerfile | 2 +- docling_serve/app.py | 182 ++++++++++++- docling_serve/datamodel/__init__.py | 0 docling_serve/datamodel/convert.py | 174 +++++++++++++ docling_serve/datamodel/engines.py | 30 +++ docling_serve/datamodel/requests.py | 62 +++++ docling_serve/datamodel/responses.py | 52 ++++ docling_serve/docling_conversion.py | 242 +----------------- docling_serve/engines/__init__.py | 8 + docling_serve/engines/async_local/__init__.py | 0 .../engines/async_local/orchestrator.py | 101 ++++++++ docling_serve/engines/async_local/worker.py | 116 +++++++++ docling_serve/engines/base_orchestrator.py | 21 ++ docling_serve/engines/block_local/__init__.py | 0 docling_serve/gradio_ui.py | 9 +- docling_serve/response_preparation.py | 38 +-- docling_serve/settings.py | 5 + pyproject.toml | 19 +- tests/test_1-file-all-outputs.py | 2 +- tests/test_1-url-all-outputs.py | 2 +- tests/test_1-url-async-ws.py | 48 ++++ tests/test_1-url-async.py | 60 +++++ tests/test_2-files-all-outputs.py | 18 +- tests/test_2-urls-all-outputs.py | 18 +- uv.lock | 59 ++--- 26 files changed, 919 insertions(+), 367 deletions(-) create mode 100644 docling_serve/datamodel/__init__.py create mode 100644 docling_serve/datamodel/convert.py create mode 100644 docling_serve/datamodel/engines.py create mode 100644 docling_serve/datamodel/requests.py create mode 100644 docling_serve/datamodel/responses.py create mode 100644 docling_serve/engines/__init__.py create mode 100644 docling_serve/engines/async_local/__init__.py create mode 100644 docling_serve/engines/async_local/orchestrator.py create mode 100644 docling_serve/engines/async_local/worker.py create mode 100644 docling_serve/engines/base_orchestrator.py create mode 100644 docling_serve/engines/block_local/__init__.py create mode 100644 tests/test_1-url-async-ws.py create mode 100644 tests/test_1-url-async.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6abf1a6..42bfaa8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,14 @@ fail_fast: true repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.6 + hooks: + # Run the Ruff formatter. + - id: ruff-format + args: [--config=pyproject.toml] + # Run the Ruff linter. + - id: ruff + args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] - repo: local hooks: - id: system @@ -13,12 +22,3 @@ repos: rev: 0.6.1 hooks: - id: uv-lock - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.6 - hooks: - # Run the Ruff linter. - - id: ruff - args: [--exit-non-zero-on-fix, --config=pyproject.toml] - # Run the Ruff formatter. - # - id: ruff-format - # args: [--config=pyproject.toml] diff --git a/Containerfile b/Containerfile index cc932d7..bf7d20a 100644 --- a/Containerfile +++ b/Containerfile @@ -53,7 +53,7 @@ RUN echo "Downloading models..." && \ chown -R 1001:0 /opt/app-root/src/.cache && \ chmod -R g=u /opt/app-root/src/.cache -COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve +COPY --chown=1001:0 ./docling_serve ./docling_serve RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \ uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA} diff --git a/docling_serve/app.py b/docling_serve/app.py index 13c2e6f..1fa8d36 100644 --- a/docling_serve/app.py +++ b/docling_serve/app.py @@ -1,3 +1,4 @@ +import asyncio import importlib.metadata import logging import tempfile @@ -6,23 +7,46 @@ from io import BytesIO from pathlib import Path from typing import Annotated, Any, Dict, List, Optional, Union -from docling.datamodel.base_models import DocumentStream, InputFormat -from docling.document_converter import DocumentConverter -from fastapi import BackgroundTasks, FastAPI, UploadFile +from fastapi import ( + BackgroundTasks, + Depends, + FastAPI, + HTTPException, + Query, + UploadFile, + WebSocket, + WebSocketDisconnect, +) from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import RedirectResponse -from pydantic import BaseModel -from docling_serve.docling_conversion import ( +from docling.datamodel.base_models import DocumentStream, InputFormat +from docling.document_converter import DocumentConverter + +from docling_serve.datamodel.convert import ConvertDocumentsOptions +from docling_serve.datamodel.requests import ( ConvertDocumentFileSourcesRequest, - ConvertDocumentsOptions, ConvertDocumentsRequest, +) +from docling_serve.datamodel.responses import ( + ConvertDocumentResponse, + HealthCheckResponse, + MessageKind, + TaskStatusResponse, + WebsocketMessage, +) +from docling_serve.docling_conversion import ( convert_documents, converters, get_pdf_pipeline_opts, ) +from docling_serve.engines import get_orchestrator +from docling_serve.engines.async_local.orchestrator import ( + AsyncLocalOrchestrator, + TaskNotFoundError, +) from docling_serve.helper_functions import FormDepends -from docling_serve.response_preparation import ConvertDocumentResponse, process_results +from docling_serve.response_preparation import process_results from docling_serve.settings import docling_serve_settings @@ -72,9 +96,22 @@ async def lifespan(app: FastAPI): converters[options_hash].initialize_pipeline(InputFormat.PDF) + orchestrator = get_orchestrator() + + # Start the background queue processor + queue_task = asyncio.create_task(orchestrator.process_queue()) + yield + # Cancel the background queue processor on shutdown + queue_task.cancel() + try: + await queue_task + except asyncio.CancelledError: + _log.info("Queue processor cancelled.") + converters.clear() + # if WITH_UI: # gradio_ui.close() @@ -84,7 +121,7 @@ async def lifespan(app: FastAPI): ################################## -def create_app(): +def create_app(): # noqa: C901 try: version = importlib.metadata.version("docling_serve") except importlib.metadata.PackageNotFoundError: @@ -145,10 +182,6 @@ def create_app(): ) return response - # Status - class HealthCheckResponse(BaseModel): - status: str = "ok" - @app.get("/health") def health() -> HealthCheckResponse: return HealthCheckResponse() @@ -233,4 +266,129 @@ def create_app(): return response + # Convert a document from URL(s) using the async api + @app.post( + "/v1alpha/convert/source/async", + response_model=TaskStatusResponse, + ) + async def process_url_async( + orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)], + conversion_request: ConvertDocumentsRequest, + ): + task = await orchestrator.enqueue(request=conversion_request) + task_queue_position = await orchestrator.get_queue_position( + task_id=task.task_id + ) + return TaskStatusResponse( + task_id=task.task_id, + task_status=task.task_status, + task_position=task_queue_position, + ) + + # Task status poll + @app.get( + "/v1alpha/status/poll/{task_id}", + response_model=TaskStatusResponse, + ) + async def task_status_poll( + orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)], + task_id: str, + wait: Annotated[ + float, Query(help="Number of seconds to wait for a completed status.") + ] = 0.0, + ): + try: + task = await orchestrator.task_status(task_id=task_id, wait=wait) + task_queue_position = await orchestrator.get_queue_position(task_id=task_id) + except TaskNotFoundError: + raise HTTPException(status_code=404, detail="Task not found.") + return TaskStatusResponse( + task_id=task.task_id, + task_status=task.task_status, + task_position=task_queue_position, + ) + + # Task status websocket + @app.websocket( + "/v1alpha/status/ws/{task_id}", + ) + async def task_status_ws( + websocket: WebSocket, + orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)], + task_id: str, + ): + await websocket.accept() + + if task_id not in orchestrator.tasks: + await websocket.send_text( + WebsocketMessage( + message=MessageKind.ERROR, error="Task not found." + ).model_dump_json() + ) + await websocket.close() + return + + task = orchestrator.tasks[task_id] + + # Track active WebSocket connections for this job + orchestrator.task_subscribers[task_id].add(websocket) + + try: + task_queue_position = await orchestrator.get_queue_position(task_id=task_id) + task_response = TaskStatusResponse( + task_id=task.task_id, + task_status=task.task_status, + task_position=task_queue_position, + ) + await websocket.send_text( + WebsocketMessage( + message=MessageKind.CONNECTION, task=task_response + ).model_dump_json() + ) + while True: + task_queue_position = await orchestrator.get_queue_position( + task_id=task_id + ) + task_response = TaskStatusResponse( + task_id=task.task_id, + task_status=task.task_status, + task_position=task_queue_position, + ) + await websocket.send_text( + WebsocketMessage( + message=MessageKind.UPDATE, task=task_response + ).model_dump_json() + ) + # each client message will be interpreted as a request for update + msg = await websocket.receive_text() + _log.debug(f"Received message: {msg}") + + except WebSocketDisconnect: + _log.info(f"WebSocket disconnected for job {task_id}") + + finally: + orchestrator.task_subscribers[task_id].remove(websocket) + + # Task result + @app.get( + "/v1alpha/result/{task_id}", + response_model=ConvertDocumentResponse, + responses={ + 200: { + "content": {"application/zip": {}}, + } + }, + ) + async def task_result( + orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)], + task_id: str, + ): + result = await orchestrator.task_result(task_id=task_id) + if result is None: + raise HTTPException( + status_code=404, + detail="Task result not found. Please wait for a completion status.", + ) + return result + return app diff --git a/docling_serve/datamodel/__init__.py b/docling_serve/datamodel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_serve/datamodel/convert.py b/docling_serve/datamodel/convert.py new file mode 100644 index 0000000..5d50e18 --- /dev/null +++ b/docling_serve/datamodel/convert.py @@ -0,0 +1,174 @@ +# Define the input options for the API +from typing import Annotated, List, Optional + +from pydantic import BaseModel, Field + +from docling.datamodel.base_models import InputFormat, OutputFormat +from docling.datamodel.pipeline_options import OcrEngine, PdfBackend, TableFormerMode +from docling_core.types.doc import ImageRefMode + + +class ConvertDocumentsOptions(BaseModel): + from_formats: Annotated[ + List[InputFormat], + Field( + description=( + "Input format(s) to convert from. String or list of strings. " + f"Allowed values: {', '.join([v.value for v in InputFormat])}. " + "Optional, defaults to all formats." + ), + examples=[[v.value for v in InputFormat]], + ), + ] = list(InputFormat) + + to_formats: Annotated[ + List[OutputFormat], + Field( + description=( + "Output format(s) to convert to. String or list of strings. " + f"Allowed values: {', '.join([v.value for v in OutputFormat])}. " + "Optional, defaults to Markdown." + ), + examples=[[OutputFormat.MARKDOWN]], + ), + ] = [OutputFormat.MARKDOWN] + + image_export_mode: Annotated[ + ImageRefMode, + Field( + description=( + "Image export mode for the document (in case of JSON," + " Markdown or HTML). " + f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. " + "Optional, defaults to Embedded." + ), + examples=[ImageRefMode.EMBEDDED.value], + # pattern="embedded|placeholder|referenced", + ), + ] = ImageRefMode.EMBEDDED + + do_ocr: Annotated[ + bool, + Field( + description=( + "If enabled, the bitmap content will be processed using OCR. " + "Boolean. Optional, defaults to true" + ), + # examples=[True], + ), + ] = True + + force_ocr: Annotated[ + bool, + Field( + description=( + "If enabled, replace existing text with OCR-generated " + "text over content. Boolean. Optional, defaults to false." + ), + # examples=[False], + ), + ] = False + + # TODO: use a restricted list based on what is installed on the system + ocr_engine: Annotated[ + OcrEngine, + Field( + description=( + "The OCR engine to use. String. " + "Allowed values: easyocr, tesseract, rapidocr. " + "Optional, defaults to easyocr." + ), + examples=[OcrEngine.EASYOCR], + ), + ] = OcrEngine.EASYOCR + + ocr_lang: Annotated[ + Optional[List[str]], + Field( + description=( + "List of languages used by the OCR engine. " + "Note that each OCR engine has " + "different values for the language names. String or list of strings. " + "Optional, defaults to empty." + ), + examples=[["fr", "de", "es", "en"]], + ), + ] = None + + pdf_backend: Annotated[ + PdfBackend, + Field( + description=( + "The PDF backend to use. String. " + f"Allowed values: {', '.join([v.value for v in PdfBackend])}. " + f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}." + ), + examples=[PdfBackend.DLPARSE_V2], + ), + ] = PdfBackend.DLPARSE_V2 + + table_mode: Annotated[ + TableFormerMode, + Field( + TableFormerMode.FAST, + description=( + "Mode to use for table structure, String. " + f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. " + "Optional, defaults to fast." + ), + examples=[TableFormerMode.FAST], + # pattern="fast|accurate", + ), + ] = TableFormerMode.FAST + + abort_on_error: Annotated[ + bool, + Field( + description=( + "Abort on error if enabled. Boolean. Optional, defaults to false." + ), + # examples=[False], + ), + ] = False + + return_as_file: Annotated[ + bool, + Field( + description=( + "Return the output as a zip file " + "(will happen anyway if multiple files are generated). " + "Boolean. Optional, defaults to false." + ), + examples=[False], + ), + ] = False + + do_table_structure: Annotated[ + bool, + Field( + description=( + "If enabled, the table structure will be extracted. " + "Boolean. Optional, defaults to true." + ), + examples=[True], + ), + ] = True + + include_images: Annotated[ + bool, + Field( + description=( + "If enabled, images will be extracted from the document. " + "Boolean. Optional, defaults to true." + ), + examples=[True], + ), + ] = True + + images_scale: Annotated[ + float, + Field( + description="Scale factor for images. Float. Optional, defaults to 2.0.", + examples=[2.0], + ), + ] = 2.0 diff --git a/docling_serve/datamodel/engines.py b/docling_serve/datamodel/engines.py new file mode 100644 index 0000000..ea50538 --- /dev/null +++ b/docling_serve/datamodel/engines.py @@ -0,0 +1,30 @@ +import enum +from typing import Optional + +from pydantic import BaseModel + +from docling_serve.datamodel.requests import ConvertDocumentsRequest +from docling_serve.datamodel.responses import ConvertDocumentResponse + + +class TaskStatus(str, enum.Enum): + SUCCESS = "success" + PENDING = "pending" + STARTED = "started" + FAILURE = "failure" + + +class AsyncEngine(str, enum.Enum): + LOCAL = "local" + + +class Task(BaseModel): + task_id: str + task_status: TaskStatus = TaskStatus.PENDING + request: Optional[ConvertDocumentsRequest] + result: Optional[ConvertDocumentResponse] = None + + def is_completed(self) -> bool: + if self.task_status in [TaskStatus.SUCCESS, TaskStatus.FAILURE]: + return True + return False diff --git a/docling_serve/datamodel/requests.py b/docling_serve/datamodel/requests.py new file mode 100644 index 0000000..129e46f --- /dev/null +++ b/docling_serve/datamodel/requests.py @@ -0,0 +1,62 @@ +import base64 +from io import BytesIO +from typing import Annotated, Any, Dict, List, Union + +from pydantic import BaseModel, Field + +from docling.datamodel.base_models import DocumentStream + +from docling_serve.datamodel.convert import ConvertDocumentsOptions + + +class DocumentsConvertBase(BaseModel): + options: ConvertDocumentsOptions = ConvertDocumentsOptions() + + +class HttpSource(BaseModel): + url: Annotated[ + str, + Field( + description="HTTP url to process", + examples=["https://arxiv.org/pdf/2206.01062"], + ), + ] + headers: Annotated[ + Dict[str, Any], + Field( + description="Additional headers used to fetch the urls, " + "e.g. authorization, agent, etc" + ), + ] = {} + + +class FileSource(BaseModel): + base64_string: Annotated[ + str, + Field( + description="Content of the file serialized in base64. " + "For example it can be obtained via " + "`base64 -w 0 /path/to/file/pdf-to-convert.pdf`." + ), + ] + filename: Annotated[ + str, + Field(description="Filename of the uploaded document", examples=["file.pdf"]), + ] + + def to_document_stream(self) -> DocumentStream: + buf = BytesIO(base64.b64decode(self.base64_string)) + return DocumentStream(stream=buf, name=self.filename) + + +class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase): + http_sources: List[HttpSource] + + +class ConvertDocumentFileSourcesRequest(DocumentsConvertBase): + file_sources: List[FileSource] + + +ConvertDocumentsRequest = Union[ + ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest +] diff --git a/docling_serve/datamodel/responses.py b/docling_serve/datamodel/responses.py new file mode 100644 index 0000000..a6ee9c2 --- /dev/null +++ b/docling_serve/datamodel/responses.py @@ -0,0 +1,52 @@ +import enum +from typing import Dict, List, Optional + +from pydantic import BaseModel + +from docling.datamodel.document import ConversionStatus, ErrorItem +from docling.utils.profiling import ProfilingItem +from docling_core.types.doc import DoclingDocument + + +# Status +class HealthCheckResponse(BaseModel): + status: str = "ok" + + +class DocumentResponse(BaseModel): + filename: str + md_content: Optional[str] = None + json_content: Optional[DoclingDocument] = None + html_content: Optional[str] = None + text_content: Optional[str] = None + doctags_content: Optional[str] = None + + +class ConvertDocumentResponse(BaseModel): + document: DocumentResponse + status: ConversionStatus + errors: List[ErrorItem] = [] + processing_time: float + timings: Dict[str, ProfilingItem] = {} + + +class ConvertDocumentErrorResponse(BaseModel): + status: ConversionStatus + + +class TaskStatusResponse(BaseModel): + task_id: str + task_status: str + task_position: Optional[int] = None + + +class MessageKind(str, enum.Enum): + CONNECTION = "connection" + UPDATE = "update" + ERROR = "error" + + +class WebsocketMessage(BaseModel): + message: MessageKind + task: Optional[TaskStatusResponse] = None + error: Optional[str] = None diff --git a/docling_serve/docling_conversion.py b/docling_serve/docling_conversion.py index aee2c76..7f2ab14 100644 --- a/docling_serve/docling_conversion.py +++ b/docling_serve/docling_conversion.py @@ -1,27 +1,16 @@ -import base64 import hashlib import json import logging -from io import BytesIO from pathlib import Path -from typing import ( - Annotated, - Any, - Dict, - Iterable, - Iterator, - List, - Optional, - Tuple, - Type, - Union, -) +from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Type, Union + +from fastapi import HTTPException from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import DocumentStream, InputFormat, OutputFormat +from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, @@ -35,235 +24,14 @@ from docling.datamodel.pipeline_options import ( ) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling_core.types.doc import ImageRefMode -from fastapi import HTTPException -from pydantic import BaseModel, Field +from docling_serve.datamodel.convert import ConvertDocumentsOptions from docling_serve.helper_functions import _to_list_of_strings from docling_serve.settings import docling_serve_settings _log = logging.getLogger(__name__) -# Define the input options for the API -class ConvertDocumentsOptions(BaseModel): - from_formats: Annotated[ - List[InputFormat], - Field( - description=( - "Input format(s) to convert from. String or list of strings. " - f"Allowed values: {', '.join([v.value for v in InputFormat])}. " - "Optional, defaults to all formats." - ), - examples=[[v.value for v in InputFormat]], - ), - ] = list(InputFormat) - - to_formats: Annotated[ - List[OutputFormat], - Field( - description=( - "Output format(s) to convert to. String or list of strings. " - f"Allowed values: {', '.join([v.value for v in OutputFormat])}. " - "Optional, defaults to Markdown." - ), - examples=[[OutputFormat.MARKDOWN]], - ), - ] = [OutputFormat.MARKDOWN] - - image_export_mode: Annotated[ - ImageRefMode, - Field( - description=( - "Image export mode for the document (in case of JSON," - " Markdown or HTML). " - f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. " - "Optional, defaults to Embedded." - ), - examples=[ImageRefMode.EMBEDDED.value], - # pattern="embedded|placeholder|referenced", - ), - ] = ImageRefMode.EMBEDDED - - do_ocr: Annotated[ - bool, - Field( - description=( - "If enabled, the bitmap content will be processed using OCR. " - "Boolean. Optional, defaults to true" - ), - # examples=[True], - ), - ] = True - - force_ocr: Annotated[ - bool, - Field( - description=( - "If enabled, replace existing text with OCR-generated " - "text over content. Boolean. Optional, defaults to false." - ), - # examples=[False], - ), - ] = False - - # TODO: use a restricted list based on what is installed on the system - ocr_engine: Annotated[ - OcrEngine, - Field( - description=( - "The OCR engine to use. String. " - "Allowed values: easyocr, tesseract, rapidocr. " - "Optional, defaults to easyocr." - ), - examples=[OcrEngine.EASYOCR], - ), - ] = OcrEngine.EASYOCR - - ocr_lang: Annotated[ - Optional[List[str]], - Field( - description=( - "List of languages used by the OCR engine. " - "Note that each OCR engine has " - "different values for the language names. String or list of strings. " - "Optional, defaults to empty." - ), - examples=[["fr", "de", "es", "en"]], - ), - ] = None - - pdf_backend: Annotated[ - PdfBackend, - Field( - description=( - "The PDF backend to use. String. " - f"Allowed values: {', '.join([v.value for v in PdfBackend])}. " - f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}." - ), - examples=[PdfBackend.DLPARSE_V2], - ), - ] = PdfBackend.DLPARSE_V2 - - table_mode: Annotated[ - TableFormerMode, - Field( - TableFormerMode.FAST, - description=( - "Mode to use for table structure, String. " - f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. " - "Optional, defaults to fast." - ), - examples=[TableFormerMode.FAST], - # pattern="fast|accurate", - ), - ] = TableFormerMode.FAST - - abort_on_error: Annotated[ - bool, - Field( - description=( - "Abort on error if enabled. Boolean. Optional, defaults to false." - ), - # examples=[False], - ), - ] = False - - return_as_file: Annotated[ - bool, - Field( - description=( - "Return the output as a zip file " - "(will happen anyway if multiple files are generated). " - "Boolean. Optional, defaults to false." - ), - examples=[False], - ), - ] = False - - do_table_structure: Annotated[ - bool, - Field( - description=( - "If enabled, the table structure will be extracted. " - "Boolean. Optional, defaults to true." - ), - examples=[True], - ), - ] = True - - include_images: Annotated[ - bool, - Field( - description=( - "If enabled, images will be extracted from the document. " - "Boolean. Optional, defaults to true." - ), - examples=[True], - ), - ] = True - - images_scale: Annotated[ - float, - Field( - description="Scale factor for images. Float. Optional, defaults to 2.0.", - examples=[2.0], - ), - ] = 2.0 - - -class DocumentsConvertBase(BaseModel): - options: ConvertDocumentsOptions = ConvertDocumentsOptions() - - -class HttpSource(BaseModel): - url: Annotated[ - str, - Field( - description="HTTP url to process", - examples=["https://arxiv.org/pdf/2206.01062"], - ), - ] - headers: Annotated[ - Dict[str, Any], - Field( - description="Additional headers used to fetch the urls, " - "e.g. authorization, agent, etc" - ), - ] = {} - - -class FileSource(BaseModel): - base64_string: Annotated[ - str, - Field( - description="Content of the file serialized in base64. " - "For example it can be obtained via " - "`base64 -w 0 /path/to/file/pdf-to-convert.pdf`." - ), - ] - filename: Annotated[ - str, - Field(description="Filename of the uploaded document", examples=["file.pdf"]), - ] - - def to_document_stream(self) -> DocumentStream: - buf = BytesIO(base64.b64decode(self.base64_string)) - return DocumentStream(stream=buf, name=self.filename) - - -class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase): - http_sources: List[HttpSource] - - -class ConvertDocumentFileSourcesRequest(DocumentsConvertBase): - file_sources: List[FileSource] - - -ConvertDocumentsRequest = Union[ - ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest -] - - # Document converters will be preloaded and stored in a dictionary converters: Dict[bytes, DocumentConverter] = {} diff --git a/docling_serve/engines/__init__.py b/docling_serve/engines/__init__.py new file mode 100644 index 0000000..d2adb7e --- /dev/null +++ b/docling_serve/engines/__init__.py @@ -0,0 +1,8 @@ +from functools import lru_cache + +from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator + + +@lru_cache +def get_orchestrator() -> AsyncLocalOrchestrator: + return AsyncLocalOrchestrator() diff --git a/docling_serve/engines/async_local/__init__.py b/docling_serve/engines/async_local/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_serve/engines/async_local/orchestrator.py b/docling_serve/engines/async_local/orchestrator.py new file mode 100644 index 0000000..8de5f50 --- /dev/null +++ b/docling_serve/engines/async_local/orchestrator.py @@ -0,0 +1,101 @@ +import asyncio +import logging +import uuid +from typing import Dict, List, Optional, Set + +from fastapi import WebSocket + +from docling_serve.datamodel.engines import Task, TaskStatus +from docling_serve.datamodel.requests import ConvertDocumentsRequest +from docling_serve.datamodel.responses import ( + MessageKind, + TaskStatusResponse, + WebsocketMessage, +) +from docling_serve.engines.async_local.worker import AsyncLocalWorker +from docling_serve.engines.base_orchestrator import BaseOrchestrator +from docling_serve.settings import docling_serve_settings + +_log = logging.getLogger(__name__) + + +class OrchestratorError(Exception): + pass + + +class TaskNotFoundError(OrchestratorError): + pass + + +class AsyncLocalOrchestrator(BaseOrchestrator): + def __init__(self): + self.task_queue = asyncio.Queue() + self.tasks: Dict[str, Task] = {} + self.queue_list: List[str] = [] + self.task_subscribers: Dict[str, Set[WebSocket]] = {} + + async def enqueue(self, request: ConvertDocumentsRequest) -> Task: + task_id = str(uuid.uuid4()) + task = Task(task_id=task_id, request=request) + self.tasks[task_id] = task + self.queue_list.append(task_id) + self.task_subscribers[task_id] = set() + await self.task_queue.put(task_id) + return task + + async def queue_size(self) -> int: + return self.task_queue.qsize() + + async def get_queue_position(self, task_id: str) -> Optional[int]: + return ( + self.queue_list.index(task_id) + 1 if task_id in self.queue_list else None + ) + + async def task_status(self, task_id: str, wait: float = 0.0) -> Task: + if task_id not in self.tasks: + raise TaskNotFoundError() + return self.tasks[task_id] + + async def task_result(self, task_id: str): + if task_id not in self.tasks: + raise TaskNotFoundError() + return self.tasks[task_id].result + + async def process_queue(self): + # Create a pool of workers + workers = [] + for i in range(docling_serve_settings.eng_loc_num_workers): + _log.debug(f"Starting worker {i}") + w = AsyncLocalWorker(i, self) + worker_task = asyncio.create_task(w.loop()) + workers.append(worker_task) + + # Wait for all workers to complete (they won't, as they run indefinitely) + await asyncio.gather(*workers) + _log.debug("All workers completed.") + + async def notify_task_subscribers(self, task_id: str): + if task_id not in self.task_subscribers: + raise RuntimeError(f"Task {task_id} does not have a subscribers list.") + + task = self.tasks[task_id] + task_queue_position = await self.get_queue_position(task_id) + msg = TaskStatusResponse( + task_id=task.task_id, + task_status=task.task_status, + task_position=task_queue_position, + ) + for websocket in self.task_subscribers[task_id]: + await websocket.send_text( + WebsocketMessage(message=MessageKind.UPDATE, task=msg).model_dump_json() + ) + if task.is_completed(): + await websocket.close() + + async def notify_queue_positions(self): + for task_id in self.task_subscribers.keys(): + # notify only pending tasks + if self.tasks[task_id].task_status != TaskStatus.PENDING: + continue + + await self.notify_task_subscribers(task_id) diff --git a/docling_serve/engines/async_local/worker.py b/docling_serve/engines/async_local/worker.py new file mode 100644 index 0000000..c3258fe --- /dev/null +++ b/docling_serve/engines/async_local/worker.py @@ -0,0 +1,116 @@ +import asyncio +import logging +import time +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from fastapi import BackgroundTasks + +from docling.datamodel.base_models import DocumentStream + +from docling_serve.datamodel.engines import TaskStatus +from docling_serve.datamodel.requests import ConvertDocumentFileSourcesRequest +from docling_serve.datamodel.responses import ConvertDocumentResponse +from docling_serve.docling_conversion import convert_documents +from docling_serve.response_preparation import process_results + +if TYPE_CHECKING: + from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator + +_log = logging.getLogger(__name__) + + +class AsyncLocalWorker: + def __init__(self, worker_id: int, orchestrator: "AsyncLocalOrchestrator"): + self.worker_id = worker_id + self.orchestrator = orchestrator + + async def loop(self): + _log.debug(f"Starting loop for worker {self.worker_id}") + while True: + task_id: str = await self.orchestrator.task_queue.get() + self.orchestrator.queue_list.remove(task_id) + + if task_id not in self.orchestrator.tasks: + raise RuntimeError(f"Task {task_id} not found.") + task = self.orchestrator.tasks[task_id] + + try: + task.task_status = TaskStatus.STARTED + _log.info(f"Worker {self.worker_id} processing task {task_id}") + + # Notify clients about task updates + await self.orchestrator.notify_task_subscribers(task_id) + + # Notify clients about queue updates + await self.orchestrator.notify_queue_positions() + + # Get the current event loop + asyncio.get_event_loop() + + # Define a callback function to send progress updates to the client. + # TODO: send partial updates, e.g. when a document in the batch is done + def run_conversion(): + sources: List[Union[str, DocumentStream]] = [] + headers: Optional[Dict[str, Any]] = None + if isinstance(task.request, ConvertDocumentFileSourcesRequest): + for file_source in task.request.file_sources: + sources.append(file_source.to_document_stream()) + else: + for http_source in task.request.http_sources: + sources.append(http_source.url) + if headers is None and http_source.headers: + headers = http_source.headers + + # Note: results are only an iterator->lazy evaluation + results = convert_documents( + sources=sources, + options=task.request.options, + headers=headers, + ) + + # The real processing will happen here + response = process_results( + background_tasks=BackgroundTasks(), + conversion_options=task.request.options, + conv_results=results, + ) + + return response + + # Run the prediction in a thread to avoid blocking the event loop. + start_time = time.monotonic() + # future = asyncio.run_coroutine_threadsafe( + # run_conversion(), + # loop=loop + # ) + # response = future.result() + + response = await asyncio.to_thread( + run_conversion, + ) + processing_time = time.monotonic() - start_time + + if not isinstance(response, ConvertDocumentResponse): + _log.error( + f"Worker {self.worker_id} got un-processable " + "result for {task_id}: {type(response)}" + ) + task.result = response + task.request = None + + task.task_status = TaskStatus.SUCCESS + _log.info( + f"Worker {self.worker_id} completed job {task_id} " + f"in {processing_time:.2f} seconds" + ) + + except Exception as e: + _log.error( + f"Worker {self.worker_id} failed to process job {task_id}: {e}" + ) + task.task_status = TaskStatus.FAILURE + + finally: + await self.orchestrator.notify_task_subscribers(task_id) + self.orchestrator.task_queue.task_done() + _log.debug(f"Worker {self.worker_id} completely done with {task_id}") diff --git a/docling_serve/engines/base_orchestrator.py b/docling_serve/engines/base_orchestrator.py new file mode 100644 index 0000000..c482c46 --- /dev/null +++ b/docling_serve/engines/base_orchestrator.py @@ -0,0 +1,21 @@ +from abc import ABC, abstractmethod + +from docling_serve.datamodel.engines import Task + + +class BaseOrchestrator(ABC): + @abstractmethod + async def enqueue(self, task) -> Task: + pass + + @abstractmethod + async def queue_size(self) -> int: + pass + + @abstractmethod + async def task_status(self, task_id: str) -> Task: + pass + + @abstractmethod + async def task_result(self, task_id: str): + pass diff --git a/docling_serve/engines/block_local/__init__.py b/docling_serve/engines/block_local/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_serve/gradio_ui.py b/docling_serve/gradio_ui.py index 3fd6aa0..9c328b7 100644 --- a/docling_serve/gradio_ui.py +++ b/docling_serve/gradio_ui.py @@ -333,7 +333,6 @@ with gr.Blocks( title="Docling Serve", delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour ) as ui: - # Constants stored in states to be able to pass them as inputs to functions processing_text = gr.State("Processing your document(s), please wait...") true_bool = gr.State(True) @@ -593,9 +592,7 @@ with gr.Blocks( set_outputs_visibility_direct, inputs=[false_bool, false_bool], outputs=[content_output, file_output], - ).then( - clear_url_input, inputs=None, outputs=[url_input] - ) + ).then(clear_url_input, inputs=None, outputs=[url_input]) # File processing file_process_btn.click( @@ -664,6 +661,4 @@ with gr.Blocks( set_outputs_visibility_direct, inputs=[false_bool, false_bool], outputs=[content_output, file_output], - ).then( - clear_file_input, inputs=None, outputs=[file_input] - ) + ).then(clear_file_input, inputs=None, outputs=[file_input]) diff --git a/docling_serve/response_preparation.py b/docling_serve/response_preparation.py index 571d76f..ecf334f 100644 --- a/docling_serve/response_preparation.py +++ b/docling_serve/response_preparation.py @@ -4,42 +4,21 @@ import shutil import tempfile import time from pathlib import Path -from typing import Dict, Iterable, List, Optional, Union +from typing import Iterable, Union -from docling.datamodel.base_models import OutputFormat -from docling.datamodel.document import ConversionResult, ConversionStatus, ErrorItem -from docling.utils.profiling import ProfilingItem -from docling_core.types.doc import DoclingDocument, ImageRefMode from fastapi import BackgroundTasks, HTTPException from fastapi.responses import FileResponse -from pydantic import BaseModel -from docling_serve.docling_conversion import ConvertDocumentsOptions +from docling.datamodel.base_models import OutputFormat +from docling.datamodel.document import ConversionResult, ConversionStatus +from docling_core.types.doc import ImageRefMode + +from docling_serve.datamodel.convert import ConvertDocumentsOptions +from docling_serve.datamodel.responses import ConvertDocumentResponse, DocumentResponse _log = logging.getLogger(__name__) -class DocumentResponse(BaseModel): - filename: str - md_content: Optional[str] = None - json_content: Optional[DoclingDocument] = None - html_content: Optional[str] = None - text_content: Optional[str] = None - doctags_content: Optional[str] = None - - -class ConvertDocumentResponse(BaseModel): - document: DocumentResponse - status: ConversionStatus - errors: List[ErrorItem] = [] - processing_time: float - timings: Dict[str, ProfilingItem] = {} - - -class ConvertDocumentErrorResponse(BaseModel): - status: ConversionStatus - - def _export_document_as_content( conv_res: ConversionResult, export_json: bool, @@ -49,7 +28,6 @@ def _export_document_as_content( export_doctags: bool, image_mode: ImageRefMode, ): - document = DocumentResponse(filename=conv_res.input.file.name) if conv_res.status == ConversionStatus.SUCCESS: @@ -86,7 +64,6 @@ def _export_documents_as_files( export_doctags: bool, image_export_mode: ImageRefMode, ): - success_count = 0 failure_count = 0 @@ -150,7 +127,6 @@ def process_results( conversion_options: ConvertDocumentsOptions, conv_results: Iterable[ConversionResult], ) -> Union[ConvertDocumentResponse, FileResponse]: - # Let's start by processing the documents try: start_time = time.monotonic() diff --git a/docling_serve/settings.py b/docling_serve/settings.py index 0e52bc6..b82e204 100644 --- a/docling_serve/settings.py +++ b/docling_serve/settings.py @@ -3,6 +3,8 @@ from typing import Optional, Union from pydantic_settings import BaseSettings, SettingsConfigDict +from docling_serve.datamodel.engines import AsyncEngine + class UvicornSettings(BaseSettings): model_config = SettingsConfigDict( @@ -28,6 +30,9 @@ class DoclingServeSettings(BaseSettings): enable_ui: bool = False artifacts_path: Optional[Path] = None + eng_kind: AsyncEngine = AsyncEngine.LOCAL + eng_loc_num_workers: int = 2 + uvicorn_settings = UvicornSettings() docling_serve_settings = DoclingServeSettings() diff --git a/pyproject.toml b/pyproject.toml index 124c1e0..e43455b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ ] requires-python = ">=3.10" dependencies = [ - "docling~=2.23", + "docling~=2.25.1", "fastapi[standard]~=0.115", "httpx~=0.28", "pydantic~=2.10", @@ -38,6 +38,7 @@ dependencies = [ "python-multipart>=0.0.14,<0.1.0", "typer~=0.12", "uvicorn[standard]>=0.29.0,<1.0.0", + "websockets~=14.0", ] [project.optional-dependencies] @@ -164,9 +165,19 @@ ignore = [ [tool.ruff.lint.mccabe] max-complexity = 15 +[tool.ruff.lint.isort.sections] +"docling" = ["docling", "docling_core"] + [tool.ruff.lint.isort] combine-as-imports = true -known-third-party = ["docling", "docling_core"] +section-order = [ + "future", + "standard-library", + "third-party", + "docling", + "first-party", + "local-folder", +] [tool.mypy] pretty = true @@ -180,10 +191,6 @@ module = [ "easyocr.*", "tesserocr.*", "rapidocr_onnxruntime.*", - "docling_conversion.*", - "gradio_ui.*", - "response_preparation.*", - "helper_functions.*", "requests.*", ] ignore_missing_imports = true diff --git a/tests/test_1-file-all-outputs.py b/tests/test_1-file-all-outputs.py index 7627aee..64887a5 100644 --- a/tests/test_1-file-all-outputs.py +++ b/tests/test_1-file-all-outputs.py @@ -89,7 +89,7 @@ async def test_convert_file(async_client): check.is_in( '{"schema_name": "DoclingDocument"', json.dumps(data["document"]["json_content"]), - msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}", + msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}', ) # HTML check check.is_in( diff --git a/tests/test_1-url-all-outputs.py b/tests/test_1-url-all-outputs.py index f971b33..f096401 100644 --- a/tests/test_1-url-all-outputs.py +++ b/tests/test_1-url-all-outputs.py @@ -83,7 +83,7 @@ async def test_convert_url(async_client): check.is_in( '{"schema_name": "DoclingDocument"', json.dumps(data["document"]["json_content"]), - msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}", + msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}', ) # HTML check check.is_in( diff --git a/tests/test_1-url-async-ws.py b/tests/test_1-url-async-ws.py new file mode 100644 index 0000000..ccec51a --- /dev/null +++ b/tests/test_1-url-async-ws.py @@ -0,0 +1,48 @@ +import base64 +from pathlib import Path + +import httpx +import pytest +import pytest_asyncio +from websockets.sync.client import connect + + +@pytest_asyncio.fixture +async def async_client(): + async with httpx.AsyncClient(timeout=60.0) as client: + yield client + + +@pytest.mark.asyncio +async def test_convert_url(async_client: httpx.AsyncClient): + """Test convert URL to all outputs""" + + doc_filename = Path("tests/2408.09869v5.pdf") + encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode() + + base_url = "http://localhost:5001/v1alpha" + payload = { + "options": { + "to_formats": ["md", "json"], + "image_export_mode": "placeholder", + "ocr": True, + "abort_on_error": False, + "return_as_file": False, + }, + # "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}], + "file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}], + } + # print(json.dumps(payload, indent=2)) + + for n in range(5): + response = await async_client.post( + f"{base_url}/convert/source/async", json=payload + ) + assert response.status_code == 200, "Response should be 200 OK" + + task = response.json() + + uri = f"ws://localhost:5001/v1alpha/status/ws/{task['task_id']}" + with connect(uri) as websocket: + for message in websocket: + print(message) diff --git a/tests/test_1-url-async.py b/tests/test_1-url-async.py new file mode 100644 index 0000000..6f796d7 --- /dev/null +++ b/tests/test_1-url-async.py @@ -0,0 +1,60 @@ +import json +import random +import time + +import httpx +import pytest +import pytest_asyncio + + +@pytest_asyncio.fixture +async def async_client(): + async with httpx.AsyncClient(timeout=60.0) as client: + yield client + + +@pytest.mark.asyncio +async def test_convert_url(async_client): + """Test convert URL to all outputs""" + + example_docs = [ + "https://arxiv.org/pdf/2411.19710", + "https://arxiv.org/pdf/2501.17887", + "https://www.nature.com/articles/s41467-024-50779-y.pdf", + "https://arxiv.org/pdf/2306.12802", + "https://arxiv.org/pdf/2311.18481", + ] + + base_url = "http://localhost:5001/v1alpha" + payload = { + "options": { + "to_formats": ["md", "json"], + "image_export_mode": "placeholder", + "ocr": True, + "abort_on_error": False, + "return_as_file": False, + }, + "http_sources": [{"url": random.choice(example_docs)}], + } + print(json.dumps(payload, indent=2)) + + for n in range(5): + response = await async_client.post( + f"{base_url}/convert/source/async", json=payload + ) + assert response.status_code == 200, "Response should be 200 OK" + + task = response.json() + + print(json.dumps(task, indent=2)) + + while task["task_status"] not in ("success", "failure"): + response = await async_client.get(f"{base_url}/status/poll/{task['task_id']}") + assert response.status_code == 200, "Response should be 200 OK" + task = response.json() + print(f"{task['task_status']=}") + print(f"{task['task_position']=}") + + time.sleep(2) + + assert task["task_status"] == "success" diff --git a/tests/test_2-files-all-outputs.py b/tests/test_2-files-all-outputs.py index 58fc33d..db2a5e9 100644 --- a/tests/test_2-files-all-outputs.py +++ b/tests/test_2-files-all-outputs.py @@ -57,18 +57,18 @@ async def test_convert_file(async_client): content_disposition = response.headers.get("content-disposition") with check: - assert ( - content_disposition is not None - ), "Content-Disposition header should be present" + assert content_disposition is not None, ( + "Content-Disposition header should be present" + ) with check: assert "attachment" in content_disposition, "Response should be an attachment" with check: - assert ( - 'filename="converted_docs.zip"' in content_disposition - ), "Attachment filename should be 'converted_docs.zip'" + assert 'filename="converted_docs.zip"' in content_disposition, ( + "Attachment filename should be 'converted_docs.zip'" + ) content_type = response.headers.get("content-type") with check: - assert ( - content_type == "application/zip" - ), "Content-Type should be 'application/zip'" + assert content_type == "application/zip", ( + "Content-Type should be 'application/zip'" + ) diff --git a/tests/test_2-urls-all-outputs.py b/tests/test_2-urls-all-outputs.py index 5ff75dc..d5d83c8 100644 --- a/tests/test_2-urls-all-outputs.py +++ b/tests/test_2-urls-all-outputs.py @@ -50,18 +50,18 @@ async def test_convert_url(async_client): content_disposition = response.headers.get("content-disposition") with check: - assert ( - content_disposition is not None - ), "Content-Disposition header should be present" + assert content_disposition is not None, ( + "Content-Disposition header should be present" + ) with check: assert "attachment" in content_disposition, "Response should be an attachment" with check: - assert ( - 'filename="converted_docs.zip"' in content_disposition - ), "Attachment filename should be 'converted_docs.zip'" + assert 'filename="converted_docs.zip"' in content_disposition, ( + "Attachment filename should be 'converted_docs.zip'" + ) content_type = response.headers.get("content-type") with check: - assert ( - content_type == "application/zip" - ), "Content-Type should be 'application/zip'" + assert content_type == "application/zip", ( + "Content-Type should be 'application/zip'" + ) diff --git a/uv.lock b/uv.lock index bc49c5e..0040723 100644 --- a/uv.lock +++ b/uv.lock @@ -349,38 +349,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2e/38/3fd83c4690dc7d753a442a284b3826ea5e5c380a411443c66421cd823898/cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7", size = 3134657 }, ] -[[package]] -name = "deepsearch-glm" -version = "1.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32' or (extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/73/d5/a907234e57f5c4f6480c9ddbc3cdacc47f727c768e502be3d361719fac4e/deepsearch_glm-1.0.0.tar.gz", hash = "sha256:e8dce88ac519a693c260f28bd3c4ec409811e65ade84fb508f6c6e37ca065e62", size = 2401014 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/65/4b2013784d5ed8d3664a2efa61f15600c8bf090766b0363c036d78aca550/deepsearch_glm-1.0.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94792b57df7a1c4ba8b47ebd8f36ea0a090d4f27a4fba39bd7b166b6b537260a", size = 6303790 }, - { url = "https://files.pythonhosted.org/packages/45/2a/1e95260a712948a21b74dcb239032d9e612f7e1a273657008655749f4115/deepsearch_glm-1.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ff46e352e96a2f56ce7ae4fdf04b271ee841c29ff159b1dec0e5ecaaadba8d4d", size = 5945851 }, - { url = "https://files.pythonhosted.org/packages/9e/1a/5c37a98f27644fd02bc447df651e8d5ce484cd6ce7cb178218625b4de5bc/deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d77d3d94d49641888aa15f3ad23e81158e791aa9d9608dd8168dc71788e56f3", size = 7431282 }, - { url = "https://files.pythonhosted.org/packages/e8/e2/56b5e7ae3ccc4d8ee758427c8c9a403c985e250a468c53538c269897bef2/deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:143de0fd111a570be12935d8799a2715fe1775d4dc4e256337860b429cee5d36", size = 7759571 }, - { url = "https://files.pythonhosted.org/packages/61/f4/e39a5090a2bf0d641449918865566ad5adabef156993a922bdbf4a3ebb60/deepsearch_glm-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9f2872dd573cd2206ce7f9e2e6016c38b66d9ecbd983283ff5e8c6023813c311", size = 7904646 }, - { url = "https://files.pythonhosted.org/packages/41/f7/8e8dd9738554f97522b59b0a6d7680ccf2d527bd3471ec4aa4e52acf552a/deepsearch_glm-1.0.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:e64d94ff5209f0a11e8c75c6b28b033ef27b95a22c2fbcbd945e7fe8cc421545", size = 6309301 }, - { url = "https://files.pythonhosted.org/packages/17/37/4d8514d8ef851e44513a71f675a7ebb373f109aece38e324c7d444ced20c/deepsearch_glm-1.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a5702205677b768b51f881d15d933370f6ef3c826dfac3b9aa0b904d2e6c495a", size = 5951522 }, - { url = "https://files.pythonhosted.org/packages/0c/c6/3680318e66df278fa7f0811dc862d6cb3c328ce168b4f36736eb77120b6d/deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0417a2ae998e1709f03458cfb9adb55423bb1328224eb055300796baa757879f", size = 7434315 }, - { url = "https://files.pythonhosted.org/packages/c3/cd/9ffb616d347d568f868f47585b3261c16e277aa7b37740e8720eee71c539/deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0e1efe9af0d28e9b473fe599246deb3a0be7c3d546a478da284747144d086a", size = 7761264 }, - { url = "https://files.pythonhosted.org/packages/3d/d3/e5ebdda9cee8a1c846e6a960a0e5b97624aff2f248c2bc89ae490b9a1342/deepsearch_glm-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:807faf13eb0deea55a1951d479a85d5e20de0ff8b2e0b57b2f7939552759a426", size = 7908603 }, - { url = "https://files.pythonhosted.org/packages/60/ca/6adbadc979910b11594cd0242f1991942c22528eead431d47de064ac2860/deepsearch_glm-1.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:56d9575df9eceb8c2ae33e3d15e133924cc195714c3d268599b6f8414c1f6bb8", size = 6308715 }, - { url = "https://files.pythonhosted.org/packages/20/7c/bf1e9c458705c7143c6630cb6847554ad694d25dc6f1f038512b9c86160a/deepsearch_glm-1.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:51f5c6522f60ba73eb12eeb7217bd98d871ba7c078337a4059d05878d8baf2d6", size = 5949609 }, - { url = "https://files.pythonhosted.org/packages/21/b1/eb0cd0db50d05f2d7a510a77960e85e6caee727eb3d931ed0ec067917813/deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6211eaf497ad7cfcb68f80f9b5387940be0204fe149a9fc03988a95145f410a", size = 7433929 }, - { url = "https://files.pythonhosted.org/packages/3a/7e/2b7db77ff02fe9eec41f3605fcd72e3eb4e6b48561b344d432b417a75cfe/deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b003bf457fce61ea4de79e2d7d0228a1ae349f677eb6570e745f79d4429804f", size = 7760438 }, - { url = "https://files.pythonhosted.org/packages/ab/97/ffb2bb5d2432c7b0e9f3a3e6b5873fbcd6e19e82b620393bfb8e01bdecb1/deepsearch_glm-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9d61f66048e6ab60fe9f84c823fd593bf8517755833bd9efb59156d77a2b42d0", size = 7907583 }, - { url = "https://files.pythonhosted.org/packages/38/06/08c5fd0e1144c2c8d76d06da1545a9cf589278a37f8b9e6235b5b416eb52/deepsearch_glm-1.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:7d558e8b365c27ee665d0589165fd074fb252c73715f9cc6aeb4304a63683f37", size = 6308867 }, - { url = "https://files.pythonhosted.org/packages/ba/fb/f5f9787876b67ce83d5afa4903901be9f8071530bc0706dc2228afc0b6c0/deepsearch_glm-1.0.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:3199093a9472e5756214b9b6563f827c19c001c7dd8ae00e03eed1140c12930d", size = 5949719 }, - { url = "https://files.pythonhosted.org/packages/83/0f/42b5a4aa798acbc6309d748435b006c489e58102b6cb2278e7b8f0194743/deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f18d1ee68a0479592e0c714e6cbf9e2d0fa8edd692d580da64431c84cbef5c2", size = 7434981 }, - { url = "https://files.pythonhosted.org/packages/17/6a/c2c4eaa4470b78dde6c03f055cbb09f3f7f15b8a6ff38f5bea5180339e6f/deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62c1c0ea0a544219da15c017632f9e0be116ecdc335b865c6c5760429557fe23", size = 7760773 }, - { url = "https://files.pythonhosted.org/packages/01/0a/7c3cf75bad38a8d6ff3842b78b3263dd81ad4eaf1d859f4b8e1ab465cad5/deepsearch_glm-1.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:962f393dcec2204de1a5cb0f635c65258bde2424ad2d4e0f5df770139c3958de", size = 7908766 }, - { url = "https://files.pythonhosted.org/packages/1f/cd/e6507d924aa69e9647f917ed671e2d62e19e41d4f120a15fcbb583661667/deepsearch_glm-1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2315cc4ffe7032dada294a0cd72a47dbc6c0121fd07d4b5719f9a9e9519d091", size = 14644989 }, -] - [[package]] name = "dill" version = "0.3.9" @@ -410,12 +378,11 @@ wheels = [ [[package]] name = "docling" -version = "2.23.0" +version = "2.25.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "beautifulsoup4" }, { name = "certifi" }, - { name = "deepsearch-glm" }, { name = "docling-core", extra = ["chunking"] }, { name = "docling-ibm-models" }, { name = "docling-parse" }, @@ -438,9 +405,9 @@ dependencies = [ { name = "tqdm" }, { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/94/45/807f81abeaa1a018998525a98d577508e6c0eba4173e6ec4beeee77f0644/docling-2.23.0.tar.gz", hash = "sha256:7ffde3366b01e2f1c0e47574700501a3b8667082cf3a185efe7e103b8473ee43", size = 106065 } +sdist = { url = "https://files.pythonhosted.org/packages/f9/88/b6d782d2cd7ed602d2bae1a01e87a6347a37295ad450d86159cc7c252290/docling-2.25.1.tar.gz", hash = "sha256:ba2fce77659f4ccf1c8a696531ea9f17253215dbebfac6536012bbc6d1c29ce8", size = 112676 } wheels = [ - { url = "https://files.pythonhosted.org/packages/9f/7f/79985fbb4f115f87688b2a8d18ddc14d45fcccd8989a02104ddc8cf9b02c/docling-2.23.0-py3-none-any.whl", hash = "sha256:bd3d05bf48fc842e502af9f26153c40e2fcc1df1945ec72ab8c4d5dd1f3b6528", size = 137161 }, + { url = "https://files.pythonhosted.org/packages/2a/c1/6c58516672f0f60c432ae331391b6548e4fdcb7b6a6dcd7725605284dcf7/docling-2.25.1-py3-none-any.whl", hash = "sha256:92318591342fc50781134fc553c6c57b703ce43e8095a80d59ed02206d0f560c", size = 145677 }, ] [[package]] @@ -472,14 +439,16 @@ chunking = [ [[package]] name = "docling-ibm-models" -version = "3.3.2" +version = "3.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "docling-core" }, { name = "huggingface-hub" }, { name = "jsonlines" }, { name = "numpy" }, { name = "opencv-python-headless" }, { name = "pillow" }, + { name = "pydantic" }, { name = "safetensors", extra = ["torch"] }, { name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine == 'x86_64' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform != 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" }, { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (extra != 'extra-13-docling-serve-cpu' and extra != 'extra-13-docling-serve-cu124')" }, @@ -492,9 +461,9 @@ dependencies = [ { name = "tqdm" }, { name = "transformers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6c/e5/7ff58b9481beb43e5b93084b784fe686be553a947c961d09cda557630dd0/docling_ibm_models-3.3.2.tar.gz", hash = "sha256:f6ed59dfb3f98a71ccdd003c13c9a868e3003c22bd5adc554197da7eec227cde", size = 66096 } +sdist = { url = "https://files.pythonhosted.org/packages/eb/a5/88d5b7c970d5e10a06062fe9e9de3cde6acdefcc1f85854f689a82863c2a/docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96", size = 69794 } wheels = [ - { url = "https://files.pythonhosted.org/packages/84/8a/1539d9b951a761e141eeabfed7fcfa99d7ae88aa3073711d42e8df3e8d1a/docling_ibm_models-3.3.2-py3-none-any.whl", hash = "sha256:9f82a2ef73c6cd8d729ab2fcc4223079ccb8b6eec0bf0643c56e55352b97b5cb", size = 76659 }, + { url = "https://files.pythonhosted.org/packages/af/8f/0f2b823fa09d06deacbdfc6d5d7809d462ddc508f43146960083d113c4c6/docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986", size = 80886 }, ] [[package]] @@ -546,14 +515,15 @@ dependencies = [ { name = "python-multipart" }, { name = "typer" }, { name = "uvicorn", extra = ["standard"] }, + { name = "websockets" }, ] [package.optional-dependencies] cpu = [ - { name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine == 'x86_64' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform != 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" }, - { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'x86_64' and extra == 'extra-13-docling-serve-cpu') or (platform_machine != 'x86_64' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform != 'darwin' and extra == 'extra-13-docling-serve-cpu')" }, - { name = "torchvision", version = "0.21.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'extra-13-docling-serve-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform == 'linux' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" }, - { name = "torchvision", version = "0.21.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-13-docling-serve-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-13-docling-serve-cpu') or (sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform == 'linux' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" }, + { name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "platform_machine != 'x86_64' and sys_platform == 'darwin'" }, + { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "platform_machine == 'x86_64' or sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.21.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux')" }, + { name = "torchvision", version = "0.21.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] cu124 = [ { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" } }, @@ -583,7 +553,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "docling", specifier = "~=2.23" }, + { name = "docling", specifier = "~=2.25.1" }, { name = "fastapi", extras = ["standard"], specifier = "~=0.115" }, { name = "gradio", marker = "extra == 'ui'", specifier = "~=5.9" }, { name = "httpx", specifier = "~=0.28" }, @@ -599,6 +569,7 @@ requires-dist = [ { name = "torchvision", marker = "extra == 'cu124'", specifier = ">=0.21.0", index = "https://download.pytorch.org/whl/cu124", conflict = { package = "docling-serve", extra = "cu124" } }, { name = "typer", specifier = "~=0.12" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.29.0,<1.0.0" }, + { name = "websockets", specifier = "~=14.0" }, ] provides-extras = ["ui", "tesserocr", "rapidocr", "cpu", "cu124"]