mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 00:23:36 +00:00
63 lines
1.6 KiB
Python
63 lines
1.6 KiB
Python
import base64
|
|
from io import BytesIO
|
|
from typing import Annotated, Any, Dict, List, Union
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from docling.datamodel.base_models import DocumentStream
|
|
|
|
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
|
|
|
|
|
class DocumentsConvertBase(BaseModel):
|
|
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
|
|
|
|
|
class HttpSource(BaseModel):
|
|
url: Annotated[
|
|
str,
|
|
Field(
|
|
description="HTTP url to process",
|
|
examples=["https://arxiv.org/pdf/2206.01062"],
|
|
),
|
|
]
|
|
headers: Annotated[
|
|
Dict[str, Any],
|
|
Field(
|
|
description="Additional headers used to fetch the urls, "
|
|
"e.g. authorization, agent, etc"
|
|
),
|
|
] = {}
|
|
|
|
|
|
class FileSource(BaseModel):
|
|
base64_string: Annotated[
|
|
str,
|
|
Field(
|
|
description="Content of the file serialized in base64. "
|
|
"For example it can be obtained via "
|
|
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
|
),
|
|
]
|
|
filename: Annotated[
|
|
str,
|
|
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
|
]
|
|
|
|
def to_document_stream(self) -> DocumentStream:
|
|
buf = BytesIO(base64.b64decode(self.base64_string))
|
|
return DocumentStream(stream=buf, name=self.filename)
|
|
|
|
|
|
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
|
http_sources: List[HttpSource]
|
|
|
|
|
|
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
|
file_sources: List[FileSource]
|
|
|
|
|
|
ConvertDocumentsRequest = Union[
|
|
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
|
]
|