feat: Async api (#60)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-03-07 11:26:50 +01:00
committed by GitHub
parent ed851c95fe
commit 82f8900197
26 changed files with 919 additions and 367 deletions

View File

@@ -89,7 +89,7 @@ async def test_convert_file(async_client):
check.is_in(
'{"schema_name": "DoclingDocument"',
json.dumps(data["document"]["json_content"]),
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
)
# HTML check
check.is_in(

View File

@@ -83,7 +83,7 @@ async def test_convert_url(async_client):
check.is_in(
'{"schema_name": "DoclingDocument"',
json.dumps(data["document"]["json_content"]),
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
)
# HTML check
check.is_in(

View File

@@ -0,0 +1,48 @@
import base64
from pathlib import Path
import httpx
import pytest
import pytest_asyncio
from websockets.sync.client import connect
@pytest_asyncio.fixture
async def async_client():
async with httpx.AsyncClient(timeout=60.0) as client:
yield client
@pytest.mark.asyncio
async def test_convert_url(async_client: httpx.AsyncClient):
"""Test convert URL to all outputs"""
doc_filename = Path("tests/2408.09869v5.pdf")
encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode()
base_url = "http://localhost:5001/v1alpha"
payload = {
"options": {
"to_formats": ["md", "json"],
"image_export_mode": "placeholder",
"ocr": True,
"abort_on_error": False,
"return_as_file": False,
},
# "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}],
"file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
}
# print(json.dumps(payload, indent=2))
for n in range(5):
response = await async_client.post(
f"{base_url}/convert/source/async", json=payload
)
assert response.status_code == 200, "Response should be 200 OK"
task = response.json()
uri = f"ws://localhost:5001/v1alpha/status/ws/{task['task_id']}"
with connect(uri) as websocket:
for message in websocket:
print(message)

60
tests/test_1-url-async.py Normal file
View File

@@ -0,0 +1,60 @@
import json
import random
import time
import httpx
import pytest
import pytest_asyncio
@pytest_asyncio.fixture
async def async_client():
async with httpx.AsyncClient(timeout=60.0) as client:
yield client
@pytest.mark.asyncio
async def test_convert_url(async_client):
"""Test convert URL to all outputs"""
example_docs = [
"https://arxiv.org/pdf/2411.19710",
"https://arxiv.org/pdf/2501.17887",
"https://www.nature.com/articles/s41467-024-50779-y.pdf",
"https://arxiv.org/pdf/2306.12802",
"https://arxiv.org/pdf/2311.18481",
]
base_url = "http://localhost:5001/v1alpha"
payload = {
"options": {
"to_formats": ["md", "json"],
"image_export_mode": "placeholder",
"ocr": True,
"abort_on_error": False,
"return_as_file": False,
},
"http_sources": [{"url": random.choice(example_docs)}],
}
print(json.dumps(payload, indent=2))
for n in range(5):
response = await async_client.post(
f"{base_url}/convert/source/async", json=payload
)
assert response.status_code == 200, "Response should be 200 OK"
task = response.json()
print(json.dumps(task, indent=2))
while task["task_status"] not in ("success", "failure"):
response = await async_client.get(f"{base_url}/status/poll/{task['task_id']}")
assert response.status_code == 200, "Response should be 200 OK"
task = response.json()
print(f"{task['task_status']=}")
print(f"{task['task_position']=}")
time.sleep(2)
assert task["task_status"] == "success"

View File

@@ -57,18 +57,18 @@ async def test_convert_file(async_client):
content_disposition = response.headers.get("content-disposition")
with check:
assert (
content_disposition is not None
), "Content-Disposition header should be present"
assert content_disposition is not None, (
"Content-Disposition header should be present"
)
with check:
assert "attachment" in content_disposition, "Response should be an attachment"
with check:
assert (
'filename="converted_docs.zip"' in content_disposition
), "Attachment filename should be 'converted_docs.zip'"
assert 'filename="converted_docs.zip"' in content_disposition, (
"Attachment filename should be 'converted_docs.zip'"
)
content_type = response.headers.get("content-type")
with check:
assert (
content_type == "application/zip"
), "Content-Type should be 'application/zip'"
assert content_type == "application/zip", (
"Content-Type should be 'application/zip'"
)

View File

@@ -50,18 +50,18 @@ async def test_convert_url(async_client):
content_disposition = response.headers.get("content-disposition")
with check:
assert (
content_disposition is not None
), "Content-Disposition header should be present"
assert content_disposition is not None, (
"Content-Disposition header should be present"
)
with check:
assert "attachment" in content_disposition, "Response should be an attachment"
with check:
assert (
'filename="converted_docs.zip"' in content_disposition
), "Attachment filename should be 'converted_docs.zip'"
assert 'filename="converted_docs.zip"' in content_disposition, (
"Attachment filename should be 'converted_docs.zip'"
)
content_type = response.headers.get("content-type")
with check:
assert (
content_type == "application/zip"
), "Content-Type should be 'application/zip'"
assert content_type == "application/zip", (
"Content-Type should be 'application/zip'"
)