api v1alpha1 (#17)

* api v1alpha1 Signed-off-by: Guillaume Moutier <gmoutier@redhat.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use actual types in request models and refactor Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * make gradio optional and update README Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Run workflow jobs sequentially to avoid disk space outage (#19) Github Action runners are running out of the space while building both the images in parallel. This change will build the image sequentially and also clean up the cpu images before start building gpu image. Signed-off-by: Anil Vishnoi <vishnoianil@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Add github job to build image (and not publish) on PR creation (#20) Signed-off-by: Anil Vishnoi <vishnoianil@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add start_server script for local dev Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix 3.12-only syntax Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix more py3.10-11 compatibility Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rework output format and background tasks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * speficy return schemas for openapi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add processing time and update REDAME Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * lint markdown Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add MD033 to config Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use port 5000 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use port 5001 as default Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update deps Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor input request Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * return docling document Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update new payload in README Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add base64 example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * wrap example in <details> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename /url in /source Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move main execution to __main__ Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Guillaume Moutier <gmoutier@redhat.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Anil Vishnoi <vishnoianil@gmail.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Anil Vishnoi <vishnoianil@gmail.com>
2025-11-29 08:33:50 +00:00 · 2025-02-03 05:00:54 -05:00
parent ddf3144512
commit c6539c42de
25 changed files with 3642 additions and 1259 deletions
--- a/tests/test_1-url-all-outputs.py
+++ b/tests/test_1-url-all-outputs.py
@@ -0,0 +1,123 @@
+import json
+
+import httpx
+import pytest
+import pytest_asyncio
+from pytest_check import check
+
+
+@pytest_asyncio.fixture
+async def async_client():
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_convert_url(async_client):
+    """Test convert URL to all outputs"""
+    url = "http://localhost:5001/v1alpha/convert/source"
+    payload = {
+        "options": {
+            "from_formats": [
+                "docx",
+                "pptx",
+                "html",
+                "image",
+                "pdf",
+                "asciidoc",
+                "md",
+                "xlsx",
+            ],
+            "to_formats": ["md", "json", "html", "text", "doctags"],
+            "image_export_mode": "placeholder",
+            "ocr": True,
+            "force_ocr": False,
+            "ocr_engine": "easyocr",
+            "ocr_lang": ["en"],
+            "pdf_backend": "dlparse_v2",
+            "table_mode": "fast",
+            "abort_on_error": False,
+            "return_as_file": False,
+        },
+        "http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}],
+    }
+    print(json.dumps(payload, indent=2))
+
+    response = await async_client.post(url, json=payload)
+    assert response.status_code == 200, "Response should be 200 OK"
+
+    data = response.json()
+
+    # Response content checks
+    # Helper function to safely slice strings
+    def safe_slice(value, length=100):
+        if isinstance(value, str):
+            return value[:length]
+        return str(value)  # Convert non-string values to string for debug purposes
+
+    # Document check
+    check.is_in(
+        "document",
+        data,
+        msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
+    )
+    # MD check
+    check.is_in(
+        "md_content",
+        data.get("document", {}),
+        msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
+    )
+    if data.get("document", {}).get("md_content") is not None:
+        check.is_in(
+            "## DocLayNet: ",
+            data["document"]["md_content"],
+            msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
+        )
+    # JSON check
+    check.is_in(
+        "json_content",
+        data.get("document", {}),
+        msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
+    )
+    if data.get("document", {}).get("json_content") is not None:
+        check.is_in(
+            '{"schema_name": "DoclingDocument"',
+            json.dumps(data["document"]["json_content"]),
+            msg=f"JSON document should contain '{{\\n  \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
+        )
+    # HTML check
+    check.is_in(
+        "html_content",
+        data.get("document", {}),
+        msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
+    )
+    if data.get("document", {}).get("html_content") is not None:
+        check.is_in(
+            '<!DOCTYPE html>\n<html lang="en">\n<head>',
+            data["document"]["html_content"],
+            msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
+        )
+    # Text check
+    check.is_in(
+        "text_content",
+        data.get("document", {}),
+        msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
+    )
+    if data.get("document", {}).get("text_content") is not None:
+        check.is_in(
+            "DocLayNet: A Large Human-Annotated Dataset",
+            data["document"]["text_content"],
+            msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
+        )
+    # DocTags check
+    check.is_in(
+        "doctags_content",
+        data.get("document", {}),
+        msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
+    )
+    if data.get("document", {}).get("doctags_content") is not None:
+        check.is_in(
+            "<document>\n<section_header_level_1><location>",
+            data["document"]["doctags_content"],
+            msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
+        )