docs: Generate usage.md automatically (#340)

Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-11-30 09:03:19 +00:00 · 2025-10-21 13:27:01 +01:00
parent 56e8535a7a
commit 9672f310b1
5 changed files with 212 additions and 29 deletions
--- a/scripts/update_doc_usage.py
+++ b/scripts/update_doc_usage.py
@@ -0,0 +1,163 @@
+import re
+from typing import Annotated, Any, get_args, get_origin
+
+from pydantic import BaseModel
+
+from docling_serve.datamodel.convert import ConvertDocumentsRequestOptions
+
+DOCS_FILE = "docs/usage.md"
+
+VARIABLE_WORDS: list[str] = [
+    "picture_description_local",
+    "vlm_pipeline_model",
+    "vlm",
+    "vlm_pipeline_model_api",
+    "ocr_engines_enum",
+    "easyocr",
+    "dlparse_v4",
+    "fast",
+    "picture_description_api",
+    "vlm_pipeline_model_local",
+]
+
+
+def format_variable_names(text: str) -> str:
+    """Format specific words in description to be code-formatted."""
+    sorted_words = sorted(VARIABLE_WORDS, key=len, reverse=True)
+
+    escaped_words = [re.escape(word) for word in sorted_words]
+
+    for word in escaped_words:
+        pattern = rf"(?<!`)\b{word}\b(?!`)"
+        text = re.sub(pattern, f"`{word}`", text)
+
+    return text
+
+
+def format_allowed_values_description(description: str) -> str:
+    """Format description to code-format allowed values."""
+    # Regex pattern to find text after "Allowed values:"
+    match = re.search(r"Allowed values:(.+?)(?:\.|$)", description, re.DOTALL)
+
+    if match:
+        # Extract the allowed values
+        values_str = match.group(1).strip()
+
+        # Split values, handling both comma and 'and' separators
+        values = re.split(r"\s*(?:,\s*|\s+and\s+)", values_str)
+
+        # Remove any remaining punctuation and whitespace
+        values = [value.strip("., ") for value in values]
+
+        # Create code-formatted values
+        formatted_values = ", ".join(f"`{value}`" for value in values)
+
+        # Replace the original allowed values with formatted version
+        formatted_description = re.sub(
+            r"(Allowed values:)(.+?)(?:\.|$)",
+            f"\\1 {formatted_values}.",
+            description,
+            flags=re.DOTALL,
+        )
+
+        return formatted_description
+
+    return description
+
+
+def _format_type(type_hint: Any) -> str:
+    """Format type ccrrectly, like Annotation or Union."""
+    if get_origin(type_hint) is Annotated:
+        base_type = get_args(type_hint)[0]
+        return _format_type(base_type)
+
+    if hasattr(type_hint, "__origin__"):
+        origin = type_hint.__origin__
+        args = get_args(type_hint)
+
+        if origin is list:
+            return f"List[{_format_type(args[0])}]"
+        elif origin is dict:
+            return f"Dict[{_format_type(args[0])}, {_format_type(args[1])}]"
+        elif str(origin).__contains__("Union") or str(origin).__contains__("Optional"):
+            return " or ".join(_format_type(arg) for arg in args)
+        elif origin is None:
+            return "null"
+
+    if hasattr(type_hint, "__name__"):
+        return type_hint.__name__
+
+    return str(type_hint)
+
+
+def generate_model_doc(model: type[BaseModel]) -> str:
+    """Generate documentation for a Pydantic model."""
+    doc = "\n| Field Name | Type | Description |\n"
+    doc += "|------------|------|-------------|\n"
+
+    for base_model in model.__mro__:
+        # Check if this is a Pydantic model
+        if hasattr(base_model, "model_fields"):
+            # Iterate through fields of this model
+            for field_name, field in base_model.model_fields.items():
+                # Extract description from Annotated field if possible
+                description = field.description or "No description provided."
+                description = format_allowed_values_description(description)
+                description = format_variable_names(description)
+
+                # Handle Annotated types
+                original_type = field.annotation
+                if get_origin(original_type) is Annotated:
+                    # Extract base type and additional metadata
+                    type_args = get_args(original_type)
+                    base_type = type_args[0]
+                else:
+                    base_type = original_type
+
+                field_type = _format_type(base_type)
+                field_type = format_variable_names(field_type)
+
+                doc += f"| `{field_name}` | {field_type} | {description} |\n"
+
+            # stop iterating the base classes
+            break
+
+    doc += "\n"
+    return doc
+
+
+def update_documentation():
+    """Update the documentation file with model information."""
+    doc_request = generate_model_doc(ConvertDocumentsRequestOptions)
+
+    with open(DOCS_FILE) as f:
+        content = f.readlines()
+
+    # Prepare to update the content
+    new_content = []
+    in_cp_section = False
+
+    for line in content:
+        if line.startswith("<!-- begin: parameters-docs -->"):
+            in_cp_section = True
+            new_content.append(line)
+            new_content.append(doc_request)
+            continue
+
+        if in_cp_section and line.strip() == "<!-- end: parameters-docs -->":
+            in_cp_section = False
+
+        if not in_cp_section:
+            new_content.append(line)
+
+    # Only write to the file if new_content is different from content
+    if "".join(new_content) != "".join(content):
+        with open(DOCS_FILE, "w") as f:
+            f.writelines(new_content)
+        print(f"Documentation updated in {DOCS_FILE}")
+    else:
+        print("No changes detected. Documentation file remains unchanged.")
+
+
+if __name__ == "__main__":
+    update_documentation()