Files
docling-serve/scripts/update_doc_usage.py
Tiago Santana 9672f310b1 docs: Generate usage.md automatically (#340)
Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-10-21 14:27:01 +02:00

164 lines
5.2 KiB
Python

import re
from typing import Annotated, Any, get_args, get_origin
from pydantic import BaseModel
from docling_serve.datamodel.convert import ConvertDocumentsRequestOptions
DOCS_FILE = "docs/usage.md"
VARIABLE_WORDS: list[str] = [
"picture_description_local",
"vlm_pipeline_model",
"vlm",
"vlm_pipeline_model_api",
"ocr_engines_enum",
"easyocr",
"dlparse_v4",
"fast",
"picture_description_api",
"vlm_pipeline_model_local",
]
def format_variable_names(text: str) -> str:
"""Format specific words in description to be code-formatted."""
sorted_words = sorted(VARIABLE_WORDS, key=len, reverse=True)
escaped_words = [re.escape(word) for word in sorted_words]
for word in escaped_words:
pattern = rf"(?<!`)\b{word}\b(?!`)"
text = re.sub(pattern, f"`{word}`", text)
return text
def format_allowed_values_description(description: str) -> str:
"""Format description to code-format allowed values."""
# Regex pattern to find text after "Allowed values:"
match = re.search(r"Allowed values:(.+?)(?:\.|$)", description, re.DOTALL)
if match:
# Extract the allowed values
values_str = match.group(1).strip()
# Split values, handling both comma and 'and' separators
values = re.split(r"\s*(?:,\s*|\s+and\s+)", values_str)
# Remove any remaining punctuation and whitespace
values = [value.strip("., ") for value in values]
# Create code-formatted values
formatted_values = ", ".join(f"`{value}`" for value in values)
# Replace the original allowed values with formatted version
formatted_description = re.sub(
r"(Allowed values:)(.+?)(?:\.|$)",
f"\\1 {formatted_values}.",
description,
flags=re.DOTALL,
)
return formatted_description
return description
def _format_type(type_hint: Any) -> str:
"""Format type ccrrectly, like Annotation or Union."""
if get_origin(type_hint) is Annotated:
base_type = get_args(type_hint)[0]
return _format_type(base_type)
if hasattr(type_hint, "__origin__"):
origin = type_hint.__origin__
args = get_args(type_hint)
if origin is list:
return f"List[{_format_type(args[0])}]"
elif origin is dict:
return f"Dict[{_format_type(args[0])}, {_format_type(args[1])}]"
elif str(origin).__contains__("Union") or str(origin).__contains__("Optional"):
return " or ".join(_format_type(arg) for arg in args)
elif origin is None:
return "null"
if hasattr(type_hint, "__name__"):
return type_hint.__name__
return str(type_hint)
def generate_model_doc(model: type[BaseModel]) -> str:
"""Generate documentation for a Pydantic model."""
doc = "\n| Field Name | Type | Description |\n"
doc += "|------------|------|-------------|\n"
for base_model in model.__mro__:
# Check if this is a Pydantic model
if hasattr(base_model, "model_fields"):
# Iterate through fields of this model
for field_name, field in base_model.model_fields.items():
# Extract description from Annotated field if possible
description = field.description or "No description provided."
description = format_allowed_values_description(description)
description = format_variable_names(description)
# Handle Annotated types
original_type = field.annotation
if get_origin(original_type) is Annotated:
# Extract base type and additional metadata
type_args = get_args(original_type)
base_type = type_args[0]
else:
base_type = original_type
field_type = _format_type(base_type)
field_type = format_variable_names(field_type)
doc += f"| `{field_name}` | {field_type} | {description} |\n"
# stop iterating the base classes
break
doc += "\n"
return doc
def update_documentation():
"""Update the documentation file with model information."""
doc_request = generate_model_doc(ConvertDocumentsRequestOptions)
with open(DOCS_FILE) as f:
content = f.readlines()
# Prepare to update the content
new_content = []
in_cp_section = False
for line in content:
if line.startswith("<!-- begin: parameters-docs -->"):
in_cp_section = True
new_content.append(line)
new_content.append(doc_request)
continue
if in_cp_section and line.strip() == "<!-- end: parameters-docs -->":
in_cp_section = False
if not in_cp_section:
new_content.append(line)
# Only write to the file if new_content is different from content
if "".join(new_content) != "".join(content):
with open(DOCS_FILE, "w") as f:
f.writelines(new_content)
print(f"Documentation updated in {DOCS_FILE}")
else:
print("No changes detected. Documentation file remains unchanged.")
if __name__ == "__main__":
update_documentation()