mirror of
https://github.com/docling-project/docling-serve.git
synced 2025-11-29 16:43:24 +00:00
200 lines
6.3 KiB
Python
200 lines
6.3 KiB
Python
import re
|
|
from typing import Annotated, Any, Union, get_args, get_origin
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from docling_serve.datamodel.convert import ConvertDocumentsRequestOptions
|
|
|
|
DOCS_FILE = "docs/usage.md"
|
|
|
|
VARIABLE_WORDS: list[str] = [
|
|
"picture_description_local",
|
|
"vlm_pipeline_model",
|
|
"vlm",
|
|
"vlm_pipeline_model_api",
|
|
"ocr_engines_enum",
|
|
"easyocr",
|
|
"dlparse_v4",
|
|
"fast",
|
|
"picture_description_api",
|
|
"vlm_pipeline_model_local",
|
|
]
|
|
|
|
|
|
def format_variable_names(text: str) -> str:
|
|
"""Format specific words in description to be code-formatted."""
|
|
sorted_words = sorted(VARIABLE_WORDS, key=len, reverse=True)
|
|
|
|
escaped_words = [re.escape(word) for word in sorted_words]
|
|
|
|
for word in escaped_words:
|
|
pattern = rf"(?<!`)\b{word}\b(?!`)"
|
|
text = re.sub(pattern, f"`{word}`", text)
|
|
|
|
return text
|
|
|
|
|
|
def format_allowed_values_description(description: str) -> str:
|
|
"""Format description to code-format allowed values."""
|
|
# Regex pattern to find text after "Allowed values:"
|
|
match = re.search(r"Allowed values:(.+?)(?:\.|$)", description, re.DOTALL)
|
|
|
|
if match:
|
|
# Extract the allowed values
|
|
values_str = match.group(1).strip()
|
|
|
|
# Split values, handling both comma and 'and' separators
|
|
values = re.split(r"\s*(?:,\s*|\s+and\s+)", values_str)
|
|
|
|
# Remove any remaining punctuation and whitespace
|
|
values = [value.strip("., ") for value in values]
|
|
|
|
# Create code-formatted values
|
|
formatted_values = ", ".join(f"`{value}`" for value in values)
|
|
|
|
# Replace the original allowed values with formatted version
|
|
formatted_description = re.sub(
|
|
r"(Allowed values:)(.+?)(?:\.|$)",
|
|
f"\\1 {formatted_values}.",
|
|
description,
|
|
flags=re.DOTALL,
|
|
)
|
|
|
|
return formatted_description
|
|
|
|
return description
|
|
|
|
|
|
def _format_type(type_hint: Any) -> str:
|
|
"""Format type ccrrectly, like Annotation or Union."""
|
|
if get_origin(type_hint) is Annotated:
|
|
base_type = get_args(type_hint)[0]
|
|
return _format_type(base_type)
|
|
|
|
if hasattr(type_hint, "__origin__"):
|
|
origin = type_hint.__origin__
|
|
args = get_args(type_hint)
|
|
|
|
if origin is list:
|
|
return f"List[{_format_type(args[0])}]"
|
|
elif origin is dict:
|
|
return f"Dict[{_format_type(args[0])}, {_format_type(args[1])}]"
|
|
elif str(origin).__contains__("Union") or str(origin).__contains__("Optional"):
|
|
return " or ".join(_format_type(arg) for arg in args)
|
|
elif origin is None:
|
|
return "null"
|
|
|
|
if hasattr(type_hint, "__name__"):
|
|
return type_hint.__name__
|
|
|
|
return str(type_hint)
|
|
|
|
|
|
def _unroll_types(tp) -> list[type]:
|
|
"""
|
|
Unrolls typing.Union and typing.Optional types into a flat list of types.
|
|
"""
|
|
origin = get_origin(tp)
|
|
if origin is Union:
|
|
# Recursively unroll each type inside the Union
|
|
types = []
|
|
for arg in get_args(tp):
|
|
types.extend(_unroll_types(arg))
|
|
# Remove duplicates while preserving order
|
|
return list(dict.fromkeys(types))
|
|
else:
|
|
# If it's not a Union, just return it as a single-element list
|
|
return [tp]
|
|
|
|
|
|
def generate_model_doc(model: type[BaseModel]) -> str:
|
|
"""Generate documentation for a Pydantic model."""
|
|
|
|
models_stack = [model]
|
|
|
|
doc = ""
|
|
while models_stack:
|
|
current_model = models_stack.pop()
|
|
|
|
doc += f"<h4>{current_model.__name__}</h4>\n"
|
|
|
|
doc += "\n| Field Name | Type | Description |\n"
|
|
doc += "|------------|------|-------------|\n"
|
|
|
|
base_models = []
|
|
if hasattr(current_model, "__mro__"):
|
|
base_models = current_model.__mro__
|
|
else:
|
|
base_models = [current_model]
|
|
|
|
for base_model in base_models:
|
|
# Check if this is a Pydantic model
|
|
if hasattr(base_model, "model_fields"):
|
|
# Iterate through fields of this model
|
|
for field_name, field in base_model.model_fields.items():
|
|
# Extract description from Annotated field if possible
|
|
description = field.description or "No description provided."
|
|
description = format_allowed_values_description(description)
|
|
description = format_variable_names(description)
|
|
|
|
# Handle Annotated types
|
|
original_type = field.annotation
|
|
if get_origin(original_type) is Annotated:
|
|
# Extract base type and additional metadata
|
|
type_args = get_args(original_type)
|
|
base_type = type_args[0]
|
|
else:
|
|
base_type = original_type
|
|
|
|
field_type = _format_type(base_type)
|
|
field_type = format_variable_names(field_type)
|
|
|
|
doc += f"| `{field_name}` | {field_type} | {description} |\n"
|
|
|
|
for field_type in _unroll_types(base_type):
|
|
if issubclass(field_type, BaseModel):
|
|
models_stack.append(field_type)
|
|
|
|
# stop iterating the base classes
|
|
break
|
|
|
|
doc += "\n"
|
|
return doc
|
|
|
|
|
|
def update_documentation():
|
|
"""Update the documentation file with model information."""
|
|
doc_request = generate_model_doc(ConvertDocumentsRequestOptions)
|
|
|
|
with open(DOCS_FILE) as f:
|
|
content = f.readlines()
|
|
|
|
# Prepare to update the content
|
|
new_content = []
|
|
in_cp_section = False
|
|
|
|
for line in content:
|
|
if line.startswith("<!-- begin: parameters-docs -->"):
|
|
in_cp_section = True
|
|
new_content.append(line)
|
|
new_content.append(doc_request)
|
|
continue
|
|
|
|
if in_cp_section and line.strip() == "<!-- end: parameters-docs -->":
|
|
in_cp_section = False
|
|
|
|
if not in_cp_section:
|
|
new_content.append(line)
|
|
|
|
# Only write to the file if new_content is different from content
|
|
if "".join(new_content) != "".join(content):
|
|
with open(DOCS_FILE, "w") as f:
|
|
f.writelines(new_content)
|
|
print(f"Documentation updated in {DOCS_FILE}")
|
|
else:
|
|
print("No changes detected. Documentation file remains unchanged.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
update_documentation()
|