docling-serve/docling_serve/ui/preview.px

from collections import defaultdict
from html import escape
from typing import Type

from docling_core.types.doc.document import (
    BaseAnnotation,
    CodeItem,
    ContentLayer,
    DescriptionAnnotation,
    DoclingDocument,
    DocItem,
    FloatingItem,
    Formatting,
    FormulaItem,
    GroupItem,
    GroupLabel,
    ListGroup,
    ListItem,
    NodeItem,
    PictureClassificationData,
    PictureItem,
    ProvenanceItem,
    RefItem,
    Script,
    SectionHeaderItem,
    TableCell,
    TableItem,
    TextItem,
    TitleItem
)
from pyjsx import jsx, JSX, JSXComponent

from .svg import image, path, rect, text


_node_components: dict[str, JSXComponent] = {}


def component(*node_types: list[Type[BaseAnnotation | NodeItem]]):
    def decorator(component):
        for t in node_types:
            _node_components[t.__name__] = component
    return decorator


def AnnotationComponent(children, annotation: BaseAnnotation):
    Comp = _node_components.get(annotation.__class__.__name__)
    element = Comp(annotation=annotation, children=[]) if Comp else (
        <code>{escape(annotation.model_dump_json(indent=2))}</code>
    )

    element.props["class"] = element.props.get("class", "") + " annotation"
    element.props["data-kind"] = annotation.kind

    return element


def NodeComponent(children, node: NodeItem | RefItem, doc: DoclingDocument):
    # Specific component or fallback.
    Comp = _node_components.get(node.__class__.__name__)
    element = Comp(node=node, doc=doc, children=[]) if Comp else (
        <span class="void"></span>
    )

    # Wrap item component with annotations, if any.
    if isinstance(node, DocItem) and (anns := node.get_annotations()):
        element = (
            <div class="annotated">
                {element}
                {[<AnnotationComponent annotation={ann} /> for ann in anns]}
            </div>
        )

    # Extend interaction and styling.
    id = node.self_ref[2:]
    element.props["id"] = id
    element.props["onclick"] = "clickId(event)"

    classes = ["item", node.content_layer.value]
    element.props["class"] = f"{element.props.get("class", "")} {" ".join(classes)}"

    return element


def node_provs(node: NodeItem, doc: DoclingDocument) -> ProvenanceItem:
    return node.prov if isinstance(node, DocItem) else [
        p
        for c in node.children
        if isinstance(c.resolve(doc), DocItem)
        for p in c.resolve(doc).prov
    ]


def DocPage(children, page_no: int, items: list[NodeItem], doc: DoclingDocument):
    page = doc.pages[page_no]
    exclusive_items = [
        item
        for item in items
        if min([p.page_no for p in node_provs(item, doc)]) == page_no
    ]

    comps = []
    for i in range(len(exclusive_items)):
        item = exclusive_items[i]
        id = item.self_ref[2:]
        kind, *index = id.split("/")

        parent_class = ""
        if isinstance(item, GroupItem):
            parent_class = "group"
        else:
            parent = item.parent.resolve(doc)
            if isinstance(parent, GroupItem) and parent.label is not GroupLabel.UNSPECIFIED:
                parent_class = "grouped"

        comps.append(
            <div class={f"item-markers {parent_class} {item.content_layer.value}"} data-id={id}>
                <span>{"/".join(index)}</span>
                <span>{item.label.replace("_", " ")}</span>
                {
                    <span>{item.content_layer.value.replace("_", " ")}</span>
                    if item.content_layer is not ContentLayer.BODY
                    else None
                }
                <a href={f"document/{id}"} target="_blank">{"{;}"}</a>
            </div>
        )
        comps.append(<NodeComponent node={item} doc={doc} />)

        pages = set([p.page_no for p in node_provs(item, doc)])
        page_mark_class = "page-marker"
        if i == 0 or len(pages) > 1:
            page_mark_class += " border"
        comps.append(<div class={page_mark_class}></div>)


    def ItemBox(children, item: DocItem, prov: ProvenanceItem):
        item_id = item.self_ref[2:]
        sub_items = [
            (item_id, prov.bbox.to_top_left_origin(page.size.height))
        ]

        # Table cells.
        if isinstance(item, TableItem):
            for cell in item.data.table_cells:
                sub_items.append(
                    (f"{item_id}/{cell.start_col_offset_idx}/{cell.start_row_offset_idx}", cell.bbox)
                )

        return [
            <rect
                data-id={id}
                x={bbox.l - 1}
                y={bbox.t - 1}
                width={bbox.width + 2}
                height={bbox.height + 2}
                vector-effect="non-scaling-stroke"
                onclick="clickId(event)"
            />
            for id, bbox in sub_items
        ]

    # Span extra row to fill up excess space.
    comps.append(
        <svg
            class="page-image"
            style={{ "grid-row": f"span {len(exclusive_items) + 1}" }}
            width="50vw"
            viewBox={f"0 0 {page.size.width} {page.size.height}"}
        >
            <image
                href={f"document/pages/{page_no}"}
                width={page.size.width}
                height={page.size.height}
            />
            {[
                <ItemBox item={item} prov={prov} />
                for item in items
                if isinstance(item, DocItem)
                for prov in item.prov
                if prov.page_no == page_no
            ]}

            <text class="top-no" x={5} y={5}>{page_no}</text>
            <text class="bottom-no" x={5} y={page.size.height - 5}>{page_no}</text>
        </svg>
    )

    return <div class="page">{comps}</div>


def DocPreview(children, doc: DoclingDocument):
    page_items: dict[int, list[NodeItem]] = defaultdict(list)

    for item, level in doc.iterate_items(
        with_groups=True,
        included_content_layers={*ContentLayer}
    ):
        if not isinstance(item, GroupItem) or item.label is not GroupLabel.UNSPECIFIED:
            pages = set([p.page_no for p in node_provs(item, doc)])
            for page in pages:
                page_items[page].append(item)

    return [
        <DocPage page_no={page_no} items={page_items[page_no]} doc={doc} />
        for page_no in sorted(page_items.keys())
    ]


def _text_classes(node: TextItem) -> str:
    classes = [node.label]

    if frmt := node.formatting:
        formats = {
            "bold": frmt.bold,
            "italic": frmt.italic,
            "underline": frmt.underline,
            "strikethrough": frmt.strikethrough
        }
        classes.extend([cls for cls, active in formats.items() if active])
        classes.append(frmt.script)

    return " ".join(classes)


@component(TextItem)
def TextComponent(children, node: TextItem, doc: DoclingDocument):
    return <p class={_text_classes(node)}>{escape(node.text)}</p>


@component(TitleItem)
def TitleComponent(children, node: TitleItem, doc: DoclingDocument):
    return <h1 class={_text_classes(node)}>{escape(node.text)}</h1>


@component(SectionHeaderItem)
def SectionHeaderComponent(children, node: SectionHeaderItem, doc: DoclingDocument):
    return <h4 class={_text_classes(node)}>{escape(node.text)}</h4>


@component(ListItem)
def ListComponent(children, node: ListItem, doc: DoclingDocument):
    return (
        <li>
            <b>{node.marker}</b>
            <span class={_text_classes(node)}>{escape(node.text)}</span>
        </li>
    )


@component(CodeItem)
def CodeComponent(children, node: CodeItem, doc: DoclingDocument):
    return (
        <figure>
            <code class={_text_classes(node)}>
                {escape(node.text or node.orig)}
            </code>
        </figure>
    )


@component(FormulaItem)
def FormulaComponent(children, node: FormulaItem, doc: DoclingDocument):
    return (
        <figure>
            <code class={_text_classes(node)}>
                {escape(node.text or node.orig)}
            </code>
        </figure>
    )


@component(PictureItem)
def PictureComponent(children, node: PictureItem, doc: DoclingDocument):
    return <figure><img src={f"document/{node.self_ref[2:]}"} loading="lazy" /></figure>


@component(PictureClassificationData)
def PictureClassificationComponent(children, annotation: PictureClassificationData):
    return (
        <table>
            <tbody>
                {[
                    <tr>
                        <td>{cls.class_name.replace("_", " ")}</td>
                        <td>{f"{cls.confidence:.2f}"}</td>
                    </tr>
                    for cls in annotation.predicted_classes
                    if cls.confidence > 0.01
                ]}
            </tbody>
        </table>
    )


@component(DescriptionAnnotation)
def DescriptionAnnotation(children, annotation: DescriptionAnnotation):
    return <span>{escape(annotation.text)}</span>


@component(TableItem)
def TableComponent(children, node: TableItem, doc: DoclingDocument):
    covered_cells: set[(int, int)] = set()

    def check_cover(cell: TableCell):
        is_covered = (cell.start_col_offset_idx, cell.start_row_offset_idx) in covered_cells

        if not is_covered:
            for x in range(cell.start_col_offset_idx, cell.end_col_offset_idx):
                for y in range(cell.start_row_offset_idx, cell.end_row_offset_idx):
                    covered_cells.add((x, y))

        return is_covered

    def Cell(children, cell: TableCell):
        id = f"{node.self_ref[2:]}/{cell.start_col_offset_idx}/{cell.start_row_offset_idx}"

        return (
            <td
                id={id}
                class={"header" if cell.column_header or cell.row_header else None}
                colspan={cell.col_span or 1}
                rowspan={cell.row_span or 1}
                onclick="clickId(event)"
            >
                {escape(cell.text)}
            </td>
        )

    return (
        <div class="table">
            <table>
                <tbody>
                    {[
                        <tr>
                            {[
                                <Cell cell={cell} />
                                for cell in row
                                if not check_cover(cell)
                            ]}
                        </tr>
                        for row in node.data.grid
                    ]}
                </tbody>
            </table>
        </div>
    )