Files
docling-serve/docling_serve/ui/preview.px

348 lines
10 KiB
Plaintext

from collections import defaultdict
from html import escape
from typing import Type
from docling_core.types.doc.document import (
BaseAnnotation,
CodeItem,
ContentLayer,
DescriptionAnnotation,
DoclingDocument,
DocItem,
FloatingItem,
Formatting,
FormulaItem,
GroupItem,
GroupLabel,
ListGroup,
ListItem,
NodeItem,
PictureClassificationData,
PictureItem,
ProvenanceItem,
RefItem,
Script,
SectionHeaderItem,
TableCell,
TableItem,
TextItem,
TitleItem
)
from pyjsx import jsx, JSX, JSXComponent
from .svg import image, path, rect, text
_node_components: dict[str, JSXComponent] = {}
def component(*node_types: list[Type[BaseAnnotation | NodeItem]]):
def decorator(component):
for t in node_types:
_node_components[t.__name__] = component
return decorator
def AnnotationComponent(children, annotation: BaseAnnotation):
Comp = _node_components.get(annotation.__class__.__name__)
element = Comp(annotation=annotation, children=[]) if Comp else (
<code>{escape(annotation.model_dump_json(indent=2))}</code>
)
element.props["class"] = element.props.get("class", "") + " annotation"
element.props["data-kind"] = annotation.kind
return element
def NodeComponent(children, node: NodeItem | RefItem, doc: DoclingDocument):
# Specific component or fallback.
Comp = _node_components.get(node.__class__.__name__)
element = Comp(node=node, doc=doc, children=[]) if Comp else (
<span class="void"></span>
)
# Wrap item component with annotations, if any.
if isinstance(node, DocItem) and (anns := node.get_annotations()):
element = (
<div class="annotated">
{element}
{[<AnnotationComponent annotation={ann} /> for ann in anns]}
</div>
)
# Extend interaction and styling.
id = node.self_ref[2:]
element.props["id"] = id
element.props["onclick"] = "clickId(event)"
classes = ["item", node.content_layer.value]
element.props["class"] = f"{element.props.get("class", "")} {" ".join(classes)}"
return element
def node_provs(node: NodeItem, doc: DoclingDocument) -> ProvenanceItem:
return node.prov if isinstance(node, DocItem) else [
p
for c in node.children
if isinstance(c.resolve(doc), DocItem)
for p in c.resolve(doc).prov
]
def DocPage(children, page_no: int, items: list[NodeItem], doc: DoclingDocument):
page = doc.pages[page_no]
exclusive_items = [
item
for item in items
if min([p.page_no for p in node_provs(item, doc)]) == page_no
]
comps = []
for i in range(len(exclusive_items)):
item = exclusive_items[i]
id = item.self_ref[2:]
kind, *index = id.split("/")
parent_class = ""
if isinstance(item, GroupItem):
parent_class = "group"
else:
parent = item.parent.resolve(doc)
if isinstance(parent, GroupItem) and parent.label is not GroupLabel.UNSPECIFIED:
parent_class = "grouped"
comps.append(
<div class={f"item-markers {parent_class} {item.content_layer.value}"} data-id={id}>
<span>{"/".join(index)}</span>
<span>{item.label.replace("_", " ")}</span>
{
<span>{item.content_layer.value.replace("_", " ")}</span>
if item.content_layer is not ContentLayer.BODY
else None
}
<a href={f"document/{id}"} target="_blank">{"{;}"}</a>
</div>
)
comps.append(<NodeComponent node={item} doc={doc} />)
pages = set([p.page_no for p in node_provs(item, doc)])
page_mark_class = "page-marker"
if i == 0 or len(pages) > 1:
page_mark_class += " border"
comps.append(<div class={page_mark_class}></div>)
def ItemBox(children, item: DocItem, prov: ProvenanceItem):
item_id = item.self_ref[2:]
sub_items = [
(item_id, prov.bbox.to_top_left_origin(page.size.height))
]
# Table cells.
if isinstance(item, TableItem):
for cell in item.data.table_cells:
sub_items.append(
(f"{item_id}/{cell.start_col_offset_idx}/{cell.start_row_offset_idx}", cell.bbox)
)
return [
<rect
data-id={id}
x={bbox.l - 1}
y={bbox.t - 1}
width={bbox.width + 2}
height={bbox.height + 2}
vector-effect="non-scaling-stroke"
onclick="clickId(event)"
/>
for id, bbox in sub_items
]
# Span extra row to fill up excess space.
comps.append(
<svg
class="page-image"
style={{ "grid-row": f"span {len(exclusive_items) + 1}" }}
width="50vw"
viewBox={f"0 0 {page.size.width} {page.size.height}"}
>
<image
href={f"document/pages/{page_no}"}
width={page.size.width}
height={page.size.height}
/>
{[
<ItemBox item={item} prov={prov} />
for item in items
if isinstance(item, DocItem)
for prov in item.prov
if prov.page_no == page_no
]}
<text class="top-no" x={5} y={5}>{page_no}</text>
<text class="bottom-no" x={5} y={page.size.height - 5}>{page_no}</text>
</svg>
)
return <div class="page">{comps}</div>
def DocPreview(children, doc: DoclingDocument):
page_items: dict[int, list[NodeItem]] = defaultdict(list)
for item, level in doc.iterate_items(
with_groups=True,
included_content_layers={*ContentLayer}
):
if not isinstance(item, GroupItem) or item.label is not GroupLabel.UNSPECIFIED:
pages = set([p.page_no for p in node_provs(item, doc)])
for page in pages:
page_items[page].append(item)
return [
<DocPage page_no={page_no} items={page_items[page_no]} doc={doc} />
for page_no in sorted(page_items.keys())
]
def _text_classes(node: TextItem) -> str:
classes = [node.label]
if frmt := node.formatting:
formats = {
"bold": frmt.bold,
"italic": frmt.italic,
"underline": frmt.underline,
"strikethrough": frmt.strikethrough
}
classes.extend([cls for cls, active in formats.items() if active])
classes.append(frmt.script)
return " ".join(classes)
@component(TextItem)
def TextComponent(children, node: TextItem, doc: DoclingDocument):
return <p class={_text_classes(node)}>{escape(node.text)}</p>
@component(TitleItem)
def TitleComponent(children, node: TitleItem, doc: DoclingDocument):
return <h1 class={_text_classes(node)}>{escape(node.text)}</h1>
@component(SectionHeaderItem)
def SectionHeaderComponent(children, node: SectionHeaderItem, doc: DoclingDocument):
return <h4 class={_text_classes(node)}>{escape(node.text)}</h4>
@component(ListItem)
def ListComponent(children, node: ListItem, doc: DoclingDocument):
return (
<li>
<b>{node.marker}</b>
<span class={_text_classes(node)}>{escape(node.text)}</span>
</li>
)
@component(CodeItem)
def CodeComponent(children, node: CodeItem, doc: DoclingDocument):
return (
<figure>
<code class={_text_classes(node)}>
{escape(node.text or node.orig)}
</code>
</figure>
)
@component(FormulaItem)
def FormulaComponent(children, node: FormulaItem, doc: DoclingDocument):
return (
<figure>
<code class={_text_classes(node)}>
{escape(node.text or node.orig)}
</code>
</figure>
)
@component(PictureItem)
def PictureComponent(children, node: PictureItem, doc: DoclingDocument):
return <figure><img src={f"document/{node.self_ref[2:]}"} loading="lazy" /></figure>
@component(PictureClassificationData)
def PictureClassificationComponent(children, annotation: PictureClassificationData):
return (
<table>
<tbody>
{[
<tr>
<td>{cls.class_name.replace("_", " ")}</td>
<td>{f"{cls.confidence:.2f}"}</td>
</tr>
for cls in annotation.predicted_classes
if cls.confidence > 0.01
]}
</tbody>
</table>
)
@component(DescriptionAnnotation)
def DescriptionAnnotation(children, annotation: DescriptionAnnotation):
return <span>{escape(annotation.text)}</span>
@component(TableItem)
def TableComponent(children, node: TableItem, doc: DoclingDocument):
covered_cells: set[(int, int)] = set()
def check_cover(cell: TableCell):
is_covered = (cell.start_col_offset_idx, cell.start_row_offset_idx) in covered_cells
if not is_covered:
for x in range(cell.start_col_offset_idx, cell.end_col_offset_idx):
for y in range(cell.start_row_offset_idx, cell.end_row_offset_idx):
covered_cells.add((x, y))
return is_covered
def Cell(children, cell: TableCell):
id = f"{node.self_ref[2:]}/{cell.start_col_offset_idx}/{cell.start_row_offset_idx}"
return (
<td
id={id}
class={"header" if cell.column_header or cell.row_header else None}
colspan={cell.col_span or 1}
rowspan={cell.row_span or 1}
onclick="clickId(event)"
>
{escape(cell.text)}
</td>
)
return (
<div class="table">
<table>
<tbody>
{[
<tr>
{[
<Cell cell={cell} />
for cell in row
if not check_cover(cell)
]}
</tr>
for row in node.data.grid
]}
</tbody>
</table>
</div>
)