from collections import defaultdict from html import escape from typing import Type from docling_core.types.doc.document import ( BaseAnnotation, CodeItem, ContentLayer, DescriptionAnnotation, DoclingDocument, DocItem, FloatingItem, Formatting, FormulaItem, GroupItem, GroupLabel, ListGroup, ListItem, NodeItem, PictureClassificationData, PictureItem, ProvenanceItem, RefItem, Script, SectionHeaderItem, TableCell, TableItem, TextItem, TitleItem ) from pyjsx import jsx, JSX, JSXComponent from .svg import image, path, rect, text _node_components: dict[str, JSXComponent] = {} def component(*node_types: list[Type[BaseAnnotation | NodeItem]]): def decorator(component): for t in node_types: _node_components[t.__name__] = component return decorator def AnnotationComponent(children, annotation: BaseAnnotation): Comp = _node_components.get(annotation.__class__.__name__) element = Comp(annotation=annotation, children=[]) if Comp else ( {escape(annotation.model_dump_json(indent=2))} ) element.props["class"] = element.props.get("class", "") + " annotation" element.props["data-kind"] = annotation.kind return element def NodeComponent(children, node: NodeItem | RefItem, doc: DoclingDocument): # Specific component or fallback. Comp = _node_components.get(node.__class__.__name__) element = Comp(node=node, doc=doc, children=[]) if Comp else ( ) # Wrap item component with annotations, if any. if isinstance(node, DocItem) and (anns := node.get_annotations()): element = (
{element} {[ for ann in anns]}
) # Extend interaction and styling. id = node.self_ref[2:] element.props["id"] = id element.props["onclick"] = "clickId(event)" classes = ["item", node.content_layer.value] element.props["class"] = f"{element.props.get("class", "")} {" ".join(classes)}" return element def node_provs(node: NodeItem, doc: DoclingDocument) -> ProvenanceItem: return node.prov if isinstance(node, DocItem) else [ p for c in node.children if isinstance(c.resolve(doc), DocItem) for p in c.resolve(doc).prov ] def DocPage(children, page_no: int, items: list[NodeItem], doc: DoclingDocument): page = doc.pages[page_no] exclusive_items = [ item for item in items if min([p.page_no for p in node_provs(item, doc)]) == page_no ] comps = [] for i in range(len(exclusive_items)): item = exclusive_items[i] id = item.self_ref[2:] kind, *index = id.split("/") parent_class = "" if isinstance(item, GroupItem): parent_class = "group" else: parent = item.parent.resolve(doc) if isinstance(parent, GroupItem) and parent.label is not GroupLabel.UNSPECIFIED: parent_class = "grouped" comps.append(
{"/".join(index)} {item.label.replace("_", " ")} { {item.content_layer.value.replace("_", " ")} if item.content_layer is not ContentLayer.BODY else None } {"{;}"}
) comps.append() pages = set([p.page_no for p in node_provs(item, doc)]) page_mark_class = "page-marker" if i == 0 or len(pages) > 1: page_mark_class += " border" comps.append(
) def ItemBox(children, item: DocItem, prov: ProvenanceItem): item_id = item.self_ref[2:] sub_items = [ (item_id, prov.bbox.to_top_left_origin(page.size.height)) ] # Table cells. if isinstance(item, TableItem): for cell in item.data.table_cells: sub_items.append( (f"{item_id}/{cell.start_col_offset_idx}/{cell.start_row_offset_idx}", cell.bbox) ) return [ for id, bbox in sub_items ] # Span extra row to fill up excess space. comps.append( {[ for item in items if isinstance(item, DocItem) for prov in item.prov if prov.page_no == page_no ]} {page_no} {page_no} ) return
{comps}
def DocPreview(children, doc: DoclingDocument): page_items: dict[int, list[NodeItem]] = defaultdict(list) for item, level in doc.iterate_items( with_groups=True, included_content_layers={*ContentLayer} ): if not isinstance(item, GroupItem) or item.label is not GroupLabel.UNSPECIFIED: pages = set([p.page_no for p in node_provs(item, doc)]) for page in pages: page_items[page].append(item) return [ for page_no in sorted(page_items.keys()) ] def _text_classes(node: TextItem) -> str: classes = [node.label] if frmt := node.formatting: formats = { "bold": frmt.bold, "italic": frmt.italic, "underline": frmt.underline, "strikethrough": frmt.strikethrough } classes.extend([cls for cls, active in formats.items() if active]) classes.append(frmt.script) return " ".join(classes) @component(TextItem) def TextComponent(children, node: TextItem, doc: DoclingDocument): return

{escape(node.text)}

@component(TitleItem) def TitleComponent(children, node: TitleItem, doc: DoclingDocument): return

{escape(node.text)}

@component(SectionHeaderItem) def SectionHeaderComponent(children, node: SectionHeaderItem, doc: DoclingDocument): return

{escape(node.text)}

@component(ListItem) def ListComponent(children, node: ListItem, doc: DoclingDocument): return (
  • {node.marker} {escape(node.text)}
  • ) @component(CodeItem) def CodeComponent(children, node: CodeItem, doc: DoclingDocument): return (
    {escape(node.text or node.orig)}
    ) @component(FormulaItem) def FormulaComponent(children, node: FormulaItem, doc: DoclingDocument): return (
    {escape(node.text or node.orig)}
    ) @component(PictureItem) def PictureComponent(children, node: PictureItem, doc: DoclingDocument): return
    @component(PictureClassificationData) def PictureClassificationComponent(children, annotation: PictureClassificationData): return ( {[ for cls in annotation.predicted_classes if cls.confidence > 0.01 ]}
    {cls.class_name.replace("_", " ")} {f"{cls.confidence:.2f}"}
    ) @component(DescriptionAnnotation) def DescriptionAnnotation(children, annotation: DescriptionAnnotation): return {escape(annotation.text)} @component(TableItem) def TableComponent(children, node: TableItem, doc: DoclingDocument): covered_cells: set[(int, int)] = set() def check_cover(cell: TableCell): is_covered = (cell.start_col_offset_idx, cell.start_row_offset_idx) in covered_cells if not is_covered: for x in range(cell.start_col_offset_idx, cell.end_col_offset_idx): for y in range(cell.start_row_offset_idx, cell.end_row_offset_idx): covered_cells.add((x, y)) return is_covered def Cell(children, cell: TableCell): id = f"{node.self_ref[2:]}/{cell.start_col_offset_idx}/{cell.start_row_offset_idx}" return ( {escape(cell.text)} ) return (
    {[ {[ for cell in row if not check_cover(cell) ]} for row in node.data.grid ]}
    )