Cleanup. Text formatting. Fallback picture annotation.

This commit is contained in:
DKL
2025-11-24 15:17:39 +01:00
parent 8d5892b176
commit 025c4c8942
4 changed files with 126 additions and 115 deletions

View File

@@ -16,7 +16,7 @@ from fastapi import (
from fastapi.responses import HTMLResponse, RedirectResponse, Response from fastapi.responses import HTMLResponse, RedirectResponse, Response
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from pydantic import AnyHttpUrl from pydantic import AnyHttpUrl
from pyjsx import auto_setup from pyjsx import auto_setup # type: ignore
from starlette.exceptions import HTTPException as StarletteHTTPException from starlette.exceptions import HTTPException as StarletteHTTPException
from docling.datamodel.base_models import OutputFormat from docling.datamodel.base_models import OutputFormat
@@ -131,7 +131,7 @@ def create_ui_app(process_file, process_url, task_result, task_status_poll) -> F
): ):
tasks = sorted(orchestrator.tasks.values(), key=lambda t: t.created_at) tasks = sorted(orchestrator.tasks.values(), key=lambda t: t.created_at)
return str(TasksPage(tasks=tasks)) return str(TasksPage(tasks))
# Task specific page. # Task specific page.
@ui_app.get("/tasks/{task_id}/", response_class=HTMLResponse) @ui_app.get("/tasks/{task_id}/", response_class=HTMLResponse)

View File

@@ -10,6 +10,7 @@ from docling_core.types.doc.document import (
DoclingDocument, DoclingDocument,
DocItem, DocItem,
FloatingItem, FloatingItem,
Formatting,
FormulaItem, FormulaItem,
GroupItem, GroupItem,
GroupLabel, GroupLabel,
@@ -20,6 +21,7 @@ from docling_core.types.doc.document import (
PictureItem, PictureItem,
ProvenanceItem, ProvenanceItem,
RefItem, RefItem,
Script,
SectionHeaderItem, SectionHeaderItem,
TableCell, TableCell,
TableItem, TableItem,
@@ -28,7 +30,7 @@ from docling_core.types.doc.document import (
) )
from pyjsx import jsx, JSX, JSXComponent from pyjsx import jsx, JSX, JSXComponent
from .svg import circle, clipPath, image, path, rect, text from .svg import image, path, rect, text
_node_components: dict[str, JSXComponent] = {} _node_components: dict[str, JSXComponent] = {}
@@ -63,7 +65,7 @@ def NodeComponent(children, node: NodeItem | RefItem, doc: DoclingDocument):
# Wrap item component with annotations, if any. # Wrap item component with annotations, if any.
if isinstance(node, DocItem) and (anns := node.get_annotations()): if isinstance(node, DocItem) and (anns := node.get_annotations()):
element = ( element = (
<div> <div class="annotated">
{element} {element}
{[<AnnotationComponent annotation={ann} /> for ann in anns]} {[<AnnotationComponent annotation={ann} /> for ann in anns]}
</div> </div>
@@ -73,7 +75,9 @@ def NodeComponent(children, node: NodeItem | RefItem, doc: DoclingDocument):
id = node.self_ref[2:] id = node.self_ref[2:]
element.props["id"] = id element.props["id"] = id
element.props["onclick"] = "clickId(event)" element.props["onclick"] = "clickId(event)"
element.props["class"] = element.props.get("class", "") + f" item {node.content_layer.value}"
classes = ["item", node.content_layer.value]
element.props["class"] = f"{element.props.get("class", "")} {" ".join(classes)}"
return element return element
@@ -203,34 +207,67 @@ def DocPreview(children, doc: DoclingDocument):
] ]
def _text_classes(node: TextItem) -> str:
classes = [node.label]
if frmt := node.formatting:
formats = {
"bold": frmt.bold,
"italic": frmt.italic,
"underline": frmt.underline,
"strikethrough": frmt.strikethrough
}
classes.extend([cls for cls, active in formats.items() if active])
classes.append(frmt.script)
return " ".join(classes)
@component(TextItem) @component(TextItem)
def TextComponent(children, node: TextItem, doc: DoclingDocument): def TextComponent(children, node: TextItem, doc: DoclingDocument):
return <p class={node.label}>{escape(node.text)}</p> return <p class={_text_classes(node)}>{escape(node.text)}</p>
@component(TitleItem) @component(TitleItem)
def TitleComponent(children, node: TitleItem, doc: DoclingDocument): def TitleComponent(children, node: TitleItem, doc: DoclingDocument):
return <h1>{escape(node.text)}</h1> return <h1 class={_text_classes(node)}>{escape(node.text)}</h1>
@component(SectionHeaderItem) @component(SectionHeaderItem)
def SectionHeaderComponent(children, node: SectionHeaderItem, doc: DoclingDocument): def SectionHeaderComponent(children, node: SectionHeaderItem, doc: DoclingDocument):
return <h4>{escape(node.text)}</h4> return <h4 class={_text_classes(node)}>{escape(node.text)}</h4>
@component(ListItem) @component(ListItem)
def ListComponent(children, node: ListItem, doc: DoclingDocument): def ListComponent(children, node: ListItem, doc: DoclingDocument):
return <li><b>{node.marker}</b> {escape(node.text)}</li> return (
<li>
<b>{node.marker}</b>
<span class={_text_classes(node)}>{escape(node.text)}</span>
</li>
)
@component(CodeItem) @component(CodeItem)
def CodeComponent(children, node: CodeItem, doc: DoclingDocument): def CodeComponent(children, node: CodeItem, doc: DoclingDocument):
return <figure><code>{escape(node.text or node.orig)}</code></figure> return (
<figure>
<code class={_text_classes(node)}>
{escape(node.text or node.orig)}
</code>
</figure>
)
@component(FormulaItem) @component(FormulaItem)
def FormulaComponent(children, node: FormulaItem, doc: DoclingDocument): def FormulaComponent(children, node: FormulaItem, doc: DoclingDocument):
return <figure><code>{escape(node.text or node.orig)}</code></figure> return (
<figure>
<code class={_text_classes(node)}>
{escape(node.text or node.orig)}
</code>
</figure>
)
@component(PictureItem) @component(PictureItem)
@@ -240,25 +277,25 @@ def PictureComponent(children, node: PictureItem, doc: DoclingDocument):
@component(PictureClassificationData) @component(PictureClassificationData)
def PictureClassificationComponent(children, annotation: PictureClassificationData): def PictureClassificationComponent(children, annotation: PictureClassificationData):
classes = annotation.predicted_classes[:5]
return ( return (
<div> <table>
<tbody>
{[ {[
<div <tr>
style={{ "width": f"{cls.confidence * 100}%" }} <td>{cls.class_name.replace("_", " ")}</td>
title={f"{cls.class_name}: {cls.confidence:.2f}"} <td>{f"{cls.confidence:.2f}"}</td>
> </tr>
{f"{cls.class_name.replace("_", " ")} {cls.confidence:.2f}"} for cls in annotation.predicted_classes
</div> if cls.confidence > 0.01
for cls in classes
]} ]}
</div> </tbody>
</table>
) )
@component(DescriptionAnnotation) @component(DescriptionAnnotation)
def DescriptionAnnotation(children, annotation: DescriptionAnnotation): def DescriptionAnnotation(children, annotation: DescriptionAnnotation):
return <div>{escape(annotation.text)}</div> return <span>{escape(annotation.text)}</span>
@component(TableItem) @component(TableItem)

View File

@@ -1,79 +1,15 @@
@import "pico.css"; @import "pico.css";
/* Pico configuration. */
:root {
--pico-font-size: 16px;
}
/* Utilities. */
.w-4 {
width: calc(4 * var(--pico-spacing));
}
.w-full {
width: 100%;
}
.max-w-full {
max-width: 100%;
}
.mt-1 {
margin-top: var(--pico-spacing);
}
.mr-auto {
margin-right: auto;
}
.mb-1 {
margin-bottom: var(--pico-spacing);
}
.mb-2 {
margin-bottom: calc(2 * var(--pico-spacing));
}
.flex {
display: flex;
}
.flex.row {
flex-direction: row;
}
.flex.col {
flex-direction: column;
}
.flex-1 {
flex: 1 1 0%;
}
.flex-auto {
flex: 1 1 auto;
}
.gap-0 {
gap: 0;
}
.gap-1 {
gap: 0.25rem;
}
.gap-2 {
gap: 0.5rem;
}
.gap-3 {
gap: 1rem;
}
.hidden {
display: none;
}
.sticky-footer {
position: sticky;
bottom: 0;
padding-top: var(--pico-spacing);
background: var(--pico-background-color);
}
/* Customization. */
@view-transition { @view-transition {
navigation: auto; navigation: auto;
} }
:root { :root {
--pico-font-size: 16px;
--highlight-factor: 0.8; --highlight-factor: 0.8;
--target: hsl(240, 100%, 34%); --target: hsl(240, 100%, 34%);
--mark: hsl(29, 100%, 40%); --mark: hsl(29, 100%, 35%);
} }
@media (prefers-color-scheme: dark) { @media (prefers-color-scheme: dark) {
@@ -84,6 +20,18 @@
} }
} }
/* Utilities. */
.hidden {
display: none;
}
.sticky-footer {
position: sticky;
bottom: 0;
padding-top: var(--pico-spacing);
background: var(--pico-background-color);
}
html { html {
scroll-behavior: smooth; scroll-behavior: smooth;
} }
@@ -232,6 +180,10 @@ main.preview {
main.preview:has(.configDarkImg > input:checked) { main.preview:has(.configDarkImg > input:checked) {
--img-hover-border: white; --img-hover-border: white;
svg.page-image {
--mark: hsl(29, 100%, 70%)
}
image, image,
img { img {
filter: invert(1) hue-rotate(180deg) saturate(1.25); filter: invert(1) hue-rotate(180deg) saturate(1.25);
@@ -280,6 +232,38 @@ main.preview {
visibility: hidden; visibility: hidden;
} }
> .item.annotated {
display: flex;
flex-direction: column;
align-items: stretch;
gap: 1rem;
}
/* Formatting. */
.bold {
font-weight: bold;
}
.italic {
font-style: italic;
}
.underline {
text-decoration: underline;
}
.strikethrough {
text-decoration: line-through;
}
.underline.strikethrough {
text-decoration: underline line-through;
}
.sub {
font-size: smaller;
vertical-align: sub;
}
.super {
font-size: smaller;
vertical-align: super;
}
/* Items out of content layer. */ /* Items out of content layer. */
> .item:not(.body), > .item:not(.body),
> .item-markers:not(.body) { > .item-markers:not(.body) {
@@ -319,34 +303,27 @@ main.preview {
} }
.annotation { .annotation {
margin: 0.5rem 1rem; margin: 0;
font-size: 0.9rem;
color: var(--mark);
&::before { &::before {
margin-right: 0.5rem;
content: attr(data-kind); content: attr(data-kind);
opacity: 0.7; opacity: 0.7;
} }
&,
* {
font-size: 0.9rem;
color: var(--mark);
}
} }
.annotation[data-kind="description"] { .annotation[data-kind="description"],
code.annotation {
white-space: pre-line; white-space: pre-line;
} }
.annotation[data-kind="classification"] { .annotation[data-kind="classification"] {
height: 1.5rem; width: fit-content;
display: flex;
align-items: center;
> div {
padding: 0 0.25rem;
background: var(--mark);
border: solid 1px var(--pico-background-color);
color: var(--pico-background-color);
overflow: hidden;
text-wrap: nowrap;
}
} }
> .item-markers { > .item-markers {
@@ -409,6 +386,8 @@ main.preview {
} }
> svg.page-image { > svg.page-image {
--mark: hsl(29, 100%, 35%);
grid-column: 5; grid-column: 5;
position: sticky; position: sticky;
top: 0.5rem; top: 0.5rem;

View File

@@ -1,4 +1,4 @@
from pyjsx import JSX from pyjsx import JSX # type: ignore
def _tag(name: str): def _tag(name: str):
@@ -6,7 +6,7 @@ def _tag(name: str):
props = " ".join([f'{k}="{v}"' for k, v in args.items()]) props = " ".join([f'{k}="{v}"' for k, v in args.items()])
if children: if children:
child_renders = "".join([f"{c}" for c in children]) child_renders = "".join([str(c) for c in children])
return f"<{name} {props}>{child_renders}</{name}>" return f"<{name} {props}>{child_renders}</{name}>"
else: else:
return f"<{name} {props} />" return f"<{name} {props} />"
@@ -14,12 +14,7 @@ def _tag(name: str):
return factory return factory
circle = _tag("circle")
clipPath = _tag("clipPath")
defs = _tag("defs")
foreignObject = _tag("foreignobject")
image = _tag("image") image = _tag("image")
path = _tag("path") path = _tag("path")
rect = _tag("rect") rect = _tag("rect")
text = _tag("text") text = _tag("text")
use = _tag("use")