Change image rendering

This commit is contained in:
Vik Paruchuri
2025-10-26 10:27:49 -04:00
parent 528b58c16f
commit 2d2d7ab331
6 changed files with 154 additions and 48 deletions

View File

@@ -13,6 +13,15 @@ def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
print(f"Failed to flatten annotations / form fields on page {page}.")
def load_image(filepath: str):
image = Image.open(filepath).convert("RGB")
if image.width < settings.MIN_IMAGE_DIM or image.height < settings.MIN_IMAGE_DIM:
scale = settings.MIN_IMAGE_DIM / min(image.width, image.height)
new_size = (int(image.width * scale), int(image.height * scale))
image = image.resize(new_size, Image.LANCZOS)
return image
def load_pdf_images(filepath: str, page_range: List[int]):
doc = pdfium.PdfDocument(filepath)
doc.init_forms()
@@ -22,7 +31,7 @@ def load_pdf_images(filepath: str, page_range: List[int]):
if not page_range or page in page_range:
page_obj = doc[page]
min_page_dim = min(page_obj.get_width(), page_obj.get_height())
scale_dpi = (settings.MIN_IMAGE_DIM / min_page_dim) * 72
scale_dpi = (settings.MIN_PDF_IMAGE_DIM / min_page_dim) * 72
scale_dpi = max(scale_dpi, settings.IMAGE_DPI)
page_obj = doc[page]
flatten(page_obj)
@@ -56,5 +65,5 @@ def load_file(filepath: str, config: dict):
if input_type and input_type.extension == "pdf":
images = load_pdf_images(filepath, page_range)
else:
images = [Image.open(filepath).convert("RGB")]
images = [load_image(filepath)]
return images

View File

@@ -71,6 +71,15 @@ def parse_html(
else:
img = BeautifulSoup(f"<img src='{img_src}'/>", "html.parser")
div.append(img)
if label in ["Text"] and not re.search(
"<.+>", str(div.decode_contents()).strip()
):
# Add inner p tags if missing for text blocks
text_content = str(div.decode_contents()).strip()
text_content = f"<p>{text_content}</p>"
div.clear()
div.append(BeautifulSoup(text_content, "html.parser"))
content = str(div.decode_contents())
out_html += content
return out_html

View File

@@ -143,6 +143,7 @@ def process():
"image_height": img_height,
"blocks": blocks_data,
"html": html_with_images,
"markdown": result.markdown,
}
)

View File

@@ -64,6 +64,20 @@
cursor: not-allowed;
}
.controls label {
display: flex;
align-items: center;
gap: 8px;
color: white;
font-size: 14px;
cursor: pointer;
user-select: none;
}
.controls input[type="checkbox"] {
cursor: pointer;
}
.loading {
display: none;
color: #f39c12;
@@ -75,6 +89,11 @@
font-weight: bold;
}
.success {
color: #27ae60;
font-weight: bold;
}
.screenshot-container {
display: none;
margin-top: 60px;
@@ -88,8 +107,18 @@
display: flex;
}
.left-panel, .right-panel {
flex: 1;
.left-panel {
flex: 0 0 40%;
display: flex;
flex-direction: column;
background: white;
border-radius: 8px;
overflow: hidden;
box-shadow: 0 4px 12px rgba(0,0,0,0.3);
}
.right-panel {
flex: 0 0 60%;
display: flex;
flex-direction: column;
background: white;
@@ -216,8 +245,14 @@
<input type="text" id="filePath" placeholder="Enter file path (e.g., /path/to/document.pdf)">
<input type="number" id="pageNumber" placeholder="Page" value="0" min="0">
<button id="processBtn" onclick="processFile()">Process</button>
<label>
<input type="checkbox" id="showLayoutBoxes" checked onchange="toggleLayoutBoxes()">
Show Layout Boxes
</label>
<button id="copyMarkdownBtn" onclick="copyMarkdown()" style="display: none;">Copy Markdown</button>
<span class="loading" id="loading">Processing...</span>
<span class="error" id="error"></span>
<span class="success" id="success"></span>
</div>
<div class="screenshot-container" id="container">
@@ -243,6 +278,11 @@
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/github-markdown-css/5.8.1/github-markdown.min.css" integrity="sha512-BrOPA520KmDMqieeM7XFe6a3u3Sb3F1JBaQnrIAmWg3EYrciJ+Qqe6ZcKCdfPv26rGcgTrJnZ/IdQEct8h3Zhw==" crossorigin="anonymous" referrerpolicy="no-referrer" />
<script>
// Global state to store markdown and canvas data
let currentMarkdown = null;
let currentData = null;
let currentImageSrc = null;
async function processFile() {
const filePath = document.getElementById('filePath').value;
const pageNumber = parseInt(document.getElementById('pageNumber').value) || 0;
@@ -286,6 +326,10 @@
}
function renderResults(data) {
// Store data for toggle functionality
currentData = data;
currentImageSrc = data.image_base64;
const canvas = document.getElementById('layoutCanvas');
const ctx = canvas.getContext('2d');
const markdownContent = document.getElementById('markdownContent');
@@ -293,51 +337,14 @@
// Draw image with layout overlays
const img = new Image();
img.onload = function() {
canvas.width = data.image_width;
canvas.height = data.image_height;
// Draw image
ctx.drawImage(img, 0, 0, data.image_width, data.image_height);
// Draw layout blocks
ctx.lineWidth = 3;
ctx.font = 'bold 14px -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif';
const labelCounts = {};
data.blocks.forEach((block) => {
const [x1, y1, x2, y2] = block.bbox;
const width = x2 - x1;
const height = y2 - y1;
// Draw rectangle with semi-transparent fill
ctx.strokeStyle = block.color;
ctx.fillStyle = block.color + '33';
ctx.fillRect(x1, y1, width, height);
ctx.strokeRect(x1, y1, width, height);
// Count labels for unique identification
labelCounts[block.label] = (labelCounts[block.label] || 0) + 1;
const labelWithCount = `${block.label} #${labelCounts[block.label]}`;
// Draw label with background
const textMetrics = ctx.measureText(labelWithCount);
const textWidth = textMetrics.width;
const textHeight = 16;
const padding = 6;
const labelX = x1;
const labelY = Math.max(y1 - textHeight - padding, textHeight);
ctx.fillStyle = block.color;
ctx.fillRect(labelX, labelY - textHeight, textWidth + padding * 2, textHeight + padding);
ctx.fillStyle = 'white';
ctx.textBaseline = 'top';
ctx.fillText(labelWithCount, labelX + padding, labelY - textHeight + padding/2);
});
drawCanvas(img, data, ctx);
};
img.src = data.image_base64;
// Store markdown and show copy button
currentMarkdown = data.markdown;
document.getElementById('copyMarkdownBtn').style.display = 'inline-block';
// Render HTML directly (with images embedded)
markdownContent.innerHTML = data.html;
@@ -363,6 +370,85 @@
});
}
function drawCanvas(img, data, ctx) {
const canvas = document.getElementById('layoutCanvas');
canvas.width = data.image_width;
canvas.height = data.image_height;
// Draw image
ctx.drawImage(img, 0, 0, data.image_width, data.image_height);
// Check if layout boxes should be shown
const showBoxes = document.getElementById('showLayoutBoxes').checked;
if (!showBoxes) return;
// Draw layout blocks
ctx.lineWidth = 3;
ctx.font = 'bold 14px -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif';
const labelCounts = {};
data.blocks.forEach((block) => {
const [x1, y1, x2, y2] = block.bbox;
const width = x2 - x1;
const height = y2 - y1;
// Draw rectangle with semi-transparent fill
ctx.strokeStyle = block.color;
ctx.fillStyle = block.color + '33';
ctx.fillRect(x1, y1, width, height);
ctx.strokeRect(x1, y1, width, height);
// Count labels for unique identification
labelCounts[block.label] = (labelCounts[block.label] || 0) + 1;
const labelWithCount = `${block.label} #${labelCounts[block.label]}`;
// Draw label with background
const textMetrics = ctx.measureText(labelWithCount);
const textWidth = textMetrics.width;
const textHeight = 16;
const padding = 6;
const labelX = x1;
const labelY = Math.max(y1 - textHeight - padding, textHeight);
ctx.fillStyle = block.color;
ctx.fillRect(labelX, labelY - textHeight, textWidth + padding * 2, textHeight + padding);
ctx.fillStyle = 'white';
ctx.textBaseline = 'top';
ctx.fillText(labelWithCount, labelX + padding, labelY - textHeight + padding/2);
});
}
function toggleLayoutBoxes() {
if (!currentData || !currentImageSrc) return;
const canvas = document.getElementById('layoutCanvas');
const ctx = canvas.getContext('2d');
const img = new Image();
img.onload = function() {
drawCanvas(img, currentData, ctx);
};
img.src = currentImageSrc;
}
function copyMarkdown() {
if (!currentMarkdown) {
document.getElementById('error').textContent = 'No markdown to copy';
return;
}
navigator.clipboard.writeText(currentMarkdown).then(() => {
const success = document.getElementById('success');
success.textContent = 'Markdown copied!';
setTimeout(() => {
success.textContent = '';
}, 2000);
}).catch((err) => {
document.getElementById('error').textContent = 'Failed to copy: ' + err.message;
});
}
// Allow Enter key to trigger processing
document.getElementById('filePath').addEventListener('keypress', function(e) {
if (e.key === 'Enter') processFile();

View File

@@ -9,7 +9,8 @@ class Settings(BaseSettings):
# Paths
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
IMAGE_DPI: int = 192
MIN_IMAGE_DIM: int = 1024
MIN_PDF_IMAGE_DIM: int = 1024
MIN_IMAGE_DIM: int = 1536
MODEL_CHECKPOINT: str = "datalab-to/chandra"
TORCH_DEVICE: str | None = None
MAX_OUTPUT_TOKENS: int = 8192

View File

@@ -1,6 +1,6 @@
[project]
name = "chandra-ocr"
version = "0.1.7"
version = "0.1.8"
description = "OCR model that converts documents to markdown, HTML, or JSON."
readme = "README.md"
requires-python = ">=3.10"