This commit is contained in:
Vik Paruchuri
2025-10-20 16:58:52 -04:00
parent 98989faae4
commit 417d210904
10 changed files with 606 additions and 27 deletions

View File

@@ -5,29 +5,43 @@ Chandra is a highly accurate OCR model that converts images and PDFs into struct
## Features
- Convert documents to markdown, html, or json with detailed layout information
- Good handwriting support
- Reconstructs forms accurately, including checkboxes
- Math equation support (LaTeX)
- Reconstructs forms, including checkboxes
- Precise table reconstruction
- Support for 40+ languages
- Two inference modes: local (HuggingFace) and remote (vLLM server)
## Benchmarks
| **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall |
|:----------|:-----:|:--------------:|:------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:-------:|
| Datalab Chandra v0.1.0 | 81.4 | **80.3** | **89.4** | **50.0** | 88.3 | **81.0** | **91.6** | **99.9** | **82.7 ± 0.9** |
| Datalab Marker v1.10.0 | **83.8** | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 |
| Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 |
| Deepseek OCR | 75.2 | 67.9 | 79.1 | 32.9 | 96.1 | 66.3 | 78.5 | 97.7 | 74.2 ± 1.0 |
| Nanonets OCR | 67.0 | 68.6 | 77.7 | 39.5 | 40.7 | 69.9 | 53.4 | 99.3 | 64.5 ± 1.1 |
| GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 |
| Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 |
| Qwen 2.5 VL (No Anchor) | 63.1 | 65.7 | 67.3 | 38.6 | 73.6 | 68.3 | 49.1 | 98.3 | 65.5 ± 1.2 |
| olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | **95.1** | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 |
| **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall |
|:----------|:--------:|:--------------:|:--------:|:---------:|:-------------------:|:------------:|:--------------:|:--------:|:--------------:|
| Datalab Chandra v0.1.0 | 81.4 | **80.3** | **89.4** | **50.0** | 88.3 | **81.0** | **91.6** | **99.9** | **82.7 ± 0.9** |
| Datalab Marker v1.10.0 | **83.8** | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 |
| Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 |
| Deepseek OCR | 75.2 | 67.9 | 79.1 | 32.9 | 96.1 | 66.3 | 78.5 | 97.7 | 74.2 ± 1.0 |
| Nanonets OCR | 67.0 | 68.6 | 77.7 | 39.5 | 40.7 | 69.9 | 53.4 | 99.3 | 64.5 ± 1.1 |
| GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 |
| Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 |
| Qwen 2.5 VL (No Anchor) | 63.1 | 65.7 | 67.3 | 38.6 | 73.6 | 68.3 | 49.1 | 98.3 | 65.5 ± 1.2 |
| Qwen 3 VL | 70.2 | 75.1 | 45.6 | 37.5 | 89.1 | 62.1 | 43.0 | 94.3 | 64.6 ± 1.1 |
| olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | **95.1** | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 |
## Installation
### From PyPI (Recommended)
```bash
pip install chandra-ocr
```
### From Source
```bash
git clone https://github.com/yourusername/chandra.git
cd chandra
uv sync
source .venv/bin/activate
```
@@ -39,14 +53,14 @@ source .venv/bin/activate
Process single files or entire directories:
```bash
# Process a single PDF with vLLM
python chandra_cli.py input.pdf ./output --method vllm
# Single file, with vllm server (see below for how to launch)
chandra input.pdf ./output --method vllm
# Process all files in a directory with local model
python chandra_cli.py ./documents ./output --method hf
chandra ./documents ./output --method hf
# Process specific pages with custom settings
python chandra_cli.py document.pdf ./output --page-range "1-10,15,20-25" --max-workers 8
chandra document.pdf ./output --page-range "1-10,15,20-25" --max-workers 8
```
**CLI Options:**
@@ -71,7 +85,7 @@ Each processed file creates a subdirectory with:
Launch the interactive demo for single-page processing:
```bash
streamlit run chandra_app.py --server.fileWatcherType none --server.headless true
chandra_app
```
The web interface allows you to:

View File

@@ -15,13 +15,12 @@ def _hash_html(html: str):
return hashlib.md5(html.encode("utf-8")).hexdigest()
def get_image_name(html: str, div_idx: int, image_idx: int):
def get_image_name(html: str, div_idx: int):
html_hash = _hash_html(html)
return f"{html_hash}_{div_idx}_img{image_idx}.webp"
return f"{html_hash}_{div_idx}_img.webp"
def extract_images(html: str, chunks: dict, image: Image.Image):
image_idx = 0
images = {}
div_idx = 0
for idx, chunk in enumerate(chunks):
@@ -31,9 +30,9 @@ def extract_images(html: str, chunks: dict, image: Image.Image):
if not img:
continue
bbox = chunk["bbox"]
image = image.crop(bbox)
img_name = get_image_name(html, div_idx, image_idx)
images[img_name] = image
block_image = image.crop(bbox)
img_name = get_image_name(html, div_idx)
images[img_name] = block_image
return images
@@ -59,7 +58,7 @@ def parse_html(
if label in ["Image", "Figure"]:
img = div.find("img")
img_src = get_image_name(html, div_idx, image_idx)
img_src = get_image_name(html, div_idx)
if img:
img["src"] = img_src
image_idx += 1

View File

View File

@@ -64,7 +64,7 @@ def ocr_layout(
return result, layout_image
st.set_page_config(layout="wide")
st.set_page_config(layout="wide", page_title="Chandra OCR Demo")
col1, col2 = st.columns([0.5, 0.5])
st.markdown("""

View File

@@ -0,0 +1,25 @@
import os
import subprocess
import sys
def main():
argv = sys.argv[1:]
cur_dir = os.path.dirname(os.path.abspath(__file__))
app_path = os.path.join(cur_dir, "app.py")
cmd = [
"streamlit",
"run",
app_path,
"--server.fileWatcherType",
"none",
"--server.headless",
"true",
]
if argv:
cmd += ["--"] + argv
subprocess.run(cmd)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,158 @@
"""
Simple Flask app for generating screenshot-ready OCR visualizations.
Displays original image with layout overlays on the left and extracted markdown on the right.
"""
from flask import Flask, render_template, request, jsonify
import base64
from io import BytesIO
from PIL import Image
from chandra.model import InferenceManager
from chandra.input import load_file
from chandra.model.schema import BatchInputItem
from chandra.output import parse_layout
app = Flask(__name__)
# Load model once at startup
model = None
def get_model():
global model
if model is None:
model = InferenceManager(method="vllm")
return model
def pil_image_to_base64(pil_image: Image.Image, format: str = "PNG") -> str:
"""Convert PIL image to base64 data URL."""
buffered = BytesIO()
pil_image.save(buffered, format=format)
img_str = base64.b64encode(buffered.getvalue()).decode()
return f"data:image/{format.lower()};base64,{img_str}"
def get_color_palette():
"""Return a color palette for different block types."""
return {
"Section-Header": "#4ECDC4",
"Text": "#45B7D1",
"List-Group": "#96CEB4",
"Table": "#FFEAA7",
"Figure": "#DDA15E",
"Image": "#BC6C25",
"Caption": "#C77DFF",
"Equation": "#9D4EDD",
"Page-Header": "#E0AFA0",
"Page-Footer": "#D4A5A5",
"Footnote": "#A8DADC",
"Form": "#F4A261",
"default": "#FF00FF",
}
@app.route("/")
def index():
return render_template("screenshot.html")
@app.route("/process", methods=["POST"])
def process():
data = request.json
file_path = data.get("file_path")
page_number = data.get("page_number", 0)
if not file_path:
return jsonify({"error": "file_path is required"}), 400
try:
# Load image
images = load_file(file_path, {"page_range": str(page_number)})
if not images:
return jsonify({"error": "No images found"}), 400
img = images[0]
# Run OCR
model = get_model()
batch = BatchInputItem(image=img, prompt_type="ocr_layout")
result = model.generate([batch])[0]
# Parse layout
layout_blocks = parse_layout(result.raw, img)
# Get markdown and HTML
html = result.html
# Convert extracted images to base64 and embed in HTML
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
for img_name, pil_img in result.images.items():
img_base64 = pil_image_to_base64(pil_img, format="PNG")
# Find all img tags with this src
img_tags = soup.find_all("img", src=img_name)
if len(img_tags) == 0:
print(f"Warning: No img tags found for {img_name}")
for img_tag in img_tags:
# Replace src with base64
img_tag["src"] = img_base64
# Wrap image with alt text display
alt_text = img_tag.get("alt", "")
if alt_text:
wrapper = soup.new_tag("div", **{"class": "image-wrapper"})
alt_div = soup.new_tag("div", **{"class": "image-alt-text"})
alt_div.string = alt_text
img_container = soup.new_tag(
"div", **{"class": "image-container-wrapper"}
)
# Move img into container
img_tag_copy = img_tag
img_tag.replace_with(wrapper)
img_container.append(img_tag_copy)
wrapper.append(alt_div)
wrapper.append(img_container)
# Convert back to HTML string
html_with_images = str(soup)
# Prepare response
img_base64 = pil_image_to_base64(img, format="PNG")
img_width, img_height = img.size
color_palette = get_color_palette()
# Prepare layout blocks data
blocks_data = []
for block in layout_blocks:
color = color_palette.get(block.label, color_palette["default"])
blocks_data.append(
{"bbox": block.bbox, "label": block.label, "color": color}
)
return jsonify(
{
"image_base64": img_base64,
"image_width": img_width,
"image_height": img_height,
"blocks": blocks_data,
"html": html_with_images,
}
)
except Exception as e:
return jsonify({"error": str(e)}), 500
def main():
app.run(host="0.0.0.0", port=8503)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,373 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Chandra OCR Screenshot Mode</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
background: #1a1a1a;
color: white;
overflow: hidden;
}
.controls {
position: fixed;
top: 0;
left: 0;
right: 0;
background: #2c3e50;
padding: 15px 20px;
display: flex;
gap: 15px;
align-items: center;
z-index: 2000;
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
}
.controls input, .controls button {
padding: 8px 15px;
border: none;
border-radius: 4px;
font-size: 14px;
}
.controls input {
flex: 1;
max-width: 500px;
}
.controls input[type="number"] {
max-width: 100px;
}
.controls button {
background: #3498db;
color: white;
cursor: pointer;
font-weight: bold;
}
.controls button:hover {
background: #2980b9;
}
.controls button:disabled {
background: #7f8c8d;
cursor: not-allowed;
}
.loading {
display: none;
color: #f39c12;
font-weight: bold;
}
.error {
color: #e74c3c;
font-weight: bold;
}
.screenshot-container {
display: none;
margin-top: 60px;
height: calc(100vh - 60px);
gap: 20px;
padding: 20px;
flex-direction: row;
}
.screenshot-container.active {
display: flex;
}
.left-panel, .right-panel {
flex: 1;
display: flex;
flex-direction: column;
background: white;
border-radius: 8px;
overflow: hidden;
box-shadow: 0 4px 12px rgba(0,0,0,0.3);
}
.panel-header {
background: #2c3e50;
color: white;
padding: 15px 20px;
font-size: 18px;
font-weight: bold;
}
.panel-content {
flex: 1;
overflow: auto;
position: relative;
}
.image-container {
position: relative;
width: 100%;
height: 100%;
display: flex;
justify-content: center;
align-items: center;
background: #f5f5f5;
}
.image-alt-text {
border: 1px solid #e5e7eb;
}
#layoutCanvas {
display: block;
max-width: 100%;
max-height: 100%;
object-fit: contain;
}
.markdown-content {
padding: 30px;
line-height: 1.6;
color: #333;
}
.markdown-content h1, .markdown-content h2, .markdown-content h3 {
margin-top: 24px;
margin-bottom: 16px;
}
.markdown-content h1 { font-size: 2em; border-bottom: 1px solid #eee; padding-bottom: 0.3em; }
.markdown-content h2 { font-size: 1.5em; border-bottom: 1px solid #eee; padding-bottom: 0.3em; }
.markdown-content h3 { font-size: 1.25em; }
.markdown-content table {
border-collapse: collapse;
width: 100%;
margin: 20px 0;
}
.markdown-content table th, .markdown-content table td {
border: 1px solid #ddd;
padding: 8px 12px;
text-align: left;
}
.markdown-content table th {
background-color: #f2f2f2;
font-weight: bold;
}
.markdown-content code {
background: #f4f4f4;
padding: 2px 6px;
border-radius: 3px;
font-family: 'Monaco', 'Courier New', monospace;
font-size: 0.9em;
}
.markdown-content pre {
background: #f4f4f4;
padding: 16px;
border-radius: 6px;
overflow-x: auto;
}
.markdown-content pre code {
background: none;
padding: 0;
}
.markdown-content img {
max-width: 100%;
height: auto;
display: block;
margin: 20px auto;
border-radius: 4px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.markdown-content figure {
margin: 20px 0;
text-align: center;
}
.markdown-content figure img {
margin: 0 auto 10px;
}
.markdown-content figcaption {
font-size: 0.9em;
color: #666;
font-style: italic;
}
</style>
</head>
<body>
<div class="controls">
<input type="text" id="filePath" placeholder="Enter file path (e.g., /path/to/document.pdf)">
<input type="number" id="pageNumber" placeholder="Page" value="0" min="0">
<button id="processBtn" onclick="processFile()">Process</button>
<span class="loading" id="loading">Processing...</span>
<span class="error" id="error"></span>
</div>
<div class="screenshot-container" id="container">
<div class="left-panel">
<div class="panel-header">Original Image with Layout Detection</div>
<div class="panel-content">
<div class="image-container">
<canvas id="layoutCanvas"></canvas>
</div>
</div>
</div>
<div class="right-panel">
<div class="panel-header">Extracted Content</div>
<div class="panel-content">
<div class="markdown-content" id="markdownContent"></div>
</div>
</div>
</div>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.25/dist/katex.min.css" integrity="sha384-WcoG4HRXMzYzfCgiyfrySxx90XSl2rxY5mnVY5TwtWE6KLrArNKn0T/mOgNL0Mmi" crossorigin="anonymous">
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.25/dist/katex.min.js" integrity="sha384-J+9dG2KMoiR9hqcFao0IBLwxt6zpcyN68IgwzsCSkbreXUjmNVRhPFTssqdSGjwQ" crossorigin="anonymous"></script>
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.25/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script>
async function processFile() {
const filePath = document.getElementById('filePath').value;
const pageNumber = parseInt(document.getElementById('pageNumber').value) || 0;
const loading = document.getElementById('loading');
const error = document.getElementById('error');
const processBtn = document.getElementById('processBtn');
const container = document.getElementById('container');
if (!filePath) {
error.textContent = 'Please enter a file path';
return;
}
error.textContent = '';
loading.style.display = 'inline';
processBtn.disabled = true;
container.classList.remove('active');
try {
const response = await fetch('/process', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ file_path: filePath, page_number: pageNumber })
});
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Processing failed');
}
const data = await response.json();
renderResults(data);
container.classList.add('active');
} catch (err) {
error.textContent = `Error: ${err.message}`;
} finally {
loading.style.display = 'none';
processBtn.disabled = false;
}
}
function renderResults(data) {
const canvas = document.getElementById('layoutCanvas');
const ctx = canvas.getContext('2d');
const markdownContent = document.getElementById('markdownContent');
// Draw image with layout overlays
const img = new Image();
img.onload = function() {
canvas.width = data.image_width;
canvas.height = data.image_height;
// Draw image
ctx.drawImage(img, 0, 0, data.image_width, data.image_height);
// Draw layout blocks
ctx.lineWidth = 3;
ctx.font = 'bold 14px -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif';
const labelCounts = {};
data.blocks.forEach((block) => {
const [x1, y1, x2, y2] = block.bbox;
const width = x2 - x1;
const height = y2 - y1;
// Draw rectangle with semi-transparent fill
ctx.strokeStyle = block.color;
ctx.fillStyle = block.color + '33';
ctx.fillRect(x1, y1, width, height);
ctx.strokeRect(x1, y1, width, height);
// Count labels for unique identification
labelCounts[block.label] = (labelCounts[block.label] || 0) + 1;
const labelWithCount = `${block.label} #${labelCounts[block.label]}`;
// Draw label with background
const textMetrics = ctx.measureText(labelWithCount);
const textWidth = textMetrics.width;
const textHeight = 16;
const padding = 6;
const labelX = x1;
const labelY = Math.max(y1 - textHeight - padding, textHeight);
ctx.fillStyle = block.color;
ctx.fillRect(labelX, labelY - textHeight, textWidth + padding * 2, textHeight + padding);
ctx.fillStyle = 'white';
ctx.textBaseline = 'top';
ctx.fillText(labelWithCount, labelX + padding, labelY - textHeight + padding/2);
});
};
img.src = data.image_base64;
// Render HTML directly (with images embedded)
markdownContent.innerHTML = data.html;
// Render math with KaTeX - find all <math> tags and render them
const mathElements = markdownContent.querySelectorAll('math');
mathElements.forEach(mathEl => {
const latex = mathEl.textContent;
const isBlock = mathEl.getAttribute('display') === 'block';
try {
const rendered = katex.renderToString(latex, {
displayMode: isBlock,
throwOnError: false
});
// Create a span to hold the rendered math
const span = document.createElement('span');
span.innerHTML = rendered;
mathEl.replaceWith(span);
} catch (e) {
console.error('KaTeX render error:', e);
}
});
}
// Allow Enter key to trigger processing
document.getElementById('filePath').addEventListener('keypress', function(e) {
if (e.key === 'Enter') processFile();
});
document.getElementById('pageNumber').addEventListener('keypress', function(e) {
if (e.key === 'Enter') processFile();
});
</script>
</body>
</html>

View File

@@ -11,7 +11,9 @@ authors = [
keywords = ["ocr", "pdf", "markdown", "layout"]
dependencies = [
"beautifulsoup4>=4.14.2",
"click>=8.0.0",
"filetype>=1.2.0",
"flask>=3.0.0",
"markdownify==1.1.0",
"openai>=2.2.0",
"pillow>=10.2.0",
@@ -26,8 +28,14 @@ dependencies = [
"streamlit>=1.50.0"
]
[build-system]
requires = ["setuptools>=61"] # or "setuptools>=61", "flit-core", etc.
build-backend = "setuptools.build_meta"
[project.scripts]
chandra = "chandra_cli:main"
chandra = "chandra.scripts.cli:main"
chandra_app = "chandra.scripts.run_app:main"
chandra_screenshot = "chandra.scripts.screenshot_app:main"
[tool.setuptools.packages.find]
include = ["chandra*"]

4
uv.lock generated
View File

@@ -163,9 +163,10 @@ wheels = [
[[package]]
name = "chandra-ocr"
version = "0.1.0"
source = { virtual = "." }
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "click" },
{ name = "filetype" },
{ name = "markdownify" },
{ name = "openai" },
@@ -190,6 +191,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = ">=4.14.2" },
{ name = "click", specifier = ">=8.0.0" },
{ name = "filetype", specifier = ">=1.2.0" },
{ name = "markdownify", specifier = "==1.1.0" },
{ name = "openai", specifier = ">=2.2.0" },