From 417d210904e0b393cf92e3949fbb21b2f933fc03 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 20 Oct 2025 16:58:52 -0400 Subject: [PATCH] Cleanups --- README.md | 48 ++- chandra/output.py | 13 +- chandra/scripts/__init__.py | 0 chandra_app.py => chandra/scripts/app.py | 2 +- chandra_cli.py => chandra/scripts/cli.py | 0 chandra/scripts/run_app.py | 25 ++ chandra/scripts/screenshot_app.py | 158 +++++++++ chandra/scripts/templates/screenshot.html | 373 ++++++++++++++++++++++ pyproject.toml | 10 +- uv.lock | 4 +- 10 files changed, 606 insertions(+), 27 deletions(-) create mode 100644 chandra/scripts/__init__.py rename chandra_app.py => chandra/scripts/app.py (98%) rename chandra_cli.py => chandra/scripts/cli.py (100%) create mode 100644 chandra/scripts/run_app.py create mode 100644 chandra/scripts/screenshot_app.py create mode 100644 chandra/scripts/templates/screenshot.html diff --git a/README.md b/README.md index 51c4033..9325f70 100644 --- a/README.md +++ b/README.md @@ -5,29 +5,43 @@ Chandra is a highly accurate OCR model that converts images and PDFs into struct ## Features - Convert documents to markdown, html, or json with detailed layout information +- Good handwriting support +- Reconstructs forms accurately, including checkboxes - Math equation support (LaTeX) -- Reconstructs forms, including checkboxes - Precise table reconstruction - Support for 40+ languages - Two inference modes: local (HuggingFace) and remote (vLLM server) + ## Benchmarks -| **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall | -|:----------|:-----:|:--------------:|:------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:-------:| -| Datalab Chandra v0.1.0 | 81.4 | **80.3** | **89.4** | **50.0** | 88.3 | **81.0** | **91.6** | **99.9** | **82.7 ± 0.9** | -| Datalab Marker v1.10.0 | **83.8** | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 | -| Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 | -| Deepseek OCR | 75.2 | 67.9 | 79.1 | 32.9 | 96.1 | 66.3 | 78.5 | 97.7 | 74.2 ± 1.0 | -| Nanonets OCR | 67.0 | 68.6 | 77.7 | 39.5 | 40.7 | 69.9 | 53.4 | 99.3 | 64.5 ± 1.1 | -| GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 | -| Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 | -| Qwen 2.5 VL (No Anchor) | 63.1 | 65.7 | 67.3 | 38.6 | 73.6 | 68.3 | 49.1 | 98.3 | 65.5 ± 1.2 | -| olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | **95.1** | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 | +| **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall | +|:----------|:--------:|:--------------:|:--------:|:---------:|:-------------------:|:------------:|:--------------:|:--------:|:--------------:| +| Datalab Chandra v0.1.0 | 81.4 | **80.3** | **89.4** | **50.0** | 88.3 | **81.0** | **91.6** | **99.9** | **82.7 ± 0.9** | +| Datalab Marker v1.10.0 | **83.8** | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 | +| Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 | +| Deepseek OCR | 75.2 | 67.9 | 79.1 | 32.9 | 96.1 | 66.3 | 78.5 | 97.7 | 74.2 ± 1.0 | +| Nanonets OCR | 67.0 | 68.6 | 77.7 | 39.5 | 40.7 | 69.9 | 53.4 | 99.3 | 64.5 ± 1.1 | +| GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 | +| Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 | +| Qwen 2.5 VL (No Anchor) | 63.1 | 65.7 | 67.3 | 38.6 | 73.6 | 68.3 | 49.1 | 98.3 | 65.5 ± 1.2 | +| Qwen 3 VL | 70.2 | 75.1 | 45.6 | 37.5 | 89.1 | 62.1 | 43.0 | 94.3 | 64.6 ± 1.1 | +| olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | **95.1** | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 | + ## Installation +### From PyPI (Recommended) + ```bash +pip install chandra-ocr +``` + +### From Source + +```bash +git clone https://github.com/yourusername/chandra.git +cd chandra uv sync source .venv/bin/activate ``` @@ -39,14 +53,14 @@ source .venv/bin/activate Process single files or entire directories: ```bash -# Process a single PDF with vLLM -python chandra_cli.py input.pdf ./output --method vllm +# Single file, with vllm server (see below for how to launch) +chandra input.pdf ./output --method vllm # Process all files in a directory with local model -python chandra_cli.py ./documents ./output --method hf +chandra ./documents ./output --method hf # Process specific pages with custom settings -python chandra_cli.py document.pdf ./output --page-range "1-10,15,20-25" --max-workers 8 +chandra document.pdf ./output --page-range "1-10,15,20-25" --max-workers 8 ``` **CLI Options:** @@ -71,7 +85,7 @@ Each processed file creates a subdirectory with: Launch the interactive demo for single-page processing: ```bash -streamlit run chandra_app.py --server.fileWatcherType none --server.headless true +chandra_app ``` The web interface allows you to: diff --git a/chandra/output.py b/chandra/output.py index 6ff0b6e..e8540c4 100644 --- a/chandra/output.py +++ b/chandra/output.py @@ -15,13 +15,12 @@ def _hash_html(html: str): return hashlib.md5(html.encode("utf-8")).hexdigest() -def get_image_name(html: str, div_idx: int, image_idx: int): +def get_image_name(html: str, div_idx: int): html_hash = _hash_html(html) - return f"{html_hash}_{div_idx}_img{image_idx}.webp" + return f"{html_hash}_{div_idx}_img.webp" def extract_images(html: str, chunks: dict, image: Image.Image): - image_idx = 0 images = {} div_idx = 0 for idx, chunk in enumerate(chunks): @@ -31,9 +30,9 @@ def extract_images(html: str, chunks: dict, image: Image.Image): if not img: continue bbox = chunk["bbox"] - image = image.crop(bbox) - img_name = get_image_name(html, div_idx, image_idx) - images[img_name] = image + block_image = image.crop(bbox) + img_name = get_image_name(html, div_idx) + images[img_name] = block_image return images @@ -59,7 +58,7 @@ def parse_html( if label in ["Image", "Figure"]: img = div.find("img") - img_src = get_image_name(html, div_idx, image_idx) + img_src = get_image_name(html, div_idx) if img: img["src"] = img_src image_idx += 1 diff --git a/chandra/scripts/__init__.py b/chandra/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chandra_app.py b/chandra/scripts/app.py similarity index 98% rename from chandra_app.py rename to chandra/scripts/app.py index 368d4cc..cd1154a 100644 --- a/chandra_app.py +++ b/chandra/scripts/app.py @@ -64,7 +64,7 @@ def ocr_layout( return result, layout_image -st.set_page_config(layout="wide") +st.set_page_config(layout="wide", page_title="Chandra OCR Demo") col1, col2 = st.columns([0.5, 0.5]) st.markdown(""" diff --git a/chandra_cli.py b/chandra/scripts/cli.py similarity index 100% rename from chandra_cli.py rename to chandra/scripts/cli.py diff --git a/chandra/scripts/run_app.py b/chandra/scripts/run_app.py new file mode 100644 index 0000000..23a0777 --- /dev/null +++ b/chandra/scripts/run_app.py @@ -0,0 +1,25 @@ +import os +import subprocess +import sys + + +def main(): + argv = sys.argv[1:] + cur_dir = os.path.dirname(os.path.abspath(__file__)) + app_path = os.path.join(cur_dir, "app.py") + cmd = [ + "streamlit", + "run", + app_path, + "--server.fileWatcherType", + "none", + "--server.headless", + "true", + ] + if argv: + cmd += ["--"] + argv + subprocess.run(cmd) + + +if __name__ == "__main__": + main() diff --git a/chandra/scripts/screenshot_app.py b/chandra/scripts/screenshot_app.py new file mode 100644 index 0000000..e7ab0f2 --- /dev/null +++ b/chandra/scripts/screenshot_app.py @@ -0,0 +1,158 @@ +""" +Simple Flask app for generating screenshot-ready OCR visualizations. +Displays original image with layout overlays on the left and extracted markdown on the right. +""" + +from flask import Flask, render_template, request, jsonify +import base64 +from io import BytesIO + +from PIL import Image +from chandra.model import InferenceManager +from chandra.input import load_file +from chandra.model.schema import BatchInputItem +from chandra.output import parse_layout + +app = Flask(__name__) + +# Load model once at startup +model = None + + +def get_model(): + global model + if model is None: + model = InferenceManager(method="vllm") + return model + + +def pil_image_to_base64(pil_image: Image.Image, format: str = "PNG") -> str: + """Convert PIL image to base64 data URL.""" + buffered = BytesIO() + pil_image.save(buffered, format=format) + img_str = base64.b64encode(buffered.getvalue()).decode() + return f"data:image/{format.lower()};base64,{img_str}" + + +def get_color_palette(): + """Return a color palette for different block types.""" + return { + "Section-Header": "#4ECDC4", + "Text": "#45B7D1", + "List-Group": "#96CEB4", + "Table": "#FFEAA7", + "Figure": "#DDA15E", + "Image": "#BC6C25", + "Caption": "#C77DFF", + "Equation": "#9D4EDD", + "Page-Header": "#E0AFA0", + "Page-Footer": "#D4A5A5", + "Footnote": "#A8DADC", + "Form": "#F4A261", + "default": "#FF00FF", + } + + +@app.route("/") +def index(): + return render_template("screenshot.html") + + +@app.route("/process", methods=["POST"]) +def process(): + data = request.json + file_path = data.get("file_path") + page_number = data.get("page_number", 0) + + if not file_path: + return jsonify({"error": "file_path is required"}), 400 + + try: + # Load image + images = load_file(file_path, {"page_range": str(page_number)}) + if not images: + return jsonify({"error": "No images found"}), 400 + + img = images[0] + + # Run OCR + model = get_model() + batch = BatchInputItem(image=img, prompt_type="ocr_layout") + result = model.generate([batch])[0] + + # Parse layout + layout_blocks = parse_layout(result.raw, img) + + # Get markdown and HTML + html = result.html + + # Convert extracted images to base64 and embed in HTML + from bs4 import BeautifulSoup + + soup = BeautifulSoup(html, "html.parser") + + for img_name, pil_img in result.images.items(): + img_base64 = pil_image_to_base64(pil_img, format="PNG") + # Find all img tags with this src + img_tags = soup.find_all("img", src=img_name) + if len(img_tags) == 0: + print(f"Warning: No img tags found for {img_name}") + for img_tag in img_tags: + # Replace src with base64 + img_tag["src"] = img_base64 + + # Wrap image with alt text display + alt_text = img_tag.get("alt", "") + if alt_text: + wrapper = soup.new_tag("div", **{"class": "image-wrapper"}) + alt_div = soup.new_tag("div", **{"class": "image-alt-text"}) + alt_div.string = alt_text + img_container = soup.new_tag( + "div", **{"class": "image-container-wrapper"} + ) + + # Move img into container + img_tag_copy = img_tag + img_tag.replace_with(wrapper) + img_container.append(img_tag_copy) + + wrapper.append(alt_div) + wrapper.append(img_container) + + # Convert back to HTML string + html_with_images = str(soup) + + # Prepare response + img_base64 = pil_image_to_base64(img, format="PNG") + img_width, img_height = img.size + + color_palette = get_color_palette() + + # Prepare layout blocks data + blocks_data = [] + for block in layout_blocks: + color = color_palette.get(block.label, color_palette["default"]) + blocks_data.append( + {"bbox": block.bbox, "label": block.label, "color": color} + ) + + return jsonify( + { + "image_base64": img_base64, + "image_width": img_width, + "image_height": img_height, + "blocks": blocks_data, + "html": html_with_images, + } + ) + + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +def main(): + app.run(host="0.0.0.0", port=8503) + + +if __name__ == "__main__": + main() diff --git a/chandra/scripts/templates/screenshot.html b/chandra/scripts/templates/screenshot.html new file mode 100644 index 0000000..d5398bc --- /dev/null +++ b/chandra/scripts/templates/screenshot.html @@ -0,0 +1,373 @@ + + + + + + Chandra OCR Screenshot Mode + + + +
+ + + + Processing... + +
+ +
+
+
Original Image with Layout Detection
+
+
+ +
+
+
+
+
Extracted Content
+
+
+
+
+
+ + + + + + + + diff --git a/pyproject.toml b/pyproject.toml index 1d15eb0..10d80c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,9 @@ authors = [ keywords = ["ocr", "pdf", "markdown", "layout"] dependencies = [ "beautifulsoup4>=4.14.2", + "click>=8.0.0", "filetype>=1.2.0", + "flask>=3.0.0", "markdownify==1.1.0", "openai>=2.2.0", "pillow>=10.2.0", @@ -26,8 +28,14 @@ dependencies = [ "streamlit>=1.50.0" ] +[build-system] +requires = ["setuptools>=61"] # or "setuptools>=61", "flit-core", etc. +build-backend = "setuptools.build_meta" + [project.scripts] -chandra = "chandra_cli:main" +chandra = "chandra.scripts.cli:main" +chandra_app = "chandra.scripts.run_app:main" +chandra_screenshot = "chandra.scripts.screenshot_app:main" [tool.setuptools.packages.find] include = ["chandra*"] diff --git a/uv.lock b/uv.lock index f90f101..0c6e4f9 100644 --- a/uv.lock +++ b/uv.lock @@ -163,9 +163,10 @@ wheels = [ [[package]] name = "chandra-ocr" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "beautifulsoup4" }, + { name = "click" }, { name = "filetype" }, { name = "markdownify" }, { name = "openai" }, @@ -190,6 +191,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.14.2" }, + { name = "click", specifier = ">=8.0.0" }, { name = "filetype", specifier = ">=1.2.0" }, { name = "markdownify", specifier = "==1.1.0" }, { name = "openai", specifier = ">=2.2.0" },