Initial benchmarks

This commit is contained in:
Vik Paruchuri
2025-10-20 15:11:12 -04:00
parent 313f9c71b8
commit 98989faae4
16 changed files with 860 additions and 76 deletions

28
.github/workflows/integration.yml vendored Normal file
View File

@@ -0,0 +1,28 @@
name: Marker Plus CI tests
on: [push]
jobs:
tests:
runs-on: t4_gpu
steps:
- uses: actions/checkout@v3
- name: Install apt requirements
run: |
sudo apt-get update
sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Install uv
uses: astral-sh/setup-uv@v7
- name: Install python dependencies
run: |
uv sync --group dev
- name: Run tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PYTHONPATH: .
run: |
uv run pytest tests/integration

43
.github/workflows/publish.yml vendored Normal file
View File

@@ -0,0 +1,43 @@
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install uv
uses: astral-sh/setup-uv@v7
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
python-version: "3.12"
- name: Extract and verify version
id: version
run: |
# Get the tag name (e.g., v1.2.3)
TAG_VERSION="${GITHUB_REF#refs/tags/}"
echo "Tag version: $TAG_VERSION"
# Extract version from pyproject.toml (e.g., 1.2.3)
PYPROJECT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])")
echo "pyproject.toml version: $PYPROJECT_VERSION"
# Compare versions (tag should be v{pyproject_version})
if [ "$TAG_VERSION" != "v$PYPROJECT_VERSION" ]; then
echo "Error: Tag version ($TAG_VERSION) does not match pyproject.toml version (v$PYPROJECT_VERSION)"
exit 1
fi
echo "Version verification passed!"
echo "version=$TAG_VERSION" >> $GITHUB_OUTPUT
- name: Build package
run: |
uv build
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
uv publish --token "$PYPI_TOKEN"

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
local.env local.env
experiments experiments
.claude
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

202
LICENSE Normal file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2025 Endless Labs, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -11,6 +11,20 @@ Chandra is a highly accurate OCR model that converts images and PDFs into struct
- Support for 40+ languages - Support for 40+ languages
- Two inference modes: local (HuggingFace) and remote (vLLM server) - Two inference modes: local (HuggingFace) and remote (vLLM server)
## Benchmarks
| **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall |
|:----------|:-----:|:--------------:|:------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:-------:|
| Datalab Chandra v0.1.0 | 81.4 | **80.3** | **89.4** | **50.0** | 88.3 | **81.0** | **91.6** | **99.9** | **82.7 ± 0.9** |
| Datalab Marker v1.10.0 | **83.8** | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 |
| Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 |
| Deepseek OCR | 75.2 | 67.9 | 79.1 | 32.9 | 96.1 | 66.3 | 78.5 | 97.7 | 74.2 ± 1.0 |
| Nanonets OCR | 67.0 | 68.6 | 77.7 | 39.5 | 40.7 | 69.9 | 53.4 | 99.3 | 64.5 ± 1.1 |
| GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 |
| Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 |
| Qwen 2.5 VL (No Anchor) | 63.1 | 65.7 | 67.3 | 38.6 | 73.6 | 68.3 | 49.1 | 98.3 | 65.5 ± 1.2 |
| olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | **95.1** | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 |
## Installation ## Installation
```bash ```bash
@@ -20,14 +34,53 @@ source .venv/bin/activate
## Usage ## Usage
### CLI
Process single files or entire directories:
```bash
# Process a single PDF with vLLM
python chandra_cli.py input.pdf ./output --method vllm
# Process all files in a directory with local model
python chandra_cli.py ./documents ./output --method hf
# Process specific pages with custom settings
python chandra_cli.py document.pdf ./output --page-range "1-10,15,20-25" --max-workers 8
```
**CLI Options:**
- `--method [hf|vllm]`: Inference method (default: vllm)
- `--page-range TEXT`: Page range for PDFs (e.g., "1-5,7,9-12")
- `--max-output-tokens INTEGER`: Max tokens per page
- `--max-workers INTEGER`: Parallel workers for vLLM
- `--include-images/--no-images`: Extract and save images (default: include)
- `--include-headers-footers/--no-headers-footers`: Include page headers/footers (default: exclude)
- `--batch-size INTEGER`: Pages per batch (default: 1)
**Output Structure:**
Each processed file creates a subdirectory with:
- `<filename>.md` - Markdown output
- `<filename>.html` - HTML output
- `<filename>_metadata.json` - Metadata (page info, token count, etc.)
- `images/` - Extracted images from the document
### Streamlit Web App ### Streamlit Web App
Launch the interactive demo: Launch the interactive demo for single-page processing:
```bash ```bash
streamlit run chandra_app.py --server.fileWatcherType none --server.headless true streamlit run chandra_app.py --server.fileWatcherType none --server.headless true
``` ```
The web interface allows you to:
- Upload PDFs or images
- Select specific pages from PDFs
- View OCR results with layout visualization
- Download markdown output
- See extracted images embedded in the output
**Inference Modes:** **Inference Modes:**
- **hf**: Loads model locally using HuggingFace Transformers (requires GPU) - **hf**: Loads model locally using HuggingFace Transformers (requires GPU)
- **vllm**: Connects to a running vLLM server for optimized batch inference - **vllm**: Connects to a running vLLM server for optimized batch inference
@@ -45,7 +98,6 @@ This launches a Docker container with optimized inference settings. Configure vi
- `VLLM_API_BASE`: Server URL (default: `http://localhost:8000/v1`) - `VLLM_API_BASE`: Server URL (default: `http://localhost:8000/v1`)
- `VLLM_MODEL_NAME`: Model name for the server (default: `chandra`) - `VLLM_MODEL_NAME`: Model name for the server (default: `chandra`)
- `VLLM_GPUS`: GPU device IDs (default: `0`) - `VLLM_GPUS`: GPU device IDs (default: `0`)
- `HF_TOKEN`: HuggingFace token for model access
### Configuration ### Configuration

View File

@@ -3,7 +3,7 @@ from typing import List
from chandra.model.hf import load_model, generate_hf from chandra.model.hf import load_model, generate_hf
from chandra.model.schema import BatchInputItem, BatchOutputItem from chandra.model.schema import BatchInputItem, BatchOutputItem
from chandra.model.vllm import generate_vllm from chandra.model.vllm import generate_vllm
from chandra.output import parse_markdown, parse_html, parse_chunks from chandra.output import parse_markdown, parse_html, parse_chunks, extract_images
class InferenceManager: class InferenceManager:
@@ -19,6 +19,14 @@ class InferenceManager:
def generate( def generate(
self, batch: List[BatchInputItem], max_output_tokens=None, **kwargs self, batch: List[BatchInputItem], max_output_tokens=None, **kwargs
) -> List[BatchOutputItem]: ) -> List[BatchOutputItem]:
output_kwargs = {}
if "include_images" in kwargs:
output_kwargs["include_images"] = kwargs.pop("include_images")
if "include_headers_footers" in kwargs:
output_kwargs["include_headers_footers"] = kwargs.pop(
"include_headers_footers"
)
if self.method == "vllm": if self.method == "vllm":
results = generate_vllm( results = generate_vllm(
batch, max_output_tokens=max_output_tokens, **kwargs batch, max_output_tokens=max_output_tokens, **kwargs
@@ -30,14 +38,16 @@ class InferenceManager:
output = [] output = []
for result, input_item in zip(results, batch): for result, input_item in zip(results, batch):
chunks = parse_chunks(result.raw, input_item.image)
output.append( output.append(
BatchOutputItem( BatchOutputItem(
markdown=parse_markdown(result.raw), markdown=parse_markdown(result.raw, **output_kwargs),
html=parse_html(result.raw), html=parse_html(result.raw, **output_kwargs),
chunks=parse_chunks(result.raw, input_item.image), chunks=chunks,
raw=result.raw, raw=result.raw,
page_box=[0, 0, input_item.image.width, input_item.image.height], page_box=[0, 0, input_item.image.width, input_item.image.height],
token_count=result.token_count, token_count=result.token_count,
images=extract_images(result.raw, chunks, input_item.image),
) )
) )
return output return output

View File

@@ -1,7 +1,7 @@
from typing import List from typing import List
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor
from chandra.model.schema import BatchInputItem, GenerationResult from chandra.model.schema import BatchInputItem, GenerationResult
from chandra.model.util import scale_to_fit from chandra.model.util import scale_to_fit
@@ -31,7 +31,7 @@ def generate_hf(
inputs = inputs.to("cuda") inputs = inputs.to("cuda")
# Inference: Generation of the output # Inference: Generation of the output
generated_ids = model.generate_hf(**inputs, max_new_tokens=max_output_tokens) generated_ids = model.generate(**inputs, max_new_tokens=max_output_tokens)
generated_ids_trimmed = [ generated_ids_trimmed = [
out_ids[len(in_ids) :] out_ids[len(in_ids) :]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids) for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -65,13 +65,13 @@ def process_batch_element(item: BatchInputItem, processor):
def load_model(): def load_model():
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model = Qwen3VLForConditionalGeneration.from_pretrained(
settings.MODEL_CHECKPOINT, settings.MODEL_CHECKPOINT,
dtype=settings.TORCH_DTYPE, dtype=settings.TORCH_DTYPE,
device_map="auto", device_map="auto",
attn_implementation=settings.TORCH_ATTN_IMPLEMENTATION, attn_implementation=settings.TORCH_ATTN_IMPLEMENTATION,
).to(settings.TORCH_DEVICE_MODEL) ).to(settings.TORCH_DEVICE_MODEL)
model = model.eval() model = model.eval()
processor = Qwen2_5_VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT) processor = Qwen3VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT)
model.processor = processor model.processor = processor
return model return model

View File

@@ -26,3 +26,4 @@ class BatchOutputItem:
raw: str raw: str
page_box: List[int] page_box: List[int]
token_count: int token_count: int
images: dict

View File

@@ -1,6 +1,8 @@
import hashlib
import json import json
import re import re
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
from functools import lru_cache
import six import six
from PIL import Image from PIL import Image
@@ -8,18 +10,59 @@ from bs4 import BeautifulSoup, NavigableString
from markdownify import MarkdownConverter, re_whitespace from markdownify import MarkdownConverter, re_whitespace
def parse_html(html: str, include_headers_footers: bool = False): @lru_cache
def _hash_html(html: str):
return hashlib.md5(html.encode("utf-8")).hexdigest()
def get_image_name(html: str, div_idx: int, image_idx: int):
html_hash = _hash_html(html)
return f"{html_hash}_{div_idx}_img{image_idx}.webp"
def extract_images(html: str, chunks: dict, image: Image.Image):
image_idx = 0
images = {}
div_idx = 0
for idx, chunk in enumerate(chunks):
div_idx += 1
if chunk["label"] in ["Image", "Figure"]:
img = chunk["content"].find("img")
if not img:
continue
bbox = chunk["bbox"]
image = image.crop(bbox)
img_name = get_image_name(html, div_idx, image_idx)
images[img_name] = image
return images
def parse_html(
html: str, include_headers_footers: bool = False, include_images: bool = True
):
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
top_level_divs = soup.find_all("div", recursive=False) top_level_divs = soup.find_all("div", recursive=False)
out_html = "" out_html = ""
image_idx = 0
div_idx = 0
for div in top_level_divs: for div in top_level_divs:
div_idx += 1
label = div.get("data-label") label = div.get("data-label")
# Skip headers and footers if not included # Skip headers and footers if not included
if label and not include_headers_footers: if label and not include_headers_footers:
if label in ["Page-Header", "Page-Footer"]: if label in ["Page-Header", "Page-Footer"]:
continue continue
if label and not include_images:
if label in ["Image", "Figure"]:
continue
if label in ["Image", "Figure"]:
img = div.find("img")
img_src = get_image_name(html, div_idx, image_idx)
if img:
img["src"] = img_src
image_idx += 1
content = str(div.decode_contents()) content = str(div.decode_contents())
out_html += content out_html += content
return out_html return out_html
@@ -125,8 +168,10 @@ class Markdownify(MarkdownConverter):
return text return text
def parse_markdown(html: str, include_headers_footers: bool = False): def parse_markdown(
html = parse_html(html, include_headers_footers) html: str, include_headers_footers: bool = False, include_images: bool = True
):
html = parse_html(html, include_headers_footers, include_images)
md_cls = Markdownify( md_cls = Markdownify(
heading_style="ATX", heading_style="ATX",

View File

@@ -1,6 +1,9 @@
import pypdfium2 as pdfium import pypdfium2 as pdfium
import streamlit as st import streamlit as st
from PIL import Image from PIL import Image
import base64
from io import BytesIO
import re
from chandra.model import InferenceManager from chandra.model import InferenceManager
from chandra.util import draw_layout from chandra.util import draw_layout
@@ -27,6 +30,26 @@ def page_counter(pdf_file):
return doc_len return doc_len
def pil_image_to_base64(pil_image: Image.Image, format: str = "PNG") -> str:
"""Convert PIL image to base64 data URL."""
buffered = BytesIO()
pil_image.save(buffered, format=format)
img_str = base64.b64encode(buffered.getvalue()).decode()
return f"data:image/{format.lower()};base64,{img_str}"
def embed_images_in_markdown(markdown: str, images: dict) -> str:
"""Replace image filenames in markdown with base64 data URLs."""
for img_name, pil_image in images.items():
# Convert PIL image to base64 data URL
data_url = pil_image_to_base64(pil_image, format="PNG")
# Replace the image reference in markdown
# Pattern matches: ![...](img_name) or ![...](img_name "title")
pattern = rf'(!\[.*?\])\({re.escape(img_name)}(?:\s+"[^"]*")?\)'
markdown = re.sub(pattern, rf"\1({data_url})", markdown)
return markdown
def ocr_layout( def ocr_layout(
img: Image.Image, img: Image.Image,
model=None, model=None,
@@ -55,7 +78,7 @@ model_mode = st.sidebar.selectbox(
"Model Mode", "Model Mode",
["None", "hf", "vllm"], ["None", "hf", "vllm"],
index=0, index=0,
help="Select how to run inference: hf loads the model in memory using huggingface transformers, vllm connects to a running vLLM server." help="Select how to run inference: hf loads the model in memory using huggingface transformers, vllm connects to a running vLLM server.",
) )
# Only load model if a mode is selected # Only load model if a mode is selected
@@ -99,10 +122,15 @@ if run_ocr:
model, model,
) )
# Embed images as base64 data URLs in the markdown
markdown_with_images = embed_images_in_markdown(result.markdown, result.images)
with col1: with col1:
html_tab, text_tab, layout_tab = st.tabs(["HTML", "HTML as text", "Layout Image"]) html_tab, text_tab, layout_tab = st.tabs(
["HTML", "HTML as text", "Layout Image"]
)
with html_tab: with html_tab:
st.markdown(result.markdown, unsafe_allow_html=True) st.markdown(markdown_with_images, unsafe_allow_html=True)
st.download_button( st.download_button(
label="Download Markdown", label="Download Markdown",
data=result.markdown, data=result.markdown,
@@ -114,7 +142,11 @@ if run_ocr:
if layout_image: if layout_image:
with layout_tab: with layout_tab:
st.image(layout_image, caption="Detected Layout", use_container_width=True) st.image(
layout_image,
caption="Detected Layout",
use_container_width=True,
)
st.text_area(result.raw) st.text_area(result.raw)
with col2: with col2:

285
chandra_cli.py Executable file
View File

@@ -0,0 +1,285 @@
import json
from pathlib import Path
from typing import List
import click
from chandra.input import load_file
from chandra.model import InferenceManager
from chandra.model.schema import BatchInputItem
def get_supported_files(input_path: Path) -> List[Path]:
"""Get list of supported image/PDF files from path."""
supported_extensions = {
".pdf",
".png",
".jpg",
".jpeg",
".gif",
".webp",
".tiff",
".bmp",
}
if input_path.is_file():
if input_path.suffix.lower() in supported_extensions:
return [input_path]
else:
raise click.BadParameter(f"Unsupported file type: {input_path.suffix}")
elif input_path.is_dir():
files = []
for ext in supported_extensions:
files.extend(input_path.glob(f"*{ext}"))
files.extend(input_path.glob(f"*{ext.upper()}"))
return sorted(files)
else:
raise click.BadParameter(f"Path does not exist: {input_path}")
def save_merged_output(
output_dir: Path,
file_name: str,
results: List,
save_images: bool = True,
save_html: bool = True,
paginate_output: bool = False,
):
"""Save merged OCR results for all pages to output directory."""
# Create subfolder for this file
safe_name = Path(file_name).stem
file_output_dir = output_dir / safe_name
file_output_dir.mkdir(parents=True, exist_ok=True)
# Merge all pages
all_markdown = []
all_html = []
all_metadata = []
total_tokens = 0
total_chunks = 0
total_images = 0
# Process each page result
for page_num, result in enumerate(results):
# Add page separator for multi-page documents
if page_num > 0 and paginate_output:
all_markdown.append(f"\n\n{page_num}" + "-" * 48 + "\n\n")
all_html.append(f"\n\n<!-- Page {page_num + 1} -->\n\n")
all_markdown.append(result.markdown)
all_html.append(result.html)
# Accumulate metadata
total_tokens += result.token_count
total_chunks += len(result.chunks)
total_images += len(result.images)
page_metadata = {
"page_num": page_num,
"page_box": result.page_box,
"token_count": result.token_count,
"num_chunks": len(result.chunks),
"num_images": len(result.images),
}
all_metadata.append(page_metadata)
# Save extracted images if requested
if save_images and result.images:
images_dir = file_output_dir / "images"
images_dir.mkdir(exist_ok=True)
for img_name, pil_image in result.images.items():
img_path = images_dir / img_name
pil_image.save(img_path)
# Save merged markdown
markdown_path = file_output_dir / f"{safe_name}.md"
with open(markdown_path, "w", encoding="utf-8") as f:
f.write("".join(all_markdown))
# Save merged HTML if requested
if save_html:
html_path = file_output_dir / f"{safe_name}.html"
with open(html_path, "w", encoding="utf-8") as f:
f.write("".join(all_html))
# Save combined metadata
metadata = {
"file_name": file_name,
"num_pages": len(results),
"total_token_count": total_tokens,
"total_chunks": total_chunks,
"total_images": total_images,
"pages": all_metadata,
}
metadata_path = file_output_dir / f"{safe_name}_metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2)
click.echo(f" Saved: {markdown_path} ({len(results)} page(s))")
@click.command()
@click.argument("input_path", type=click.Path(exists=True, path_type=Path))
@click.argument("output_path", type=click.Path(path_type=Path))
@click.option(
"--method",
type=click.Choice(["hf", "vllm"], case_sensitive=False),
default="vllm",
help="Inference method: 'hf' for local model, 'vllm' for vLLM server.",
)
@click.option(
"--page-range",
type=str,
default=None,
help="Page range for PDFs (e.g., '1-5,7,9-12'). Only applicable to PDF files.",
)
@click.option(
"--max-output-tokens",
type=int,
default=None,
help="Maximum number of output tokens per page.",
)
@click.option(
"--max-workers",
type=int,
default=None,
help="Maximum number of parallel workers for vLLM inference.",
)
@click.option(
"--max-retries",
type=int,
default=None,
help="Maximum number of retries for vLLM inference.",
)
@click.option(
"--include-images/--no-images",
default=True,
help="Include images in output.",
)
@click.option(
"--include-headers-footers/--no-headers-footers",
default=False,
help="Include page headers and footers in output.",
)
@click.option(
"--save-html/--no-html",
default=True,
help="Save HTML output files.",
)
@click.option(
"--batch-size",
type=int,
default=1,
help="Number of pages to process in a batch.",
)
@click.option(
"--paginate_output",
is_flag=True,
default=False,
)
def main(
input_path: Path,
output_path: Path,
method: str,
page_range: str,
max_output_tokens: int,
max_workers: int,
max_retries: int,
include_images: bool,
include_headers_footers: bool,
save_html: bool,
batch_size: int,
paginate_output: bool,
):
click.echo("Chandra CLI - Starting OCR processing")
click.echo(f"Input: {input_path}")
click.echo(f"Output: {output_path}")
click.echo(f"Method: {method}")
# Create output directory
output_path.mkdir(parents=True, exist_ok=True)
# Load model
click.echo(f"\nLoading model with method '{method}'...")
model = InferenceManager(method=method)
click.echo("Model loaded successfully.")
# Get files to process
files_to_process = get_supported_files(input_path)
click.echo(f"\nFound {len(files_to_process)} file(s) to process.")
if not files_to_process:
click.echo("No supported files found. Exiting.")
return
# Process each file
for file_idx, file_path in enumerate(files_to_process, 1):
click.echo(
f"\n[{file_idx}/{len(files_to_process)}] Processing: {file_path.name}"
)
try:
# Load images from file
config = {"page_range": page_range} if page_range else {}
images = load_file(str(file_path), config)
click.echo(f" Loaded {len(images)} page(s)")
# Accumulate all results for this document
all_results = []
# Process pages in batches
for batch_start in range(0, len(images), batch_size):
batch_end = min(batch_start + batch_size, len(images))
batch_images = images[batch_start:batch_end]
# Create batch input items
batch = [
BatchInputItem(image=img, prompt_type="ocr_layout")
for img in batch_images
]
# Run inference
click.echo(f" Processing pages {batch_start + 1}-{batch_end}...")
# Build kwargs for generate
generate_kwargs = {
"include_images": include_images,
"include_headers_footers": include_headers_footers,
}
if max_output_tokens is not None:
generate_kwargs["max_output_tokens"] = max_output_tokens
if method == "vllm":
if max_workers is not None:
generate_kwargs["max_workers"] = max_workers
if max_retries is not None:
generate_kwargs["max_retries"] = max_retries
results = model.generate(batch, **generate_kwargs)
all_results.extend(results)
# Save merged output for all pages
save_merged_output(
output_path,
file_path.name,
all_results,
save_images=include_images,
save_html=save_html,
paginate_output=paginate_output,
)
click.echo(f" Completed: {file_path.name}")
except Exception as e:
click.echo(f" Error processing {file_path.name}: {e}", err=True)
continue
click.echo(f"\nProcessing complete. Results saved to: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -1,9 +1,14 @@
[project] [project]
name = "chandra" name = "chandra-ocr"
version = "0.1.0" version = "0.1.0"
description = "Add your description here" description = "OCR model that converts documents to markdown, HTML, or JSON."
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
license = {text = "Apache-2.0"}
authors = [
{name = "Datalab", email = "hi@datalab.to"}
]
keywords = ["ocr", "pdf", "markdown", "layout"]
dependencies = [ dependencies = [
"beautifulsoup4>=4.14.2", "beautifulsoup4>=4.14.2",
"filetype>=1.2.0", "filetype>=1.2.0",
@@ -15,17 +20,20 @@ dependencies = [
"pypdfium2>=4.30.0", "pypdfium2>=4.30.0",
"python-dotenv>=1.1.1", "python-dotenv>=1.1.1",
"qwen-vl-utils>=0.0.14", "qwen-vl-utils>=0.0.14",
"streamlit>=1.50.0",
"surya-ocr>=0.17.0",
"torch>=2.8.0", "torch>=2.8.0",
"torchvision>=0.23.0", "torchvision>=0.23.0",
"transformers>=4.57.1", "transformers>=4.57.1",
"streamlit>=1.50.0"
] ]
[project.scripts]
chandra = "chandra_cli:main"
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
include = ["chandra*"] include = ["chandra*"]
[dependency-groups] [dependency-groups]
dev = [ dev = [
"pre-commit>=4.3.0", "pre-commit>=4.3.0",
"pytest>=8.4.2",
] ]

4
pytest.ini Normal file
View File

@@ -0,0 +1,4 @@
[pytest]
testpaths=tests
filterwarnings =
ignore::Warning

10
tests/conftest.py Normal file
View File

@@ -0,0 +1,10 @@
import pytest
from PIL import Image, ImageDraw
@pytest.fixture(scope="session")
def simple_text_image() -> Image.Image:
image = Image.new("RGB", (800, 600), "white")
draw = ImageDraw.Draw(image)
draw.text((50, 50), "Hello, World!", fill="black", font_size=32)
return image

View File

@@ -0,0 +1,18 @@
from chandra.model import InferenceManager, BatchInputItem
def test_inference_image(simple_text_image):
manager = InferenceManager(method="hf")
batch = [
BatchInputItem(
image=simple_text_image,
prompt_type="ocr_layout",
)
]
outputs = manager.generate(batch, max_output_tokens=128)
assert len(outputs) == 1
output = outputs[0]
assert "Hello, World!" in output.markdown
chunks = output.chunks
assert len(chunks) == 1

155
uv.lock generated
View File

@@ -161,7 +161,7 @@ wheels = [
] ]
[[package]] [[package]]
name = "chandra" name = "chandra-ocr"
version = "0.1.0" version = "0.1.0"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
@@ -176,7 +176,6 @@ dependencies = [
{ name = "python-dotenv" }, { name = "python-dotenv" },
{ name = "qwen-vl-utils" }, { name = "qwen-vl-utils" },
{ name = "streamlit" }, { name = "streamlit" },
{ name = "surya-ocr" },
{ name = "torch" }, { name = "torch" },
{ name = "torchvision" }, { name = "torchvision" },
{ name = "transformers" }, { name = "transformers" },
@@ -185,6 +184,7 @@ dependencies = [
[package.dev-dependencies] [package.dev-dependencies]
dev = [ dev = [
{ name = "pre-commit" }, { name = "pre-commit" },
{ name = "pytest" },
] ]
[package.metadata] [package.metadata]
@@ -200,14 +200,16 @@ requires-dist = [
{ name = "python-dotenv", specifier = ">=1.1.1" }, { name = "python-dotenv", specifier = ">=1.1.1" },
{ name = "qwen-vl-utils", specifier = ">=0.0.14" }, { name = "qwen-vl-utils", specifier = ">=0.0.14" },
{ name = "streamlit", specifier = ">=1.50.0" }, { name = "streamlit", specifier = ">=1.50.0" },
{ name = "surya-ocr", specifier = ">=0.17.0" },
{ name = "torch", specifier = ">=2.8.0" }, { name = "torch", specifier = ">=2.8.0" },
{ name = "torchvision", specifier = ">=0.23.0" }, { name = "torchvision", specifier = ">=0.23.0" },
{ name = "transformers", specifier = ">=4.57.1" }, { name = "transformers", specifier = ">=4.57.1" },
] ]
[package.metadata.requires-dev] [package.metadata.requires-dev]
dev = [{ name = "pre-commit", specifier = ">=4.3.0" }] dev = [
{ name = "pre-commit", specifier = ">=4.3.0" },
{ name = "pytest", specifier = ">=8.4.2" },
]
[[package]] [[package]]
name = "charset-normalizer" name = "charset-normalizer"
@@ -312,15 +314,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 }, { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 },
] ]
[[package]]
name = "einops"
version = "0.8.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e5/81/df4fbe24dff8ba3934af99044188e20a98ed441ad17a274539b74e82e126/einops-0.8.1.tar.gz", hash = "sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84", size = 54805 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359 },
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.3.0" version = "1.3.0"
@@ -473,6 +466,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
] ]
[[package]]
name = "iniconfig"
version = "2.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 },
]
[[package]] [[package]]
name = "jinja2" name = "jinja2"
version = "3.1.6" version = "3.1.6"
@@ -1041,24 +1043,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/92/6aeef1836e66dfec7f7f160a4f06d7041be7f6ccfc47a2f0f5738b332245/openai-2.2.0-py3-none-any.whl", hash = "sha256:d222e63436e33f3134a3d7ce490dc2d2f146fa98036eb65cc225df3ce163916f", size = 998972 }, { url = "https://files.pythonhosted.org/packages/cb/92/6aeef1836e66dfec7f7f160a4f06d7041be7f6ccfc47a2f0f5738b332245/openai-2.2.0-py3-none-any.whl", hash = "sha256:d222e63436e33f3134a3d7ce490dc2d2f146fa98036eb65cc225df3ce163916f", size = 998972 },
] ]
[[package]]
name = "opencv-python-headless"
version = "4.11.0.86"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/36/2f/5b2b3ba52c864848885ba988f24b7f105052f68da9ab0e693cc7c25b0b30/opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798", size = 95177929 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/53/2c50afa0b1e05ecdb4603818e85f7d174e683d874ef63a6abe3ac92220c8/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca", size = 37326460 },
{ url = "https://files.pythonhosted.org/packages/3b/43/68555327df94bb9b59a1fd645f63fafb0762515344d2046698762fc19d58/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81", size = 56723330 },
{ url = "https://files.pythonhosted.org/packages/45/be/1438ce43ebe65317344a87e4b150865c5585f4c0db880a34cdae5ac46881/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb", size = 29487060 },
{ url = "https://files.pythonhosted.org/packages/dd/5c/c139a7876099916879609372bfa513b7f1257f7f1a908b0bdc1c2328241b/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b", size = 49969856 },
{ url = "https://files.pythonhosted.org/packages/95/dd/ed1191c9dc91abcc9f752b499b7928aacabf10567bb2c2535944d848af18/opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b", size = 29324425 },
{ url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386 },
]
[[package]] [[package]]
name = "packaging" name = "packaging"
version = "25.0" version = "25.0"
@@ -1198,6 +1182,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651 }, { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651 },
] ]
[[package]]
name = "pluggy"
version = "1.6.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538 },
]
[[package]] [[package]]
name = "pre-commit" name = "pre-commit"
version = "4.3.0" version = "4.3.0"
@@ -1416,6 +1409,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ab/4c/b888e6cf58bd9db9c93f40d1c6be8283ff49d88919231afe93a6bcf61626/pydeck-0.9.1-py2.py3-none-any.whl", hash = "sha256:b3f75ba0d273fc917094fa61224f3f6076ca8752b93d46faf3bcfd9f9d59b038", size = 6900403 }, { url = "https://files.pythonhosted.org/packages/ab/4c/b888e6cf58bd9db9c93f40d1c6be8283ff49d88919231afe93a6bcf61626/pydeck-0.9.1-py2.py3-none-any.whl", hash = "sha256:b3f75ba0d273fc917094fa61224f3f6076ca8752b93d46faf3bcfd9f9d59b038", size = 6900403 },
] ]
[[package]]
name = "pygments"
version = "2.19.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217 },
]
[[package]] [[package]]
name = "pypdfium2" name = "pypdfium2"
version = "4.30.0" version = "4.30.0"
@@ -1436,6 +1438,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 }, { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 },
] ]
[[package]]
name = "pytest"
version = "8.4.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "exceptiongroup", marker = "python_full_version < '3.11'" },
{ name = "iniconfig" },
{ name = "packaging" },
{ name = "pluggy" },
{ name = "pygments" },
{ name = "tomli", marker = "python_full_version < '3.11'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750 },
]
[[package]] [[package]]
name = "python-dateutil" name = "python-dateutil"
version = "2.9.0.post0" version = "2.9.0.post0"
@@ -1913,30 +1933,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/38/991bbf9fa3ed3d9c8e69265fc449bdaade8131c7f0f750dbd388c3c477dc/streamlit-1.50.0-py3-none-any.whl", hash = "sha256:9403b8f94c0a89f80cf679c2fcc803d9a6951e0fba542e7611995de3f67b4bb3", size = 10068477 }, { url = "https://files.pythonhosted.org/packages/2a/38/991bbf9fa3ed3d9c8e69265fc449bdaade8131c7f0f750dbd388c3c477dc/streamlit-1.50.0-py3-none-any.whl", hash = "sha256:9403b8f94c0a89f80cf679c2fcc803d9a6951e0fba542e7611995de3f67b4bb3", size = 10068477 },
] ]
[[package]]
name = "surya-ocr"
version = "0.17.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "einops" },
{ name = "filetype" },
{ name = "opencv-python-headless" },
{ name = "pillow" },
{ name = "platformdirs" },
{ name = "pre-commit" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "pypdfium2" },
{ name = "python-dotenv" },
{ name = "torch" },
{ name = "transformers" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ae/97/f868c1034da3d5788eb0d59f4b314f71bafe491e2524d3de3aa42fac2fd4/surya_ocr-0.17.0.tar.gz", hash = "sha256:3110ec9a2be0d4296968ced02ee4d33941f34c145a2d6ac508f75122014ed170", size = 155481 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8b/91/7df8763a2d38ce628c3244520e338619b84aedc83ca760e0a0d42c5cf25e/surya_ocr-0.17.0-py3-none-any.whl", hash = "sha256:a728adb1aadd26493f1b937ec411f4b041fa93c8e3524c42b4c627c2e4744d5c", size = 183395 },
]
[[package]] [[package]]
name = "sympy" name = "sympy"
version = "1.14.0" version = "1.14.0"
@@ -1992,6 +1988,55 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588 }, { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588 },
] ]
[[package]]
name = "tomli"
version = "2.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b3/2e/299f62b401438d5fe1624119c723f5d877acc86a4c2492da405626665f12/tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45", size = 153236 },
{ url = "https://files.pythonhosted.org/packages/86/7f/d8fffe6a7aefdb61bced88fcb5e280cfd71e08939da5894161bd71bea022/tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba", size = 148084 },
{ url = "https://files.pythonhosted.org/packages/47/5c/24935fb6a2ee63e86d80e4d3b58b222dafaf438c416752c8b58537c8b89a/tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf", size = 234832 },
{ url = "https://files.pythonhosted.org/packages/89/da/75dfd804fc11e6612846758a23f13271b76d577e299592b4371a4ca4cd09/tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441", size = 242052 },
{ url = "https://files.pythonhosted.org/packages/70/8c/f48ac899f7b3ca7eb13af73bacbc93aec37f9c954df3c08ad96991c8c373/tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845", size = 239555 },
{ url = "https://files.pythonhosted.org/packages/ba/28/72f8afd73f1d0e7829bfc093f4cb98ce0a40ffc0cc997009ee1ed94ba705/tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c", size = 245128 },
{ url = "https://files.pythonhosted.org/packages/b6/eb/a7679c8ac85208706d27436e8d421dfa39d4c914dcf5fa8083a9305f58d9/tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456", size = 96445 },
{ url = "https://files.pythonhosted.org/packages/0a/fe/3d3420c4cb1ad9cb462fb52967080575f15898da97e21cb6f1361d505383/tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be", size = 107165 },
{ url = "https://files.pythonhosted.org/packages/ff/b7/40f36368fcabc518bb11c8f06379a0fd631985046c038aca08c6d6a43c6e/tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac", size = 154891 },
{ url = "https://files.pythonhosted.org/packages/f9/3f/d9dd692199e3b3aab2e4e4dd948abd0f790d9ded8cd10cbaae276a898434/tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22", size = 148796 },
{ url = "https://files.pythonhosted.org/packages/60/83/59bff4996c2cf9f9387a0f5a3394629c7efa5ef16142076a23a90f1955fa/tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f", size = 242121 },
{ url = "https://files.pythonhosted.org/packages/45/e5/7c5119ff39de8693d6baab6c0b6dcb556d192c165596e9fc231ea1052041/tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52", size = 250070 },
{ url = "https://files.pythonhosted.org/packages/45/12/ad5126d3a278f27e6701abde51d342aa78d06e27ce2bb596a01f7709a5a2/tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8", size = 245859 },
{ url = "https://files.pythonhosted.org/packages/fb/a1/4d6865da6a71c603cfe6ad0e6556c73c76548557a8d658f9e3b142df245f/tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6", size = 250296 },
{ url = "https://files.pythonhosted.org/packages/a0/b7/a7a7042715d55c9ba6e8b196d65d2cb662578b4d8cd17d882d45322b0d78/tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876", size = 97124 },
{ url = "https://files.pythonhosted.org/packages/06/1e/f22f100db15a68b520664eb3328fb0ae4e90530887928558112c8d1f4515/tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878", size = 107698 },
{ url = "https://files.pythonhosted.org/packages/89/48/06ee6eabe4fdd9ecd48bf488f4ac783844fd777f547b8d1b61c11939974e/tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b", size = 154819 },
{ url = "https://files.pythonhosted.org/packages/f1/01/88793757d54d8937015c75dcdfb673c65471945f6be98e6a0410fba167ed/tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae", size = 148766 },
{ url = "https://files.pythonhosted.org/packages/42/17/5e2c956f0144b812e7e107f94f1cc54af734eb17b5191c0bbfb72de5e93e/tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b", size = 240771 },
{ url = "https://files.pythonhosted.org/packages/d5/f4/0fbd014909748706c01d16824eadb0307115f9562a15cbb012cd9b3512c5/tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf", size = 248586 },
{ url = "https://files.pythonhosted.org/packages/30/77/fed85e114bde5e81ecf9bc5da0cc69f2914b38f4708c80ae67d0c10180c5/tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f", size = 244792 },
{ url = "https://files.pythonhosted.org/packages/55/92/afed3d497f7c186dc71e6ee6d4fcb0acfa5f7d0a1a2878f8beae379ae0cc/tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05", size = 248909 },
{ url = "https://files.pythonhosted.org/packages/f8/84/ef50c51b5a9472e7265ce1ffc7f24cd4023d289e109f669bdb1553f6a7c2/tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606", size = 96946 },
{ url = "https://files.pythonhosted.org/packages/b2/b7/718cd1da0884f281f95ccfa3a6cc572d30053cba64603f79d431d3c9b61b/tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999", size = 107705 },
{ url = "https://files.pythonhosted.org/packages/19/94/aeafa14a52e16163008060506fcb6aa1949d13548d13752171a755c65611/tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e", size = 154244 },
{ url = "https://files.pythonhosted.org/packages/db/e4/1e58409aa78eefa47ccd19779fc6f36787edbe7d4cd330eeeedb33a4515b/tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3", size = 148637 },
{ url = "https://files.pythonhosted.org/packages/26/b6/d1eccb62f665e44359226811064596dd6a366ea1f985839c566cd61525ae/tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc", size = 241925 },
{ url = "https://files.pythonhosted.org/packages/70/91/7cdab9a03e6d3d2bb11beae108da5bdc1c34bdeb06e21163482544ddcc90/tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0", size = 249045 },
{ url = "https://files.pythonhosted.org/packages/15/1b/8c26874ed1f6e4f1fcfeb868db8a794cbe9f227299402db58cfcc858766c/tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879", size = 245835 },
{ url = "https://files.pythonhosted.org/packages/fd/42/8e3c6a9a4b1a1360c1a2a39f0b972cef2cc9ebd56025168c4137192a9321/tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005", size = 253109 },
{ url = "https://files.pythonhosted.org/packages/22/0c/b4da635000a71b5f80130937eeac12e686eefb376b8dee113b4a582bba42/tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463", size = 97930 },
{ url = "https://files.pythonhosted.org/packages/b9/74/cb1abc870a418ae99cd5c9547d6bce30701a954e0e721821df483ef7223c/tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8", size = 107964 },
{ url = "https://files.pythonhosted.org/packages/54/78/5c46fff6432a712af9f792944f4fcd7067d8823157949f4e40c56b8b3c83/tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77", size = 163065 },
{ url = "https://files.pythonhosted.org/packages/39/67/f85d9bd23182f45eca8939cd2bc7050e1f90c41f4a2ecbbd5963a1d1c486/tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf", size = 159088 },
{ url = "https://files.pythonhosted.org/packages/26/5a/4b546a0405b9cc0659b399f12b6adb750757baf04250b148d3c5059fc4eb/tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530", size = 268193 },
{ url = "https://files.pythonhosted.org/packages/42/4f/2c12a72ae22cf7b59a7fe75b3465b7aba40ea9145d026ba41cb382075b0e/tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b", size = 275488 },
{ url = "https://files.pythonhosted.org/packages/92/04/a038d65dbe160c3aa5a624e93ad98111090f6804027d474ba9c37c8ae186/tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67", size = 272669 },
{ url = "https://files.pythonhosted.org/packages/be/2f/8b7c60a9d1612a7cbc39ffcca4f21a73bf368a80fc25bccf8253e2563267/tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f", size = 279709 },
{ url = "https://files.pythonhosted.org/packages/7e/46/cc36c679f09f27ded940281c38607716c86cf8ba4a518d524e349c8b4874/tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0", size = 107563 },
{ url = "https://files.pythonhosted.org/packages/84/ff/426ca8683cf7b753614480484f6437f568fd2fda2edbdf57a2d3d8b27a0b/tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba", size = 119756 },
{ url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408 },
]
[[package]] [[package]]
name = "torch" name = "torch"
version = "2.8.0" version = "2.8.0"