From 47bd444f20b629f6455d635a3303ae0f8eba62ca Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 21 Oct 2025 12:11:37 -0400 Subject: [PATCH] Code cleanup --- README.md | 11 ++++++----- chandra/model/util.py | 9 ++++----- chandra/settings.py | 15 --------------- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index c8a8044..969b236 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Chandra -Chandra is an OCR model that converts images and PDFs into structured HTML/Markdown/JSON while preserving layout information. +Chandra is a highly accurate OCR model that converts images and PDFs into structured HTML/Markdown/JSON while preserving layout information. ## Features @@ -154,7 +154,11 @@ VLLM_MODEL_NAME=chandra VLLM_GPUS=0 ``` -## Benchmark table +# Commercial usage + +This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license (free for research, personal use, and startups under $2M funding/revenue, cannot be used competitively with our API). To remove the OpenRAIL license requirements, or for broader commercial licensing, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-chandra). + +# Benchmark table | **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall | Source | |:--------------------------|:--------:|:--------------:|:--------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:--------------:|:------:| @@ -168,9 +172,6 @@ VLLM_GPUS=0 | olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | **95.1** | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 | olmocr repo | | dots.ocr | 82.1 | 64.2 | 88.3 | 40.9 | 94.1 | **82.4** | 81.2 | 99.5 | 79.1 ± 1.0 | dots.ocr repo | -# Commercial usage - -This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license (free for research, personal use, and startups under $2M funding/revenue, cannot be used competitively with our API). To remove the OpenRAIL license requirements, or for broader commercial licensing, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-chandra). # Credits diff --git a/chandra/model/util.py b/chandra/model/util.py index 819fb5b..d43c1c6 100644 --- a/chandra/model/util.py +++ b/chandra/model/util.py @@ -43,7 +43,10 @@ def scale_to_fit( def detect_repeat_token( - predicted_tokens: str, max_repeats: int = 4, window_size: int = 500, cut_from_end: int = 0 + predicted_tokens: str, + max_repeats: int = 4, + window_size: int = 500, + cut_from_end: int = 0, ): try: predicted_tokens = parse_markdown(predicted_tokens) @@ -77,7 +80,3 @@ def detect_repeat_token( return True return False - - -def layout_failed(predicted_tokens: str, image: Image.Image): - pass diff --git a/chandra/settings.py b/chandra/settings.py index 2c59ec3..d9a9898 100644 --- a/chandra/settings.py +++ b/chandra/settings.py @@ -22,21 +22,6 @@ class Settings(BaseSettings): VLLM_GPUS: str = "0" MAX_VLLM_RETRIES: int = 6 - # Transformers settings - @computed_field - @property - def TORCH_DEVICE_MODEL(self) -> str: - if self.TORCH_DEVICE is not None: - return self.TORCH_DEVICE - - if torch.cuda.is_available(): - return "cuda" - - if torch.backends.mps.is_available(): - return "mps" - - return "cpu" - @computed_field @property def TORCH_DTYPE(self) -> torch.dtype: