Code cleanup

2026-03-07 14:23:26 +00:00 · 2025-10-21 12:11:37 -04:00
parent 2151833414
commit 47bd444f20
3 changed files with 10 additions and 25 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Chandra

-Chandra is an OCR model that converts images and PDFs into structured HTML/Markdown/JSON while preserving layout information.
+Chandra is a highly accurate OCR model that converts images and PDFs into structured HTML/Markdown/JSON while preserving layout information.

 ## Features

@@ -154,7 +154,11 @@ VLLM_MODEL_NAME=chandra
 VLLM_GPUS=0
 ```

-## Benchmark table
+# Commercial usage
+
+This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license (free for research, personal use, and startups under $2M funding/revenue, cannot be used competitively with our API). To remove the OpenRAIL license requirements, or for broader commercial licensing, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-chandra).
+
+# Benchmark table

 | **Model**                 |  ArXiv   | Old Scans Math |  Tables  | Old Scans | Headers and Footers | Multi column | Long tiny text | Base |    Overall     | Source |
 |:--------------------------|:--------:|:--------------:|:--------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:--------------:|:------:|
@@ -168,9 +172,6 @@ VLLM_GPUS=0
 | olmOCR v0.3.0             |   78.6   | 79.9 |   72.9   |   43.9    |      **95.1**       |     77.3     |      81.2      | 98.9 |   78.5 ± 1.1   | olmocr repo |
 | dots.ocr                  |   82.1   | 64.2 |   88.3   |   40.9    |        94.1         |   **82.4**   |      81.2      | 99.5 |   79.1 ± 1.0   | dots.ocr repo |

-# Commercial usage
-
-This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license (free for research, personal use, and startups under $2M funding/revenue, cannot be used competitively with our API). To remove the OpenRAIL license requirements, or for broader commercial licensing, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-chandra).

 # Credits

--- a/chandra/model/util.py
+++ b/chandra/model/util.py
@@ -43,7 +43,10 @@ def scale_to_fit(


 def detect_repeat_token(
-    predicted_tokens: str, max_repeats: int = 4, window_size: int = 500, cut_from_end: int = 0
+    predicted_tokens: str,
+    max_repeats: int = 4,
+    window_size: int = 500,
+    cut_from_end: int = 0,
 ):
    try:
        predicted_tokens = parse_markdown(predicted_tokens)
@@ -77,7 +80,3 @@ def detect_repeat_token(
            return True

    return False
-
-
-def layout_failed(predicted_tokens: str, image: Image.Image):
-    pass
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -22,21 +22,6 @@ class Settings(BaseSettings):
    VLLM_GPUS: str = "0"
    MAX_VLLM_RETRIES: int = 6

-    # Transformers settings
-    @computed_field
-    @property
-    def TORCH_DEVICE_MODEL(self) -> str:
-        if self.TORCH_DEVICE is not None:
-            return self.TORCH_DEVICE
-
-        if torch.cuda.is_available():
-            return "cuda"
-
-        if torch.backends.mps.is_available():
-            return "mps"
-
-        return "cpu"
-
    @computed_field
    @property
    def TORCH_DTYPE(self) -> torch.dtype: