From d1cde9b608d6a61f5b96cf139240973d7ca9c70b Mon Sep 17 00:00:00 2001
From: Zach Nussbaum <zanussbaum@gmail.com>
Date: Tue, 4 Nov 2025 13:16:57 -0500
Subject: [PATCH 1/4] fix: respect max output tokens

---
 chandra/model/vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chandra/model/vllm.py b/chandra/model/vllm.py
index 4e36f0e..5528571 100644
--- a/chandra/model/vllm.py
+++ b/chandra/model/vllm.py
@@ -71,7 +71,7 @@ def generate_vllm(
             completion = client.chat.completions.create(
                 model=model_name,
                 messages=[{"role": "user", "content": content}],
-                max_tokens=settings.MAX_OUTPUT_TOKENS,
+                max_tokens=max_output_tokens,
                 temperature=temperature,
                 top_p=top_p,
             )

From a3889b12fb961e9ecb1c47d2796e92ccb923e447 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 5 Nov 2025 13:45:59 -0500
Subject: [PATCH 2/4] bbox scale

---
 chandra/output.py   | 8 +++++---
 chandra/prompts.py  | 4 +++-
 chandra/settings.py | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/chandra/output.py b/chandra/output.py
index b174ef5..9afe54a 100644
--- a/chandra/output.py
+++ b/chandra/output.py
@@ -9,6 +9,8 @@ from PIL import Image
 from bs4 import BeautifulSoup, NavigableString
 from markdownify import MarkdownConverter, re_whitespace
 
+from chandra.settings import settings
+
 
 @lru_cache
 def _hash_html(html: str):
@@ -25,7 +27,7 @@ def fix_raw(html: str):
         numbers = re.findall(r"\d+", match.group(0))
         return "[" + ",".join(numbers) + "]"
 
-    result = re.sub(r"(?:<BBOX\d+>){4}", replace_group, html)
+    result = re.sub(r"(?:\|BBOX\d+\|){4}", replace_group, html)
     return result
 
 
@@ -232,8 +234,8 @@ def parse_layout(html: str, image: Image.Image):
     soup = BeautifulSoup(html, "html.parser")
     top_level_divs = soup.find_all("div", recursive=False)
     width, height = image.size
-    width_scaler = width / 1024
-    height_scaler = height / 1024
+    width_scaler = width / settings.BBOX_SCALE
+    height_scaler = height / settings.BBOX_SCALE
     layout_blocks = []
     for div in top_level_divs:
         bbox = div.get("data-bbox")
diff --git a/chandra/prompts.py b/chandra/prompts.py
index 49d6b15..f5a17bb 100644
--- a/chandra/prompts.py
+++ b/chandra/prompts.py
@@ -1,3 +1,5 @@
+from chandra.settings import settings
+
 ALLOWED_TAGS = [
     "math",
     "br",
@@ -65,7 +67,7 @@ Guidelines:
 """.strip()
 
 OCR_LAYOUT_PROMPT = f"""
-OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
+OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-{settings.BBOX_SCALE}. The data-label attribute is the label for the block.
 
 Use the following labels:
 - Caption
diff --git a/chandra/settings.py b/chandra/settings.py
index 3aa35bb..d0abd1b 100644
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -15,6 +15,7 @@ class Settings(BaseSettings):
     TORCH_DEVICE: str | None = None
     MAX_OUTPUT_TOKENS: int = 12384
     TORCH_ATTN: str | None = None
+    BBOX_SCALE: int = 1024
 
     # vLLM server settings
     VLLM_API_KEY: str = "EMPTY"

From fe28f26fc237fbeb7612bb66a11a99602c6cf5e6 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 7 Nov 2025 13:18:38 -0500
Subject: [PATCH 3/4] Adjust bbox format

---
 chandra/output.py   | 3 +--
 chandra/settings.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/chandra/output.py b/chandra/output.py
index 9afe54a..8dc3ad9 100644
--- a/chandra/output.py
+++ b/chandra/output.py
@@ -1,5 +1,4 @@
 import hashlib
-import json
 import re
 from dataclasses import dataclass, asdict
 from functools import lru_cache
@@ -241,7 +240,7 @@ def parse_layout(html: str, image: Image.Image):
         bbox = div.get("data-bbox")
 
         try:
-            bbox = json.loads(bbox)
+            bbox = bbox.split(" ")
         except Exception:
             bbox = [0, 0, 1, 1]
 
diff --git a/chandra/settings.py b/chandra/settings.py
index d0abd1b..5fa078a 100644
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -15,7 +15,7 @@ class Settings(BaseSettings):
     TORCH_DEVICE: str | None = None
     MAX_OUTPUT_TOKENS: int = 12384
     TORCH_ATTN: str | None = None
-    BBOX_SCALE: int = 1024
+    BBOX_SCALE: int = 1000
 
     # vLLM server settings
     VLLM_API_KEY: str = "EMPTY"

From 3958707a80aa86d086b27464bb89dcb808516700 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Mon, 10 Nov 2025 11:12:00 -0500
Subject: [PATCH 4/4] Support multiple formats

---
 chandra/model/hf.py   |  3 +--
 chandra/model/vllm.py |  2 --
 chandra/output.py     | 19 ++++++++-----------
 chandra/settings.py   |  2 +-
 4 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/chandra/model/hf.py b/chandra/model/hf.py
index b88eb9e..50aa883 100644
--- a/chandra/model/hf.py
+++ b/chandra/model/hf.py
@@ -5,7 +5,6 @@ from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor
 
 from chandra.model.schema import BatchInputItem, GenerationResult
 from chandra.model.util import scale_to_fit
-from chandra.output import fix_raw
 from chandra.prompts import PROMPT_MAPPING
 from chandra.settings import settings
 
@@ -43,7 +42,7 @@ def generate_hf(
         clean_up_tokenization_spaces=False,
     )
     results = [
-        GenerationResult(raw=fix_raw(out), token_count=len(ids), error=False)
+        GenerationResult(raw=out, token_count=len(ids), error=False)
         for out, ids in zip(output_text, generated_ids_trimmed)
     ]
     return results
diff --git a/chandra/model/vllm.py b/chandra/model/vllm.py
index 5528571..1aabf69 100644
--- a/chandra/model/vllm.py
+++ b/chandra/model/vllm.py
@@ -9,7 +9,6 @@ from openai import OpenAI
 
 from chandra.model.schema import BatchInputItem, GenerationResult
 from chandra.model.util import scale_to_fit, detect_repeat_token
-from chandra.output import fix_raw
 from chandra.prompts import PROMPT_MAPPING
 from chandra.settings import settings
 
@@ -76,7 +75,6 @@ def generate_vllm(
                 top_p=top_p,
             )
             raw = completion.choices[0].message.content
-            raw = fix_raw(raw)
             result = GenerationResult(
                 raw=raw,
                 token_count=completion.usage.completion_tokens,
diff --git a/chandra/output.py b/chandra/output.py
index 8dc3ad9..47e9f98 100644
--- a/chandra/output.py
+++ b/chandra/output.py
@@ -1,4 +1,5 @@
 import hashlib
+import json
 import re
 from dataclasses import dataclass, asdict
 from functools import lru_cache
@@ -21,15 +22,6 @@ def get_image_name(html: str, div_idx: int):
     return f"{html_hash}_{div_idx}_img.webp"
 
 
-def fix_raw(html: str):
-    def replace_group(match):
-        numbers = re.findall(r"\d+", match.group(0))
-        return "[" + ",".join(numbers) + "]"
-
-    result = re.sub(r"(?:\|BBOX\d+\|){4}", replace_group, html)
-    return result
-
-
 def extract_images(html: str, chunks: dict, image: Image.Image):
     images = {}
     div_idx = 0
@@ -240,9 +232,14 @@ def parse_layout(html: str, image: Image.Image):
         bbox = div.get("data-bbox")
 
         try:
-            bbox = bbox.split(" ")
+            bbox = json.loads(bbox)
+            assert len(bbox) == 4, "Invalid bbox length"
         except Exception:
-            bbox = [0, 0, 1, 1]
+            try:
+                bbox = bbox.split(" ")
+                assert len(bbox) == 4, "Invalid bbox length"
+            except Exception:
+                bbox = [0, 0, 1, 1]
 
         bbox = list(map(int, bbox))
         # Normalize bbox
diff --git a/chandra/settings.py b/chandra/settings.py
index 5fa078a..d0abd1b 100644
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -15,7 +15,7 @@ class Settings(BaseSettings):
     TORCH_DEVICE: str | None = None
     MAX_OUTPUT_TOKENS: int = 12384
     TORCH_ATTN: str | None = None
-    BBOX_SCALE: int = 1000
+    BBOX_SCALE: int = 1024
 
     # vLLM server settings
     VLLM_API_KEY: str = "EMPTY"