From d1cde9b608d6a61f5b96cf139240973d7ca9c70b Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Tue, 4 Nov 2025 13:16:57 -0500 Subject: [PATCH 1/4] fix: respect max output tokens --- chandra/model/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chandra/model/vllm.py b/chandra/model/vllm.py index 4e36f0e..5528571 100644 --- a/chandra/model/vllm.py +++ b/chandra/model/vllm.py @@ -71,7 +71,7 @@ def generate_vllm( completion = client.chat.completions.create( model=model_name, messages=[{"role": "user", "content": content}], - max_tokens=settings.MAX_OUTPUT_TOKENS, + max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, ) From a3889b12fb961e9ecb1c47d2796e92ccb923e447 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 5 Nov 2025 13:45:59 -0500 Subject: [PATCH 2/4] bbox scale --- chandra/output.py | 8 +++++--- chandra/prompts.py | 4 +++- chandra/settings.py | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/chandra/output.py b/chandra/output.py index b174ef5..9afe54a 100644 --- a/chandra/output.py +++ b/chandra/output.py @@ -9,6 +9,8 @@ from PIL import Image from bs4 import BeautifulSoup, NavigableString from markdownify import MarkdownConverter, re_whitespace +from chandra.settings import settings + @lru_cache def _hash_html(html: str): @@ -25,7 +27,7 @@ def fix_raw(html: str): numbers = re.findall(r"\d+", match.group(0)) return "[" + ",".join(numbers) + "]" - result = re.sub(r"(?:){4}", replace_group, html) + result = re.sub(r"(?:\|BBOX\d+\|){4}", replace_group, html) return result @@ -232,8 +234,8 @@ def parse_layout(html: str, image: Image.Image): soup = BeautifulSoup(html, "html.parser") top_level_divs = soup.find_all("div", recursive=False) width, height = image.size - width_scaler = width / 1024 - height_scaler = height / 1024 + width_scaler = width / settings.BBOX_SCALE + height_scaler = height / settings.BBOX_SCALE layout_blocks = [] for div in top_level_divs: bbox = div.get("data-bbox") diff --git a/chandra/prompts.py b/chandra/prompts.py index 49d6b15..f5a17bb 100644 --- a/chandra/prompts.py +++ b/chandra/prompts.py @@ -1,3 +1,5 @@ +from chandra.settings import settings + ALLOWED_TAGS = [ "math", "br", @@ -65,7 +67,7 @@ Guidelines: """.strip() OCR_LAYOUT_PROMPT = f""" -OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-1024. The data-label attribute is the label for the block. +OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{settings.BBOX_SCALE}. The data-label attribute is the label for the block. Use the following labels: - Caption diff --git a/chandra/settings.py b/chandra/settings.py index 3aa35bb..d0abd1b 100644 --- a/chandra/settings.py +++ b/chandra/settings.py @@ -15,6 +15,7 @@ class Settings(BaseSettings): TORCH_DEVICE: str | None = None MAX_OUTPUT_TOKENS: int = 12384 TORCH_ATTN: str | None = None + BBOX_SCALE: int = 1024 # vLLM server settings VLLM_API_KEY: str = "EMPTY" From fe28f26fc237fbeb7612bb66a11a99602c6cf5e6 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 7 Nov 2025 13:18:38 -0500 Subject: [PATCH 3/4] Adjust bbox format --- chandra/output.py | 3 +-- chandra/settings.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/chandra/output.py b/chandra/output.py index 9afe54a..8dc3ad9 100644 --- a/chandra/output.py +++ b/chandra/output.py @@ -1,5 +1,4 @@ import hashlib -import json import re from dataclasses import dataclass, asdict from functools import lru_cache @@ -241,7 +240,7 @@ def parse_layout(html: str, image: Image.Image): bbox = div.get("data-bbox") try: - bbox = json.loads(bbox) + bbox = bbox.split(" ") except Exception: bbox = [0, 0, 1, 1] diff --git a/chandra/settings.py b/chandra/settings.py index d0abd1b..5fa078a 100644 --- a/chandra/settings.py +++ b/chandra/settings.py @@ -15,7 +15,7 @@ class Settings(BaseSettings): TORCH_DEVICE: str | None = None MAX_OUTPUT_TOKENS: int = 12384 TORCH_ATTN: str | None = None - BBOX_SCALE: int = 1024 + BBOX_SCALE: int = 1000 # vLLM server settings VLLM_API_KEY: str = "EMPTY" From 3958707a80aa86d086b27464bb89dcb808516700 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 10 Nov 2025 11:12:00 -0500 Subject: [PATCH 4/4] Support multiple formats --- chandra/model/hf.py | 3 +-- chandra/model/vllm.py | 2 -- chandra/output.py | 19 ++++++++----------- chandra/settings.py | 2 +- 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/chandra/model/hf.py b/chandra/model/hf.py index b88eb9e..50aa883 100644 --- a/chandra/model/hf.py +++ b/chandra/model/hf.py @@ -5,7 +5,6 @@ from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor from chandra.model.schema import BatchInputItem, GenerationResult from chandra.model.util import scale_to_fit -from chandra.output import fix_raw from chandra.prompts import PROMPT_MAPPING from chandra.settings import settings @@ -43,7 +42,7 @@ def generate_hf( clean_up_tokenization_spaces=False, ) results = [ - GenerationResult(raw=fix_raw(out), token_count=len(ids), error=False) + GenerationResult(raw=out, token_count=len(ids), error=False) for out, ids in zip(output_text, generated_ids_trimmed) ] return results diff --git a/chandra/model/vllm.py b/chandra/model/vllm.py index 5528571..1aabf69 100644 --- a/chandra/model/vllm.py +++ b/chandra/model/vllm.py @@ -9,7 +9,6 @@ from openai import OpenAI from chandra.model.schema import BatchInputItem, GenerationResult from chandra.model.util import scale_to_fit, detect_repeat_token -from chandra.output import fix_raw from chandra.prompts import PROMPT_MAPPING from chandra.settings import settings @@ -76,7 +75,6 @@ def generate_vllm( top_p=top_p, ) raw = completion.choices[0].message.content - raw = fix_raw(raw) result = GenerationResult( raw=raw, token_count=completion.usage.completion_tokens, diff --git a/chandra/output.py b/chandra/output.py index 8dc3ad9..47e9f98 100644 --- a/chandra/output.py +++ b/chandra/output.py @@ -1,4 +1,5 @@ import hashlib +import json import re from dataclasses import dataclass, asdict from functools import lru_cache @@ -21,15 +22,6 @@ def get_image_name(html: str, div_idx: int): return f"{html_hash}_{div_idx}_img.webp" -def fix_raw(html: str): - def replace_group(match): - numbers = re.findall(r"\d+", match.group(0)) - return "[" + ",".join(numbers) + "]" - - result = re.sub(r"(?:\|BBOX\d+\|){4}", replace_group, html) - return result - - def extract_images(html: str, chunks: dict, image: Image.Image): images = {} div_idx = 0 @@ -240,9 +232,14 @@ def parse_layout(html: str, image: Image.Image): bbox = div.get("data-bbox") try: - bbox = bbox.split(" ") + bbox = json.loads(bbox) + assert len(bbox) == 4, "Invalid bbox length" except Exception: - bbox = [0, 0, 1, 1] + try: + bbox = bbox.split(" ") + assert len(bbox) == 4, "Invalid bbox length" + except Exception: + bbox = [0, 0, 1, 1] bbox = list(map(int, bbox)) # Normalize bbox diff --git a/chandra/settings.py b/chandra/settings.py index 5fa078a..d0abd1b 100644 --- a/chandra/settings.py +++ b/chandra/settings.py @@ -15,7 +15,7 @@ class Settings(BaseSettings): TORCH_DEVICE: str | None = None MAX_OUTPUT_TOKENS: int = 12384 TORCH_ATTN: str | None = None - BBOX_SCALE: int = 1000 + BBOX_SCALE: int = 1024 # vLLM server settings VLLM_API_KEY: str = "EMPTY"