bbox scale

2026-01-20 05:50:42 +00:00 · 2025-11-05 13:45:59 -05:00
parent aabfed2ed3
commit a3889b12fb
3 changed files with 9 additions and 4 deletions
--- a/chandra/output.py
+++ b/chandra/output.py
@@ -9,6 +9,8 @@ from PIL import Image
 from bs4 import BeautifulSoup, NavigableString
 from markdownify import MarkdownConverter, re_whitespace

+from chandra.settings import settings
+

@lru_cache
 def _hash_html(html: str):
@@ -25,7 +27,7 @@ def fix_raw(html: str):
        numbers = re.findall(r"\d+", match.group(0))
        return "[" + ",".join(numbers) + "]"

-    result = re.sub(r"(?:<BBOX\d+>){4}", replace_group, html)
+    result = re.sub(r"(?:\|BBOX\d+\|){4}", replace_group, html)
    return result


@@ -232,8 +234,8 @@ def parse_layout(html: str, image: Image.Image):
    soup = BeautifulSoup(html, "html.parser")
    top_level_divs = soup.find_all("div", recursive=False)
    width, height = image.size
-    width_scaler = width / 1024
-    height_scaler = height / 1024
+    width_scaler = width / settings.BBOX_SCALE
+    height_scaler = height / settings.BBOX_SCALE
    layout_blocks = []
    for div in top_level_divs:
        bbox = div.get("data-bbox")
--- a/chandra/prompts.py
+++ b/chandra/prompts.py
@@ -1,3 +1,5 @@
+from chandra.settings import settings
+
 ALLOWED_TAGS = [
    "math",
    "br",
@@ -65,7 +67,7 @@ Guidelines:
 """.strip()

 OCR_LAYOUT_PROMPT = f"""
-OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
+OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-{settings.BBOX_SCALE}. The data-label attribute is the label for the block.

 Use the following labels:
 - Caption
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -15,6 +15,7 @@ class Settings(BaseSettings):
    TORCH_DEVICE: str | None = None
    MAX_OUTPUT_TOKENS: int = 12384
    TORCH_ATTN: str | None = None
+    BBOX_SCALE: int = 1024

    # vLLM server settings
    VLLM_API_KEY: str = "EMPTY"