Support different bbox format

2025-11-29 00:23:12 +00:00 · 2025-10-30 13:02:56 -04:00
parent 7cf96f3911
commit 4b01146865
3 changed files with 17 additions and 3 deletions
--- a/chandra/model/hf.py
+++ b/chandra/model/hf.py
@@ -5,6 +5,7 @@ from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor

 from chandra.model.schema import BatchInputItem, GenerationResult
 from chandra.model.util import scale_to_fit
+from chandra.output import fix_raw
 from chandra.prompts import PROMPT_MAPPING
 from chandra.settings import settings

@@ -42,7 +43,7 @@ def generate_hf(
        clean_up_tokenization_spaces=False,
    )
    results = [
-        GenerationResult(raw=out, token_count=len(ids), error=False)
+        GenerationResult(raw=fix_raw(out), token_count=len(ids), error=False)
        for out, ids in zip(output_text, generated_ids_trimmed)
    ]
    return results
--- a/chandra/model/vllm.py
+++ b/chandra/model/vllm.py
@@ -9,6 +9,7 @@ from openai import OpenAI

 from chandra.model.schema import BatchInputItem, GenerationResult
 from chandra.model.util import scale_to_fit, detect_repeat_token
+from chandra.output import fix_raw
 from chandra.prompts import PROMPT_MAPPING
 from chandra.settings import settings

@@ -74,8 +75,10 @@ def generate_vllm(
                temperature=temperature,
                top_p=top_p,
            )
+            raw = completion.choices[0].message.content
+            raw = fix_raw(raw)
            result = GenerationResult(
-                raw=completion.choices[0].message.content,
+                raw=raw,
                token_count=completion.usage.completion_tokens,
                error=False,
            )
--- a/chandra/output.py
+++ b/chandra/output.py
@@ -20,6 +20,15 @@ def get_image_name(html: str, div_idx: int):
    return f"{html_hash}_{div_idx}_img.webp"


+def fix_raw(html: str):
+    def replace_group(match):
+        numbers = re.findall(r"\d+", match.group(0))
+        return "[" + ",".join(numbers) + "]"
+
+    result = re.sub(r"(?:<BBOX\d+>){4}", replace_group, html)
+    return result
+
+
 def extract_images(html: str, chunks: dict, image: Image.Image):
    images = {}
    div_idx = 0
@@ -228,10 +237,11 @@ def parse_layout(html: str, image: Image.Image):
    layout_blocks = []
    for div in top_level_divs:
        bbox = div.get("data-bbox")
+
        try:
            bbox = json.loads(bbox)
        except Exception:
-            bbox = [0, 0, 1, 1]  # Fallback to a default bbox if parsing fails
+            bbox = [0, 0, 1, 1]

        bbox = list(map(int, bbox))
        # Normalize bbox