Merge pull request #32 from datalab-to/dev

Dev
This commit is contained in:
Vik Paruchuri
2025-11-10 11:37:06 -05:00
committed by GitHub
5 changed files with 16 additions and 18 deletions

View File

@@ -5,7 +5,6 @@ from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor
from chandra.model.schema import BatchInputItem, GenerationResult
from chandra.model.util import scale_to_fit
from chandra.output import fix_raw
from chandra.prompts import PROMPT_MAPPING
from chandra.settings import settings
@@ -43,7 +42,7 @@ def generate_hf(
clean_up_tokenization_spaces=False,
)
results = [
GenerationResult(raw=fix_raw(out), token_count=len(ids), error=False)
GenerationResult(raw=out, token_count=len(ids), error=False)
for out, ids in zip(output_text, generated_ids_trimmed)
]
return results

View File

@@ -9,7 +9,6 @@ from openai import OpenAI
from chandra.model.schema import BatchInputItem, GenerationResult
from chandra.model.util import scale_to_fit, detect_repeat_token
from chandra.output import fix_raw
from chandra.prompts import PROMPT_MAPPING
from chandra.settings import settings
@@ -71,12 +70,11 @@ def generate_vllm(
completion = client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": content}],
max_tokens=settings.MAX_OUTPUT_TOKENS,
max_tokens=max_output_tokens,
temperature=temperature,
top_p=top_p,
)
raw = completion.choices[0].message.content
raw = fix_raw(raw)
result = GenerationResult(
raw=raw,
token_count=completion.usage.completion_tokens,

View File

@@ -9,6 +9,8 @@ from PIL import Image
from bs4 import BeautifulSoup, NavigableString
from markdownify import MarkdownConverter, re_whitespace
from chandra.settings import settings
@lru_cache
def _hash_html(html: str):
@@ -20,15 +22,6 @@ def get_image_name(html: str, div_idx: int):
return f"{html_hash}_{div_idx}_img.webp"
def fix_raw(html: str):
def replace_group(match):
numbers = re.findall(r"\d+", match.group(0))
return "[" + ",".join(numbers) + "]"
result = re.sub(r"(?:<BBOX\d+>){4}", replace_group, html)
return result
def extract_images(html: str, chunks: dict, image: Image.Image):
images = {}
div_idx = 0
@@ -232,16 +225,21 @@ def parse_layout(html: str, image: Image.Image):
soup = BeautifulSoup(html, "html.parser")
top_level_divs = soup.find_all("div", recursive=False)
width, height = image.size
width_scaler = width / 1024
height_scaler = height / 1024
width_scaler = width / settings.BBOX_SCALE
height_scaler = height / settings.BBOX_SCALE
layout_blocks = []
for div in top_level_divs:
bbox = div.get("data-bbox")
try:
bbox = json.loads(bbox)
assert len(bbox) == 4, "Invalid bbox length"
except Exception:
bbox = [0, 0, 1, 1]
try:
bbox = bbox.split(" ")
assert len(bbox) == 4, "Invalid bbox length"
except Exception:
bbox = [0, 0, 1, 1]
bbox = list(map(int, bbox))
# Normalize bbox

View File

@@ -1,3 +1,5 @@
from chandra.settings import settings
ALLOWED_TAGS = [
"math",
"br",
@@ -65,7 +67,7 @@ Guidelines:
""".strip()
OCR_LAYOUT_PROMPT = f"""
OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{settings.BBOX_SCALE}. The data-label attribute is the label for the block.
Use the following labels:
- Caption

View File

@@ -15,6 +15,7 @@ class Settings(BaseSettings):
TORCH_DEVICE: str | None = None
MAX_OUTPUT_TOKENS: int = 12384
TORCH_ATTN: str | None = None
BBOX_SCALE: int = 1024
# vLLM server settings
VLLM_API_KEY: str = "EMPTY"