diff --git a/chandra/output.py b/chandra/output.py index b174ef5..9afe54a 100644 --- a/chandra/output.py +++ b/chandra/output.py @@ -9,6 +9,8 @@ from PIL import Image from bs4 import BeautifulSoup, NavigableString from markdownify import MarkdownConverter, re_whitespace +from chandra.settings import settings + @lru_cache def _hash_html(html: str): @@ -25,7 +27,7 @@ def fix_raw(html: str): numbers = re.findall(r"\d+", match.group(0)) return "[" + ",".join(numbers) + "]" - result = re.sub(r"(?:){4}", replace_group, html) + result = re.sub(r"(?:\|BBOX\d+\|){4}", replace_group, html) return result @@ -232,8 +234,8 @@ def parse_layout(html: str, image: Image.Image): soup = BeautifulSoup(html, "html.parser") top_level_divs = soup.find_all("div", recursive=False) width, height = image.size - width_scaler = width / 1024 - height_scaler = height / 1024 + width_scaler = width / settings.BBOX_SCALE + height_scaler = height / settings.BBOX_SCALE layout_blocks = [] for div in top_level_divs: bbox = div.get("data-bbox") diff --git a/chandra/prompts.py b/chandra/prompts.py index 49d6b15..f5a17bb 100644 --- a/chandra/prompts.py +++ b/chandra/prompts.py @@ -1,3 +1,5 @@ +from chandra.settings import settings + ALLOWED_TAGS = [ "math", "br", @@ -65,7 +67,7 @@ Guidelines: """.strip() OCR_LAYOUT_PROMPT = f""" -OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-1024. The data-label attribute is the label for the block. +OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{settings.BBOX_SCALE}. The data-label attribute is the label for the block. Use the following labels: - Caption diff --git a/chandra/settings.py b/chandra/settings.py index 3aa35bb..d0abd1b 100644 --- a/chandra/settings.py +++ b/chandra/settings.py @@ -15,6 +15,7 @@ class Settings(BaseSettings): TORCH_DEVICE: str | None = None MAX_OUTPUT_TOKENS: int = 12384 TORCH_ATTN: str | None = None + BBOX_SCALE: int = 1024 # vLLM server settings VLLM_API_KEY: str = "EMPTY"