21 Commits

Author SHA1 Message Date
Vik Paruchuri
cba67c6d15 Merge pull request #47 from datalab-to/dev
Shift import
2025-11-19 12:19:23 -05:00
Vik Paruchuri
c049e7524f Shift import 2025-11-19 12:18:57 -05:00
Vik Paruchuri
7ac08e16e1 Merge pull request #46 from datalab-to/dev
Hide imports
2025-11-19 12:06:47 -05:00
Vik Paruchuri
b96eb84094 Hide imports 2025-11-19 12:06:07 -05:00
Vik Paruchuri
7d967717a3 Merge pull request #41 from datalab-to/dev
Enable piping through params
2025-11-12 18:06:41 -05:00
Vik Paruchuri
0f5f3d485c Enable piping through params 2025-11-12 18:06:05 -05:00
Vik Paruchuri
94786516c8 Merge pull request #40 from datalab-to/dev
fix issue with pop
2025-11-12 17:34:47 -05:00
Vik Paruchuri
1bab4bf73a fix issue with pop 2025-11-12 17:33:50 -05:00
Vik Paruchuri
c735484ad4 Merge pull request #39 from datalab-to/dev
Enable passing bbox scale
2025-11-12 17:17:34 -05:00
Vik Paruchuri
34f825351c Enable passing bbox scale 2025-11-12 17:17:15 -05:00
Vik Paruchuri
914d508ddd Merge pull request #38 from datalab-to/dev
Add a small sleep
2025-11-12 16:06:21 -05:00
Vik Paruchuri
068db0311e Add a small sleep 2025-11-12 16:06:02 -05:00
Vik Paruchuri
f04e0d146b Merge pull request #37 from datalab-to/dev
Dev
2025-11-12 16:03:23 -05:00
Vik Paruchuri
aafbb70ce8 Merge pull request #36 from datalab-to/vik/bbox
Fix retry settings
2025-11-12 16:03:10 -05:00
Vik Paruchuri
393d3d53f4 Merge pull request #32 from datalab-to/dev
Dev
2025-11-10 11:37:06 -05:00
Vik Paruchuri
910bcf100f Merge pull request #31 from datalab-to/vik/bbox
Vik/bbox
2025-11-10 11:36:50 -05:00
Vik Paruchuri
5330679cf3 Merge pull request #22 from datalab-to/dev
Dev
2025-11-03 17:16:02 -05:00
Vik Paruchuri
b6320b09bd Merge pull request #10 from datalab-to/dev
Enable passing custom headers
2025-10-30 10:21:48 -04:00
Vik Paruchuri
b99aa32e19 Merge pull request #9 from datalab-to/dev
Improve robustness
2025-10-29 18:17:02 -04:00
Vik Paruchuri
ba6a5f71da Merge pull request #6 from datalab-to/dev
Change image rendering
2025-10-26 10:40:23 -04:00
Vik Paruchuri
57cb163663 Merge pull request #2 from datalab-to/dev
Dev
2025-10-23 16:55:57 -04:00
7 changed files with 59 additions and 68 deletions

View File

@@ -13,16 +13,23 @@ def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
print(f"Failed to flatten annotations / form fields on page {page}.")
def load_image(filepath: str):
def load_image(
filepath: str, min_image_dim: int = settings.MIN_IMAGE_DIM
) -> Image.Image:
image = Image.open(filepath).convert("RGB")
if image.width < settings.MIN_IMAGE_DIM or image.height < settings.MIN_IMAGE_DIM:
scale = settings.MIN_IMAGE_DIM / min(image.width, image.height)
if image.width < min_image_dim or image.height < min_image_dim:
scale = min_image_dim / min(image.width, image.height)
new_size = (int(image.width * scale), int(image.height * scale))
image = image.resize(new_size, Image.Resampling.LANCZOS)
return image
def load_pdf_images(filepath: str, page_range: List[int]):
def load_pdf_images(
filepath: str,
page_range: List[int],
image_dpi: int = settings.IMAGE_DPI,
min_pdf_image_dim: int = settings.MIN_PDF_IMAGE_DIM,
) -> List[Image.Image]:
doc = pdfium.PdfDocument(filepath)
doc.init_forms()
@@ -31,8 +38,8 @@ def load_pdf_images(filepath: str, page_range: List[int]):
if not page_range or page in page_range:
page_obj = doc[page]
min_page_dim = min(page_obj.get_width(), page_obj.get_height())
scale_dpi = (settings.MIN_PDF_IMAGE_DIM / min_page_dim) * 72
scale_dpi = max(scale_dpi, settings.IMAGE_DPI)
scale_dpi = (min_pdf_image_dim / min_page_dim) * 72
scale_dpi = max(scale_dpi, image_dpi)
page_obj = doc[page]
flatten(page_obj)
page_obj = doc[page]

View File

@@ -4,6 +4,7 @@ from chandra.model.hf import load_model, generate_hf
from chandra.model.schema import BatchInputItem, BatchOutputItem
from chandra.model.vllm import generate_vllm
from chandra.output import parse_markdown, parse_html, parse_chunks, extract_images
from chandra.settings import settings
class InferenceManager:
@@ -26,19 +27,29 @@ class InferenceManager:
output_kwargs["include_headers_footers"] = kwargs.pop(
"include_headers_footers"
)
bbox_scale = kwargs.pop("bbox_scale", settings.BBOX_SCALE)
vllm_api_base = kwargs.pop("vllm_api_base", settings.VLLM_API_BASE)
if self.method == "vllm":
results = generate_vllm(
batch, max_output_tokens=max_output_tokens, **kwargs
batch,
max_output_tokens=max_output_tokens,
bbox_scale=bbox_scale,
vllm_api_base=vllm_api_base,
**kwargs,
)
else:
results = generate_hf(
batch, self.model, max_output_tokens=max_output_tokens, **kwargs
batch,
self.model,
max_output_tokens=max_output_tokens,
bbox_scale=bbox_scale,
**kwargs,
)
output = []
for result, input_item in zip(results, batch):
chunks = parse_chunks(result.raw, input_item.image)
chunks = parse_chunks(result.raw, input_item.image, bbox_scale=bbox_scale)
output.append(
BatchOutputItem(
markdown=parse_markdown(result.raw, **output_kwargs),

View File

@@ -1,8 +1,5 @@
from typing import List
from qwen_vl_utils import process_vision_info
from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor
from chandra.model.schema import BatchInputItem, GenerationResult
from chandra.model.util import scale_to_fit
from chandra.prompts import PROMPT_MAPPING
@@ -10,12 +7,20 @@ from chandra.settings import settings
def generate_hf(
batch: List[BatchInputItem], model, max_output_tokens=None, **kwargs
batch: List[BatchInputItem],
model,
max_output_tokens=None,
bbox_scale: int = settings.BBOX_SCALE,
**kwargs,
) -> List[GenerationResult]:
from qwen_vl_utils import process_vision_info
if max_output_tokens is None:
max_output_tokens = settings.MAX_OUTPUT_TOKENS
messages = [process_batch_element(item, model.processor) for item in batch]
messages = [
process_batch_element(item, model.processor, bbox_scale) for item in batch
]
text = model.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
@@ -48,12 +53,12 @@ def generate_hf(
return results
def process_batch_element(item: BatchInputItem, processor):
def process_batch_element(item: BatchInputItem, processor, bbox_scale: int):
prompt = item.prompt
prompt_type = item.prompt_type
if not prompt:
prompt = PROMPT_MAPPING[prompt_type]
prompt = PROMPT_MAPPING[prompt_type].replace("{bbox_scale}", str(bbox_scale))
content = []
image = scale_to_fit(item.image) # Guarantee max size
@@ -65,12 +70,15 @@ def process_batch_element(item: BatchInputItem, processor):
def load_model():
import torch
from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor
device_map = "auto"
if settings.TORCH_DEVICE:
device_map = {"": settings.TORCH_DEVICE}
kwargs = {
"dtype": settings.TORCH_DTYPE,
"dtype": torch.bfloat16,
"device_map": device_map,
}
if settings.TORCH_ATTN:

View File

@@ -1,5 +1,6 @@
import base64
import io
import time
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
from typing import List
@@ -27,10 +28,12 @@ def generate_vllm(
max_workers: int | None = None,
custom_headers: dict | None = None,
max_failure_retries: int | None = None,
bbox_scale: int = settings.BBOX_SCALE,
vllm_api_base: str = settings.VLLM_API_BASE,
) -> List[GenerationResult]:
client = OpenAI(
api_key=settings.VLLM_API_KEY,
base_url=settings.VLLM_API_BASE,
base_url=vllm_api_base,
default_headers=custom_headers,
)
model_name = settings.VLLM_MODEL_NAME
@@ -53,7 +56,9 @@ def generate_vllm(
) -> GenerationResult:
prompt = item.prompt
if not prompt:
prompt = PROMPT_MAPPING[item.prompt_type]
prompt = PROMPT_MAPPING[item.prompt_type].replace(
"{bbox_scale}", str(bbox_scale)
)
content = []
image = scale_to_fit(item.image)
@@ -112,6 +117,7 @@ def generate_vllm(
print(
f"Detected vllm error, retrying generation (attempt {retries + 1})..."
)
time.sleep(2 * (retries + 1)) # Sleeping can help under load
return True
if (
@@ -122,6 +128,7 @@ def generate_vllm(
print(
f"Detected vllm error, retrying generation (attempt {retries + 1})..."
)
time.sleep(2 * (retries + 1)) # Sleeping can help under load
return True
return False

View File

@@ -6,7 +6,7 @@ from functools import lru_cache
import six
from PIL import Image
from bs4 import BeautifulSoup, NavigableString
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter, re_whitespace
from chandra.settings import settings
@@ -89,39 +89,6 @@ def parse_html(
return out_html
def escape_dollars(text):
return text.replace("$", r"\$")
def get_formatted_table_text(element):
text = []
for content in element.contents:
if content is None:
continue
if isinstance(content, NavigableString):
stripped = content.strip()
if stripped:
text.append(escape_dollars(stripped))
elif content.name == "br":
text.append("<br>")
elif content.name == "math":
text.append("$" + content.text + "$")
else:
content_str = escape_dollars(str(content))
text.append(content_str)
full_text = ""
for i, t in enumerate(text):
if t == "<br>":
full_text += t
elif i > 0 and text[i - 1] != "<br>":
full_text += " " + t
else:
full_text += t
return full_text
class Markdownify(MarkdownConverter):
def __init__(
self,
@@ -221,12 +188,12 @@ class LayoutBlock:
content: str
def parse_layout(html: str, image: Image.Image):
def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE):
soup = BeautifulSoup(html, "html.parser")
top_level_divs = soup.find_all("div", recursive=False)
width, height = image.size
width_scaler = width / settings.BBOX_SCALE
height_scaler = height / settings.BBOX_SCALE
width_scaler = width / bbox_scale
height_scaler = height / bbox_scale
layout_blocks = []
for div in top_level_divs:
bbox = div.get("data-bbox")
@@ -255,7 +222,7 @@ def parse_layout(html: str, image: Image.Image):
return layout_blocks
def parse_chunks(html: str, image: Image.Image):
layout = parse_layout(html, image)
def parse_chunks(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE):
layout = parse_layout(html, image, bbox_scale=bbox_scale)
chunks = [asdict(block) for block in layout]
return chunks

View File

@@ -1,5 +1,3 @@
from chandra.settings import settings
ALLOWED_TAGS = [
"math",
"br",
@@ -67,7 +65,7 @@ Guidelines:
""".strip()
OCR_LAYOUT_PROMPT = f"""
OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{settings.BBOX_SCALE}. The data-label attribute is the label for the block.
OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{{bbox_scale}}. The data-label attribute is the label for the block.
Use the following labels:
- Caption

View File

@@ -1,7 +1,5 @@
from dotenv import find_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os
@@ -24,11 +22,6 @@ class Settings(BaseSettings):
VLLM_GPUS: str = "0"
MAX_VLLM_RETRIES: int = 6
@computed_field
@property
def TORCH_DTYPE(self) -> torch.dtype:
return torch.bfloat16
class Config:
env_file = find_dotenv("local.env")
extra = "ignore"