Initial commit

This commit is contained in:
Vik Paruchuri
2025-10-08 17:34:01 -04:00
commit 17b1b03bde
13 changed files with 2201 additions and 0 deletions

221
.gitignore vendored Normal file
View File

@@ -0,0 +1,221 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# poetry.lock
# poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
# pdm.lock
# pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
# pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# Redis
*.rdb
*.aof
*.pid
# RabbitMQ
mnesia/
rabbitmq/
rabbitmq-data/
# ActiveMQ
activemq-data/
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
# .idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# Streamlit
.streamlit/secrets.toml
# Virtual environments
.venv
.idea/

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.12

7
README.md Normal file
View File

@@ -0,0 +1,7 @@
# Chandra
Try:
```shell
streamlit run chandra_app.py --server.fileWatcherType none --server.headless true
```

0
chandra/__init__.py Normal file
View File

40
chandra/image.py Normal file
View File

@@ -0,0 +1,40 @@
import math
from typing import Tuple
from PIL import Image
def scale_to_fit(
img: Image.Image,
max_size: Tuple[int, int] = (1024, 1024),
min_size: Tuple[int, int] = (28, 28),
):
resample_method = Image.Resampling.LANCZOS
width, height = img.size
# Check for empty or invalid image
if width == 0 or height == 0:
return img
max_width, max_height = max_size
min_width, min_height = min_size
current_pixels = width * height
max_pixels = max_width * max_height
min_pixels = min_width * min_height
if current_pixels > max_pixels:
scale_factor = (max_pixels / current_pixels) ** 0.5
new_width = math.floor(width * scale_factor)
new_height = math.floor(height * scale_factor)
elif current_pixels < min_pixels:
scale_factor = (min_pixels / current_pixels) ** 0.5
new_width = math.ceil(width * scale_factor)
new_height = math.ceil(height * scale_factor)
else:
return img
return img.resize((new_width, new_height), resample=resample_method)

44
chandra/layout.py Normal file
View File

@@ -0,0 +1,44 @@
import json
from dataclasses import dataclass
from PIL import Image
from PIL.ImageDraw import ImageDraw
from bs4 import BeautifulSoup
@dataclass
class LayoutBlock:
bbox: list[int]
label: str
content: str
def parse_layout(html: str, image: Image.Image):
soup = BeautifulSoup(html, "html.parser")
top_level_divs = soup.find_all("div", recursive=False)
width, height = image.size
width_scaler = width / 1024
height_scaler = height / 1024
layout_blocks = []
for div in top_level_divs:
bbox = div.get("data-bbox")
bbox = json.loads(bbox)
bbox = list(map(int, bbox))
# Normalize bbox
bbox = [
max(0, int(bbox[0] * width_scaler)),
max(0, int(bbox[1] * height_scaler)),
min(int(bbox[2] * width_scaler), width),
min(int(bbox[3] * height_scaler), height),
]
label = div.get("data-label", "block")
content = str(div.decode_contents())
layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
return layout_blocks
def draw_layout(image: Image.Image, layout_blocks: list[LayoutBlock]):
draw_image = image.copy()
draw = ImageDraw(draw_image)
for block in layout_blocks:
draw.rectangle(block.bbox, outline="red", width=2)
draw.text((block.bbox[0], block.bbox[1]), block.label, fill="blue")
return draw_image

39
chandra/load.py Normal file
View File

@@ -0,0 +1,39 @@
from typing import List
import filetype
from PIL import Image
import pypdfium2 as pdfium
from chandra.settings import settings
def load_pdf_images(filepath: str, page_range: List[int]):
doc = pdfium.PdfDocument(filepath)
images = [doc[i].render(scale=settings.IMAGE_DPI / 72).to_pil().convert("RGB") for i in range(len(doc)) if not page_range or i in page_range]
doc.close()
return images
def parse_range_str(range_str: str) -> List[int]:
range_lst = range_str.split(",")
page_lst = []
for i in range_lst:
if "-" in i:
start, end = i.split("-")
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst
def load_file(filepath: str, config: dict):
page_range = config.get("page_range")
if page_range:
page_range = parse_range_str(page_range)
input_type = filetype.guess(filepath)
if input_type and input_type.extension == "pdf":
images = load_pdf_images(filepath, page_range)
else:
images = [Image.open(filepath).convert("RGB")]
return images

78
chandra/model.py Normal file
View File

@@ -0,0 +1,78 @@
from dataclasses import dataclass
from typing import List
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
from chandra.image import scale_to_fit
from chandra.prompts import PROMPT_MAPPING
from chandra.settings import settings
from qwen_vl_utils import process_vision_info
@dataclass
class BatchItem:
images: List[Image.Image]
prompt: str | None = None
prompt_type: str | None = None
def load():
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
settings.MODEL_CHECKPOINT,
dtype=settings.TORCH_DTYPE,
device_map="auto",
attn_implementation=settings.TORCH_ATTN_IMPLEMENTATION,
).to(settings.TORCH_DEVICE_MODEL)
model = model.eval()
processor = Qwen2_5_VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT)
model.processor = processor
return model
def process_batch_element(item: BatchItem, processor):
prompt = item.prompt
prompt_type = item.prompt_type
images = item.images
if not prompt:
prompt = PROMPT_MAPPING[prompt_type]
content = []
for image in images:
image = scale_to_fit(image) # Guarantee max size
content.append({"type": "image", "image": image})
content.append({"type": "text", "text": prompt})
message = {
"role": "user",
"content": content
}
return message
def generate(batch: List[BatchItem], model):
messages = [process_batch_element(item, model.processor) for item in batch]
text = model.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, _ = process_vision_info(messages)
inputs = model.processor(
text=text,
images=image_inputs,
padding=True,
return_tensors="pt",
padding_side="left"
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=settings.MAX_OUTPUT_TOKENS)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = model.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text

99
chandra/prompts.py Normal file
View File

@@ -0,0 +1,99 @@
ALLOWED_TAGS = [
"math",
"br",
"i",
"b",
"u",
"del",
"sup",
"sub",
"table",
"tr",
"td",
"p",
"th",
"div",
"pre",
"h1",
"h2",
"h3",
"h4",
"h5",
"ul",
"ol",
"li",
"input",
"a",
"span",
"img",
"hr",
"tbody",
"small",
"caption",
"strong",
"thead",
"big",
"code",
]
ALLOWED_ATTRIBUTES = [
"class",
"colspan",
"rowspan",
"display",
"checked",
"type",
"border",
"value",
"style",
"href",
"alt",
"align",
]
PROMPT_ENDING = f"""
Only use these tags {ALLOWED_TAGS}, and these attributes {ALLOWED_ATTRIBUTES}.
Guidelines:
* Inline math: Surround math with <math>...</math> tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math.
* Tables: Use colspan and rowspan attributes to match table structure.
* Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters.
* Images: Include a description of any images in the alt attribute of an <img> tag. Do not fill out the src property.
* Forms: Mark checkboxes and radio buttons properly.
* Text: join lines together properly into paragraphs using <p>...</p> tags. Use <br> tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning.
* Use the simplest possible HTML structure that accurately represents the content of the block.
* Make sure the text is accurate and easy for a human to read and interpret. Reading order should be correct and natural.
""".strip()
OCR_LAYOUT_PROMPT = f"""
OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
Use the following labels:
- Caption
- Footnote
- Equation-Block
- List-Item
- Page-Header
- Page-Footer
- Image
- Section-Header
- Table
- Text
- Complex-Block
- Code-Block
- Form
- Table-Of-Contents
- Figure
{PROMPT_ENDING}
""".strip()
OCR_PROMPT = f"""
OCR this image to HTML.
{PROMPT_ENDING}
""".strip()
PROMPT_MAPPING = {
"ocr_layout": OCR_LAYOUT_PROMPT,
"ocr": OCR_PROMPT,
}

48
chandra/settings.py Normal file
View File

@@ -0,0 +1,48 @@
from dotenv import find_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os
class Settings(BaseSettings):
# Paths
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
IMAGE_DPI: int = 96
MODEL_CHECKPOINT: str = "datalab-to/chandra-0.2.1"
TORCH_DEVICE: str | None = None
MAX_OUTPUT_TOKENS: int = 2048
@computed_field
@property
def TORCH_DEVICE_MODEL(self) -> str:
if self.TORCH_DEVICE is not None:
return self.TORCH_DEVICE
if torch.cuda.is_available():
return "cuda"
if torch.backends.mps.is_available():
return "mps"
return "cpu"
@computed_field
@property
def TORCH_DTYPE(self) -> torch.dtype:
return torch.bfloat16
@computed_field
@property
def TORCH_ATTN_IMPLEMENTATION(self) -> str:
if self.TORCH_DEVICE_MODEL == "cuda":
return "flash_attention_2"
else:
return "sdpa"
class Config:
env_file = find_dotenv("local.env")
extra = "ignore"
settings = Settings()

113
chandra_app.py Normal file
View File

@@ -0,0 +1,113 @@
import pypdfium2 as pdfium
import streamlit as st
from PIL import Image
from chandra.layout import parse_layout, draw_layout
from chandra.load import load_pdf_images
from chandra.model import load, BatchItem, generate
@st.cache_resource()
def load_model():
return load()
@st.cache_data()
def get_page_image(pdf_file, page_num):
return load_pdf_images(pdf_file, [page_num])[0]
@st.cache_data()
def page_counter(pdf_file):
doc = pdfium.PdfDocument(pdf_file)
doc_len = len(doc)
doc.close()
return doc_len
# Function for OCR
def ocr_layout(
img: Image.Image,
) -> (Image.Image, str):
batch = BatchItem(
images=[img],
prompt_type="ocr_layout",
)
html = generate([batch], model=model)[0]
print(f"Generated HTML: {html[:500]}...")
layout = parse_layout(html, img)
layout_image = draw_layout(img, layout)
return html, layout_image
def ocr(
img: Image.Image,
) -> str:
batch = BatchItem(
images=[img],
prompt_type="ocr"
)
return generate([batch], model=model)[0]
st.set_page_config(layout="wide")
col1, col2 = st.columns([0.5, 0.5])
model = load_model()
st.markdown("""
# Chandra OCR Demo
This app will let you try chandra, a multilingual OCR toolkit.
""")
in_file = st.sidebar.file_uploader(
"PDF file or image:", type=["pdf", "png", "jpg", "jpeg", "gif", "webp"]
)
if in_file is None:
st.stop()
filetype = in_file.type
page_count = None
if "pdf" in filetype:
page_count = page_counter(in_file)
page_number = st.sidebar.number_input(
f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
)
pil_image = get_page_image(in_file, page_number)
else:
pil_image = Image.open(in_file).convert("RGB")
page_number = None
run_ocr = st.sidebar.button("Run OCR")
prompt_type = st.sidebar.selectbox(
"Prompt type",
["ocr_layout", "ocr"],
index=0,
help="Select the prompt type for OCR.",
)
if pil_image is None:
st.stop()
if run_ocr:
if prompt_type == "ocr_layout":
pred, layout_image = ocr_layout(
pil_image,
)
else:
pred = ocr(
pil_image,
)
layout_image = None
with col1:
html_tab, text_tab, layout_tab = st.tabs(["HTML", "HTML as text", "Layout Image"])
with html_tab:
st.markdown(pred, unsafe_allow_html=True)
with text_tab:
st.text(pred)
if layout_image:
with layout_tab:
st.image(layout_image, caption="Detected Layout", use_container_width=True)
with col2:
st.image(pil_image, caption="Uploaded Image", use_container_width=True)

19
pyproject.toml Normal file
View File

@@ -0,0 +1,19 @@
[project]
name = "chandra"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"beautifulsoup4>=4.14.2",
"filetype>=1.2.0",
"pillow>=11.3.0",
"pydantic>=2.12.0",
"pydantic-settings>=2.11.0",
"pypdfium2>=4.30.0",
"python-dotenv>=1.1.1",
"qwen-vl-utils>=0.0.14",
"streamlit>=1.50.0",
"torch>=2.8.0",
"transformers>=4.57.0",
]

1492
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff