Initial commit

2026-02-08 07:10:36 +00:00 · 2025-10-08 17:34:01 -04:00
commit 17b1b03bde
13 changed files with 2201 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,221 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml
+
+# Virtual environments
+.venv
+
+.idea/
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
--- a/README.md
+++ b/README.md
@@ -0,0 +1,7 @@
+# Chandra
+
+Try:
+
+```shell
+streamlit run chandra_app.py --server.fileWatcherType none --server.headless true
+```
--- a/chandra/init.py
+++ b/chandra/init.py
--- a/chandra/image.py
+++ b/chandra/image.py
@@ -0,0 +1,40 @@
+import math
+from typing import Tuple
+
+from PIL import Image
+
+
+def scale_to_fit(
+    img: Image.Image,
+    max_size: Tuple[int, int] = (1024, 1024),
+    min_size: Tuple[int, int] = (28, 28),
+):
+    resample_method = Image.Resampling.LANCZOS
+
+    width, height = img.size
+
+    # Check for empty or invalid image
+    if width == 0 or height == 0:
+        return img
+
+    max_width, max_height = max_size
+    min_width, min_height = min_size
+
+    current_pixels = width * height
+    max_pixels = max_width * max_height
+    min_pixels = min_width * min_height
+
+    if current_pixels > max_pixels:
+        scale_factor = (max_pixels / current_pixels) ** 0.5
+
+        new_width = math.floor(width * scale_factor)
+        new_height = math.floor(height * scale_factor)
+    elif current_pixels < min_pixels:
+        scale_factor = (min_pixels / current_pixels) ** 0.5
+
+        new_width = math.ceil(width * scale_factor)
+        new_height = math.ceil(height * scale_factor)
+    else:
+        return img
+
+    return img.resize((new_width, new_height), resample=resample_method)
--- a/chandra/layout.py
+++ b/chandra/layout.py
@@ -0,0 +1,44 @@
+import json
+from dataclasses import dataclass
+
+from PIL import Image
+from PIL.ImageDraw import ImageDraw
+from bs4 import BeautifulSoup
+
+@dataclass
+class LayoutBlock:
+    bbox: list[int]
+    label: str
+    content: str
+
+def parse_layout(html: str, image: Image.Image):
+    soup = BeautifulSoup(html, "html.parser")
+    top_level_divs = soup.find_all("div", recursive=False)
+    width, height = image.size
+    width_scaler = width / 1024
+    height_scaler = height / 1024
+    layout_blocks = []
+    for div in top_level_divs:
+        bbox = div.get("data-bbox")
+        bbox = json.loads(bbox)
+        bbox = list(map(int, bbox))
+        # Normalize bbox
+        bbox = [
+            max(0, int(bbox[0] * width_scaler)),
+            max(0, int(bbox[1] * height_scaler)),
+            min(int(bbox[2] * width_scaler), width),
+            min(int(bbox[3] * height_scaler), height),
+        ]
+        label = div.get("data-label", "block")
+        content = str(div.decode_contents())
+        layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
+    return layout_blocks
+
+def draw_layout(image: Image.Image, layout_blocks: list[LayoutBlock]):
+    draw_image = image.copy()
+    draw = ImageDraw(draw_image)
+    for block in layout_blocks:
+        draw.rectangle(block.bbox, outline="red", width=2)
+        draw.text((block.bbox[0], block.bbox[1]), block.label, fill="blue")
+
+    return draw_image
--- a/chandra/load.py
+++ b/chandra/load.py
@@ -0,0 +1,39 @@
+from typing import List
+import filetype
+from PIL import Image
+import pypdfium2 as pdfium
+
+from chandra.settings import settings
+
+
+def load_pdf_images(filepath: str, page_range: List[int]):
+    doc = pdfium.PdfDocument(filepath)
+    images = [doc[i].render(scale=settings.IMAGE_DPI / 72).to_pil().convert("RGB") for i in range(len(doc)) if not page_range or i in page_range]
+    doc.close()
+    return images
+
+
+def parse_range_str(range_str: str) -> List[int]:
+    range_lst = range_str.split(",")
+    page_lst = []
+    for i in range_lst:
+        if "-" in i:
+            start, end = i.split("-")
+            page_lst += list(range(int(start), int(end) + 1))
+        else:
+            page_lst.append(int(i))
+    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
+    return page_lst
+
+
+def load_file(filepath: str, config: dict):
+    page_range = config.get("page_range")
+    if page_range:
+        page_range = parse_range_str(page_range)
+
+    input_type = filetype.guess(filepath)
+    if input_type and input_type.extension == "pdf":
+        images = load_pdf_images(filepath, page_range)
+    else:
+        images = [Image.open(filepath).convert("RGB")]
+    return images
--- a/chandra/model.py
+++ b/chandra/model.py
@@ -0,0 +1,78 @@
+from dataclasses import dataclass
+from typing import List
+
+from PIL import Image
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
+
+from chandra.image import scale_to_fit
+from chandra.prompts import PROMPT_MAPPING
+from chandra.settings import settings
+
+from qwen_vl_utils import process_vision_info
+
+@dataclass
+class BatchItem:
+    images: List[Image.Image]
+    prompt: str | None = None
+    prompt_type: str | None = None
+
+
+def load():
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        settings.MODEL_CHECKPOINT,
+        dtype=settings.TORCH_DTYPE,
+        device_map="auto",
+        attn_implementation=settings.TORCH_ATTN_IMPLEMENTATION,
+    ).to(settings.TORCH_DEVICE_MODEL)
+    model = model.eval()
+    processor = Qwen2_5_VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT)
+    model.processor = processor
+    return model
+
+def process_batch_element(item: BatchItem, processor):
+    prompt = item.prompt
+    prompt_type = item.prompt_type
+    images = item.images
+
+    if not prompt:
+        prompt = PROMPT_MAPPING[prompt_type]
+
+    content = []
+    for image in images:
+        image = scale_to_fit(image)  # Guarantee max size
+        content.append({"type": "image", "image": image})
+
+    content.append({"type": "text", "text": prompt})
+    message = {
+        "role": "user",
+        "content": content
+    }
+    return message
+
+
+def generate(batch: List[BatchItem], model):
+    messages = [process_batch_element(item, model.processor) for item in batch]
+    text = model.processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_inputs, _ = process_vision_info(messages)
+    inputs = model.processor(
+        text=text,
+        images=image_inputs,
+        padding=True,
+        return_tensors="pt",
+        padding_side="left"
+    )
+    inputs = inputs.to("cuda")
+
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=settings.MAX_OUTPUT_TOKENS)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = model.processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text
+
--- a/chandra/prompts.py
+++ b/chandra/prompts.py
@@ -0,0 +1,99 @@
+ALLOWED_TAGS = [
+    "math",
+    "br",
+    "i",
+    "b",
+    "u",
+    "del",
+    "sup",
+    "sub",
+    "table",
+    "tr",
+    "td",
+    "p",
+    "th",
+    "div",
+    "pre",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "ul",
+    "ol",
+    "li",
+    "input",
+    "a",
+    "span",
+    "img",
+    "hr",
+    "tbody",
+    "small",
+    "caption",
+    "strong",
+    "thead",
+    "big",
+    "code",
+]
+ALLOWED_ATTRIBUTES = [
+    "class",
+    "colspan",
+    "rowspan",
+    "display",
+    "checked",
+    "type",
+    "border",
+    "value",
+    "style",
+    "href",
+    "alt",
+    "align",
+]
+
+PROMPT_ENDING = f"""
+Only use these tags {ALLOWED_TAGS}, and these attributes {ALLOWED_ATTRIBUTES}.
+
+Guidelines:
+* Inline math: Surround math with <math>...</math> tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math.
+* Tables: Use colspan and rowspan attributes to match table structure.
+* Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters.
+* Images: Include a description of any images in the alt attribute of an <img> tag. Do not fill out the src property.
+* Forms: Mark checkboxes and radio buttons properly.
+* Text: join lines together properly into paragraphs using <p>...</p> tags.  Use <br> tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning.
+* Use the simplest possible HTML structure that accurately represents the content of the block.
+* Make sure the text is accurate and easy for a human to read and interpret.  Reading order should be correct and natural.
+""".strip()
+
+OCR_LAYOUT_PROMPT = f"""
+OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
+
+Use the following labels:
+- Caption
+- Footnote
+- Equation-Block
+- List-Item
+- Page-Header
+- Page-Footer
+- Image
+- Section-Header
+- Table
+- Text
+- Complex-Block
+- Code-Block
+- Form
+- Table-Of-Contents
+- Figure
+
+{PROMPT_ENDING}
+""".strip()
+
+OCR_PROMPT = f"""
+OCR this image to HTML.
+
+{PROMPT_ENDING}
+""".strip()
+
+PROMPT_MAPPING = {
+    "ocr_layout": OCR_LAYOUT_PROMPT,
+    "ocr": OCR_PROMPT,
+}
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -0,0 +1,48 @@
+from dotenv import find_dotenv
+from pydantic import computed_field
+from pydantic_settings import BaseSettings
+import torch
+import os
+
+
+class Settings(BaseSettings):
+    # Paths
+    BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    IMAGE_DPI: int = 96
+    MODEL_CHECKPOINT: str = "datalab-to/chandra-0.2.1"
+    TORCH_DEVICE: str | None = None
+    MAX_OUTPUT_TOKENS: int = 2048
+
+    @computed_field
+    @property
+    def TORCH_DEVICE_MODEL(self) -> str:
+        if self.TORCH_DEVICE is not None:
+            return self.TORCH_DEVICE
+
+        if torch.cuda.is_available():
+            return "cuda"
+
+        if torch.backends.mps.is_available():
+            return "mps"
+
+        return "cpu"
+
+    @computed_field
+    @property
+    def TORCH_DTYPE(self) -> torch.dtype:
+        return torch.bfloat16
+
+    @computed_field
+    @property
+    def TORCH_ATTN_IMPLEMENTATION(self) -> str:
+        if self.TORCH_DEVICE_MODEL == "cuda":
+            return "flash_attention_2"
+        else:
+            return "sdpa"
+
+    class Config:
+        env_file = find_dotenv("local.env")
+        extra = "ignore"
+
+
+settings = Settings()
--- a/chandra_app.py
+++ b/chandra_app.py
@@ -0,0 +1,113 @@
+import pypdfium2 as pdfium
+import streamlit as st
+from PIL import Image
+
+from chandra.layout import parse_layout, draw_layout
+from chandra.load import load_pdf_images
+from chandra.model import load, BatchItem, generate
+
+
+@st.cache_resource()
+def load_model():
+    return load()
+
+@st.cache_data()
+def get_page_image(pdf_file, page_num):
+    return load_pdf_images(pdf_file, [page_num])[0]
+
+@st.cache_data()
+def page_counter(pdf_file):
+    doc = pdfium.PdfDocument(pdf_file)
+    doc_len = len(doc)
+    doc.close()
+    return doc_len
+
+# Function for OCR
+def ocr_layout(
+    img: Image.Image,
+) -> (Image.Image, str):
+    batch = BatchItem(
+        images=[img],
+        prompt_type="ocr_layout",
+    )
+    html = generate([batch], model=model)[0]
+    print(f"Generated HTML: {html[:500]}...")
+    layout = parse_layout(html, img)
+    layout_image = draw_layout(img, layout)
+    return html, layout_image
+
+def ocr(
+    img: Image.Image,
+) -> str:
+    batch = BatchItem(
+        images=[img],
+        prompt_type="ocr"
+    )
+    return generate([batch], model=model)[0]
+
+st.set_page_config(layout="wide")
+col1, col2 = st.columns([0.5, 0.5])
+
+model = load_model()
+
+st.markdown("""
+# Chandra OCR Demo
+
+This app will let you try chandra, a multilingual OCR toolkit.
+""")
+
+in_file = st.sidebar.file_uploader(
+    "PDF file or image:", type=["pdf", "png", "jpg", "jpeg", "gif", "webp"]
+)
+
+if in_file is None:
+    st.stop()
+
+filetype = in_file.type
+page_count = None
+if "pdf" in filetype:
+    page_count = page_counter(in_file)
+    page_number = st.sidebar.number_input(
+        f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
+    )
+
+    pil_image = get_page_image(in_file, page_number)
+else:
+    pil_image = Image.open(in_file).convert("RGB")
+    page_number = None
+
+run_ocr = st.sidebar.button("Run OCR")
+prompt_type = st.sidebar.selectbox(
+    "Prompt type",
+    ["ocr_layout", "ocr"],
+    index=0,
+    help="Select the prompt type for OCR.",
+)
+
+if pil_image is None:
+    st.stop()
+
+if run_ocr:
+    if prompt_type == "ocr_layout":
+        pred, layout_image = ocr_layout(
+            pil_image,
+        )
+    else:
+        pred = ocr(
+            pil_image,
+        )
+        layout_image = None
+
+    with col1:
+        html_tab, text_tab, layout_tab = st.tabs(["HTML", "HTML as text", "Layout Image"])
+        with html_tab:
+            st.markdown(pred, unsafe_allow_html=True)
+        with text_tab:
+            st.text(pred)
+
+        if layout_image:
+            with layout_tab:
+                st.image(layout_image, caption="Detected Layout", use_container_width=True)
+
+with col2:
+    st.image(pil_image, caption="Uploaded Image", use_container_width=True)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "chandra"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "beautifulsoup4>=4.14.2",
+    "filetype>=1.2.0",
+    "pillow>=11.3.0",
+    "pydantic>=2.12.0",
+    "pydantic-settings>=2.11.0",
+    "pypdfium2>=4.30.0",
+    "python-dotenv>=1.1.1",
+    "qwen-vl-utils>=0.0.14",
+    "streamlit>=1.50.0",
+    "torch>=2.8.0",
+    "transformers>=4.57.0",
+]
--- a/uv.lock
+++ b/uv.lock