Initial commit

2025-11-29 08:33:13 +00:00 · 2025-10-08 17:34:01 -04:00
commit 17b1b03bde
13 changed files with 2201 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,221 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #   Usually these files are written by a python script from a template
 #   before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py.cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 # Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 # uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 # poetry.lock
 # poetry.toml
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
 #   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
 # pdm.lock
 # pdm.toml
 .pdm-python
 .pdm-build/
 # pixi
 #   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
 # pixi.lock
 #   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
 #   in the .venv directory. It is recommended not to include this directory in version control.
 .pixi
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # Redis
 *.rdb
 *.aof
 *.pid
 # RabbitMQ
 mnesia/
 rabbitmq/
 rabbitmq-data/
 # ActiveMQ
 activemq-data/
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .envrc
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #   and can be added to the global gitignore or merged into this file.  For a more nuclear
 #   option (not recommended) you can uncomment the following to ignore the entire idea folder.
 # .idea/
 # Abstra
 #   Abstra is an AI-powered process automation framework.
 #   Ignore directories containing user credentials, local state, and settings.
 #   Learn more at https://abstra.io/docs
 .abstra/
 # Visual Studio Code
 #   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
 #   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
 #   and can be added to the global gitignore or merged into this file. However, if you prefer,
 #   you could uncomment the following to ignore the entire vscode folder
 # .vscode/
 # Ruff stuff:
 .ruff_cache/
 # PyPI configuration file
 .pypirc
 # Marimo
 marimo/_static/
 marimo/_lsp/
 __marimo__/
 # Streamlit
 .streamlit/secrets.toml
 # Virtual environments
 .venv
 .idea/
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
 3.12
--- a/README.md
+++ b/README.md
@@ -0,0 +1,7 @@
 # Chandra
 Try:
 ```shell
 streamlit run chandra_app.py --server.fileWatcherType none --server.headless true
 ```
--- a/chandra/init.py
+++ b/chandra/init.py
--- a/chandra/image.py
+++ b/chandra/image.py
@@ -0,0 +1,40 @@
 import math
 from typing import Tuple
 from PIL import Image
 def scale_to_fit(
    img: Image.Image,
    max_size: Tuple[int, int] = (1024, 1024),
    min_size: Tuple[int, int] = (28, 28),
 ):
    resample_method = Image.Resampling.LANCZOS
    width, height = img.size
    # Check for empty or invalid image
    if width == 0 or height == 0:
        return img
    max_width, max_height = max_size
    min_width, min_height = min_size
    current_pixels = width * height
    max_pixels = max_width * max_height
    min_pixels = min_width * min_height
    if current_pixels > max_pixels:
        scale_factor = (max_pixels / current_pixels) ** 0.5
        new_width = math.floor(width * scale_factor)
        new_height = math.floor(height * scale_factor)
    elif current_pixels < min_pixels:
        scale_factor = (min_pixels / current_pixels) ** 0.5
        new_width = math.ceil(width * scale_factor)
        new_height = math.ceil(height * scale_factor)
    else:
        return img
    return img.resize((new_width, new_height), resample=resample_method)
--- a/chandra/layout.py
+++ b/chandra/layout.py
@@ -0,0 +1,44 @@
 import json
 from dataclasses import dataclass
 from PIL import Image
 from PIL.ImageDraw import ImageDraw
 from bs4 import BeautifulSoup
@dataclass
 class LayoutBlock:
    bbox: list[int]
    label: str
    content: str
 def parse_layout(html: str, image: Image.Image):
    soup = BeautifulSoup(html, "html.parser")
    top_level_divs = soup.find_all("div", recursive=False)
    width, height = image.size
    width_scaler = width / 1024
    height_scaler = height / 1024
    layout_blocks = []
    for div in top_level_divs:
        bbox = div.get("data-bbox")
        bbox = json.loads(bbox)
        bbox = list(map(int, bbox))
        # Normalize bbox
        bbox = [
            max(0, int(bbox[0] * width_scaler)),
            max(0, int(bbox[1] * height_scaler)),
            min(int(bbox[2] * width_scaler), width),
            min(int(bbox[3] * height_scaler), height),
        ]
        label = div.get("data-label", "block")
        content = str(div.decode_contents())
        layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
    return layout_blocks
 def draw_layout(image: Image.Image, layout_blocks: list[LayoutBlock]):
    draw_image = image.copy()
    draw = ImageDraw(draw_image)
    for block in layout_blocks:
        draw.rectangle(block.bbox, outline="red", width=2)
        draw.text((block.bbox[0], block.bbox[1]), block.label, fill="blue")
    return draw_image
--- a/chandra/load.py
+++ b/chandra/load.py
@@ -0,0 +1,39 @@
 from typing import List
 import filetype
 from PIL import Image
 import pypdfium2 as pdfium
 from chandra.settings import settings
 def load_pdf_images(filepath: str, page_range: List[int]):
    doc = pdfium.PdfDocument(filepath)
    images = [doc[i].render(scale=settings.IMAGE_DPI / 72).to_pil().convert("RGB") for i in range(len(doc)) if not page_range or i in page_range]
    doc.close()
    return images
 def parse_range_str(range_str: str) -> List[int]:
    range_lst = range_str.split(",")
    page_lst = []
    for i in range_lst:
        if "-" in i:
            start, end = i.split("-")
            page_lst += list(range(int(start), int(end) + 1))
        else:
            page_lst.append(int(i))
    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
    return page_lst
 def load_file(filepath: str, config: dict):
    page_range = config.get("page_range")
    if page_range:
        page_range = parse_range_str(page_range)
    input_type = filetype.guess(filepath)
    if input_type and input_type.extension == "pdf":
        images = load_pdf_images(filepath, page_range)
    else:
        images = [Image.open(filepath).convert("RGB")]
    return images
--- a/chandra/model.py
+++ b/chandra/model.py
@@ -0,0 +1,78 @@
 from dataclasses import dataclass
 from typing import List
 from PIL import Image
 from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 from chandra.image import scale_to_fit
 from chandra.prompts import PROMPT_MAPPING
 from chandra.settings import settings
 from qwen_vl_utils import process_vision_info
@dataclass
 class BatchItem:
    images: List[Image.Image]
    prompt: str | None = None
    prompt_type: str | None = None
 def load():
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        settings.MODEL_CHECKPOINT,
        dtype=settings.TORCH_DTYPE,
        device_map="auto",
        attn_implementation=settings.TORCH_ATTN_IMPLEMENTATION,
    ).to(settings.TORCH_DEVICE_MODEL)
    model = model.eval()
    processor = Qwen2_5_VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT)
    model.processor = processor
    return model
 def process_batch_element(item: BatchItem, processor):
    prompt = item.prompt
    prompt_type = item.prompt_type
    images = item.images
    if not prompt:
        prompt = PROMPT_MAPPING[prompt_type]
    content = []
    for image in images:
        image = scale_to_fit(image)  # Guarantee max size
        content.append({"type": "image", "image": image})
    content.append({"type": "text", "text": prompt})
    message = {
        "role": "user",
        "content": content
    }
    return message
 def generate(batch: List[BatchItem], model):
    messages = [process_batch_element(item, model.processor) for item in batch]
    text = model.processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)
    inputs = model.processor(
        text=text,
        images=image_inputs,
        padding=True,
        return_tensors="pt",
        padding_side="left"
    )
    inputs = inputs.to("cuda")
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=settings.MAX_OUTPUT_TOKENS)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = model.processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text
--- a/chandra/prompts.py
+++ b/chandra/prompts.py
@@ -0,0 +1,99 @@
 ALLOWED_TAGS = [
    "math",
    "br",
    "i",
    "b",
    "u",
    "del",
    "sup",
    "sub",
    "table",
    "tr",
    "td",
    "p",
    "th",
    "div",
    "pre",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "ul",
    "ol",
    "li",
    "input",
    "a",
    "span",
    "img",
    "hr",
    "tbody",
    "small",
    "caption",
    "strong",
    "thead",
    "big",
    "code",
 ]
 ALLOWED_ATTRIBUTES = [
    "class",
    "colspan",
    "rowspan",
    "display",
    "checked",
    "type",
    "border",
    "value",
    "style",
    "href",
    "alt",
    "align",
 ]
 PROMPT_ENDING = f"""
 Only use these tags {ALLOWED_TAGS}, and these attributes {ALLOWED_ATTRIBUTES}.
 Guidelines:
 * Inline math: Surround math with <math>...</math> tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math.
 * Tables: Use colspan and rowspan attributes to match table structure.
 * Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters.
 * Images: Include a description of any images in the alt attribute of an <img> tag. Do not fill out the src property.
 * Forms: Mark checkboxes and radio buttons properly.
 * Text: join lines together properly into paragraphs using <p>...</p> tags.  Use <br> tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning.
 * Use the simplest possible HTML structure that accurately represents the content of the block.
 * Make sure the text is accurate and easy for a human to read and interpret.  Reading order should be correct and natural.
 """.strip()
 OCR_LAYOUT_PROMPT = f"""
 OCR this image to HTML, arranged as layout blocks.  Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format.  Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
 Use the following labels:
 - Caption
 - Footnote
 - Equation-Block
 - List-Item
 - Page-Header
 - Page-Footer
 - Image
 - Section-Header
 - Table
 - Text
 - Complex-Block
 - Code-Block
 - Form
 - Table-Of-Contents
 - Figure
 {PROMPT_ENDING}
 """.strip()
 OCR_PROMPT = f"""
 OCR this image to HTML.
 {PROMPT_ENDING}
 """.strip()
 PROMPT_MAPPING = {
    "ocr_layout": OCR_LAYOUT_PROMPT,
    "ocr": OCR_PROMPT,
 }
--- a/chandra/settings.py
+++ b/chandra/settings.py
@@ -0,0 +1,48 @@
 from dotenv import find_dotenv
 from pydantic import computed_field
 from pydantic_settings import BaseSettings
 import torch
 import os
 class Settings(BaseSettings):
    # Paths
    BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    IMAGE_DPI: int = 96
    MODEL_CHECKPOINT: str = "datalab-to/chandra-0.2.1"
    TORCH_DEVICE: str | None = None
    MAX_OUTPUT_TOKENS: int = 2048
    @computed_field
    @property
    def TORCH_DEVICE_MODEL(self) -> str:
        if self.TORCH_DEVICE is not None:
            return self.TORCH_DEVICE
        if torch.cuda.is_available():
            return "cuda"
        if torch.backends.mps.is_available():
            return "mps"
        return "cpu"
    @computed_field
    @property
    def TORCH_DTYPE(self) -> torch.dtype:
        return torch.bfloat16
    @computed_field
    @property
    def TORCH_ATTN_IMPLEMENTATION(self) -> str:
        if self.TORCH_DEVICE_MODEL == "cuda":
            return "flash_attention_2"
        else:
            return "sdpa"
    class Config:
        env_file = find_dotenv("local.env")
        extra = "ignore"
 settings = Settings()
--- a/chandra_app.py
+++ b/chandra_app.py
@@ -0,0 +1,113 @@
 import pypdfium2 as pdfium
 import streamlit as st
 from PIL import Image
 from chandra.layout import parse_layout, draw_layout
 from chandra.load import load_pdf_images
 from chandra.model import load, BatchItem, generate
@st.cache_resource()
 def load_model():
    return load()
@st.cache_data()
 def get_page_image(pdf_file, page_num):
    return load_pdf_images(pdf_file, [page_num])[0]
@st.cache_data()
 def page_counter(pdf_file):
    doc = pdfium.PdfDocument(pdf_file)
    doc_len = len(doc)
    doc.close()
    return doc_len
 # Function for OCR
 def ocr_layout(
    img: Image.Image,
 ) -> (Image.Image, str):
    batch = BatchItem(
        images=[img],
        prompt_type="ocr_layout",
    )
    html = generate([batch], model=model)[0]
    print(f"Generated HTML: {html[:500]}...")
    layout = parse_layout(html, img)
    layout_image = draw_layout(img, layout)
    return html, layout_image
 def ocr(
    img: Image.Image,
 ) -> str:
    batch = BatchItem(
        images=[img],
        prompt_type="ocr"
    )
    return generate([batch], model=model)[0]
 st.set_page_config(layout="wide")
 col1, col2 = st.columns([0.5, 0.5])
 model = load_model()
 st.markdown("""
 # Chandra OCR Demo
 This app will let you try chandra, a multilingual OCR toolkit.
 """)
 in_file = st.sidebar.file_uploader(
    "PDF file or image:", type=["pdf", "png", "jpg", "jpeg", "gif", "webp"]
 )
 if in_file is None:
    st.stop()
 filetype = in_file.type
 page_count = None
 if "pdf" in filetype:
    page_count = page_counter(in_file)
    page_number = st.sidebar.number_input(
        f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
    )
    pil_image = get_page_image(in_file, page_number)
 else:
    pil_image = Image.open(in_file).convert("RGB")
    page_number = None
 run_ocr = st.sidebar.button("Run OCR")
 prompt_type = st.sidebar.selectbox(
    "Prompt type",
    ["ocr_layout", "ocr"],
    index=0,
    help="Select the prompt type for OCR.",
 )
 if pil_image is None:
    st.stop()
 if run_ocr:
    if prompt_type == "ocr_layout":
        pred, layout_image = ocr_layout(
            pil_image,
        )
    else:
        pred = ocr(
            pil_image,
        )
        layout_image = None
    with col1:
        html_tab, text_tab, layout_tab = st.tabs(["HTML", "HTML as text", "Layout Image"])
        with html_tab:
            st.markdown(pred, unsafe_allow_html=True)
        with text_tab:
            st.text(pred)
        if layout_image:
            with layout_tab:
                st.image(layout_image, caption="Detected Layout", use_container_width=True)
 with col2:
    st.image(pil_image, caption="Uploaded Image", use_container_width=True)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
 [project]
 name = "chandra"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "beautifulsoup4>=4.14.2",
    "filetype>=1.2.0",
    "pillow>=11.3.0",
    "pydantic>=2.12.0",
    "pydantic-settings>=2.11.0",
    "pypdfium2>=4.30.0",
    "python-dotenv>=1.1.1",
    "qwen-vl-utils>=0.0.14",
    "streamlit>=1.50.0",
    "torch>=2.8.0",
    "transformers>=4.57.0",
 ]
--- a/uv.lock
+++ b/uv.lock