mirror of
https://github.com/datalab-to/chandra.git
synced 2025-11-29 08:33:13 +00:00
Initial commit
This commit is contained in:
221
.gitignore
vendored
Normal file
221
.gitignore
vendored
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[codz]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
# Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
# poetry.lock
|
||||||
|
# poetry.toml
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||||
|
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||||
|
# pdm.lock
|
||||||
|
# pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# pixi
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||||
|
# pixi.lock
|
||||||
|
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||||
|
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||||
|
.pixi
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# Redis
|
||||||
|
*.rdb
|
||||||
|
*.aof
|
||||||
|
*.pid
|
||||||
|
|
||||||
|
# RabbitMQ
|
||||||
|
mnesia/
|
||||||
|
rabbitmq/
|
||||||
|
rabbitmq-data/
|
||||||
|
|
||||||
|
# ActiveMQ
|
||||||
|
activemq-data/
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.envrc
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
# .idea/
|
||||||
|
|
||||||
|
# Abstra
|
||||||
|
# Abstra is an AI-powered process automation framework.
|
||||||
|
# Ignore directories containing user credentials, local state, and settings.
|
||||||
|
# Learn more at https://abstra.io/docs
|
||||||
|
.abstra/
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||||
|
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||||
|
# you could uncomment the following to ignore the entire vscode folder
|
||||||
|
# .vscode/
|
||||||
|
|
||||||
|
# Ruff stuff:
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
|
|
||||||
|
# Marimo
|
||||||
|
marimo/_static/
|
||||||
|
marimo/_lsp/
|
||||||
|
__marimo__/
|
||||||
|
|
||||||
|
# Streamlit
|
||||||
|
.streamlit/secrets.toml
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv
|
||||||
|
|
||||||
|
.idea/
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
|||||||
|
3.12
|
||||||
7
README.md
Normal file
7
README.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Chandra
|
||||||
|
|
||||||
|
Try:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
streamlit run chandra_app.py --server.fileWatcherType none --server.headless true
|
||||||
|
```
|
||||||
0
chandra/__init__.py
Normal file
0
chandra/__init__.py
Normal file
40
chandra/image.py
Normal file
40
chandra/image.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import math
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
def scale_to_fit(
|
||||||
|
img: Image.Image,
|
||||||
|
max_size: Tuple[int, int] = (1024, 1024),
|
||||||
|
min_size: Tuple[int, int] = (28, 28),
|
||||||
|
):
|
||||||
|
resample_method = Image.Resampling.LANCZOS
|
||||||
|
|
||||||
|
width, height = img.size
|
||||||
|
|
||||||
|
# Check for empty or invalid image
|
||||||
|
if width == 0 or height == 0:
|
||||||
|
return img
|
||||||
|
|
||||||
|
max_width, max_height = max_size
|
||||||
|
min_width, min_height = min_size
|
||||||
|
|
||||||
|
current_pixels = width * height
|
||||||
|
max_pixels = max_width * max_height
|
||||||
|
min_pixels = min_width * min_height
|
||||||
|
|
||||||
|
if current_pixels > max_pixels:
|
||||||
|
scale_factor = (max_pixels / current_pixels) ** 0.5
|
||||||
|
|
||||||
|
new_width = math.floor(width * scale_factor)
|
||||||
|
new_height = math.floor(height * scale_factor)
|
||||||
|
elif current_pixels < min_pixels:
|
||||||
|
scale_factor = (min_pixels / current_pixels) ** 0.5
|
||||||
|
|
||||||
|
new_width = math.ceil(width * scale_factor)
|
||||||
|
new_height = math.ceil(height * scale_factor)
|
||||||
|
else:
|
||||||
|
return img
|
||||||
|
|
||||||
|
return img.resize((new_width, new_height), resample=resample_method)
|
||||||
44
chandra/layout.py
Normal file
44
chandra/layout.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
from PIL.ImageDraw import ImageDraw
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LayoutBlock:
|
||||||
|
bbox: list[int]
|
||||||
|
label: str
|
||||||
|
content: str
|
||||||
|
|
||||||
|
def parse_layout(html: str, image: Image.Image):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
top_level_divs = soup.find_all("div", recursive=False)
|
||||||
|
width, height = image.size
|
||||||
|
width_scaler = width / 1024
|
||||||
|
height_scaler = height / 1024
|
||||||
|
layout_blocks = []
|
||||||
|
for div in top_level_divs:
|
||||||
|
bbox = div.get("data-bbox")
|
||||||
|
bbox = json.loads(bbox)
|
||||||
|
bbox = list(map(int, bbox))
|
||||||
|
# Normalize bbox
|
||||||
|
bbox = [
|
||||||
|
max(0, int(bbox[0] * width_scaler)),
|
||||||
|
max(0, int(bbox[1] * height_scaler)),
|
||||||
|
min(int(bbox[2] * width_scaler), width),
|
||||||
|
min(int(bbox[3] * height_scaler), height),
|
||||||
|
]
|
||||||
|
label = div.get("data-label", "block")
|
||||||
|
content = str(div.decode_contents())
|
||||||
|
layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
|
||||||
|
return layout_blocks
|
||||||
|
|
||||||
|
def draw_layout(image: Image.Image, layout_blocks: list[LayoutBlock]):
|
||||||
|
draw_image = image.copy()
|
||||||
|
draw = ImageDraw(draw_image)
|
||||||
|
for block in layout_blocks:
|
||||||
|
draw.rectangle(block.bbox, outline="red", width=2)
|
||||||
|
draw.text((block.bbox[0], block.bbox[1]), block.label, fill="blue")
|
||||||
|
|
||||||
|
return draw_image
|
||||||
39
chandra/load.py
Normal file
39
chandra/load.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
from typing import List
|
||||||
|
import filetype
|
||||||
|
from PIL import Image
|
||||||
|
import pypdfium2 as pdfium
|
||||||
|
|
||||||
|
from chandra.settings import settings
|
||||||
|
|
||||||
|
|
||||||
|
def load_pdf_images(filepath: str, page_range: List[int]):
|
||||||
|
doc = pdfium.PdfDocument(filepath)
|
||||||
|
images = [doc[i].render(scale=settings.IMAGE_DPI / 72).to_pil().convert("RGB") for i in range(len(doc)) if not page_range or i in page_range]
|
||||||
|
doc.close()
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def parse_range_str(range_str: str) -> List[int]:
|
||||||
|
range_lst = range_str.split(",")
|
||||||
|
page_lst = []
|
||||||
|
for i in range_lst:
|
||||||
|
if "-" in i:
|
||||||
|
start, end = i.split("-")
|
||||||
|
page_lst += list(range(int(start), int(end) + 1))
|
||||||
|
else:
|
||||||
|
page_lst.append(int(i))
|
||||||
|
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
|
||||||
|
return page_lst
|
||||||
|
|
||||||
|
|
||||||
|
def load_file(filepath: str, config: dict):
|
||||||
|
page_range = config.get("page_range")
|
||||||
|
if page_range:
|
||||||
|
page_range = parse_range_str(page_range)
|
||||||
|
|
||||||
|
input_type = filetype.guess(filepath)
|
||||||
|
if input_type and input_type.extension == "pdf":
|
||||||
|
images = load_pdf_images(filepath, page_range)
|
||||||
|
else:
|
||||||
|
images = [Image.open(filepath).convert("RGB")]
|
||||||
|
return images
|
||||||
78
chandra/model.py
Normal file
78
chandra/model.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
|
||||||
|
|
||||||
|
from chandra.image import scale_to_fit
|
||||||
|
from chandra.prompts import PROMPT_MAPPING
|
||||||
|
from chandra.settings import settings
|
||||||
|
|
||||||
|
from qwen_vl_utils import process_vision_info
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BatchItem:
|
||||||
|
images: List[Image.Image]
|
||||||
|
prompt: str | None = None
|
||||||
|
prompt_type: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def load():
|
||||||
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||||
|
settings.MODEL_CHECKPOINT,
|
||||||
|
dtype=settings.TORCH_DTYPE,
|
||||||
|
device_map="auto",
|
||||||
|
attn_implementation=settings.TORCH_ATTN_IMPLEMENTATION,
|
||||||
|
).to(settings.TORCH_DEVICE_MODEL)
|
||||||
|
model = model.eval()
|
||||||
|
processor = Qwen2_5_VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT)
|
||||||
|
model.processor = processor
|
||||||
|
return model
|
||||||
|
|
||||||
|
def process_batch_element(item: BatchItem, processor):
|
||||||
|
prompt = item.prompt
|
||||||
|
prompt_type = item.prompt_type
|
||||||
|
images = item.images
|
||||||
|
|
||||||
|
if not prompt:
|
||||||
|
prompt = PROMPT_MAPPING[prompt_type]
|
||||||
|
|
||||||
|
content = []
|
||||||
|
for image in images:
|
||||||
|
image = scale_to_fit(image) # Guarantee max size
|
||||||
|
content.append({"type": "image", "image": image})
|
||||||
|
|
||||||
|
content.append({"type": "text", "text": prompt})
|
||||||
|
message = {
|
||||||
|
"role": "user",
|
||||||
|
"content": content
|
||||||
|
}
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
def generate(batch: List[BatchItem], model):
|
||||||
|
messages = [process_batch_element(item, model.processor) for item in batch]
|
||||||
|
text = model.processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
image_inputs, _ = process_vision_info(messages)
|
||||||
|
inputs = model.processor(
|
||||||
|
text=text,
|
||||||
|
images=image_inputs,
|
||||||
|
padding=True,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding_side="left"
|
||||||
|
)
|
||||||
|
inputs = inputs.to("cuda")
|
||||||
|
|
||||||
|
# Inference: Generation of the output
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=settings.MAX_OUTPUT_TOKENS)
|
||||||
|
generated_ids_trimmed = [
|
||||||
|
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||||
|
]
|
||||||
|
output_text = model.processor.batch_decode(
|
||||||
|
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||||
|
)
|
||||||
|
return output_text
|
||||||
|
|
||||||
99
chandra/prompts.py
Normal file
99
chandra/prompts.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
ALLOWED_TAGS = [
|
||||||
|
"math",
|
||||||
|
"br",
|
||||||
|
"i",
|
||||||
|
"b",
|
||||||
|
"u",
|
||||||
|
"del",
|
||||||
|
"sup",
|
||||||
|
"sub",
|
||||||
|
"table",
|
||||||
|
"tr",
|
||||||
|
"td",
|
||||||
|
"p",
|
||||||
|
"th",
|
||||||
|
"div",
|
||||||
|
"pre",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
"li",
|
||||||
|
"input",
|
||||||
|
"a",
|
||||||
|
"span",
|
||||||
|
"img",
|
||||||
|
"hr",
|
||||||
|
"tbody",
|
||||||
|
"small",
|
||||||
|
"caption",
|
||||||
|
"strong",
|
||||||
|
"thead",
|
||||||
|
"big",
|
||||||
|
"code",
|
||||||
|
]
|
||||||
|
ALLOWED_ATTRIBUTES = [
|
||||||
|
"class",
|
||||||
|
"colspan",
|
||||||
|
"rowspan",
|
||||||
|
"display",
|
||||||
|
"checked",
|
||||||
|
"type",
|
||||||
|
"border",
|
||||||
|
"value",
|
||||||
|
"style",
|
||||||
|
"href",
|
||||||
|
"alt",
|
||||||
|
"align",
|
||||||
|
]
|
||||||
|
|
||||||
|
PROMPT_ENDING = f"""
|
||||||
|
Only use these tags {ALLOWED_TAGS}, and these attributes {ALLOWED_ATTRIBUTES}.
|
||||||
|
|
||||||
|
Guidelines:
|
||||||
|
* Inline math: Surround math with <math>...</math> tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math.
|
||||||
|
* Tables: Use colspan and rowspan attributes to match table structure.
|
||||||
|
* Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters.
|
||||||
|
* Images: Include a description of any images in the alt attribute of an <img> tag. Do not fill out the src property.
|
||||||
|
* Forms: Mark checkboxes and radio buttons properly.
|
||||||
|
* Text: join lines together properly into paragraphs using <p>...</p> tags. Use <br> tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning.
|
||||||
|
* Use the simplest possible HTML structure that accurately represents the content of the block.
|
||||||
|
* Make sure the text is accurate and easy for a human to read and interpret. Reading order should be correct and natural.
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
OCR_LAYOUT_PROMPT = f"""
|
||||||
|
OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-1024. The data-label attribute is the label for the block.
|
||||||
|
|
||||||
|
Use the following labels:
|
||||||
|
- Caption
|
||||||
|
- Footnote
|
||||||
|
- Equation-Block
|
||||||
|
- List-Item
|
||||||
|
- Page-Header
|
||||||
|
- Page-Footer
|
||||||
|
- Image
|
||||||
|
- Section-Header
|
||||||
|
- Table
|
||||||
|
- Text
|
||||||
|
- Complex-Block
|
||||||
|
- Code-Block
|
||||||
|
- Form
|
||||||
|
- Table-Of-Contents
|
||||||
|
- Figure
|
||||||
|
|
||||||
|
{PROMPT_ENDING}
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
OCR_PROMPT = f"""
|
||||||
|
OCR this image to HTML.
|
||||||
|
|
||||||
|
{PROMPT_ENDING}
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
PROMPT_MAPPING = {
|
||||||
|
"ocr_layout": OCR_LAYOUT_PROMPT,
|
||||||
|
"ocr": OCR_PROMPT,
|
||||||
|
}
|
||||||
48
chandra/settings.py
Normal file
48
chandra/settings.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from dotenv import find_dotenv
|
||||||
|
from pydantic import computed_field
|
||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
# Paths
|
||||||
|
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
IMAGE_DPI: int = 96
|
||||||
|
MODEL_CHECKPOINT: str = "datalab-to/chandra-0.2.1"
|
||||||
|
TORCH_DEVICE: str | None = None
|
||||||
|
MAX_OUTPUT_TOKENS: int = 2048
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def TORCH_DEVICE_MODEL(self) -> str:
|
||||||
|
if self.TORCH_DEVICE is not None:
|
||||||
|
return self.TORCH_DEVICE
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return "cuda"
|
||||||
|
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
return "mps"
|
||||||
|
|
||||||
|
return "cpu"
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def TORCH_DTYPE(self) -> torch.dtype:
|
||||||
|
return torch.bfloat16
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def TORCH_ATTN_IMPLEMENTATION(self) -> str:
|
||||||
|
if self.TORCH_DEVICE_MODEL == "cuda":
|
||||||
|
return "flash_attention_2"
|
||||||
|
else:
|
||||||
|
return "sdpa"
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
env_file = find_dotenv("local.env")
|
||||||
|
extra = "ignore"
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
113
chandra_app.py
Normal file
113
chandra_app.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
import pypdfium2 as pdfium
|
||||||
|
import streamlit as st
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from chandra.layout import parse_layout, draw_layout
|
||||||
|
from chandra.load import load_pdf_images
|
||||||
|
from chandra.model import load, BatchItem, generate
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_resource()
|
||||||
|
def load_model():
|
||||||
|
return load()
|
||||||
|
|
||||||
|
@st.cache_data()
|
||||||
|
def get_page_image(pdf_file, page_num):
|
||||||
|
return load_pdf_images(pdf_file, [page_num])[0]
|
||||||
|
|
||||||
|
@st.cache_data()
|
||||||
|
def page_counter(pdf_file):
|
||||||
|
doc = pdfium.PdfDocument(pdf_file)
|
||||||
|
doc_len = len(doc)
|
||||||
|
doc.close()
|
||||||
|
return doc_len
|
||||||
|
|
||||||
|
# Function for OCR
|
||||||
|
def ocr_layout(
|
||||||
|
img: Image.Image,
|
||||||
|
) -> (Image.Image, str):
|
||||||
|
batch = BatchItem(
|
||||||
|
images=[img],
|
||||||
|
prompt_type="ocr_layout",
|
||||||
|
)
|
||||||
|
html = generate([batch], model=model)[0]
|
||||||
|
print(f"Generated HTML: {html[:500]}...")
|
||||||
|
layout = parse_layout(html, img)
|
||||||
|
layout_image = draw_layout(img, layout)
|
||||||
|
return html, layout_image
|
||||||
|
|
||||||
|
def ocr(
|
||||||
|
img: Image.Image,
|
||||||
|
) -> str:
|
||||||
|
batch = BatchItem(
|
||||||
|
images=[img],
|
||||||
|
prompt_type="ocr"
|
||||||
|
)
|
||||||
|
return generate([batch], model=model)[0]
|
||||||
|
|
||||||
|
st.set_page_config(layout="wide")
|
||||||
|
col1, col2 = st.columns([0.5, 0.5])
|
||||||
|
|
||||||
|
model = load_model()
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
# Chandra OCR Demo
|
||||||
|
|
||||||
|
This app will let you try chandra, a multilingual OCR toolkit.
|
||||||
|
""")
|
||||||
|
|
||||||
|
in_file = st.sidebar.file_uploader(
|
||||||
|
"PDF file or image:", type=["pdf", "png", "jpg", "jpeg", "gif", "webp"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if in_file is None:
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
filetype = in_file.type
|
||||||
|
page_count = None
|
||||||
|
if "pdf" in filetype:
|
||||||
|
page_count = page_counter(in_file)
|
||||||
|
page_number = st.sidebar.number_input(
|
||||||
|
f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
|
||||||
|
)
|
||||||
|
|
||||||
|
pil_image = get_page_image(in_file, page_number)
|
||||||
|
else:
|
||||||
|
pil_image = Image.open(in_file).convert("RGB")
|
||||||
|
page_number = None
|
||||||
|
|
||||||
|
run_ocr = st.sidebar.button("Run OCR")
|
||||||
|
prompt_type = st.sidebar.selectbox(
|
||||||
|
"Prompt type",
|
||||||
|
["ocr_layout", "ocr"],
|
||||||
|
index=0,
|
||||||
|
help="Select the prompt type for OCR.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if pil_image is None:
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
if run_ocr:
|
||||||
|
if prompt_type == "ocr_layout":
|
||||||
|
pred, layout_image = ocr_layout(
|
||||||
|
pil_image,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pred = ocr(
|
||||||
|
pil_image,
|
||||||
|
)
|
||||||
|
layout_image = None
|
||||||
|
|
||||||
|
with col1:
|
||||||
|
html_tab, text_tab, layout_tab = st.tabs(["HTML", "HTML as text", "Layout Image"])
|
||||||
|
with html_tab:
|
||||||
|
st.markdown(pred, unsafe_allow_html=True)
|
||||||
|
with text_tab:
|
||||||
|
st.text(pred)
|
||||||
|
|
||||||
|
if layout_image:
|
||||||
|
with layout_tab:
|
||||||
|
st.image(layout_image, caption="Detected Layout", use_container_width=True)
|
||||||
|
|
||||||
|
with col2:
|
||||||
|
st.image(pil_image, caption="Uploaded Image", use_container_width=True)
|
||||||
19
pyproject.toml
Normal file
19
pyproject.toml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
[project]
|
||||||
|
name = "chandra"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"beautifulsoup4>=4.14.2",
|
||||||
|
"filetype>=1.2.0",
|
||||||
|
"pillow>=11.3.0",
|
||||||
|
"pydantic>=2.12.0",
|
||||||
|
"pydantic-settings>=2.11.0",
|
||||||
|
"pypdfium2>=4.30.0",
|
||||||
|
"python-dotenv>=1.1.1",
|
||||||
|
"qwen-vl-utils>=0.0.14",
|
||||||
|
"streamlit>=1.50.0",
|
||||||
|
"torch>=2.8.0",
|
||||||
|
"transformers>=4.57.0",
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user