import hashlib import json import re from dataclasses import dataclass, asdict from functools import lru_cache import six from PIL import Image from bs4 import BeautifulSoup from markdownify import MarkdownConverter, re_whitespace from chandra.settings import settings @lru_cache def _hash_html(html: str): return hashlib.md5(html.encode("utf-8")).hexdigest() def get_image_name(html: str, div_idx: int): html_hash = _hash_html(html) return f"{html_hash}_{div_idx}_img.webp" def extract_images(html: str, chunks: dict, image: Image.Image): images = {} div_idx = 0 for idx, chunk in enumerate(chunks): div_idx += 1 if chunk["label"] in ["Image", "Figure"]: img = chunk["content"].find("img") if not img: continue bbox = chunk["bbox"] try: block_image = image.crop(bbox) except ValueError: # Happens when bbox coordinates are invalid continue img_name = get_image_name(html, div_idx) images[img_name] = block_image return images def parse_html( html: str, include_headers_footers: bool = False, include_images: bool = True ): soup = BeautifulSoup(html, "html.parser") top_level_divs = soup.find_all("div", recursive=False) out_html = "" image_idx = 0 div_idx = 0 for div in top_level_divs: div_idx += 1 label = div.get("data-label") # Skip headers and footers if not included if label and not include_headers_footers: if label in ["Page-Header", "Page-Footer"]: continue if label and not include_images: if label in ["Image", "Figure"]: continue if label in ["Image", "Figure"]: img = div.find("img") img_src = get_image_name(html, div_idx) # If no tag, add one in if img: img["src"] = img_src image_idx += 1 else: img = BeautifulSoup(f"", "html.parser") div.append(img) # Wrap text content in

tags if no inner HTML tags exist if label in ["Text"] and not re.search( "<.+>", str(div.decode_contents()).strip() ): # Add inner p tags if missing for text blocks text_content = str(div.decode_contents()).strip() text_content = f"

{text_content}

" div.clear() div.append(BeautifulSoup(text_content, "html.parser")) content = str(div.decode_contents()) out_html += content return out_html class Markdownify(MarkdownConverter): def __init__( self, inline_math_delimiters, block_math_delimiters, **kwargs, ): super().__init__(**kwargs) self.inline_math_delimiters = inline_math_delimiters self.block_math_delimiters = block_math_delimiters def convert_math(self, el, text, parent_tags): block = el.has_attr("display") and el["display"] == "block" if block: return ( "\n" + self.block_math_delimiters[0] + text.strip() + self.block_math_delimiters[1] + "\n" ) else: return ( " " + self.inline_math_delimiters[0] + text.strip() + self.inline_math_delimiters[1] + " " ) def convert_table(self, el, text, parent_tags): return "\n\n" + str(el) + "\n\n" def convert_a(self, el, text, parent_tags): text = self.escape(text) # Escape brackets and parentheses in text text = re.sub(r"([\[\]()])", r"\\\1", text) return super().convert_a(el, text, parent_tags) def escape(self, text, parent_tags=None): text = super().escape(text, parent_tags) if self.options["escape_dollars"]: text = text.replace("$", r"\$") return text def process_text(self, el, parent_tags=None): text = six.text_type(el) or "" # normalize whitespace if we're not inside a preformatted element if not el.find_parent("pre"): text = re_whitespace.sub(" ", text) # escape special characters if we're not inside a preformatted or code element if not el.find_parent(["pre", "code", "kbd", "samp", "math"]): text = self.escape(text) # remove trailing whitespaces if any of the following condition is true: # - current text node is the last node in li # - current text node is followed by an embedded list if el.parent.name == "li" and ( not el.next_sibling or el.next_sibling.name in ["ul", "ol"] ): text = text.rstrip() return text def parse_markdown( html: str, include_headers_footers: bool = False, include_images: bool = True ): html = parse_html(html, include_headers_footers, include_images) md_cls = Markdownify( heading_style="ATX", bullets="-", escape_misc=False, escape_underscores=True, escape_asterisks=True, escape_dollars=True, sub_symbol="", sup_symbol="", inline_math_delimiters=("$", "$"), block_math_delimiters=("$$", "$$"), ) try: markdown = md_cls.convert(html) except Exception as e: print(f"Error converting HTML to Markdown: {e}") markdown = "" return markdown.strip() @dataclass class LayoutBlock: bbox: list[int] label: str content: str def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE): soup = BeautifulSoup(html, "html.parser") top_level_divs = soup.find_all("div", recursive=False) width, height = image.size width_scaler = width / bbox_scale height_scaler = height / bbox_scale layout_blocks = [] for div in top_level_divs: bbox = div.get("data-bbox") try: bbox = json.loads(bbox) assert len(bbox) == 4, "Invalid bbox length" except Exception: try: bbox = bbox.split(" ") assert len(bbox) == 4, "Invalid bbox length" except Exception: bbox = [0, 0, 1, 1] bbox = list(map(int, bbox)) # Normalize bbox bbox = [ max(0, int(bbox[0] * width_scaler)), max(0, int(bbox[1] * height_scaler)), min(int(bbox[2] * width_scaler), width), min(int(bbox[3] * height_scaler), height), ] label = div.get("data-label", "block") content = str(div.decode_contents()) layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content)) return layout_blocks def parse_chunks(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE): layout = parse_layout(html, image, bbox_scale=bbox_scale) chunks = [asdict(block) for block in layout] return chunks