mirror of
https://github.com/datalab-to/chandra.git
synced 2025-11-30 17:13:11 +00:00
85 lines
2.3 KiB
Python
85 lines
2.3 KiB
Python
import math
|
|
from typing import Tuple
|
|
|
|
from PIL import Image
|
|
|
|
from chandra.output import parse_markdown
|
|
|
|
|
|
def scale_to_fit(
|
|
img: Image.Image,
|
|
max_size: Tuple[int, int] = (3072, 2048),
|
|
min_size: Tuple[int, int] = (28, 28),
|
|
):
|
|
resample_method = Image.Resampling.LANCZOS
|
|
|
|
width, height = img.size
|
|
|
|
# Check for empty or invalid image
|
|
if width == 0 or height == 0:
|
|
return img
|
|
|
|
max_width, max_height = max_size
|
|
min_width, min_height = min_size
|
|
|
|
current_pixels = width * height
|
|
max_pixels = max_width * max_height
|
|
min_pixels = min_width * min_height
|
|
|
|
if current_pixels > max_pixels:
|
|
scale_factor = (max_pixels / current_pixels) ** 0.5
|
|
|
|
new_width = math.floor(width * scale_factor)
|
|
new_height = math.floor(height * scale_factor)
|
|
elif current_pixels < min_pixels:
|
|
scale_factor = (min_pixels / current_pixels) ** 0.5
|
|
|
|
new_width = math.ceil(width * scale_factor)
|
|
new_height = math.ceil(height * scale_factor)
|
|
else:
|
|
return img
|
|
|
|
return img.resize((new_width, new_height), resample=resample_method)
|
|
|
|
|
|
def detect_repeat_token(
|
|
predicted_tokens: str,
|
|
base_max_repeats: int = 4,
|
|
window_size: int = 500,
|
|
cut_from_end: int = 0,
|
|
scaling_factor: float = 3.0,
|
|
):
|
|
try:
|
|
predicted_tokens = parse_markdown(predicted_tokens)
|
|
except Exception as e:
|
|
print(f"Error parsing markdown: {e}")
|
|
return True
|
|
|
|
if cut_from_end > 0:
|
|
predicted_tokens = predicted_tokens[:-cut_from_end]
|
|
|
|
for seq_len in range(1, window_size // 2 + 1):
|
|
# Extract the potential repeating sequence from the end
|
|
candidate_seq = predicted_tokens[-seq_len:]
|
|
|
|
# Inverse scaling: shorter sequences need more repeats
|
|
max_repeats = int(base_max_repeats * (1 + scaling_factor / seq_len))
|
|
|
|
# Count how many times this sequence appears consecutively at the end
|
|
repeat_count = 0
|
|
pos = len(predicted_tokens) - seq_len
|
|
if pos < 0:
|
|
continue
|
|
|
|
while pos >= 0:
|
|
if predicted_tokens[pos : pos + seq_len] == candidate_seq:
|
|
repeat_count += 1
|
|
pos -= seq_len
|
|
else:
|
|
break
|
|
|
|
if repeat_count > max_repeats:
|
|
return True
|
|
|
|
return False
|