Fix max repeats

This commit is contained in:
Vik Paruchuri
2025-11-03 17:11:51 -05:00
parent 4b01146865
commit aabfed2ed3
2 changed files with 6 additions and 4 deletions

View File

@@ -44,9 +44,10 @@ def scale_to_fit(
def detect_repeat_token( def detect_repeat_token(
predicted_tokens: str, predicted_tokens: str,
max_repeats: int = 4, base_max_repeats: int = 4,
window_size: int = 500, window_size: int = 500,
cut_from_end: int = 0, cut_from_end: int = 0,
scaling_factor: float = 3.0,
): ):
try: try:
predicted_tokens = parse_markdown(predicted_tokens) predicted_tokens = parse_markdown(predicted_tokens)
@@ -57,11 +58,13 @@ def detect_repeat_token(
if cut_from_end > 0: if cut_from_end > 0:
predicted_tokens = predicted_tokens[:-cut_from_end] predicted_tokens = predicted_tokens[:-cut_from_end]
# Try different sequence lengths (1 to window_size//2)
for seq_len in range(1, window_size // 2 + 1): for seq_len in range(1, window_size // 2 + 1):
# Extract the potential repeating sequence from the end # Extract the potential repeating sequence from the end
candidate_seq = predicted_tokens[-seq_len:] candidate_seq = predicted_tokens[-seq_len:]
# Inverse scaling: shorter sequences need more repeats
max_repeats = int(base_max_repeats * (1 + scaling_factor / seq_len))
# Count how many times this sequence appears consecutively at the end # Count how many times this sequence appears consecutively at the end
repeat_count = 0 repeat_count = 0
pos = len(predicted_tokens) - seq_len pos = len(predicted_tokens) - seq_len
@@ -75,7 +78,6 @@ def detect_repeat_token(
else: else:
break break
# If we found more than max_repeats consecutive occurrences
if repeat_count > max_repeats: if repeat_count > max_repeats:
return True return True

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "chandra-ocr" name = "chandra-ocr"
version = "0.1.8" version = "0.1.9"
description = "OCR model that converts documents to markdown, HTML, or JSON." description = "OCR model that converts documents to markdown, HTML, or JSON."
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"