mirror of
https://github.com/datalab-to/chandra.git
synced 2025-11-29 00:23:12 +00:00
Fix max repeats
This commit is contained in:
@@ -44,9 +44,10 @@ def scale_to_fit(
|
|||||||
|
|
||||||
def detect_repeat_token(
|
def detect_repeat_token(
|
||||||
predicted_tokens: str,
|
predicted_tokens: str,
|
||||||
max_repeats: int = 4,
|
base_max_repeats: int = 4,
|
||||||
window_size: int = 500,
|
window_size: int = 500,
|
||||||
cut_from_end: int = 0,
|
cut_from_end: int = 0,
|
||||||
|
scaling_factor: float = 3.0,
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
predicted_tokens = parse_markdown(predicted_tokens)
|
predicted_tokens = parse_markdown(predicted_tokens)
|
||||||
@@ -57,11 +58,13 @@ def detect_repeat_token(
|
|||||||
if cut_from_end > 0:
|
if cut_from_end > 0:
|
||||||
predicted_tokens = predicted_tokens[:-cut_from_end]
|
predicted_tokens = predicted_tokens[:-cut_from_end]
|
||||||
|
|
||||||
# Try different sequence lengths (1 to window_size//2)
|
|
||||||
for seq_len in range(1, window_size // 2 + 1):
|
for seq_len in range(1, window_size // 2 + 1):
|
||||||
# Extract the potential repeating sequence from the end
|
# Extract the potential repeating sequence from the end
|
||||||
candidate_seq = predicted_tokens[-seq_len:]
|
candidate_seq = predicted_tokens[-seq_len:]
|
||||||
|
|
||||||
|
# Inverse scaling: shorter sequences need more repeats
|
||||||
|
max_repeats = int(base_max_repeats * (1 + scaling_factor / seq_len))
|
||||||
|
|
||||||
# Count how many times this sequence appears consecutively at the end
|
# Count how many times this sequence appears consecutively at the end
|
||||||
repeat_count = 0
|
repeat_count = 0
|
||||||
pos = len(predicted_tokens) - seq_len
|
pos = len(predicted_tokens) - seq_len
|
||||||
@@ -75,7 +78,6 @@ def detect_repeat_token(
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
# If we found more than max_repeats consecutive occurrences
|
|
||||||
if repeat_count > max_repeats:
|
if repeat_count > max_repeats:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "chandra-ocr"
|
name = "chandra-ocr"
|
||||||
version = "0.1.8"
|
version = "0.1.9"
|
||||||
description = "OCR model that converts documents to markdown, HTML, or JSON."
|
description = "OCR model that converts documents to markdown, HTML, or JSON."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
|
|||||||
Reference in New Issue
Block a user