simulstreaming: limit n tokens to prevent hallucinations

2026-03-07 14:23:18 +00:00 · 2025-11-28 21:41:19 +01:00
parent 45bf3f57d7
commit d45c397c6a
1 changed files with 18 additions and 4 deletions
--- a/whisperlivekit/simul_whisper/simul_whisper.py
+++ b/whisperlivekit/simul_whisper/simul_whisper.py
@@ -484,7 +484,17 @@ class AlignAtt:
        
        accumulated_cross_attns = []
        
+        audio_duration_s = self.segments_len()
+        max_tokens_per_chunk = max(50, int(audio_duration_s * TOKENS_PER_SECOND * 2.0))  # 2x margin, min 50
+        tokens_produced_this_chunk = 0
+        
        while not completed and current_tokens.shape[1] < self.max_text_len:  # bos is 3 tokens
+            tokens_produced_this_chunk += 1
+            
+            if tokens_produced_this_chunk > max_tokens_per_chunk:
+                logger.warning(f"[Loop Detection] Too many tokens ({tokens_produced_this_chunk}) for {audio_duration_s:.2f}s audio. Breaking.")
+                current_tokens = current_tokens[:, :token_len_before_decoding]  # Discard all new tokens
+                break

            if new_segment:
                tokens_for_logits = current_tokens
@@ -631,11 +641,15 @@ class AlignAtt:
            )
            timestamped_words.append(timestamp_entry)

-        # Hold incomplete tokens for next chunk
+        # Hold incomplete tokens for next chunk (with limit to prevent hallucination accumulation)
        self.state.pending_incomplete_tokens = []
+        MAX_PENDING_TOKENS = 10  # Real incomplete UTF-8 chars are at most a few tokens
        if split_words and replacement_char in split_words[-1]:
-            self.state.pending_incomplete_tokens = split_tokens[-1]
-            logger.warning(f"[UTF-8 Fix] Holding {len(self.state.pending_incomplete_tokens)} incomplete tokens for next chunk: {self.state.pending_incomplete_tokens}")
+            if len(split_tokens[-1]) <= MAX_PENDING_TOKENS:
+                self.state.pending_incomplete_tokens = split_tokens[-1]
+                logger.debug(f"[UTF-8 Fix] Holding {len(self.state.pending_incomplete_tokens)} incomplete tokens for next chunk")
+            else:
+                logger.warning(f"[UTF-8 Fix] Skipping {len(split_tokens[-1])} tokens (exceeds limit of {MAX_PENDING_TOKENS}, likely hallucination)")

        return timestamped_words

@@ -702,4 +716,4 @@ class AlignAtt:
        attn_of_alignment_heads = median_filter(attn_of_alignment_heads, 7)
        attn_of_alignment_heads = attn_of_alignment_heads.mean(dim=1)
        attn_of_alignment_heads = attn_of_alignment_heads[:, :, :content_mel_len]
-        return attn_of_alignment_heads
+        return attn_of_alignment_heads