diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index 55a9ce0..104db15 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -484,7 +484,17 @@ class AlignAtt: accumulated_cross_attns = [] + audio_duration_s = self.segments_len() + max_tokens_per_chunk = max(50, int(audio_duration_s * TOKENS_PER_SECOND * 2.0)) # 2x margin, min 50 + tokens_produced_this_chunk = 0 + while not completed and current_tokens.shape[1] < self.max_text_len: # bos is 3 tokens + tokens_produced_this_chunk += 1 + + if tokens_produced_this_chunk > max_tokens_per_chunk: + logger.warning(f"[Loop Detection] Too many tokens ({tokens_produced_this_chunk}) for {audio_duration_s:.2f}s audio. Breaking.") + current_tokens = current_tokens[:, :token_len_before_decoding] # Discard all new tokens + break if new_segment: tokens_for_logits = current_tokens @@ -631,11 +641,15 @@ class AlignAtt: ) timestamped_words.append(timestamp_entry) - # Hold incomplete tokens for next chunk + # Hold incomplete tokens for next chunk (with limit to prevent hallucination accumulation) self.state.pending_incomplete_tokens = [] + MAX_PENDING_TOKENS = 10 # Real incomplete UTF-8 chars are at most a few tokens if split_words and replacement_char in split_words[-1]: - self.state.pending_incomplete_tokens = split_tokens[-1] - logger.warning(f"[UTF-8 Fix] Holding {len(self.state.pending_incomplete_tokens)} incomplete tokens for next chunk: {self.state.pending_incomplete_tokens}") + if len(split_tokens[-1]) <= MAX_PENDING_TOKENS: + self.state.pending_incomplete_tokens = split_tokens[-1] + logger.debug(f"[UTF-8 Fix] Holding {len(self.state.pending_incomplete_tokens)} incomplete tokens for next chunk") + else: + logger.warning(f"[UTF-8 Fix] Skipping {len(split_tokens[-1])} tokens (exceeds limit of {MAX_PENDING_TOKENS}, likely hallucination)") return timestamped_words @@ -702,4 +716,4 @@ class AlignAtt: attn_of_alignment_heads = median_filter(attn_of_alignment_heads, 7) attn_of_alignment_heads = attn_of_alignment_heads.mean(dim=1) attn_of_alignment_heads = attn_of_alignment_heads[:, :, :content_mel_len] - return attn_of_alignment_heads + return attn_of_alignment_heads \ No newline at end of file