From b63f54e8387438ae8438e2499cd6636daee6a60f Mon Sep 17 00:00:00 2001
From: Chingning Chen <chingning.chen@kneron.us>
Date: Mon, 2 Mar 2026 15:31:43 +0800
Subject: [PATCH] fix(whisper/tokenizer): prevent IndexError from crashing
 multilingual streams

This fix addresses a critical bug in the Whisper tokenizer that causes
the transcription server to crash with an `IndexError: string index out
of range` when streaming audio in languages utilizing multi-byte UTF-8
characters (e.g., Cantonese, Japanese, Mandarin).

When a 3-byte character is cut off at the boundary of an audio chunk,
incomplete bytes are decoded into a single Unicode replacement character
(`\ufffd`), artificially shortening the string and breaking the offset
mapping assumed by `split_tokens_on_unicode`.

This ports the upstream fix from SYSTRAN/faster-whisper (PR #111) to add
a strict bounds check before accessing the string index, allowing
incomplete bytes to be safely caught and handled in the next chunk.
---
 whisperlivekit/whisper/tokenizer.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/whisperlivekit/whisper/tokenizer.py b/whisperlivekit/whisper/tokenizer.py
index 2af8375..2cb6be7 100644
--- a/whisperlivekit/whisper/tokenizer.py
+++ b/whisperlivekit/whisper/tokenizer.py
@@ -296,10 +296,15 @@ class Tokenizer:
             current_tokens.append(token)
             decoded = self.decode_with_timestamps(current_tokens)
 
-            if (
-                replacement_char not in decoded
-                or decoded_full[unicode_offset + decoded.index(replacement_char)]
-                == replacement_char
+            try:
+                replacement_char_index = decoded.index(replacement_char)
+                replacement_char_index += unicode_offset
+            except ValueError:
+                replacement_char_index = None
+
+            if replacement_char_index is None or (
+                replacement_char_index < len(decoded_full)
+                and decoded_full[replacement_char_index] == replacement_char
             ):
                 words.append(decoded)
                 word_tokens.append(current_tokens)