From b63f54e8387438ae8438e2499cd6636daee6a60f Mon Sep 17 00:00:00 2001 From: Chingning Chen Date: Mon, 2 Mar 2026 15:31:43 +0800 Subject: [PATCH] fix(whisper/tokenizer): prevent IndexError from crashing multilingual streams This fix addresses a critical bug in the Whisper tokenizer that causes the transcription server to crash with an `IndexError: string index out of range` when streaming audio in languages utilizing multi-byte UTF-8 characters (e.g., Cantonese, Japanese, Mandarin). When a 3-byte character is cut off at the boundary of an audio chunk, incomplete bytes are decoded into a single Unicode replacement character (`\ufffd`), artificially shortening the string and breaking the offset mapping assumed by `split_tokens_on_unicode`. This ports the upstream fix from SYSTRAN/faster-whisper (PR #111) to add a strict bounds check before accessing the string index, allowing incomplete bytes to be safely caught and handled in the next chunk. --- whisperlivekit/whisper/tokenizer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/whisperlivekit/whisper/tokenizer.py b/whisperlivekit/whisper/tokenizer.py index 2af8375..2cb6be7 100644 --- a/whisperlivekit/whisper/tokenizer.py +++ b/whisperlivekit/whisper/tokenizer.py @@ -296,10 +296,15 @@ class Tokenizer: current_tokens.append(token) decoded = self.decode_with_timestamps(current_tokens) - if ( - replacement_char not in decoded - or decoded_full[unicode_offset + decoded.index(replacement_char)] - == replacement_char + try: + replacement_char_index = decoded.index(replacement_char) + replacement_char_index += unicode_offset + except ValueError: + replacement_char_index = None + + if replacement_char_index is None or ( + replacement_char_index < len(decoded_full) + and decoded_full[replacement_char_index] == replacement_char ): words.append(decoded) word_tokens.append(current_tokens)