From d89622b9c2138d7fc04c5fa27c457f7aeab7bf44 Mon Sep 17 00:00:00 2001
From: Quentin Fuxa <quentin.fuxa@gmail.com>
Date: Sun, 23 Feb 2025 23:41:15 +0100
Subject: [PATCH] Improve sentence tokenization handling -
 MosesSentenceSplitter now works with list input

---
 src/whisper_streaming/online_asr.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py
index 0ad0bed..2aebe99 100644
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -263,11 +263,26 @@ class OnlineASRProcessor:
 
     def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
         """
-        Converts a list of tokens to a list of Sentence objects by using the provided
+        Converts a list of tokens to a list of Sentence objects using the provided
         sentence tokenizer.
         """
+        if not tokens:
+            return []
+
         full_text = " ".join(token.text for token in tokens)
-        sentence_texts = self.tokenize(full_text) if self.tokenize else [full_text]
+
+        if self.tokenize:
+            try:
+                sentence_texts = self.tokenize(full_text)
+            except Exception as e:
+                # Some tokenizers (e.g., MosesSentenceSplitter) expect a list input.
+                try:
+                    sentence_texts = self.tokenize([full_text])
+                except Exception as e2:
+                    raise ValueError("Tokenization failed") from e2
+        else:
+            sentence_texts = [full_text]
+
         sentences: List[Sentence] = []
         token_index = 0
         for sent_text in sentence_texts:
@@ -276,7 +291,7 @@ class OnlineASRProcessor:
                 continue
             sent_tokens = []
             accumulated = ""
-            # Accumulate tokens until roughly matching the sentence text.
+            # Accumulate tokens until roughly matching the length of the sentence text.
             while token_index < len(tokens) and len(accumulated) < len(sent_text):
                 token = tokens[token_index]
                 accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
@@ -290,7 +305,6 @@ class OnlineASRProcessor:
                 )
                 sentences.append(sentence)
         return sentences
-
     def finish(self) -> Transcript:
         """
         Flush the remaining transcript when processing ends.