Improve sentence tokenization handling - MosesSentenceSplitter now works with list input

2026-03-07 22:33:36 +00:00 · 2025-02-23 23:41:15 +01:00
parent 296327071d
commit d89622b9c2
1 changed files with 18 additions and 4 deletions
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -263,11 +263,26 @@ class OnlineASRProcessor:

    def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
        """
-        Converts a list of tokens to a list of Sentence objects by using the provided
+        Converts a list of tokens to a list of Sentence objects using the provided
        sentence tokenizer.
        """
+        if not tokens:
+            return []
+
        full_text = " ".join(token.text for token in tokens)
-        sentence_texts = self.tokenize(full_text) if self.tokenize else [full_text]
+
+        if self.tokenize:
+            try:
+                sentence_texts = self.tokenize(full_text)
+            except Exception as e:
+                # Some tokenizers (e.g., MosesSentenceSplitter) expect a list input.
+                try:
+                    sentence_texts = self.tokenize([full_text])
+                except Exception as e2:
+                    raise ValueError("Tokenization failed") from e2
+        else:
+            sentence_texts = [full_text]
+
        sentences: List[Sentence] = []
        token_index = 0
        for sent_text in sentence_texts:
@@ -276,7 +291,7 @@ class OnlineASRProcessor:
                continue
            sent_tokens = []
            accumulated = ""
-            # Accumulate tokens until roughly matching the sentence text.
+            # Accumulate tokens until roughly matching the length of the sentence text.
            while token_index < len(tokens) and len(accumulated) < len(sent_text):
                token = tokens[token_index]
                accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
@@ -290,7 +305,6 @@ class OnlineASRProcessor:
                )
                sentences.append(sentence)
        return sentences
-
    def finish(self) -> Transcript:
        """
        Flush the remaining transcript when processing ends.