From d89622b9c2138d7fc04c5fa27c457f7aeab7bf44 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 23 Feb 2025 23:41:15 +0100 Subject: [PATCH] Improve sentence tokenization handling - MosesSentenceSplitter now works with list input --- src/whisper_streaming/online_asr.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index 0ad0bed..2aebe99 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -263,11 +263,26 @@ class OnlineASRProcessor: def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]: """ - Converts a list of tokens to a list of Sentence objects by using the provided + Converts a list of tokens to a list of Sentence objects using the provided sentence tokenizer. """ + if not tokens: + return [] + full_text = " ".join(token.text for token in tokens) - sentence_texts = self.tokenize(full_text) if self.tokenize else [full_text] + + if self.tokenize: + try: + sentence_texts = self.tokenize(full_text) + except Exception as e: + # Some tokenizers (e.g., MosesSentenceSplitter) expect a list input. + try: + sentence_texts = self.tokenize([full_text]) + except Exception as e2: + raise ValueError("Tokenization failed") from e2 + else: + sentence_texts = [full_text] + sentences: List[Sentence] = [] token_index = 0 for sent_text in sentence_texts: @@ -276,7 +291,7 @@ class OnlineASRProcessor: continue sent_tokens = [] accumulated = "" - # Accumulate tokens until roughly matching the sentence text. + # Accumulate tokens until roughly matching the length of the sentence text. while token_index < len(tokens) and len(accumulated) < len(sent_text): token = tokens[token_index] accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text @@ -290,7 +305,6 @@ class OnlineASRProcessor: ) sentences.append(sentence) return sentences - def finish(self) -> Transcript: """ Flush the remaining transcript when processing ends.