From 42d2784c20e9ef776ee41ea4c690fc63c8fbb2df Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 11:18:54 +0100 Subject: [PATCH] clearer log messages for sentence segmentation --- src/whisper_streaming/online_asr.py | 12 +++++++++--- whisper_online.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index dc34fd8..4b4f1ad 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -194,10 +194,16 @@ class OnlineASRProcessor: def chunk_completed_sentence(self): if self.commited == []: return - logger.debug("COMPLETED SENTENCE: ", [s[2] for s in self.commited]) + + raw_text = self.asr.sep.join([s[2] for s in self.commited]) + logger.debug(f"[Sentence-segmentation] Raw Text: {raw_text}") + sents = self.words_to_sentences(self.commited) + + + for s in sents: - logger.debug(f"\t\tSENT: {s}") + logger.debug(f"[Sentence-segmentation] completed sentence: {s}") if len(sents) < 2: return while len(sents) > 2: @@ -205,7 +211,7 @@ class OnlineASRProcessor: # we will continue with audio processing at this timestamp chunk_at = sents[-2][1] - logger.debug(f"--- sentence chunked at {chunk_at:2.2f}") + logger.debug(f"[Sentence-segmentation]: sentence chunked at {chunk_at:2.2f}") self.chunk_at(chunk_at) def chunk_completed_segment(self, res): diff --git a/whisper_online.py b/whisper_online.py index 077e660..55a1183 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -58,7 +58,7 @@ def create_tokenizer(lan): lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split() ): - logger.debug( + logger.warning( f"{lan} code is not supported by wtpsplit. Going to use None lang_code option." ) lan = None