From 42935805817602ef4c10a569eec0134590df2d4d Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Tue, 21 Jan 2025 12:06:03 +0100
Subject: [PATCH] use moses sentence segmenter instead of tokenizer

---
 src/whisper_streaming/online_asr.py | 41 ++++++++++++++++++-----------
 whisper_online.py                   |  7 ++---
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py
index 4b4f1ad..522a243 100644
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -87,11 +87,20 @@ class OnlineASRProcessor:
         buffer_trimming=("segment", 15),
         logfile=sys.stderr,
     ):
-        """asr: WhisperASR object
-        tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
-        ("segment", 15)
-        buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
-        logfile: where to store the log.
+        """
+        Initialize OnlineASRProcessor.
+
+        Args:
+            asr: WhisperASR object
+            tokenize_method: Sentence tokenizer function for the target language.
+            Must be a function that takes a list of text as input like MosesSentenceSplitter.
+            Can be None if using "segment" buffer trimming option.
+            buffer_trimming: Tuple of (option, seconds) where:
+            - option: Either "sentence" or "segment"
+            - seconds: Number of seconds threshold for buffer trimming
+            Default is ("segment", 15)
+            logfile: File to store logs
+
         """
         self.asr = asr
         self.tokenize = tokenize_method
@@ -194,24 +203,25 @@ class OnlineASRProcessor:
     def chunk_completed_sentence(self):
         if self.commited == []:
             return
-        
-        raw_text = self.asr.sep.join([s[2] for s in self.commited]) 
-        logger.debug(f"[Sentence-segmentation] Raw Text: {raw_text}")
 
         sents = self.words_to_sentences(self.commited)
 
 
 
-        for s in sents:
-            logger.debug(f"[Sentence-segmentation] completed sentence: {s}")
         if len(sents) < 2:
+            logger.debug(f"[Sentence-segmentation] no sentence segmented.")
             return
-        while len(sents) > 2:
-            sents.pop(0)
+        
+
+
+        identified_sentence= "\n    - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sents])
+        logger.debug(f"[Sentence-segmentation] identified sentences:\n    - {identified_sentence}")
+       
+
         # we will continue with audio processing at this timestamp
         chunk_at = sents[-2][1]
 
-        logger.debug(f"[Sentence-segmentation]: sentence chunked at {chunk_at:2.2f}")
+        logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}")
         self.chunk_at(chunk_at)
 
     def chunk_completed_segment(self, res):
@@ -249,8 +259,9 @@ class OnlineASRProcessor:
         """
 
         cwords = [w for w in words]
-        t = " ".join(o[2] for o in cwords)
-        s = self.tokenize(t)
+        t = self.asr.sep.join(o[2] for o in cwords)
+        logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
+        s = self.tokenize([t])
         out = []
         while s:
             beg = None
diff --git a/whisper_online.py b/whisper_online.py
index 55a1183..f553d45 100644
--- a/whisper_online.py
+++ b/whisper_online.py
@@ -49,16 +49,16 @@ def create_tokenizer(lan):
         lan
         in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split()
     ):
-        from mosestokenizer import MosesTokenizer
+        from mosestokenizer import MosesSentenceSplitter        
 
-        return MosesTokenizer(lan)
+        return MosesSentenceSplitter(lan)
 
     # the following languages are in Whisper, but not in wtpsplit:
     if (
         lan
         in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split()
     ):
-        logger.warning(
+        logger.debug(
             f"{lan} code is not supported by wtpsplit. Going to use None lang_code option."
         )
         lan = None
@@ -204,6 +204,7 @@ def backend_factory(args):
 
     # Create the tokenizer
     if args.buffer_trimming == "sentence":
+
         tokenizer = create_tokenizer(tgt_language)
     else:
         tokenizer = None