From 14c2bbef87404f5208760466e3a9a74940388d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20Mach=C3=A1=C4=8Dek?= <machacek@ufal.mff.cuni.cz>
Date: Sun, 18 Aug 2024 20:33:08 +0200
Subject: [PATCH] removing duplicated code -- whisper_online_vac

---
 voice_activity_controller.py |   1 +
 whisper_online.py            |  66 +++++++++++-
 whisper_online_server.py     |   4 +-
 whisper_online_vac.py        | 203 -----------------------------------
 4 files changed, 66 insertions(+), 208 deletions(-)
 delete mode 100644 whisper_online_vac.py

diff --git a/voice_activity_controller.py b/voice_activity_controller.py
index ccfbea7..bcf9b04 100644
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -48,6 +48,7 @@ class VoiceActivityController:
             silence_in_wav)
 
         """
+        print("applying vad here")
         x = audio
         if not torch.is_tensor(x):
             try:
diff --git a/whisper_online.py b/whisper_online.py
index 20a9b79..de7cbf4 100644
--- a/whisper_online.py
+++ b/whisper_online.py
@@ -517,6 +517,59 @@ class OnlineASRProcessor:
             e = offset + sents[-1][1]
         return (b,e,t)
 
+class VACOnlineASRProcessor(OnlineASRProcessor):
+    '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller). 
+
+    It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds), 
+    it runs VAD and continuously detects whether there is speech or not. 
+    When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
+    '''
+
+    def __init__(self, online_chunk_size, *a, **kw):
+        self.online_chunk_size = online_chunk_size
+
+        self.online = OnlineASRProcessor(*a, **kw)
+        from voice_activity_controller import VoiceActivityController
+        self.vac = VoiceActivityController(use_vad_result = False)
+
+        self.logfile = self.online.logfile
+
+        self.init()
+
+    def init(self):
+        self.online.init()
+        self.vac.reset_states()
+        self.current_online_chunk_buffer_size = 0
+        self.is_currently_final = False
+
+
+    def insert_audio_chunk(self, audio):
+        r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
+        audio, is_final = r
+        print(is_final)
+        self.is_currently_final = is_final
+        self.online.insert_audio_chunk(audio)
+        self.current_online_chunk_buffer_size += len(audio)
+
+    def process_iter(self):
+        if self.is_currently_final:
+            return self.finish()
+        elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size:
+            self.current_online_chunk_buffer_size = 0
+            ret = self.online.process_iter()
+            return ret
+        else:
+            print("no online update, only VAD", file=self.logfile)
+            return (None, None, "")
+
+    def finish(self):
+        ret = self.online.finish()
+        self.online.init(keep_offset=True)
+        self.current_online_chunk_buffer_size = 0
+        return ret
+
+
+
 WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
 
 def create_tokenizer(lan):
@@ -561,6 +614,8 @@ def add_shared_args(parser):
     parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
     parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
     parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
+    parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
+    parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
     parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
     parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
     parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
@@ -607,7 +662,11 @@ def asr_factory(args, logfile=sys.stderr):
         tokenizer = None
 
     # Create the OnlineASRProcessor
-    online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
+    if args.vac:
+        
+        online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
+    else:
+        online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
 
     return asr, online
 
@@ -652,7 +711,10 @@ if __name__ == "__main__":
     logger.info("Audio duration is: %2.2f seconds" % duration)
 
     asr, online = asr_factory(args, logfile=logfile)
-    min_chunk = args.min_chunk_size
+    if args.vac:
+        min_chunk = args.vac_chunk_size
+    else:
+        min_chunk = args.min_chunk_size
 
     # load the audio into the LRU cache before we start the timer
     a = load_audio_chunk(audio_path,0,1)
diff --git a/whisper_online_server.py b/whisper_online_server.py
index 959020e..3892329 100644
--- a/whisper_online_server.py
+++ b/whisper_online_server.py
@@ -13,8 +13,6 @@ parser = argparse.ArgumentParser()
 # server options
 parser.add_argument("--host", type=str, default='localhost')
 parser.add_argument("--port", type=int, default=43007)
-parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
-parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
 parser.add_argument("--warmup-file", type=str, dest="warmup_file", 
         help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
 
@@ -108,7 +106,7 @@ class ServerProcessor:
             raw_bytes = self.connection.non_blocking_receive_audio()
             if not raw_bytes:
                 break
-            print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
+#            print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
             sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
             audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
             out.append(audio)
diff --git a/whisper_online_vac.py b/whisper_online_vac.py
deleted file mode 100644
index f323d38..0000000
--- a/whisper_online_vac.py
+++ /dev/null
@@ -1,203 +0,0 @@
-from whisper_online import *
-from voice_activity_controller import *
-import soundfile
-import io
-
-SAMPLING_RATE = 16000
-
-class VACOnlineASRProcessor(OnlineASRProcessor):
-
-    def __init__(self, online_chunk_size, *a, **kw):
-        self.online_chunk_size = online_chunk_size
-
-        self.online = OnlineASRProcessor(*a, **kw)
-        self.vac = VoiceActivityController(use_vad_result = False)
-
-        self.logfile = self.online.logfile
-
-        self.init()
-
-    def init(self):
-        self.online.init()
-        self.vac.reset_states()
-        self.current_online_chunk_buffer_size = 0
-        self.is_currently_final = False
-
-
-    def insert_audio_chunk(self, audio):
-        r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
-        audio, is_final = r
-        print(is_final)
-        self.is_currently_final = is_final
-        self.online.insert_audio_chunk(audio)
-        self.current_online_chunk_buffer_size += len(audio)
-
-    def process_iter(self):
-        if self.is_currently_final:
-            return self.finish()
-        elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size:
-            self.current_online_chunk_buffer_size = 0
-            ret = self.online.process_iter()
-            return ret
-        else:
-            print("no online update, only VAD", file=self.logfile)
-            return (None, None, "")
-
-    def finish(self):
-        ret = self.online.finish()
-        self.online.init(keep_offset=True)
-        self.current_online_chunk_buffer_size = 0
-        return ret
-
-
-
-
-if __name__ == "__main__":
-
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
-    add_shared_args(parser)
-    parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
-    parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
-    parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
-    parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') 
-    args = parser.parse_args()
-
-    # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
-    logfile = sys.stderr
-
-    if args.offline and args.comp_unaware:
-        print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
-        sys.exit(1)
-
-    audio_path = args.audio_path
-
-    SAMPLING_RATE = 16000
-    duration = len(load_audio(audio_path))/SAMPLING_RATE
-    print("Audio duration is: %2.2f seconds" % duration, file=logfile)
-
-    size = args.model
-    language = args.lan
-
-    t = time.time()
-    print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
-
-    if args.backend == "faster-whisper":
-        asr_cls = FasterWhisperASR
-    else:
-        asr_cls = WhisperTimestampedASR
-
-    asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
-
-    if args.task == "translate":
-        asr.set_translate_task()
-        tgt_language = "en"  # Whisper translates into English
-    else:
-        tgt_language = language  # Whisper transcribes in this language
-
-
-    e = time.time()
-    print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
-
-    if args.vad:
-        print("setting VAD filter",file=logfile)
-        asr.use_vad()
-
-    
-    min_chunk = args.vac_chunk_size
-    if args.buffer_trimming == "sentence":
-        tokenizer = create_tokenizer(tgt_language)
-    else:
-        tokenizer = None
-    online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
-
-
-    # load the audio into the LRU cache before we start the timer
-    a = load_audio_chunk(audio_path,0,1)
-
-    # warm up the ASR, because the very first transcribe takes much more time than the other
-    asr.transcribe(a)
-
-    beg = args.start_at
-    start = time.time()-beg
-
-    def output_transcript(o, now=None):
-        # output format in stdout is like:
-        # 4186.3606 0 1720 Takhle to je
-        # - the first three words are:
-        #    - emission time from beginning of processing, in milliseconds
-        #    - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
-        # - the next words: segment transcript
-        if now is None:
-            now = time.time()-start
-        if o[0] is not None:
-            print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
-            print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
-        else:
-            print(o,file=logfile,flush=True)
-
-    if args.offline: ## offline mode processing (for testing/debugging)
-        a = load_audio(audio_path)
-        online.insert_audio_chunk(a)
-        try:
-            o = online.process_iter()
-        except AssertionError:
-            print("assertion error",file=logfile)
-            pass
-        else:
-            output_transcript(o)
-        now = None
-    elif args.comp_unaware:  # computational unaware mode 
-        end = beg + min_chunk
-        while True:
-            a = load_audio_chunk(audio_path,beg,end)
-            online.insert_audio_chunk(a)
-            try:
-                o = online.process_iter()
-            except AssertionError:
-                print("assertion error",file=logfile)
-                pass
-            else:
-                output_transcript(o, now=end)
-
-            print(f"## last processed {end:.2f}s",file=logfile,flush=True)
-
-            if end >= duration:
-                break
-
-            beg = end
-
-            if end + min_chunk > duration:
-                end = duration
-            else:
-                end += min_chunk
-        now = duration
-
-    else: # online = simultaneous mode
-        end = 0
-        while True:
-            now = time.time() - start
-            if now < end+min_chunk:
-                time.sleep(min_chunk+end-now)
-            end = time.time() - start
-            a = load_audio_chunk(audio_path,beg,end)
-            beg = end
-            online.insert_audio_chunk(a)
-
-            try:
-                o = online.process_iter()
-            except AssertionError:
-                print("assertion error",file=logfile)
-                pass
-            else:
-                output_transcript(o)
-            now = time.time() - start
-            print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
-
-            if end >= duration:
-                break
-        now = None
-
-    o = online.finish()
-    output_transcript(o, now=now)