From 14c2bbef87404f5208760466e3a9a74940388d0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Mach=C3=A1=C4=8Dek?= Date: Sun, 18 Aug 2024 20:33:08 +0200 Subject: [PATCH] removing duplicated code -- whisper_online_vac --- voice_activity_controller.py | 1 + whisper_online.py | 66 +++++++++++- whisper_online_server.py | 4 +- whisper_online_vac.py | 203 ----------------------------------- 4 files changed, 66 insertions(+), 208 deletions(-) delete mode 100644 whisper_online_vac.py diff --git a/voice_activity_controller.py b/voice_activity_controller.py index ccfbea7..bcf9b04 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -48,6 +48,7 @@ class VoiceActivityController: silence_in_wav) """ + print("applying vad here") x = audio if not torch.is_tensor(x): try: diff --git a/whisper_online.py b/whisper_online.py index 20a9b79..de7cbf4 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -517,6 +517,59 @@ class OnlineASRProcessor: e = offset + sents[-1][1] return (b,e,t) +class VACOnlineASRProcessor(OnlineASRProcessor): + '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller). + + It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds), + it runs VAD and continuously detects whether there is speech or not. + When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately. + ''' + + def __init__(self, online_chunk_size, *a, **kw): + self.online_chunk_size = online_chunk_size + + self.online = OnlineASRProcessor(*a, **kw) + from voice_activity_controller import VoiceActivityController + self.vac = VoiceActivityController(use_vad_result = False) + + self.logfile = self.online.logfile + + self.init() + + def init(self): + self.online.init() + self.vac.reset_states() + self.current_online_chunk_buffer_size = 0 + self.is_currently_final = False + + + def insert_audio_chunk(self, audio): + r = self.vac.detect_speech_iter(audio,audio_in_int16=False) + audio, is_final = r + print(is_final) + self.is_currently_final = is_final + self.online.insert_audio_chunk(audio) + self.current_online_chunk_buffer_size += len(audio) + + def process_iter(self): + if self.is_currently_final: + return self.finish() + elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size: + self.current_online_chunk_buffer_size = 0 + ret = self.online.process_iter() + return ret + else: + print("no online update, only VAD", file=self.logfile) + return (None, None, "") + + def finish(self): + ret = self.online.finish() + self.online.init(keep_offset=True) + self.current_online_chunk_buffer_size = 0 + return ret + + + WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",") def create_tokenizer(lan): @@ -561,6 +614,8 @@ def add_shared_args(parser): parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.") parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.") parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.') + parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.') + parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.') parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.') parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.') @@ -607,7 +662,11 @@ def asr_factory(args, logfile=sys.stderr): tokenizer = None # Create the OnlineASRProcessor - online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) + if args.vac: + + online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) + else: + online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) return asr, online @@ -652,7 +711,10 @@ if __name__ == "__main__": logger.info("Audio duration is: %2.2f seconds" % duration) asr, online = asr_factory(args, logfile=logfile) - min_chunk = args.min_chunk_size + if args.vac: + min_chunk = args.vac_chunk_size + else: + min_chunk = args.min_chunk_size # load the audio into the LRU cache before we start the timer a = load_audio_chunk(audio_path,0,1) diff --git a/whisper_online_server.py b/whisper_online_server.py index 959020e..3892329 100644 --- a/whisper_online_server.py +++ b/whisper_online_server.py @@ -13,8 +13,6 @@ parser = argparse.ArgumentParser() # server options parser.add_argument("--host", type=str, default='localhost') parser.add_argument("--port", type=int, default=43007) -parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.') -parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') parser.add_argument("--warmup-file", type=str, dest="warmup_file", help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") @@ -108,7 +106,7 @@ class ServerProcessor: raw_bytes = self.connection.non_blocking_receive_audio() if not raw_bytes: break - print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) +# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32) out.append(audio) diff --git a/whisper_online_vac.py b/whisper_online_vac.py deleted file mode 100644 index f323d38..0000000 --- a/whisper_online_vac.py +++ /dev/null @@ -1,203 +0,0 @@ -from whisper_online import * -from voice_activity_controller import * -import soundfile -import io - -SAMPLING_RATE = 16000 - -class VACOnlineASRProcessor(OnlineASRProcessor): - - def __init__(self, online_chunk_size, *a, **kw): - self.online_chunk_size = online_chunk_size - - self.online = OnlineASRProcessor(*a, **kw) - self.vac = VoiceActivityController(use_vad_result = False) - - self.logfile = self.online.logfile - - self.init() - - def init(self): - self.online.init() - self.vac.reset_states() - self.current_online_chunk_buffer_size = 0 - self.is_currently_final = False - - - def insert_audio_chunk(self, audio): - r = self.vac.detect_speech_iter(audio,audio_in_int16=False) - audio, is_final = r - print(is_final) - self.is_currently_final = is_final - self.online.insert_audio_chunk(audio) - self.current_online_chunk_buffer_size += len(audio) - - def process_iter(self): - if self.is_currently_final: - return self.finish() - elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size: - self.current_online_chunk_buffer_size = 0 - ret = self.online.process_iter() - return ret - else: - print("no online update, only VAD", file=self.logfile) - return (None, None, "") - - def finish(self): - ret = self.online.finish() - self.online.init(keep_offset=True) - self.current_online_chunk_buffer_size = 0 - return ret - - - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.") - add_shared_args(parser) - parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.') - parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.') - parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.') - parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') - args = parser.parse_args() - - # reset to store stderr to different file stream, e.g. open(os.devnull,"w") - logfile = sys.stderr - - if args.offline and args.comp_unaware: - print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile) - sys.exit(1) - - audio_path = args.audio_path - - SAMPLING_RATE = 16000 - duration = len(load_audio(audio_path))/SAMPLING_RATE - print("Audio duration is: %2.2f seconds" % duration, file=logfile) - - size = args.model - language = args.lan - - t = time.time() - print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True) - - if args.backend == "faster-whisper": - asr_cls = FasterWhisperASR - else: - asr_cls = WhisperTimestampedASR - - asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) - - if args.task == "translate": - asr.set_translate_task() - tgt_language = "en" # Whisper translates into English - else: - tgt_language = language # Whisper transcribes in this language - - - e = time.time() - print(f"done. It took {round(e-t,2)} seconds.",file=logfile) - - if args.vad: - print("setting VAD filter",file=logfile) - asr.use_vad() - - - min_chunk = args.vac_chunk_size - if args.buffer_trimming == "sentence": - tokenizer = create_tokenizer(tgt_language) - else: - tokenizer = None - online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) - - - # load the audio into the LRU cache before we start the timer - a = load_audio_chunk(audio_path,0,1) - - # warm up the ASR, because the very first transcribe takes much more time than the other - asr.transcribe(a) - - beg = args.start_at - start = time.time()-beg - - def output_transcript(o, now=None): - # output format in stdout is like: - # 4186.3606 0 1720 Takhle to je - # - the first three words are: - # - emission time from beginning of processing, in milliseconds - # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway - # - the next words: segment transcript - if now is None: - now = time.time()-start - if o[0] is not None: - print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True) - print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True) - else: - print(o,file=logfile,flush=True) - - if args.offline: ## offline mode processing (for testing/debugging) - a = load_audio(audio_path) - online.insert_audio_chunk(a) - try: - o = online.process_iter() - except AssertionError: - print("assertion error",file=logfile) - pass - else: - output_transcript(o) - now = None - elif args.comp_unaware: # computational unaware mode - end = beg + min_chunk - while True: - a = load_audio_chunk(audio_path,beg,end) - online.insert_audio_chunk(a) - try: - o = online.process_iter() - except AssertionError: - print("assertion error",file=logfile) - pass - else: - output_transcript(o, now=end) - - print(f"## last processed {end:.2f}s",file=logfile,flush=True) - - if end >= duration: - break - - beg = end - - if end + min_chunk > duration: - end = duration - else: - end += min_chunk - now = duration - - else: # online = simultaneous mode - end = 0 - while True: - now = time.time() - start - if now < end+min_chunk: - time.sleep(min_chunk+end-now) - end = time.time() - start - a = load_audio_chunk(audio_path,beg,end) - beg = end - online.insert_audio_chunk(a) - - try: - o = online.process_iter() - except AssertionError: - print("assertion error",file=logfile) - pass - else: - output_transcript(o) - now = time.time() - start - print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True) - - if end >= duration: - break - now = None - - o = online.finish() - output_transcript(o, now=now)