diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py index 63160e0..3b2b61d 100644 --- a/mic_test_whisper_simple.py +++ b/mic_test_whisper_simple.py @@ -72,12 +72,12 @@ SAMPLING_RATE = 16000 model = "large-v2" src_lan = "en" # source language tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used -use_vad_result = True +use_vad = False min_sample_length = 1 * SAMPLING_RATE -vad = VoiceActivityController(use_vad_result = use_vad_result) +vac = VoiceActivityController(use_vad_result = use_vad) asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model tokenizer = create_tokenizer(tgt_lan) @@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr) stream = MicrophoneStream() -stream = vad.detect_user_speech(stream, audio_in_int16 = False) +stream = vac.detect_user_speech(stream, audio_in_int16 = False) stream = online.stream_process(stream) for isFinal, text in stream: diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py index bd68832..b427015 100644 --- a/mic_test_whisper_streaming.py +++ b/mic_test_whisper_streaming.py @@ -13,7 +13,7 @@ model = "large-v2" src_lan = "en" # source language tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used use_vad_result = True -min_sample_length = 1.5 * SAMPLING_RATE +min_sample_length = 1 * SAMPLING_RATE @@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream): # processing loop: if is_final: o = online.finish() - online.init() # final_processing_pending = False print('-----'*10) complete_text = complete_text + o[2] print('FINAL - '+ complete_text) # do something with current partial output print('-----'*10) + online.init() out = [] out_len = 0 diff --git a/voice_activity_controller.py b/voice_activity_controller.py index 59aceca..3ccc29a 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -76,7 +76,7 @@ class VoiceActivityController: if self.current_sample - self.temp_end < self.min_silence_samples: return audio, 0, window_size_samples else: - return np.array([], dtype=np.float16) , 0, window_size_samples + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples