vad

2026-03-07 22:33:36 +00:00 · 2023-12-09 17:12:43 -03:00
parent fe4207edca
commit 324dee03e7
3 changed files with 6 additions and 6 deletions
--- a/mic_test_whisper_simple.py
+++ b/mic_test_whisper_simple.py
@@ -72,12 +72,12 @@ SAMPLING_RATE = 16000
 model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
-use_vad_result = True
+use_vad = False
 min_sample_length = 1 * SAMPLING_RATE



-vad = VoiceActivityController(use_vad_result = use_vad_result)
+vac = VoiceActivityController(use_vad_result = use_vad)
 asr = FasterWhisperASR(src_lan, "large-v2")  # loads and wraps Whisper model

 tokenizer = create_tokenizer(tgt_lan)
@@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr)


 stream = MicrophoneStream()
-stream = vad.detect_user_speech(stream, audio_in_int16 = False) 
+stream = vac.detect_user_speech(stream, audio_in_int16 = False) 
 stream = online.stream_process(stream)

 for isFinal, text in stream:
--- a/mic_test_whisper_streaming.py
+++ b/mic_test_whisper_streaming.py
@@ -13,7 +13,7 @@ model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1.5 * SAMPLING_RATE
+min_sample_length = 1 * SAMPLING_RATE



@@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream):   # processing loop:

    if is_final:
        o = online.finish()
-        online.init()   
        # final_processing_pending = False         
        print('-----'*10)
        complete_text = complete_text + o[2]
        print('FINAL - '+ complete_text) # do something with current partial output
        print('-----'*10)   
+        online.init()   
        out = []
        out_len = 0    
        
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -76,7 +76,7 @@ class VoiceActivityController:
            if self.current_sample - self.temp_end < self.min_silence_samples:
                return audio, 0, window_size_samples
            else:
-                return np.array([], dtype=np.float16) , 0, window_size_samples
+                return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples