mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 22:33:36 +00:00
vad
This commit is contained in:
@@ -72,12 +72,12 @@ SAMPLING_RATE = 16000
|
||||
model = "large-v2"
|
||||
src_lan = "en" # source language
|
||||
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
||||
use_vad_result = True
|
||||
use_vad = False
|
||||
min_sample_length = 1 * SAMPLING_RATE
|
||||
|
||||
|
||||
|
||||
vad = VoiceActivityController(use_vad_result = use_vad_result)
|
||||
vac = VoiceActivityController(use_vad_result = use_vad)
|
||||
asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
|
||||
|
||||
tokenizer = create_tokenizer(tgt_lan)
|
||||
@@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr)
|
||||
|
||||
|
||||
stream = MicrophoneStream()
|
||||
stream = vad.detect_user_speech(stream, audio_in_int16 = False)
|
||||
stream = vac.detect_user_speech(stream, audio_in_int16 = False)
|
||||
stream = online.stream_process(stream)
|
||||
|
||||
for isFinal, text in stream:
|
||||
|
||||
@@ -13,7 +13,7 @@ model = "large-v2"
|
||||
src_lan = "en" # source language
|
||||
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
||||
use_vad_result = True
|
||||
min_sample_length = 1.5 * SAMPLING_RATE
|
||||
min_sample_length = 1 * SAMPLING_RATE
|
||||
|
||||
|
||||
|
||||
@@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream): # processing loop:
|
||||
|
||||
if is_final:
|
||||
o = online.finish()
|
||||
online.init()
|
||||
# final_processing_pending = False
|
||||
print('-----'*10)
|
||||
complete_text = complete_text + o[2]
|
||||
print('FINAL - '+ complete_text) # do something with current partial output
|
||||
print('-----'*10)
|
||||
online.init()
|
||||
out = []
|
||||
out_len = 0
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ class VoiceActivityController:
|
||||
if self.current_sample - self.temp_end < self.min_silence_samples:
|
||||
return audio, 0, window_size_samples
|
||||
else:
|
||||
return np.array([], dtype=np.float16) , 0, window_size_samples
|
||||
return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user