From c8c786af4fb0fdf709cb0748acf91b5001d39bbe Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Wed, 6 Dec 2023 12:17:55 -0300 Subject: [PATCH] use of silero model instead of silero VadIterator --- mic_test_whisper_simple.py | 8 +-- mic_test_whisper_streaming.py | 2 +- microphone_stream.py | 2 +- voice_activity_controller.py | 106 ++++++++++++++++++---------------- whisper_online.py | 9 ++- 5 files changed, 69 insertions(+), 58 deletions(-) diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py index 58d3a8d..63160e0 100644 --- a/mic_test_whisper_simple.py +++ b/mic_test_whisper_simple.py @@ -39,7 +39,6 @@ class SimpleASRProcessor: if chunk is not None: sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") audio, _ = librosa.load(sf,sr=SAMPLING_RATE) - # self.audio_buffer.append(chunk) out = [] out.append(audio) a = np.concatenate(out) @@ -47,15 +46,16 @@ class SimpleASRProcessor: if is_final and len(self.audio_buffer) > 0: res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) - # use custom ts_words tsw = self.ts_words(res) + self.init_prompt = self.init_prompt + tsw self.init_prompt = self.init_prompt [-100:] self.audio_buffer.resize(0) iter_in_phrase =0 + yield True, tsw - # show progress evry 10 chunks - elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0: + # show progress evry 50 chunks + elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0: res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) # use custom ts_words tsw = self.ts_words(res) diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py index 26c0ba5..bd68832 100644 --- a/mic_test_whisper_streaming.py +++ b/mic_test_whisper_streaming.py @@ -13,7 +13,7 @@ model = "large-v2" src_lan = "en" # source language tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used use_vad_result = True -min_sample_length = 1 * SAMPLING_RATE +min_sample_length = 1.5 * SAMPLING_RATE diff --git a/microphone_stream.py b/microphone_stream.py index c317844..63d5019 100644 --- a/microphone_stream.py +++ b/microphone_stream.py @@ -29,7 +29,7 @@ class MicrophoneStream: self._pyaudio = pyaudio.PyAudio() self.sample_rate = sample_rate - self._chunk_size = int(self.sample_rate * 0.1) + self._chunk_size = int(self.sample_rate * 40 / 1000) self._stream = self._pyaudio.open( format=pyaudio.paInt16, channels=1, diff --git a/voice_activity_controller.py b/voice_activity_controller.py index e9083cb..533daab 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -3,16 +3,27 @@ import numpy as np # import sounddevice as sd import torch import numpy as np +import datetime +def int2float(sound): + abs_max = np.abs(sound).max() + sound = sound.astype('float32') + if abs_max > 0: + sound *= 1/32768 + sound = sound.squeeze() # depends on the use case + return sound + class VoiceActivityController: def __init__( self, sampling_rate = 16000, - second_ofSilence = 0.5, - second_ofSpeech = 0.25, + min_silence_to_final_ms = 500, + min_speech_to_final_ms = 100, + min_silence_duration_ms = 100, use_vad_result = True, activity_detected_callback=None, + threshold =0.3 ): self.activity_detected_callback=activity_detected_callback self.model, self.utils = torch.hub.load( @@ -26,84 +37,77 @@ class VoiceActivityController: collect_chunks) = self.utils self.sampling_rate = sampling_rate - self.silence_limit = second_ofSilence * self.sampling_rate - self.speech_limit = second_ofSpeech *self.sampling_rate + self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 + self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000 + self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 self.use_vad_result = use_vad_result - self.vad_iterator = VADIterator( - model =self.model, - threshold = 0.3, # 0.5 - sampling_rate= self.sampling_rate, - min_silence_duration_ms = 500, #100 - speech_pad_ms = 400 #30 - ) self.last_marked_chunk = None - - - def int2float(self, sound): - abs_max = np.abs(sound).max() - sound = sound.astype('float32') - if abs_max > 0: - sound *= 1/32768 - sound = sound.squeeze() # depends on the use case - return sound + self.threshold = threshold + self.reset_states() + + def reset_states(self): + self.model.reset_states() + self.temp_end = 0 + self.current_sample = 0 def apply_vad(self, audio): - audio_float32 = self.int2float(audio) - chunk = self.vad_iterator(audio_float32, return_seconds=False) + x = int2float(audio) + if not torch.is_tensor(x): + try: + x = torch.Tensor(x) + except: + raise TypeError("Audio cannot be casted to tensor. Cast it manually") - if chunk is not None: - if "start" in chunk: - start = chunk["start"] - self.last_marked_chunk = chunk - return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0 - - if "end" in chunk: - #todo: pending get the padding from the next chunk - end = chunk["end"] if chunk["end"] < len(audio) else len(audio) - self.last_marked_chunk = chunk - return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end + speech_prob = self.model(x, self.sampling_rate).item() + + window_size_samples = len(x[0]) if x.dim() == 2 else len(x) + self.current_sample += window_size_samples - if self.last_marked_chunk is not None: - if "start" in self.last_marked_chunk: - return audio, len(audio) ,0 - if "end" in self.last_marked_chunk: - return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) + if (speech_prob >= self.threshold): + self.temp_end = 0 + return audio, window_size_samples, 0 + + else : + if not self.temp_end: + self.temp_end = self.current_sample + + if self.current_sample - self.temp_end < self.min_silence_samples: + return audio, 0, window_size_samples + else: + return np.array([], dtype=np.float16) , 0, window_size_samples + - return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 def detect_user_speech(self, audio_stream, audio_in_int16 = False): - silence_len= 0 + last_silence_len= 0 speech_len = 0 for data in audio_stream: # replace with your condition of choice - # if isinstance(data, EndOfTransmission): - # raise EndOfTransmission("End of transmission detected") audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data wav = audio_block - is_final = False - voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav) - # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}') + voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav) + if speech_in_wav > 0 : - silence_len= 0 + last_silence_len= 0 speech_len += speech_in_wav if self.activity_detected_callback is not None: self.activity_detected_callback() - silence_len = silence_len + last_silent_duration_in_wav - if silence_len>= self.silence_limit and speech_len >= self.speech_limit: + last_silence_len += last_silent_in_wav + if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit: + is_final = True - silence_len= 0 - speech_len = 0 - + last_silence_len= 0 + speech_len = 0 yield voice_audio.tobytes(), is_final diff --git a/whisper_online.py b/whisper_online.py index 8efbbab..dc23c18 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -4,7 +4,7 @@ import numpy as np import librosa from functools import lru_cache import time - +import datetime @lru_cache @@ -118,14 +118,21 @@ class FasterWhisperASR(ASRBase): return model def transcribe(self, audio, init_prompt=""): + + # tiempo_inicio = datetime.datetime.now() # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs) + + # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.') + return list(segments) def ts_words(self, segments): o = [] for segment in segments: for word in segment.words: + if segment.no_speech_prob > 0.9: + continue # not stripping the spaces -- should not be merged with them! w = word.word t = (word.start, word.end, w)