use of silero model instead of silero VadIterator

2026-03-07 22:33:36 +00:00 · 2023-12-06 12:17:55 -03:00
parent 3fad8133b4
commit c8c786af4f
5 changed files with 69 additions and 58 deletions
--- a/mic_test_whisper_simple.py
+++ b/mic_test_whisper_simple.py
@@ -39,7 +39,6 @@ class SimpleASRProcessor:
            if chunk is not None:
                sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
                audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
-                # self.audio_buffer.append(chunk)
                out = []
                out.append(audio)
                a = np.concatenate(out)
@@ -47,15 +46,16 @@ class SimpleASRProcessor:

            if is_final and len(self.audio_buffer) > 0:
                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
-                # use custom ts_words
                tsw = self.ts_words(res)
+                
                self.init_prompt = self.init_prompt + tsw
                self.init_prompt  = self.init_prompt [-100:]
                self.audio_buffer.resize(0)
                iter_in_phrase =0
+                
                yield True, tsw
-            # show progress evry 10 chunks
-            elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
+            # show progress evry 50 chunks
+            elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                # use custom ts_words
                tsw = self.ts_words(res)
--- a/mic_test_whisper_streaming.py
+++ b/mic_test_whisper_streaming.py
@@ -13,7 +13,7 @@ model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1 * SAMPLING_RATE
+min_sample_length = 1.5 * SAMPLING_RATE



--- a/microphone_stream.py
+++ b/microphone_stream.py
@@ -29,7 +29,7 @@ class MicrophoneStream:
        self._pyaudio = pyaudio.PyAudio()
        self.sample_rate = sample_rate

-        self._chunk_size = int(self.sample_rate * 0.1)
+        self._chunk_size = int(self.sample_rate * 40  / 1000)
        self._stream = self._pyaudio.open(
            format=pyaudio.paInt16,
            channels=1,
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -3,16 +3,27 @@ import numpy as np
 # import sounddevice as sd
 import torch
 import numpy as np
+import datetime


+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound
+
 class VoiceActivityController:
    def __init__(
            self, 
            sampling_rate = 16000,
-            second_ofSilence = 0.5,
-            second_ofSpeech = 0.25,
+            min_silence_to_final_ms = 500,
+            min_speech_to_final_ms = 100,
+            min_silence_duration_ms = 100,
            use_vad_result = True,
            activity_detected_callback=None,
+            threshold =0.3
        ):
        self.activity_detected_callback=activity_detected_callback
        self.model, self.utils = torch.hub.load(
@@ -26,84 +37,77 @@ class VoiceActivityController:
        collect_chunks) = self.utils

        self.sampling_rate = sampling_rate  
-        self.silence_limit = second_ofSilence * self.sampling_rate 
-        self.speech_limit = second_ofSpeech *self.sampling_rate 
+        self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 
+        self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000

        self.use_vad_result = use_vad_result
-        self.vad_iterator = VADIterator(
-            model =self.model,
-            threshold = 0.3, # 0.5
-            sampling_rate= self.sampling_rate,
-            min_silence_duration_ms = 500, #100
-            speech_pad_ms = 400 #30
-        )
        self.last_marked_chunk = None
-        
-    
-    def int2float(self, sound):
-        abs_max = np.abs(sound).max()
-        sound = sound.astype('float32')
-        if abs_max > 0:
-            sound *= 1/32768
-        sound = sound.squeeze()  # depends on the use case
-        return sound
+        self.threshold = threshold
+        self.reset_states()
+
+    def reset_states(self):
+        self.model.reset_states()
+        self.temp_end = 0
+        self.current_sample = 0

    def apply_vad(self, audio):
-        audio_float32 = self.int2float(audio)
-        chunk = self.vad_iterator(audio_float32, return_seconds=False)
+        x = int2float(audio)
+        if not torch.is_tensor(x):
+            try:
+                x = torch.Tensor(x)
+            except:
+                raise TypeError("Audio cannot be casted to tensor. Cast it manually")

-        if chunk is not None:        
-            if "start" in chunk:
-                start = chunk["start"]
-                self.last_marked_chunk = chunk
-                return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
-            
-            if "end" in chunk:
-                #todo: pending get the padding from the next chunk
-                end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
-                self.last_marked_chunk = chunk
-                return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
+        speech_prob = self.model(x, self.sampling_rate).item()
+        
+        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+        self.current_sample += window_size_samples 

-        if self.last_marked_chunk is not None:
-            if "start" in self.last_marked_chunk:
-                return audio, len(audio)  ,0

-            if "end" in self.last_marked_chunk:
-                return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) 
+        if (speech_prob >= self.threshold):
+            self.temp_end = 0
+            return audio, window_size_samples, 0
+
+        else :
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return audio, 0, window_size_samples
+            else:
+                return np.array([], dtype=np.float16) , 0, window_size_samples
+

-        return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 



    def detect_user_speech(self, audio_stream, audio_in_int16 = False):
-        silence_len= 0
+        last_silence_len= 0
        speech_len = 0

        for data in audio_stream:  # replace with your condition of choice
-            # if isinstance(data, EndOfTransmission):
-            #     raise EndOfTransmission("End of transmission detected")
            
            
            audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
            wav = audio_block
            
-
            is_final = False
-            voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
-            # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
+            voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
+

            if speech_in_wav > 0 :
-                silence_len= 0                
+                last_silence_len= 0                
                speech_len += speech_in_wav
                if self.activity_detected_callback is not None:
                    self.activity_detected_callback()

-            silence_len = silence_len + last_silent_duration_in_wav
-            if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
+            last_silence_len +=  last_silent_in_wav
+            if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
+
                is_final = True
-                silence_len= 0
-                speech_len = 0
-            
+                last_silence_len= 0
+                speech_len = 0                

            yield voice_audio.tobytes(), is_final

--- a/whisper_online.py
+++ b/whisper_online.py
@@ -4,7 +4,7 @@ import numpy as np
 import librosa  
 from functools import lru_cache
 import time
-
+import datetime


@lru_cache
@@ -118,14 +118,21 @@ class FasterWhisperASR(ASRBase):
        return model

    def transcribe(self, audio, init_prompt=""):
+
+        # tiempo_inicio = datetime.datetime.now()
        # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
        segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
+        
+        # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe  take { (datetime.datetime.now() -tiempo_inicio)  } ms.')
+
        return list(segments)

    def ts_words(self, segments):
        o = []
        for segment in segments:
            for word in segment.words:
+                if segment.no_speech_prob > 0.9:
+                    continue
                # not stripping the spaces -- should not be merged with them!
                w = word.word
                t = (word.start, word.end, w)