From e6648e4f46a0dbc0d524f5650e26b86db93cee7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Mach=C3=A1=C4=8Dek?= Date: Thu, 28 Nov 2024 18:13:49 +0100 Subject: [PATCH] fixed silero vad chunk size issues #141 #121 #142 #136 etc. --- silero_vad.py => silero_vad_iterator.py | 30 +++++++++++++++++-------- whisper_online.py | 23 ++++++++++++------- 2 files changed, 36 insertions(+), 17 deletions(-) rename silero_vad.py => silero_vad_iterator.py (79%) diff --git a/silero_vad.py b/silero_vad_iterator.py similarity index 79% rename from silero_vad.py rename to silero_vad_iterator.py index 9e79e9d..1eea7af 100644 --- a/silero_vad.py +++ b/silero_vad_iterator.py @@ -2,6 +2,7 @@ import torch # This is copied from silero-vad's vad_utils.py: # https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340 +# (except changed defaults) # Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE @@ -10,8 +11,8 @@ class VADIterator: model, threshold: float = 0.5, sampling_rate: int = 16000, - min_silence_duration_ms: int = 100, - speech_pad_ms: int = 30 + min_silence_duration_ms: int = 500, # makes sense on one recording that I checked + speech_pad_ms: int = 100 # same ): """ @@ -95,11 +96,14 @@ class VADIterator: return None ####################### -# this is our workaround for Silero v5 requiring at least 512-sized audio chunks -# (see https://github.com/ufal/whisper_streaming/issues/116 ) +# because Silero now requires exactly 512-sized audio chunks import numpy as np class FixedVADIterator(VADIterator): + '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once. + If audio to be processed at once is long and multiple voiced segments detected, + then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment. + ''' def reset_states(self): super().reset_states() @@ -107,11 +111,19 @@ class FixedVADIterator(VADIterator): def __call__(self, x, return_seconds=False): self.buffer = np.append(self.buffer, x) - if len(self.buffer) >= 512: - ret = super().__call__(self.buffer, return_seconds=return_seconds) - self.buffer = np.array([],dtype=np.float32) - return ret - return None + ret = None + while len(self.buffer) >= 512: + r = super().__call__(self.buffer[:512], return_seconds=return_seconds) + self.buffer = self.buffer[512:] + if ret is None: + ret = r + elif r is not None: + if 'end' in r: + ret['end'] = r['end'] # the latter end + if 'start' in r and 'end' in ret: # there is an earlier start. + # Remove end, merging this segment with the previous one. + del ret['end'] + return ret if ret != {} else None if __name__ == "__main__": # test/demonstrate the need for FixedVADIterator: diff --git a/whisper_online.py b/whisper_online.py index 183b0a0..c11e53c 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -534,8 +534,8 @@ class VACOnlineASRProcessor(OnlineASRProcessor): repo_or_dir='snakers4/silero-vad', model='silero_vad' ) - from silero_vad import FixedVADIterator - self.vac = FixedVADIterator(model) # we use all the default options: 500ms silence, etc. + from silero_vad_iterator import FixedVADIterator + self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc. self.logfile = self.online.logfile self.init() @@ -561,24 +561,31 @@ class VACOnlineASRProcessor(OnlineASRProcessor): self.audio_buffer = np.append(self.audio_buffer, audio) if res is not None: - frame = list(res.values())[0] + frame = list(res.values())[0]-self.buffer_offset if 'start' in res and 'end' not in res: self.status = 'voice' - send_audio = self.audio_buffer[frame-self.buffer_offset:] - self.online.init(offset=frame/self.SAMPLING_RATE) + send_audio = self.audio_buffer[frame:] + self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE) self.online.insert_audio_chunk(send_audio) self.current_online_chunk_buffer_size += len(send_audio) self.clear_buffer() elif 'end' in res and 'start' not in res: self.status = 'nonvoice' - send_audio = self.audio_buffer[:frame-self.buffer_offset] + send_audio = self.audio_buffer[:frame] self.online.insert_audio_chunk(send_audio) self.current_online_chunk_buffer_size += len(send_audio) self.is_currently_final = True self.clear_buffer() else: - # It doesn't happen in the current code. - raise NotImplemented("both start and end of voice in one chunk!!!") + beg = res["start"]-self.buffer_offset + end = res["end"]-self.buffer_offset + self.status = 'nonvoice' + send_audio = self.audio_buffer[beg:end] + self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE) + self.online.insert_audio_chunk(send_audio) + self.current_online_chunk_buffer_size += len(send_audio) + self.is_currently_final = True + self.clear_buffer() else: if self.status == 'voice': self.online.insert_audio_chunk(self.audio_buffer)