fixed silero vad chunk size

issues #141 #121 #142 #136 etc.
This commit is contained in:
Dominik Macháček
2024-11-28 18:13:49 +01:00
parent 863242f107
commit e6648e4f46
2 changed files with 36 additions and 17 deletions

View File

@@ -2,6 +2,7 @@ import torch
# This is copied from silero-vad's vad_utils.py:
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
# (except changed defaults)
# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
@@ -10,8 +11,8 @@ class VADIterator:
model,
threshold: float = 0.5,
sampling_rate: int = 16000,
min_silence_duration_ms: int = 100,
speech_pad_ms: int = 30
min_silence_duration_ms: int = 500, # makes sense on one recording that I checked
speech_pad_ms: int = 100 # same
):
"""
@@ -95,11 +96,14 @@ class VADIterator:
return None
#######################
# this is our workaround for Silero v5 requiring at least 512-sized audio chunks
# (see https://github.com/ufal/whisper_streaming/issues/116 )
# because Silero now requires exactly 512-sized audio chunks
import numpy as np
class FixedVADIterator(VADIterator):
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
If audio to be processed at once is long and multiple voiced segments detected,
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
'''
def reset_states(self):
super().reset_states()
@@ -107,11 +111,19 @@ class FixedVADIterator(VADIterator):
def __call__(self, x, return_seconds=False):
self.buffer = np.append(self.buffer, x)
if len(self.buffer) >= 512:
ret = super().__call__(self.buffer, return_seconds=return_seconds)
self.buffer = np.array([],dtype=np.float32)
return ret
return None
ret = None
while len(self.buffer) >= 512:
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
self.buffer = self.buffer[512:]
if ret is None:
ret = r
elif r is not None:
if 'end' in r:
ret['end'] = r['end'] # the latter end
if 'start' in r and 'end' in ret: # there is an earlier start.
# Remove end, merging this segment with the previous one.
del ret['end']
return ret if ret != {} else None
if __name__ == "__main__":
# test/demonstrate the need for FixedVADIterator:

View File

@@ -534,8 +534,8 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
repo_or_dir='snakers4/silero-vad',
model='silero_vad'
)
from silero_vad import FixedVADIterator
self.vac = FixedVADIterator(model) # we use all the default options: 500ms silence, etc.
from silero_vad_iterator import FixedVADIterator
self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc.
self.logfile = self.online.logfile
self.init()
@@ -561,24 +561,31 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
self.audio_buffer = np.append(self.audio_buffer, audio)
if res is not None:
frame = list(res.values())[0]
frame = list(res.values())[0]-self.buffer_offset
if 'start' in res and 'end' not in res:
self.status = 'voice'
send_audio = self.audio_buffer[frame-self.buffer_offset:]
self.online.init(offset=frame/self.SAMPLING_RATE)
send_audio = self.audio_buffer[frame:]
self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE)
self.online.insert_audio_chunk(send_audio)
self.current_online_chunk_buffer_size += len(send_audio)
self.clear_buffer()
elif 'end' in res and 'start' not in res:
self.status = 'nonvoice'
send_audio = self.audio_buffer[:frame-self.buffer_offset]
send_audio = self.audio_buffer[:frame]
self.online.insert_audio_chunk(send_audio)
self.current_online_chunk_buffer_size += len(send_audio)
self.is_currently_final = True
self.clear_buffer()
else:
# It doesn't happen in the current code.
raise NotImplemented("both start and end of voice in one chunk!!!")
beg = res["start"]-self.buffer_offset
end = res["end"]-self.buffer_offset
self.status = 'nonvoice'
send_audio = self.audio_buffer[beg:end]
self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE)
self.online.insert_audio_chunk(send_audio)
self.current_online_chunk_buffer_size += len(send_audio)
self.is_currently_final = True
self.clear_buffer()
else:
if self.status == 'voice':
self.online.insert_audio_chunk(self.audio_buffer)