mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 22:33:36 +00:00
better handle silences when VAC + correct offset issue with whisperstreaming backend
This commit is contained in:
@@ -309,7 +309,7 @@ class AudioProcessor:
|
||||
|
||||
if type(item) is Silence:
|
||||
cumulative_pcm_duration_stream_time += item.duration
|
||||
self.online.insert_silence(item.duration)
|
||||
self.online.insert_silence(item.duration, self.tokens[-1].end)
|
||||
continue
|
||||
|
||||
if isinstance(item, np.ndarray):
|
||||
@@ -438,7 +438,7 @@ class AudioProcessor:
|
||||
last_end_diarized = 0
|
||||
undiarized_text = []
|
||||
current_time = time() - self.beg_loop
|
||||
tokens = handle_silences(tokens, current_time)
|
||||
tokens = handle_silences(tokens, current_time, self.silence)
|
||||
for token in tokens:
|
||||
speaker = token.speaker
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import re
|
||||
|
||||
MIN_SILENCE_DURATION = 4 #in seconds
|
||||
END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important
|
||||
END_SILENCE_DURATION_VAC = 3 #VAC is good at detecting silences, but we want to skip the smallest silences
|
||||
|
||||
def blank_to_silence(tokens):
|
||||
full_string = ''.join([t.text for t in tokens])
|
||||
@@ -76,11 +77,15 @@ def no_token_to_silence(tokens):
|
||||
new_tokens.append(token)
|
||||
return new_tokens
|
||||
|
||||
def ends_with_silence(tokens, current_time):
|
||||
def ends_with_silence(tokens, current_time, vac_detected_silence):
|
||||
if not tokens:
|
||||
return []
|
||||
last_token = tokens[-1]
|
||||
if tokens and current_time - last_token.end >= END_SILENCE_DURATION:
|
||||
if tokens and (
|
||||
current_time - last_token.end >= END_SILENCE_DURATION
|
||||
or
|
||||
(current_time - last_token.end >= 3 and vac_detected_silence)
|
||||
):
|
||||
if last_token.speaker == -2:
|
||||
last_token.end = current_time
|
||||
else:
|
||||
@@ -95,9 +100,9 @@ def ends_with_silence(tokens, current_time):
|
||||
return tokens
|
||||
|
||||
|
||||
def handle_silences(tokens, current_time):
|
||||
def handle_silences(tokens, current_time, vac_detected_silence):
|
||||
tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
|
||||
tokens = no_token_to_silence(tokens)
|
||||
tokens = ends_with_silence(tokens, current_time)
|
||||
tokens = ends_with_silence(tokens, current_time, vac_detected_silence)
|
||||
return tokens
|
||||
|
||||
@@ -52,7 +52,7 @@ class SimulStreamingOnlineProcessor:
|
||||
cfg=self.asr.cfg,
|
||||
loaded_model=model)
|
||||
|
||||
def insert_silence(self, silence_duration):
|
||||
def insert_silence(self, silence_duration, offset):
|
||||
"""
|
||||
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
|
||||
"""
|
||||
|
||||
@@ -153,7 +153,7 @@ class OnlineASRProcessor:
|
||||
"""Append an audio chunk (a numpy array) to the current audio buffer."""
|
||||
self.audio_buffer = np.append(self.audio_buffer, audio)
|
||||
|
||||
def insert_silence(self, silence_duration):
|
||||
def insert_silence(self, silence_duration, offset):
|
||||
"""
|
||||
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
|
||||
"""
|
||||
@@ -161,7 +161,7 @@ class OnlineASRProcessor:
|
||||
gap_silence = np.zeros(int(16000 * silence_duration), dtype=np.int16)
|
||||
self.insert_audio_chunk(gap_silence)
|
||||
else:
|
||||
self.init(offset=(silence_duration + self.buffer_time_offset) / self.SAMPLING_RATE)
|
||||
self.init(offset=silence_duration + offset)
|
||||
self.global_time_offset += silence_duration
|
||||
|
||||
def prompt(self) -> Tuple[str, str]:
|
||||
@@ -244,7 +244,7 @@ class OnlineASRProcessor:
|
||||
)
|
||||
if self.global_time_offset:
|
||||
for token in committed_tokens:
|
||||
token.with_offset(self.global_time_offset)
|
||||
token = token.with_offset(self.global_time_offset)
|
||||
return committed_tokens, current_audio_processed_upto
|
||||
|
||||
def chunk_completed_sentence(self):
|
||||
|
||||
Reference in New Issue
Block a user