better handle silences when VAC + correct offset issue with whisperstreaming backend

This commit is contained in:
Quentin Fuxa
2025-08-17 01:27:07 +02:00
parent 7fe0353260
commit e2184d5e06
4 changed files with 15 additions and 10 deletions

View File

@@ -309,7 +309,7 @@ class AudioProcessor:
if type(item) is Silence:
cumulative_pcm_duration_stream_time += item.duration
self.online.insert_silence(item.duration)
self.online.insert_silence(item.duration, self.tokens[-1].end)
continue
if isinstance(item, np.ndarray):
@@ -438,7 +438,7 @@ class AudioProcessor:
last_end_diarized = 0
undiarized_text = []
current_time = time() - self.beg_loop
tokens = handle_silences(tokens, current_time)
tokens = handle_silences(tokens, current_time, self.silence)
for token in tokens:
speaker = token.speaker

View File

@@ -3,6 +3,7 @@ import re
MIN_SILENCE_DURATION = 4 #in seconds
END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important
END_SILENCE_DURATION_VAC = 3 #VAC is good at detecting silences, but we want to skip the smallest silences
def blank_to_silence(tokens):
full_string = ''.join([t.text for t in tokens])
@@ -76,11 +77,15 @@ def no_token_to_silence(tokens):
new_tokens.append(token)
return new_tokens
def ends_with_silence(tokens, current_time):
def ends_with_silence(tokens, current_time, vac_detected_silence):
if not tokens:
return []
last_token = tokens[-1]
if tokens and current_time - last_token.end >= END_SILENCE_DURATION:
if tokens and (
current_time - last_token.end >= END_SILENCE_DURATION
or
(current_time - last_token.end >= 3 and vac_detected_silence)
):
if last_token.speaker == -2:
last_token.end = current_time
else:
@@ -95,9 +100,9 @@ def ends_with_silence(tokens, current_time):
return tokens
def handle_silences(tokens, current_time):
def handle_silences(tokens, current_time, vac_detected_silence):
tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
tokens = no_token_to_silence(tokens)
tokens = ends_with_silence(tokens, current_time)
tokens = ends_with_silence(tokens, current_time, vac_detected_silence)
return tokens

View File

@@ -52,7 +52,7 @@ class SimulStreamingOnlineProcessor:
cfg=self.asr.cfg,
loaded_model=model)
def insert_silence(self, silence_duration):
def insert_silence(self, silence_duration, offset):
"""
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
"""

View File

@@ -153,7 +153,7 @@ class OnlineASRProcessor:
"""Append an audio chunk (a numpy array) to the current audio buffer."""
self.audio_buffer = np.append(self.audio_buffer, audio)
def insert_silence(self, silence_duration):
def insert_silence(self, silence_duration, offset):
"""
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
"""
@@ -161,7 +161,7 @@ class OnlineASRProcessor:
gap_silence = np.zeros(int(16000 * silence_duration), dtype=np.int16)
self.insert_audio_chunk(gap_silence)
else:
self.init(offset=(silence_duration + self.buffer_time_offset) / self.SAMPLING_RATE)
self.init(offset=silence_duration + offset)
self.global_time_offset += silence_duration
def prompt(self) -> Tuple[str, str]:
@@ -244,7 +244,7 @@ class OnlineASRProcessor:
)
if self.global_time_offset:
for token in committed_tokens:
token.with_offset(self.global_time_offset)
token = token.with_offset(self.global_time_offset)
return committed_tokens, current_audio_processed_upto
def chunk_completed_sentence(self):