From e2184d5e06c57be36a07ac4e80454a60491781c3 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 17 Aug 2025 01:27:07 +0200 Subject: [PATCH] better handle silences when VAC + correct offset issue with whisperstreaming backend --- whisperlivekit/audio_processor.py | 4 ++-- whisperlivekit/remove_silences.py | 13 +++++++++---- whisperlivekit/simul_whisper/backend.py | 2 +- .../whisper_streaming_custom/online_asr.py | 6 +++--- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 135b420..c3e414a 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -309,7 +309,7 @@ class AudioProcessor: if type(item) is Silence: cumulative_pcm_duration_stream_time += item.duration - self.online.insert_silence(item.duration) + self.online.insert_silence(item.duration, self.tokens[-1].end) continue if isinstance(item, np.ndarray): @@ -438,7 +438,7 @@ class AudioProcessor: last_end_diarized = 0 undiarized_text = [] current_time = time() - self.beg_loop - tokens = handle_silences(tokens, current_time) + tokens = handle_silences(tokens, current_time, self.silence) for token in tokens: speaker = token.speaker diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py index 4db095b..ede339c 100644 --- a/whisperlivekit/remove_silences.py +++ b/whisperlivekit/remove_silences.py @@ -3,6 +3,7 @@ import re MIN_SILENCE_DURATION = 4 #in seconds END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important +END_SILENCE_DURATION_VAC = 3 #VAC is good at detecting silences, but we want to skip the smallest silences def blank_to_silence(tokens): full_string = ''.join([t.text for t in tokens]) @@ -76,11 +77,15 @@ def no_token_to_silence(tokens): new_tokens.append(token) return new_tokens -def ends_with_silence(tokens, current_time): +def ends_with_silence(tokens, current_time, vac_detected_silence): if not tokens: return [] last_token = tokens[-1] - if tokens and current_time - last_token.end >= END_SILENCE_DURATION: + if tokens and ( + current_time - last_token.end >= END_SILENCE_DURATION + or + (current_time - last_token.end >= 3 and vac_detected_silence) + ): if last_token.speaker == -2: last_token.end = current_time else: @@ -95,9 +100,9 @@ def ends_with_silence(tokens, current_time): return tokens -def handle_silences(tokens, current_time): +def handle_silences(tokens, current_time, vac_detected_silence): tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text tokens = no_token_to_silence(tokens) - tokens = ends_with_silence(tokens, current_time) + tokens = ends_with_silence(tokens, current_time, vac_detected_silence) return tokens \ No newline at end of file diff --git a/whisperlivekit/simul_whisper/backend.py b/whisperlivekit/simul_whisper/backend.py index de306bc..3573aca 100644 --- a/whisperlivekit/simul_whisper/backend.py +++ b/whisperlivekit/simul_whisper/backend.py @@ -52,7 +52,7 @@ class SimulStreamingOnlineProcessor: cfg=self.asr.cfg, loaded_model=model) - def insert_silence(self, silence_duration): + def insert_silence(self, silence_duration, offset): """ If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame """ diff --git a/whisperlivekit/whisper_streaming_custom/online_asr.py b/whisperlivekit/whisper_streaming_custom/online_asr.py index 1f8a2a0..a68d55e 100644 --- a/whisperlivekit/whisper_streaming_custom/online_asr.py +++ b/whisperlivekit/whisper_streaming_custom/online_asr.py @@ -153,7 +153,7 @@ class OnlineASRProcessor: """Append an audio chunk (a numpy array) to the current audio buffer.""" self.audio_buffer = np.append(self.audio_buffer, audio) - def insert_silence(self, silence_duration): + def insert_silence(self, silence_duration, offset): """ If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame """ @@ -161,7 +161,7 @@ class OnlineASRProcessor: gap_silence = np.zeros(int(16000 * silence_duration), dtype=np.int16) self.insert_audio_chunk(gap_silence) else: - self.init(offset=(silence_duration + self.buffer_time_offset) / self.SAMPLING_RATE) + self.init(offset=silence_duration + offset) self.global_time_offset += silence_duration def prompt(self) -> Tuple[str, str]: @@ -244,7 +244,7 @@ class OnlineASRProcessor: ) if self.global_time_offset: for token in committed_tokens: - token.with_offset(self.global_time_offset) + token = token.with_offset(self.global_time_offset) return committed_tokens, current_audio_processed_upto def chunk_completed_sentence(self):