better handle silences when VAC + correct offset issue with whisperstreaming backend

2026-03-07 22:33:36 +00:00 · 2025-08-17 01:27:07 +02:00
parent 7fe0353260
commit e2184d5e06
4 changed files with 15 additions and 10 deletions
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -309,7 +309,7 @@ class AudioProcessor:
                
                if type(item) is Silence:
                    cumulative_pcm_duration_stream_time += item.duration
-                    self.online.insert_silence(item.duration)
+                    self.online.insert_silence(item.duration, self.tokens[-1].end)
                    continue
                
                if isinstance(item, np.ndarray):
@@ -438,7 +438,7 @@ class AudioProcessor:
                last_end_diarized = 0
                undiarized_text = []
                current_time = time() - self.beg_loop
-                tokens = handle_silences(tokens, current_time)
+                tokens = handle_silences(tokens, current_time, self.silence)
                for token in tokens:
                    speaker = token.speaker
                    
--- a/whisperlivekit/remove_silences.py
+++ b/whisperlivekit/remove_silences.py
@@ -3,6 +3,7 @@ import re

 MIN_SILENCE_DURATION = 4 #in seconds
 END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important
+END_SILENCE_DURATION_VAC = 3 #VAC is good at detecting silences, but we want to skip the smallest silences

 def blank_to_silence(tokens):
    full_string = ''.join([t.text for t in tokens])
@@ -76,11 +77,15 @@ def no_token_to_silence(tokens):
            new_tokens.append(token)
    return new_tokens
            
-def ends_with_silence(tokens, current_time):
+def ends_with_silence(tokens, current_time, vac_detected_silence):
    if not tokens:
        return []
    last_token = tokens[-1]
-    if tokens and current_time - last_token.end >= END_SILENCE_DURATION:
+    if tokens and (
+        current_time - last_token.end >= END_SILENCE_DURATION 
+        or 
+        (current_time - last_token.end >= 3 and vac_detected_silence)
+        ):
        if last_token.speaker == -2:
            last_token.end = current_time
        else:
@@ -95,9 +100,9 @@ def ends_with_silence(tokens, current_time):
    return tokens
    

-def handle_silences(tokens, current_time):
+def handle_silences(tokens, current_time, vac_detected_silence):
    tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
    tokens = no_token_to_silence(tokens)
-    tokens = ends_with_silence(tokens, current_time)
+    tokens = ends_with_silence(tokens, current_time, vac_detected_silence)
    return tokens
     
--- a/whisperlivekit/simul_whisper/backend.py
+++ b/whisperlivekit/simul_whisper/backend.py
@@ -52,7 +52,7 @@ class SimulStreamingOnlineProcessor:
            cfg=self.asr.cfg,
            loaded_model=model)

-    def insert_silence(self, silence_duration):
+    def insert_silence(self, silence_duration, offset):
        """
        If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
        """
--- a/whisperlivekit/whisper_streaming_custom/online_asr.py
+++ b/whisperlivekit/whisper_streaming_custom/online_asr.py
@@ -153,7 +153,7 @@ class OnlineASRProcessor:
        """Append an audio chunk (a numpy array) to the current audio buffer."""
        self.audio_buffer = np.append(self.audio_buffer, audio)

-    def insert_silence(self, silence_duration):
+    def insert_silence(self, silence_duration, offset):
        """
        If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
        """
@@ -161,7 +161,7 @@ class OnlineASRProcessor:
            gap_silence = np.zeros(int(16000 * silence_duration), dtype=np.int16)
            self.insert_audio_chunk(gap_silence)
        else:
-            self.init(offset=(silence_duration + self.buffer_time_offset) / self.SAMPLING_RATE)
+            self.init(offset=silence_duration + offset)
        self.global_time_offset += silence_duration

    def prompt(self) -> Tuple[str, str]:
@@ -244,7 +244,7 @@ class OnlineASRProcessor:
        )
        if self.global_time_offset:
            for token in committed_tokens:
-                token.with_offset(self.global_time_offset)
+                token = token.with_offset(self.global_time_offset)
        return committed_tokens, current_audio_processed_upto

    def chunk_completed_sentence(self):