From e2184d5e06c57be36a07ac4e80454a60491781c3 Mon Sep 17 00:00:00 2001
From: Quentin Fuxa <quentin.fuxa@gmail.com>
Date: Sun, 17 Aug 2025 01:27:07 +0200
Subject: [PATCH] better handle silences when VAC + correct offset issue with
 whisperstreaming backend

---
 whisperlivekit/audio_processor.py                   |  4 ++--
 whisperlivekit/remove_silences.py                   | 13 +++++++++----
 whisperlivekit/simul_whisper/backend.py             |  2 +-
 .../whisper_streaming_custom/online_asr.py          |  6 +++---
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py
index 135b420..c3e414a 100644
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -309,7 +309,7 @@ class AudioProcessor:
                 
                 if type(item) is Silence:
                     cumulative_pcm_duration_stream_time += item.duration
-                    self.online.insert_silence(item.duration)
+                    self.online.insert_silence(item.duration, self.tokens[-1].end)
                     continue
                 
                 if isinstance(item, np.ndarray):
@@ -438,7 +438,7 @@ class AudioProcessor:
                 last_end_diarized = 0
                 undiarized_text = []
                 current_time = time() - self.beg_loop
-                tokens = handle_silences(tokens, current_time)
+                tokens = handle_silences(tokens, current_time, self.silence)
                 for token in tokens:
                     speaker = token.speaker
                     
diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py
index 4db095b..ede339c 100644
--- a/whisperlivekit/remove_silences.py
+++ b/whisperlivekit/remove_silences.py
@@ -3,6 +3,7 @@ import re
 
 MIN_SILENCE_DURATION = 4 #in seconds
 END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important
+END_SILENCE_DURATION_VAC = 3 #VAC is good at detecting silences, but we want to skip the smallest silences
 
 def blank_to_silence(tokens):
     full_string = ''.join([t.text for t in tokens])
@@ -76,11 +77,15 @@ def no_token_to_silence(tokens):
             new_tokens.append(token)
     return new_tokens
             
-def ends_with_silence(tokens, current_time):
+def ends_with_silence(tokens, current_time, vac_detected_silence):
     if not tokens:
         return []
     last_token = tokens[-1]
-    if tokens and current_time - last_token.end >= END_SILENCE_DURATION:
+    if tokens and (
+        current_time - last_token.end >= END_SILENCE_DURATION 
+        or 
+        (current_time - last_token.end >= 3 and vac_detected_silence)
+        ):
         if last_token.speaker == -2:
             last_token.end = current_time
         else:
@@ -95,9 +100,9 @@ def ends_with_silence(tokens, current_time):
     return tokens
     
 
-def handle_silences(tokens, current_time):
+def handle_silences(tokens, current_time, vac_detected_silence):
     tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
     tokens = no_token_to_silence(tokens)
-    tokens = ends_with_silence(tokens, current_time)
+    tokens = ends_with_silence(tokens, current_time, vac_detected_silence)
     return tokens
      
\ No newline at end of file
diff --git a/whisperlivekit/simul_whisper/backend.py b/whisperlivekit/simul_whisper/backend.py
index de306bc..3573aca 100644
--- a/whisperlivekit/simul_whisper/backend.py
+++ b/whisperlivekit/simul_whisper/backend.py
@@ -52,7 +52,7 @@ class SimulStreamingOnlineProcessor:
             cfg=self.asr.cfg,
             loaded_model=model)
 
-    def insert_silence(self, silence_duration):
+    def insert_silence(self, silence_duration, offset):
         """
         If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
         """
diff --git a/whisperlivekit/whisper_streaming_custom/online_asr.py b/whisperlivekit/whisper_streaming_custom/online_asr.py
index 1f8a2a0..a68d55e 100644
--- a/whisperlivekit/whisper_streaming_custom/online_asr.py
+++ b/whisperlivekit/whisper_streaming_custom/online_asr.py
@@ -153,7 +153,7 @@ class OnlineASRProcessor:
         """Append an audio chunk (a numpy array) to the current audio buffer."""
         self.audio_buffer = np.append(self.audio_buffer, audio)
 
-    def insert_silence(self, silence_duration):
+    def insert_silence(self, silence_duration, offset):
         """
         If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
         """
@@ -161,7 +161,7 @@ class OnlineASRProcessor:
             gap_silence = np.zeros(int(16000 * silence_duration), dtype=np.int16)
             self.insert_audio_chunk(gap_silence)
         else:
-            self.init(offset=(silence_duration + self.buffer_time_offset) / self.SAMPLING_RATE)
+            self.init(offset=silence_duration + offset)
         self.global_time_offset += silence_duration
 
     def prompt(self) -> Tuple[str, str]:
@@ -244,7 +244,7 @@ class OnlineASRProcessor:
         )
         if self.global_time_offset:
             for token in committed_tokens:
-                token.with_offset(self.global_time_offset)
+                token = token.with_offset(self.global_time_offset)
         return committed_tokens, current_audio_processed_upto
 
     def chunk_completed_sentence(self):