use with_offset to add no audio offset to tokens

2026-03-07 22:33:36 +00:00 · 2025-08-17 00:33:24 +02:00
parent 55e08474f3
commit 0f2eba507e
2 changed files with 63 additions and 25 deletions
--- a/whisperlivekit/simul_whisper/backend.py
+++ b/whisperlivekit/simul_whisper/backend.py
@@ -38,7 +38,7 @@ class SimulStreamingOnlineProcessor:
        self.logfile = logfile
        self.is_last = False
        self.end = 0.0
-        self.cumulative_audio_duration = 0.0
+        self.global_time_offset = 0.0
        
        self.committed: List[ASRToken] = []
        self.last_result_tokens: List[ASRToken] = []
@@ -59,10 +59,9 @@ class SimulStreamingOnlineProcessor:
        if silence_duration < 5:
            gap_silence = torch.zeros(int(16000*min(silence_duration, 1.0)))
            self.model.insert_audio(gap_silence)
-            self.model.last_attend_frame += int(TOKENS_PER_SECOND * (min(silence_duration, 1.0) - 1.0))
        else:
            self.model.refresh_segment(complete=True)
-            self.model.last_attend_frame += int(TOKENS_PER_SECOND * silence_duration)
+        self.global_time_offset += silence_duration


        
@@ -83,29 +82,51 @@ class SimulStreamingOnlineProcessor:
        )

    def timestamped_text(self, tokens, generation):
-        # From the simulstreaming repo. self.model to self.asr.model
-        pr = generation["progress"]
-        if "result" not in generation:
-            split_words, split_tokens = self.model.tokenizer.split_to_word_tokens(tokens)
+        """
+        generate timestamped text from tokens and generation data.
+        
+        args:
+            tokens: List of tokens to process
+            generation: Dictionary containing generation progress and optionally results
+            
+        returns:
+            List of tuples containing (start_time, end_time, word) for each word
+        """
+        FRAME_DURATION = 0.02    
+        if "result" in generation:
+            split_words = generation["result"]["split_words"]
+            split_tokens = generation["result"]["split_tokens"]
        else:
-            split_words, split_tokens = generation["result"]["split_words"], generation["result"]["split_tokens"]
-
-        frames = [p["most_attended_frames"][0] for p in pr]
-        tokens = tokens.copy()
-        ret = []
-        for sw,st in zip(split_words, split_tokens):
-            b = None
-            for stt in st:
-                t,f = tokens.pop(0), frames.pop(0)
-                if t != stt:
-                    raise ValueError(f"Token mismatch: {t} != {stt} at frame {f}.")
-                if b is None:
-                    b = f
-            e = f
-            out = (b*0.02, e*0.02, sw)
-            ret.append(out)
-            logger.debug(f"TS-WORD:\t{' '.join(map(str, out))}")
-        return ret
+            split_words, split_tokens = self.model.tokenizer.split_to_word_tokens(tokens)
+        progress = generation["progress"]
+        frames = [p["most_attended_frames"][0] for p in progress]
+        tokens_queue = tokens.copy()
+        timestamped_words = []
+        
+        for word, word_tokens in zip(split_words, split_tokens):
+            start_frame = None
+            end_frame = None
+            for expected_token in word_tokens:
+                if not tokens_queue or not frames:
+                    raise ValueError(f"Insufficient tokens or frames for word '{word}'")
+                    
+                actual_token = tokens_queue.pop(0)
+                current_frame = frames.pop(0)
+                if actual_token != expected_token:
+                    raise ValueError(
+                        f"Token mismatch: expected '{expected_token}', "
+                        f"got '{actual_token}' at frame {current_frame}"
+                    )
+                if start_frame is None:
+                    start_frame = current_frame
+                end_frame = current_frame
+            start_time = start_frame * FRAME_DURATION
+            end_time = end_frame * FRAME_DURATION
+            
+            timestamp_entry = (start_time, end_time, word)
+            timestamped_words.append(timestamp_entry)
+            logger.debug(f"TS-WORD:\t{start_time:.2f}\t{end_time:.2f}\t{word}")
+        return timestamped_words

    def process_iter(self) -> Tuple[List[ASRToken], float]:
        """
@@ -126,6 +147,8 @@ class SimulStreamingOnlineProcessor:
                    end=end,
                    text=word,
                    probability=0.95  # fake prob. Maybe we can extract it from the model?
+                ).with_offset(
+                    self.global_time_offset
                )
                new_tokens.append(token)
                
--- a/whisperlivekit/whisper_streaming_custom/online_asr.py
+++ b/whisperlivekit/whisper_streaming_custom/online_asr.py
@@ -122,6 +122,7 @@ class OnlineASRProcessor:
        self.tokenize = tokenize_method
        self.logfile = logfile
        self.confidence_validation = confidence_validation
+        self.global_time_offset = 0.0
        self.init()

        self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
@@ -152,6 +153,17 @@ class OnlineASRProcessor:
        """Append an audio chunk (a numpy array) to the current audio buffer."""
        self.audio_buffer = np.append(self.audio_buffer, audio)

+    def insert_silence(self, silence_duration):
+        """
+        If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
+        """
+        if silence_duration < 3:
+            gap_silence = np.zeros(int(16000 * silence_duration), dtype=np.int16)
+            self.insert_audio_chunk(gap_silence)
+        else:
+            self.init(offset=(silence_duration + self.buffer_time_offset) / self.SAMPLING_RATE)
+        self.global_time_offset += silence_duration
+
    def prompt(self) -> Tuple[str, str]:
        """
        Returns a tuple: (prompt, context), where:
@@ -230,6 +242,9 @@ class OnlineASRProcessor:
        logger.debug(
            f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
        )
+        if self.global_time_offset:
+            for token in committed_tokens:
+                token.with_offset(self.global_time_offset)
        return committed_tokens, current_audio_processed_upto

    def chunk_completed_sentence(self):