diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index ba31e46..552c3af 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -47,6 +47,7 @@ class AudioProcessor: self.last_ffmpeg_activity = time() self.ffmpeg_health_check_interval = 5 self.ffmpeg_max_idle_time = 10 + self.debug = False # State management self.is_stopping = False @@ -58,7 +59,7 @@ class AudioProcessor: self.end_buffer = 0 self.end_attributed_speaker = 0 self.lock = asyncio.Lock() - self.beg_loop = time() + self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio self.sep = " " # Default separator self.last_response_content = "" @@ -298,11 +299,10 @@ class AudioProcessor: asr_internal_buffer_duration_s = len(getattr(self.online, 'audio_buffer', [])) / self.online.SAMPLING_RATE transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer) - - logger.info( - f"ASR processing: internal_buffer={asr_internal_buffer_duration_s:.2f}s, " - f"lag={transcription_lag_s:.2f}s." - ) + asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |" + if type(item) is Silence: + asr_processing_logs += f" + Silence of = {item.duration :.2fs} | last_end = {self.tokens[-1].end} |" + logger.info(asr_processing_logs) if type(item) is Silence: cumulative_pcm_duration_stream_time += item.duration @@ -444,8 +444,8 @@ class AudioProcessor: lines = [] last_end_diarized = 0 undiarized_text = [] - current_time = time() - self.beg_loop - tokens = handle_silences(tokens, current_time, self.silence) + current_time = time() - self.beg_loop if self.beg_loop else None + tokens, buffer_transcription = handle_silences(tokens, buffer_transcription, current_time, self.silence) for token in tokens: speaker = token.speaker @@ -459,21 +459,23 @@ class AudioProcessor: if speaker not in [-1, 0]: last_end_diarized = max(token.end, last_end_diarized) - # Group by speaker + debug_info = "" + if self.debug: + debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]" if speaker != previous_speaker or not lines: lines.append({ "speaker": speaker, - "text": token.text, + "text": token.text + debug_info, "beg": format_time(token.start), "end": format_time(token.end), "diff": round(token.end - last_end_diarized, 2) }) previous_speaker = speaker elif token.text: # Only append if text isn't empty - lines[-1]["text"] += sep + token.text + lines[-1]["text"] += sep + token.text + debug_info lines[-1]["end"] = format_time(token.end) lines[-1]["diff"] = round(token.end - last_end_diarized, 2) - + # Handle undiarized text if undiarized_text: combined = sep.join(undiarized_text) @@ -634,6 +636,10 @@ class AudioProcessor: async def process_audio(self, message): """Process incoming audio data.""" + + if not self.beg_loop: + self.beg_loop = time() + if not message: logger.info("Empty audio message received, initiating stop sequence.") self.is_stopping = True diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py index ede339c..1ea7f6d 100644 --- a/whisperlivekit/remove_silences.py +++ b/whisperlivekit/remove_silences.py @@ -77,7 +77,7 @@ def no_token_to_silence(tokens): new_tokens.append(token) return new_tokens -def ends_with_silence(tokens, current_time, vac_detected_silence): +def ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence): if not tokens: return [] last_token = tokens[-1] @@ -97,12 +97,13 @@ def ends_with_silence(tokens, current_time, vac_detected_silence): probability=0.95 ) ) + #We validate the buffer has because of the silence return tokens -def handle_silences(tokens, current_time, vac_detected_silence): +def handle_silences(tokens, buffer_transcription, current_time, vac_detected_silence): tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text tokens = no_token_to_silence(tokens) - tokens = ends_with_silence(tokens, current_time, vac_detected_silence) - return tokens + tokens = ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence) + return tokens, buffer_transcription \ No newline at end of file diff --git a/whisperlivekit/simul_whisper/backend.py b/whisperlivekit/simul_whisper/backend.py index d0eefa6..e38eda9 100644 --- a/whisperlivekit/simul_whisper/backend.py +++ b/whisperlivekit/simul_whisper/backend.py @@ -36,7 +36,6 @@ class SimulStreamingOnlineProcessor: ): self.asr = asr self.logfile = logfile - self.is_last = False self.end = 0.0 self.global_time_offset = 0.0 @@ -57,12 +56,13 @@ class SimulStreamingOnlineProcessor: If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame """ if silence_duration < 5: - gap_silence = torch.zeros(int(16000*min(silence_duration, 1.0))) + gap_silence = torch.zeros(int(16000*silence_duration)) self.model.insert_audio(gap_silence) - self.global_time_offset = silence_duration - 1.0 + # self.global_time_offset += silence_duration else: + self.process_iter(is_last=True) #we want to totally process what remains in the buffer. self.model.refresh_segment(complete=True) - self.global_time_offset += silence_duration + self.global_time_offset += silence_duration + offset @@ -132,14 +132,14 @@ class SimulStreamingOnlineProcessor: logger.debug(f"TS-WORD:\t{start_time:.2f}\t{end_time:.2f}\t{word}") return timestamped_words - def process_iter(self) -> Tuple[List[ASRToken], float]: + def process_iter(self, is_last=False) -> Tuple[List[ASRToken], float]: """ Process accumulated audio chunks using SimulStreaming. Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time). """ try: - tokens, generation_progress = self.model.infer(is_last=self.is_last) + tokens, generation_progress = self.model.infer(is_last=is_last) ts_words = self.timestamped_text(tokens, generation_progress) new_tokens = []