diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 6229b32..b8b2086 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -233,14 +233,6 @@ class AudioProcessor: buffer_size = max(int(32000 * elapsed_time), 4096) beg = current_time - # Detect idle state much more quickly - if current_time - self.last_ffmpeg_activity > self.ffmpeg_max_idle_time: - logger.warning(f"FFmpeg process idle for {current_time - self.last_ffmpeg_activity:.2f}s. Restarting...") - await self.restart_ffmpeg() - beg = time() - self.last_ffmpeg_activity = time() - continue - chunk = await loop.run_in_executor(None, self.ffmpeg_process.stdout.read, buffer_size) if chunk: self.last_ffmpeg_activity = time() @@ -554,9 +546,9 @@ class AudioProcessor: logger.info(f"{task_name} completed normally.") ffmpeg_idle_time = current_time - self.last_ffmpeg_activity - if ffmpeg_idle_time > 15: + if ffmpeg_idle_time > 10: logger.warning(f"FFmpeg idle for {ffmpeg_idle_time:.2f}s - may need attention.") - if ffmpeg_idle_time > 30 and not self.is_stopping: + if ffmpeg_idle_time > 15 and not self.is_stopping: logger.error("FFmpeg idle for too long and not in stopping phase, forcing restart.") await self.restart_ffmpeg() except asyncio.CancelledError: diff --git a/whisperlivekit/whisper_streaming_custom/online_asr.py b/whisperlivekit/whisper_streaming_custom/online_asr.py index a17e94b..b689e0f 100644 --- a/whisperlivekit/whisper_streaming_custom/online_asr.py +++ b/whisperlivekit/whisper_streaming_custom/online_asr.py @@ -154,6 +154,7 @@ class OnlineASRProcessor: self.buffer_time_offset = offset if offset is not None else 0.0 self.transcript_buffer.last_committed_time = self.buffer_time_offset self.committed: List[ASRToken] = [] + self.time_of_last_asr_output = 0.0 def get_audio_buffer_end_time(self) -> float: """Returns the absolute end time of the current audio_buffer.""" @@ -210,11 +211,26 @@ class OnlineASRProcessor: self.transcript_buffer.insert(tokens, self.buffer_time_offset) committed_tokens = self.transcript_buffer.flush() self.committed.extend(committed_tokens) + + if committed_tokens: + self.time_of_last_asr_output = self.committed[-1].end + completed = self.concatenate_tokens(committed_tokens) logger.debug(f">>>> COMPLETE NOW: {completed.text}") incomp = self.concatenate_tokens(self.transcript_buffer.buffer) logger.debug(f"INCOMPLETE: {incomp.text}") + buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE + if not committed_tokens and buffer_duration > self.buffer_trimming_sec: + time_since_last_output = self.get_audio_buffer_end_time() - self.time_of_last_asr_output + if time_since_last_output > self.buffer_trimming_sec: + logger.warning( + f"No ASR output for {time_since_last_output:.2f}s. " + f"Resetting buffer to prevent freezing." + ) + self.init(offset=self.get_audio_buffer_end_time()) + return [], current_audio_processed_upto + if committed_tokens and self.buffer_trimming_way == "sentence": if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec: self.chunk_completed_sentence()