From bfd60b39219d8502b8d601672453a3c233f90ae1 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Mon, 17 Nov 2025 22:52:00 +0100 Subject: [PATCH] Add audio partial silence in chunks handling. bump to 0.2.14.post2 --- pyproject.toml | 2 +- whisperlivekit/audio_processor.py | 42 +++++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5b96028..d387377 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "whisperlivekit" -version = "0.2.14" +version = "0.2.14.post2" description = "Real-time speech-to-text with speaker diarization using Whisper" readme = "README.md" authors = [ diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 41225e8..b3e0219 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -127,6 +127,7 @@ class AudioProcessor: self.diarization_queue = asyncio.Queue() if self.args.diarization else None self.translation_queue = asyncio.Queue() if self.args.target_language else None self.pcm_buffer = bytearray() + self.total_pcm_samples = 0 self.transcription_task = None self.diarization_task = None @@ -174,6 +175,26 @@ class AudioProcessor: self.start_silence = None self.last_silence_dispatch_time = None + async def _enqueue_active_audio(self, pcm_chunk: np.ndarray): + if pcm_chunk is None or pcm_chunk.size == 0: + return + if not self.diarization_before_transcription and self.transcription_queue: + await self.transcription_queue.put(pcm_chunk.copy()) + if self.args.diarization and self.diarization_queue: + await self.diarization_queue.put(pcm_chunk.copy()) + self.silence_duration = 0.0 + + def _slice_before_silence(self, pcm_array, chunk_sample_start, silence_sample): + if silence_sample is None: + return None + relative_index = int(silence_sample - chunk_sample_start) + if relative_index <= 0: + return None + split_index = min(relative_index, len(pcm_array)) + if split_index <= 0: + return None + return pcm_array[:split_index] + def convert_pcm_to_float(self, pcm_buffer): """Convert PCM buffer in s16le format to normalized NumPy array.""" return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0 @@ -669,25 +690,30 @@ class AudioProcessor: pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:aligned_chunk_size]) self.pcm_buffer = self.pcm_buffer[aligned_chunk_size:] + num_samples = len(pcm_array) + chunk_sample_start = self.total_pcm_samples + chunk_sample_end = chunk_sample_start + num_samples + res = None if self.args.vac: res = self.vac(pcm_array) if res is not None: - if res.get("end", 0) > res.get("start", 0) and not self.silence: + silence_detected = res.get("end", 0) > res.get("start", 0) + if silence_detected and not self.silence: + pre_silence_chunk = self._slice_before_silence( + pcm_array, chunk_sample_start, res.get("end") + ) + if pre_silence_chunk is not None and pre_silence_chunk.size > 0: + await self._enqueue_active_audio(pre_silence_chunk) await self._begin_silence() elif self.silence: await self._end_silence() - if not self.silence: - if not self.diarization_before_transcription and self.transcription_queue: - await self.transcription_queue.put(pcm_array.copy()) + await self._enqueue_active_audio(pcm_array) - if self.args.diarization and self.diarization_queue: - await self.diarization_queue.put(pcm_array.copy()) - - self.silence_duration = 0.0 + self.total_pcm_samples = chunk_sample_end if not self.args.transcription and not self.args.diarization: await asyncio.sleep(0.1)