diff --git a/README.md b/README.md index 4800f1b..e880f1e 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,7 @@ An important list of parameters can be changed. But what *should* you change? | `--port` | Server port | `8000` | | `--ssl-certfile` | Path to the SSL certificate file (for HTTPS support) | `None` | | `--ssl-keyfile` | Path to the SSL private key file (for HTTPS support) | `None` | +| `--pcm-input` | raw PCM (s16le) data is expected as input and FFmpeg will be bypassed. | `False` | | SimulStreaming backend options | Description | Default | diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 4e74743..92a3098 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -52,6 +52,7 @@ class AudioProcessor: self.last_ffmpeg_activity = time() self.ffmpeg_health_check_interval = 5 self.ffmpeg_max_idle_time = 10 + self.is_pcm_input = self.args.pcm_input self.debug = False # State management @@ -208,54 +209,7 @@ class AudioProcessor: continue self.pcm_buffer.extend(chunk) - - # Process when enough data - if len(self.pcm_buffer) >= self.bytes_per_sec: - if len(self.pcm_buffer) > self.max_bytes_per_sec: - logger.warning( - f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. " - f"Consider using a smaller model." - ) - - # Process audio chunk - pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec]) - self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:] - - res = None - end_of_audio = False - silence_buffer = None - - if self.args.vac: - res = self.vac(pcm_array) - - if res is not None: - if res.get('end', 0) > res.get('start', 0): - end_of_audio = True - elif self.silence: #end of silence - self.silence = False - silence_buffer = Silence(duration=time() - self.start_silence) - - if silence_buffer: - if self.args.transcription and self.transcription_queue: - await self.transcription_queue.put(silence_buffer) - if self.args.diarization and self.diarization_queue: - await self.diarization_queue.put(silence_buffer) - - if not self.silence: - if self.args.transcription and self.transcription_queue: - await self.transcription_queue.put(pcm_array.copy()) - - if self.args.diarization and self.diarization_queue: - await self.diarization_queue.put(pcm_array.copy()) - - self.silence_duration = 0.0 - if end_of_audio: - self.silence = True - self.start_silence = time() - - # Sleep if no processing is happening - if not self.args.transcription and not self.args.diarization: - await asyncio.sleep(0.1) + await self.handle_pcm_data() @@ -671,10 +625,65 @@ class AudioProcessor: logger.warning("AudioProcessor is stopping. Ignoring incoming audio.") return - success = await self.ffmpeg_manager.write_data(message) - if not success: - ffmpeg_state = await self.ffmpeg_manager.get_state() - if ffmpeg_state == FFmpegState.FAILED: - logger.error("FFmpeg is in FAILED state, cannot process audio") - else: - logger.warning("Failed to write audio data to FFmpeg") + if self.is_pcm_input: + self.pcm_buffer.extend(message) + await self.handle_pcm_data() + else: + success = await self.ffmpeg_manager.write_data(message) + if not success: + ffmpeg_state = await self.ffmpeg_manager.get_state() + if ffmpeg_state == FFmpegState.FAILED: + logger.error("FFmpeg is in FAILED state, cannot process audio") + else: + logger.warning("Failed to write audio data to FFmpeg") + + async def handle_pcm_data(self): + # Process when enough data + if len(self.pcm_buffer) < self.bytes_per_sec: + return + + if len(self.pcm_buffer) > self.max_bytes_per_sec: + logger.warning( + f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. " + f"Consider using a smaller model." + ) + + # Process audio chunk + pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec]) + self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:] + + res = None + end_of_audio = False + silence_buffer = None + + if self.args.vac: + res = self.vac(pcm_array) + + if res is not None: + if res.get("end", 0) > res.get("start", 0): + end_of_audio = True + elif self.silence: #end of silence + self.silence = False + silence_buffer = Silence(duration=time() - self.start_silence) + + if silence_buffer: + if self.args.transcription and self.transcription_queue: + await self.transcription_queue.put(silence_buffer) + if self.args.diarization and self.diarization_queue: + await self.diarization_queue.put(silence_buffer) + + if not self.silence: + if self.args.transcription and self.transcription_queue: + await self.transcription_queue.put(pcm_array.copy()) + + if self.args.diarization and self.diarization_queue: + await self.diarization_queue.put(pcm_array.copy()) + + self.silence_duration = 0.0 + + if end_of_audio: + self.silence = True + self.start_silence = time() + + if not self.args.transcription and not self.args.diarization: + await asyncio.sleep(0.1) diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index 8fd2d61..4791183 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -42,6 +42,7 @@ class TranscriptionEngine: "ssl_keyfile": None, "transcription": True, "vad": True, + "pcm_input": False, # whisperstreaming params: "buffer_trimming": "segment", "confidence_validation": False, diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index fe8a2f8..2be546e 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -173,7 +173,12 @@ def parse_args(): ) parser.add_argument("--ssl-certfile", type=str, help="Path to the SSL certificate file.", default=None) parser.add_argument("--ssl-keyfile", type=str, help="Path to the SSL private key file.", default=None) - + parser.add_argument( + "--pcm-input", + action="store_true", + default=False, + help="If set, raw PCM (s16le) data is expected as input and FFmpeg will be bypassed." + ) # SimulStreaming-specific arguments simulstreaming_group = parser.add_argument_group('SimulStreaming arguments (only used with --backend simulstreaming)')