support for raw PCM input option by @YeonjunNotFR

2026-03-08 06:44:09 +00:00 · 2025-09-11 21:32:11 +02:00
parent b06866877a
commit a4e9f3cab7
4 changed files with 72 additions and 56 deletions
--- a/README.md
+++ b/README.md
@@ -163,6 +163,7 @@ An important list of parameters can be changed. But what *should* you change?
 | `--port` | Server port | `8000` |
 | `--ssl-certfile` | Path to the SSL certificate file (for HTTPS support) | `None` |
 | `--ssl-keyfile` | Path to the SSL private key file (for HTTPS support) | `None` |
+| `--pcm-input` | raw PCM (s16le) data is expected as input and FFmpeg will be bypassed. | `False` |


 | SimulStreaming backend options | Description | Default |
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -52,6 +52,7 @@ class AudioProcessor:
        self.last_ffmpeg_activity = time()
        self.ffmpeg_health_check_interval = 5
        self.ffmpeg_max_idle_time = 10
+        self.is_pcm_input = self.args.pcm_input
        self.debug = False

        # State management
@@ -208,54 +209,7 @@ class AudioProcessor:
                        continue
                    
                self.pcm_buffer.extend(chunk)
-
-                # Process when enough data
-                if len(self.pcm_buffer) >= self.bytes_per_sec:
-                    if len(self.pcm_buffer) > self.max_bytes_per_sec:
-                        logger.warning(
-                            f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. "
-                            f"Consider using a smaller model."
-                        )
-
-                    # Process audio chunk
-                    pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec])
-                    self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:]
-                    
-                    res = None
-                    end_of_audio = False
-                    silence_buffer = None
-                    
-                    if self.args.vac:
-                        res = self.vac(pcm_array)
-                    
-                    if res is not None:
-                        if res.get('end', 0) > res.get('start', 0):
-                            end_of_audio = True
-                        elif self.silence: #end of silence
-                            self.silence = False
-                            silence_buffer = Silence(duration=time() - self.start_silence)
-                            
-                    if silence_buffer:
-                        if self.args.transcription and self.transcription_queue:
-                            await self.transcription_queue.put(silence_buffer)
-                        if self.args.diarization and self.diarization_queue:
-                            await self.diarization_queue.put(silence_buffer)
-
-                    if not self.silence:                            
-                        if self.args.transcription and self.transcription_queue:
-                            await self.transcription_queue.put(pcm_array.copy())
-
-                        if self.args.diarization and self.diarization_queue:
-                            await self.diarization_queue.put(pcm_array.copy())
-                        
-                        self.silence_duration = 0.0
-                        if end_of_audio:
-                            self.silence = True
-                            self.start_silence = time()
-
-                    # Sleep if no processing is happening
-                    if not self.args.transcription and not self.args.diarization:
-                        await asyncio.sleep(0.1)
+                await self.handle_pcm_data()
                    
                    
                    
@@ -671,10 +625,65 @@ class AudioProcessor:
            logger.warning("AudioProcessor is stopping. Ignoring incoming audio.")
            return

-        success = await self.ffmpeg_manager.write_data(message)
-        if not success:
-            ffmpeg_state = await self.ffmpeg_manager.get_state()
-            if ffmpeg_state == FFmpegState.FAILED:
-                logger.error("FFmpeg is in FAILED state, cannot process audio")
-            else:
-                logger.warning("Failed to write audio data to FFmpeg")
+        if self.is_pcm_input:
+            self.pcm_buffer.extend(message)
+            await self.handle_pcm_data()
+        else:
+            success = await self.ffmpeg_manager.write_data(message)
+            if not success:
+                ffmpeg_state = await self.ffmpeg_manager.get_state()
+                if ffmpeg_state == FFmpegState.FAILED:
+                    logger.error("FFmpeg is in FAILED state, cannot process audio")
+                else:
+                    logger.warning("Failed to write audio data to FFmpeg")
+
+    async def handle_pcm_data(self):
+        # Process when enough data
+        if len(self.pcm_buffer) < self.bytes_per_sec:
+            return
+
+        if len(self.pcm_buffer) > self.max_bytes_per_sec:
+            logger.warning(
+                f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. "
+                f"Consider using a smaller model."
+            )
+
+        # Process audio chunk
+        pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec])
+        self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:]
+
+        res = None
+        end_of_audio = False
+        silence_buffer = None
+
+        if self.args.vac:
+            res = self.vac(pcm_array)
+
+        if res is not None:
+            if res.get("end", 0) > res.get("start", 0):
+                end_of_audio = True
+            elif self.silence: #end of silence
+                self.silence = False
+                silence_buffer = Silence(duration=time() - self.start_silence)
+
+        if silence_buffer:
+            if self.args.transcription and self.transcription_queue:
+                await self.transcription_queue.put(silence_buffer)
+            if self.args.diarization and self.diarization_queue:
+                await self.diarization_queue.put(silence_buffer)
+
+        if not self.silence:
+            if self.args.transcription and self.transcription_queue:
+                await self.transcription_queue.put(pcm_array.copy())
+
+            if self.args.diarization and self.diarization_queue:
+                await self.diarization_queue.put(pcm_array.copy())
+
+            self.silence_duration = 0.0
+
+            if end_of_audio:
+                self.silence = True
+                self.start_silence = time()
+
+        if not self.args.transcription and not self.args.diarization:
+            await asyncio.sleep(0.1)
--- a/whisperlivekit/core.py
+++ b/whisperlivekit/core.py
@@ -42,6 +42,7 @@ class TranscriptionEngine:
            "ssl_keyfile": None,
            "transcription": True,
            "vad": True,
+            "pcm_input": False,
            # whisperstreaming params:
            "buffer_trimming": "segment",
            "confidence_validation": False,
--- a/whisperlivekit/parse_args.py
+++ b/whisperlivekit/parse_args.py
@@ -173,7 +173,12 @@ def parse_args():
    )
    parser.add_argument("--ssl-certfile", type=str, help="Path to the SSL certificate file.", default=None)
    parser.add_argument("--ssl-keyfile", type=str, help="Path to the SSL private key file.", default=None)
-
+    parser.add_argument(
+        "--pcm-input",
+        action="store_true",
+        default=False,
+        help="If set, raw PCM (s16le) data is expected as input and FFmpeg will be bypassed."
+    )
    # SimulStreaming-specific arguments
    simulstreaming_group = parser.add_argument_group('SimulStreaming arguments (only used with --backend simulstreaming)')