replace deprecated ScriptProcessorNode with AudioWorklet

feat: support web audio 16kHz PCM input and remove ffmpeg dependency
asyncio.to_thread for transcription and translation
2026-03-09 15:25:34 +00:00 · 2025-09-17 10:53:53 +02:00 · 2025-09-15 23:22:25 +08:00 · 2025-09-15 15:23:22 +02:00 · 2025-09-15 10:19:26 +02:00 · 2025-09-15 10:00:14 +02:00
11 changed files with 242 additions and 446 deletions
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -4,9 +4,8 @@ from time import time, sleep
 import math
 import logging
 import traceback
-from whisperlivekit.timed_objects import ASRToken, Silence, Line
+from whisperlivekit.timed_objects import ASRToken, Silence, Line, FrontData, State
 from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory
-from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
 from whisperlivekit.silero_vad_iterator import FixedVADIterator
 from whisperlivekit.results_formater import format_output
 # Set up logging once
@@ -49,10 +48,7 @@ class AudioProcessor:
        self.bytes_per_sample = 2
        self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
        self.max_bytes_per_sec = 32000 * 5  # 5 seconds of audio at 32 kHz
-        self.last_ffmpeg_activity = time()
-        self.ffmpeg_health_check_interval = 5
-        self.ffmpeg_max_idle_time = 10
-        self.is_pcm_input = self.args.pcm_input
+        self.is_pcm_input = True
        self.debug = False

        # State management
@@ -68,7 +64,7 @@ class AudioProcessor:
        self.lock = asyncio.Lock()
        self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio
        self.sep = " "  # Default separator
-        self.last_response_content = ""
+        self.last_response_content = FrontData()
        
        # Models and processing
        self.asr = models.asr
@@ -79,18 +75,6 @@ class AudioProcessor:
        else:
            self.vac = None
            
-        self.ffmpeg_manager = FFmpegManager(
-            sample_rate=self.sample_rate,
-            channels=self.channels
-        )
-        
-        async def handle_ffmpeg_error(error_type: str):
-            logger.error(f"FFmpeg error: {error_type}")
-            self._ffmpeg_error = error_type
-        
-        self.ffmpeg_manager.on_error_callback = handle_ffmpeg_error
-        self._ffmpeg_error = None
-        
        self.transcription_queue = asyncio.Queue() if self.args.transcription else None
        self.diarization_queue = asyncio.Queue() if self.args.diarization else None
        self.translation_queue = asyncio.Queue() if self.args.target_language else None
@@ -98,12 +82,12 @@ class AudioProcessor:

        self.transcription_task = None
        self.diarization_task = None
-        self.ffmpeg_reader_task = None
        self.watchdog_task = None
        self.all_tasks_for_cleanup = []
        
        if self.args.transcription:
-            self.online = online_factory(self.args, models.asr, models.tokenizer)            
+            self.online = online_factory(self.args, models.asr, models.tokenizer)        
+            self.sep = self.online.asr.sep   
        if self.args.diarization:
            self.diarization = online_diarization_factory(self.args, models.diarization_model)
        if self.args.target_language:
@@ -113,13 +97,12 @@ class AudioProcessor:
        """Convert PCM buffer in s16le format to normalized NumPy array."""
        return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0

-    async def update_transcription(self, new_tokens, buffer, end_buffer, sep):
+    async def update_transcription(self, new_tokens, buffer, end_buffer):
        """Thread-safe update of transcription with new data."""
        async with self.lock:
            self.tokens.extend(new_tokens)
            self.buffer_transcription = buffer
            self.end_buffer = end_buffer
-            self.sep = sep
            
    async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
        """Thread-safe update of diarization with new data."""
@@ -152,17 +135,16 @@ class AudioProcessor:
                latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
                remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1))
                
-            return {
-                "tokens": self.tokens.copy(),
-                "translated_segments": self.translated_segments.copy(),
-                "buffer_transcription": self.buffer_transcription,
-                "buffer_diarization": self.buffer_diarization,
-                "end_buffer": self.end_buffer,
-                "end_attributed_speaker": self.end_attributed_speaker,
-                "sep": self.sep,
-                "remaining_time_transcription": remaining_transcription,
-                "remaining_time_diarization": remaining_diarization
-            }
+            return State(
+                tokens=self.tokens.copy(),
+                translated_segments=self.translated_segments.copy(),
+                buffer_transcription=self.buffer_transcription,
+                buffer_diarization=self.buffer_diarization,
+                end_buffer=self.end_buffer,
+                end_attributed_speaker=self.end_attributed_speaker,
+                remaining_time_transcription=remaining_transcription,
+                remaining_time_diarization=remaining_diarization
+            )
            
    async def reset(self):
        """Reset all state variables to initial values."""
@@ -173,70 +155,8 @@ class AudioProcessor:
            self.end_buffer = self.end_attributed_speaker = 0
            self.beg_loop = time()

-    async def ffmpeg_stdout_reader(self):
-        """Read audio data from FFmpeg stdout and process it."""
-        beg = time()
-        
-        while True:
-            try:
-                # Check if FFmpeg is running
-                state = await self.ffmpeg_manager.get_state()
-                if state == FFmpegState.FAILED:
-                    logger.error("FFmpeg is in FAILED state, cannot read data")
-                    break
-                elif state == FFmpegState.STOPPED:
-                    logger.info("FFmpeg is stopped")
-                    break
-                elif state != FFmpegState.RUNNING:
-                    logger.warning(f"FFmpeg is in {state} state, waiting...")
-                    await asyncio.sleep(0.5)
-                    continue
-                
-                current_time = time()
-                elapsed_time = math.floor((current_time - beg) * 10) / 10
-                buffer_size = max(int(32000 * elapsed_time), 4096)
-                beg = current_time
-
-                chunk = await self.ffmpeg_manager.read_data(buffer_size)
-                        
-                if not chunk:
-                    if self.is_stopping:
-                        logger.info("FFmpeg stdout closed, stopping.")
-                        break
-                    else:
-                        # No data available, but not stopping - FFmpeg might be restarting
-                        await asyncio.sleep(0.1)
-                        continue
-                    
-                self.pcm_buffer.extend(chunk)
-                await self.handle_pcm_data()
-                    
-                    
-                    
-            except Exception as e:
-                logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
-                logger.warning(f"Traceback: {traceback.format_exc()}")
-                # Try to recover by waiting a bit
-                await asyncio.sleep(1)
-                
-                # Check if we should exit
-                if self.is_stopping:
-                    break
-        
-        logger.info("FFmpeg stdout processing finished. Signaling downstream processors.")
-        if self.args.transcription and self.transcription_queue:
-            await self.transcription_queue.put(SENTINEL)
-            logger.debug("Sentinel put into transcription_queue.")
-        if self.args.diarization and self.diarization_queue:
-            await self.diarization_queue.put(SENTINEL)
-            logger.debug("Sentinel put into diarization_queue.")
-        if self.args.target_language and self.translation_queue:
-            await self.translation_queue.put(SENTINEL)
-
-
    async def transcription_processor(self):
        """Process audio chunks for transcription."""
-        self.sep = self.online.asr.sep
        cumulative_pcm_duration_stream_time = 0.0
        
        while True:
@@ -276,7 +196,7 @@ class AudioProcessor:
                stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time

                self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
-                new_tokens, current_audio_processed_upto = self.online.process_iter()
+                new_tokens, current_audio_processed_upto = await asyncio.to_thread(self.online.process_iter)
                
                # Get buffer information
                _buffer_transcript_obj = self.online.get_buffer()
@@ -300,7 +220,7 @@ class AudioProcessor:
                new_end_buffer = max(candidate_end_times)
                
                await self.update_transcription(
-                    new_tokens, buffer_text, new_end_buffer, self.sep
+                    new_tokens, buffer_text, new_end_buffer
                )
                
                if new_tokens and self.args.target_language and self.translation_queue:
@@ -314,6 +234,14 @@ class AudioProcessor:
                logger.warning(f"Traceback: {traceback.format_exc()}")
                if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
                    self.transcription_queue.task_done()
+        
+        if self.is_stopping:
+            logger.info("Transcription processor finishing due to stopping flag.")
+            if self.diarization_queue:
+                await self.diarization_queue.put(SENTINEL)
+            if self.translation_queue:
+                await self.translation_queue.put(SENTINEL)
+
        logger.info("Transcription processor task finished.")


@@ -385,7 +313,7 @@ class AudioProcessor:
                    tokens_to_process.append(additional_token)                
                if tokens_to_process:
                    online_translation.insert_tokens(tokens_to_process)
-                    self.translated_segments = online_translation.process()
+                    self.translated_segments = await asyncio.to_thread(online_translation.process)
                
                self.translation_queue.task_done()
                for _ in additional_tokens:
@@ -407,39 +335,16 @@ class AudioProcessor:

    async def results_formatter(self):
        """Format processing results for output."""
-        last_sent_trans = None
-        last_sent_diar = None
        while True:
            try:
-                ffmpeg_state = await self.ffmpeg_manager.get_state()
-                if ffmpeg_state == FFmpegState.FAILED and self._ffmpeg_error:
-                    yield {
-                        "status": "error",
-                        "error": f"FFmpeg error: {self._ffmpeg_error}",
-                        "lines": [],
-                        "buffer_transcription": "",
-                        "buffer_diarization": "",
-                        "remaining_time_transcription": 0,
-                        "remaining_time_diarization": 0
-                    }
-                    self._ffmpeg_error = None
-                    await asyncio.sleep(1)
-                    continue
-                
                # Get current state
                state = await self.get_current_state()
-                tokens = state["tokens"]
-                buffer_transcription = state["buffer_transcription"]
-                buffer_diarization = state["buffer_diarization"]
-                end_attributed_speaker = state["end_attributed_speaker"]
-                sep = state["sep"]
                                
                # Add dummy tokens if needed
-                if (not tokens or tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
+                if (not state.tokens or state.tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
                    await self.add_dummy_token()
                    sleep(0.5)
                    state = await self.get_current_state()
-                    tokens = state["tokens"]
                
                # Format output
                lines, undiarized_text, buffer_transcription, buffer_diarization = format_output(
@@ -447,18 +352,19 @@ class AudioProcessor:
                    self.silence,
                    current_time = time() - self.beg_loop if self.beg_loop else None,
                    args = self.args,
-                    debug = self.debug
+                    debug = self.debug,
+                    sep=self.sep
                )
                # Handle undiarized text
                if undiarized_text:
-                    combined = sep.join(undiarized_text)
+                    combined = self.sep.join(undiarized_text)
                    if buffer_transcription:
-                        combined += sep
-                    await self.update_diarization(end_attributed_speaker, combined)
+                        combined += self.sep
+                    await self.update_diarization(state.end_attributed_speaker, combined)
                    buffer_diarization = combined
                
                response_status = "active_transcription"
-                if not tokens and not buffer_transcription and not buffer_diarization:
+                if not state.tokens and not buffer_transcription and not buffer_diarization:
                    response_status = "no_audio_detected"
                    lines = []
                elif response_status == "active_transcription" and not lines:
@@ -468,32 +374,19 @@ class AudioProcessor:
                        end=state.get("end_buffer", 0)        
                    )]
                
-                response = {
-                    "status": response_status,
-                    "lines": [line.to_dict() for line in lines],
-                    "buffer_transcription": buffer_transcription,
-                    "buffer_diarization": buffer_diarization,
-                    "remaining_time_transcription": state["remaining_time_transcription"],
-                    "remaining_time_diarization": state["remaining_time_diarization"] if self.args.diarization else 0
-                }
-                
-                current_response_signature = f"{response_status} | " + \
-                                           ' '.join([f"{line.speaker} {line.text}" for line in lines]) + \
-                                           f" | {buffer_transcription} | {buffer_diarization}"
-                
-                trans = state["remaining_time_transcription"]
-                diar = state["remaining_time_diarization"]
-                should_push = (
-                    current_response_signature != self.last_response_content
-                    or last_sent_trans is None
-                    or round(trans, 1) != round(last_sent_trans, 1)
-                    or round(diar, 1) != round(last_sent_diar, 1)
+                response = FrontData(
+                    status=response_status,
+                    lines=lines,
+                    buffer_transcription=buffer_transcription,
+                    buffer_diarization=buffer_diarization,
+                    remaining_time_transcription=state.remaining_time_transcription,
+                    remaining_time_diarization=state.remaining_time_diarization if self.args.diarization else 0
                )
-                if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
+                                
+                should_push = (response != self.last_response_content)
+                if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected"):
                    yield response
-                    self.last_response_content = current_response_signature
-                    last_sent_trans = trans
-                    last_sent_diar = diar
+                    self.last_response_content = response
                
                # Check for termination condition
                if self.is_stopping:
@@ -507,33 +400,18 @@ class AudioProcessor:
                        logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
                        return
                
-                await asyncio.sleep(0.1)  # Avoid overwhelming the client
+                await asyncio.sleep(0.05)
                
            except Exception as e:
                logger.warning(f"Exception in results_formatter: {e}")
                logger.warning(f"Traceback: {traceback.format_exc()}")
-                await asyncio.sleep(0.5)  # Back off on error
+                await asyncio.sleep(0.5)
        
    async def create_tasks(self):
        """Create and start processing tasks."""
        self.all_tasks_for_cleanup = []
        processing_tasks_for_watchdog = []

-        success = await self.ffmpeg_manager.start()
-        if not success:
-            logger.error("Failed to start FFmpeg manager")
-            async def error_generator():
-                yield {
-                    "status": "error", 
-                    "error": "FFmpeg failed to start. Please check that FFmpeg is installed.",
-                    "lines": [],
-                    "buffer_transcription": "",
-                    "buffer_diarization": "",
-                    "remaining_time_transcription": 0,
-                    "remaining_time_diarization": 0
-                }
-            return error_generator()
-
        if self.args.transcription and self.online:
            self.transcription_task = asyncio.create_task(self.transcription_processor())
            self.all_tasks_for_cleanup.append(self.transcription_task)
@@ -549,10 +427,6 @@ class AudioProcessor:
            self.all_tasks_for_cleanup.append(self.translation_task)
            processing_tasks_for_watchdog.append(self.translation_task)
        
-        self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
-        self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
-        processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
-
        # Monitor overall system health
        self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
        self.all_tasks_for_cleanup.append(self.watchdog_task)
@@ -573,15 +447,6 @@ class AudioProcessor:
                            logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
                        else:
                            logger.info(f"{task_name} completed normally.")
-                
-                # Check FFmpeg status through the manager
-                ffmpeg_state = await self.ffmpeg_manager.get_state()
-                if ffmpeg_state == FFmpegState.FAILED:
-                    logger.error("FFmpeg is in FAILED state, notifying results formatter")
-                    # FFmpeg manager will handle its own recovery
-                elif ffmpeg_state == FFmpegState.STOPPED and not self.is_stopping:
-                    logger.warning("FFmpeg unexpectedly stopped, attempting restart")
-                    await self.ffmpeg_manager.restart()
                    
            except asyncio.CancelledError:
                logger.info("Watchdog task cancelled.")
@@ -601,8 +466,6 @@ class AudioProcessor:
            if created_tasks:
                await asyncio.gather(*created_tasks, return_exceptions=True)
            logger.info("All processing tasks cancelled or finished.")
-            await self.ffmpeg_manager.stop()
-            logger.info("FFmpeg manager stopped.")
            if self.args.diarization and hasattr(self, 'diarization') and hasattr(self.diarization, 'close'):
                self.diarization.close()
            logger.info("AudioProcessor cleanup complete.")
@@ -617,8 +480,10 @@ class AudioProcessor:
        if not message:
            logger.info("Empty audio message received, initiating stop sequence.")
            self.is_stopping = True
-            # Signal FFmpeg manager to stop accepting data
-            await self.ffmpeg_manager.stop()
+            
+            if self.transcription_queue:
+                await self.transcription_queue.put(SENTINEL)
+
            return

        if self.is_stopping:
@@ -628,14 +493,6 @@ class AudioProcessor:
        if self.is_pcm_input:
            self.pcm_buffer.extend(message)
            await self.handle_pcm_data()
-        else:
-            success = await self.ffmpeg_manager.write_data(message)
-            if not success:
-                ffmpeg_state = await self.ffmpeg_manager.get_state()
-                if ffmpeg_state == FFmpegState.FAILED:
-                    logger.error("FFmpeg is in FAILED state, cannot process audio")
-                else:
-                    logger.warning("Failed to write audio data to FFmpeg")

    async def handle_pcm_data(self):
        # Process when enough data
--- a/whisperlivekit/basic_server.py
+++ b/whisperlivekit/basic_server.py
@@ -54,7 +54,7 @@ async def handle_websocket_results(websocket, results_generator):
    """Consumes results from the audio processor and sends them via WebSocket."""
    try:
        async for response in results_generator:
-            await websocket.send_json(response)
+            await websocket.send_json(response.to_dict())
        # when the results_generator finishes it means all audio has been processed
        logger.info("Results generator finished. Sending 'ready_to_stop' to client.")
        await websocket.send_json({"type": "ready_to_stop"})
--- a/whisperlivekit/core.py
+++ b/whisperlivekit/core.py
@@ -4,7 +4,7 @@ try:
 except ImportError:
    from .whisper_streaming_custom.whisper_online import backend_factory
    from .whisper_streaming_custom.online_asr import OnlineASRProcessor
-from whisperlivekit.warmup import warmup_asr, warmup_online
+from whisperlivekit.warmup import warmup_asr
 from argparse import Namespace
 import sys

@@ -120,7 +120,7 @@ class TranscriptionEngine:

            else:
                self.asr, self.tokenizer = backend_factory(self.args)
-            warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
+                warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here

        if self.args.diarization:
            if self.args.diarization_backend == "diart":
@@ -155,7 +155,6 @@ def online_factory(args, asr, tokenizer, logfile=sys.stderr):
            asr,
            logfile=logfile,
        )
-        # warmup_online(online, args.warmup_file)
    else:
        online = OnlineASRProcessor(
            asr,
--- a/whisperlivekit/ffmpeg_manager.py
+++ b/whisperlivekit/ffmpeg_manager.py
@@ -1,193 +0,0 @@
-import asyncio
-import logging
-from enum import Enum
-from typing import Optional, Callable
-import contextlib
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-ERROR_INSTALL_INSTRUCTIONS = """
-FFmpeg is not installed or not found in your system's PATH.
-Please install FFmpeg to enable audio processing.
-
-Installation instructions:
-
-# Ubuntu/Debian:
-sudo apt update && sudo apt install ffmpeg
-
-# macOS (using Homebrew):
-brew install ffmpeg
-
-# Windows:
-# 1. Download the latest static build from https://ffmpeg.org/download.html
-# 2. Extract the archive (e.g., to C:\\FFmpeg).
-# 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
-
-After installation, please restart the application.
-"""
-
-class FFmpegState(Enum):
-    STOPPED = "stopped"
-    STARTING = "starting"
-    RUNNING = "running"
-    RESTARTING = "restarting"
-    FAILED = "failed"
-
-class FFmpegManager:
-    def __init__(self, sample_rate: int = 16000, channels: int = 1):
-        self.sample_rate = sample_rate
-        self.channels = channels
-
-        self.process: Optional[asyncio.subprocess.Process] = None
-        self._stderr_task: Optional[asyncio.Task] = None
-
-        self.on_error_callback: Optional[Callable[[str], None]] = None
-
-        self.state = FFmpegState.STOPPED
-        self._state_lock = asyncio.Lock()
-
-    async def start(self) -> bool:
-        async with self._state_lock:
-            if self.state != FFmpegState.STOPPED:
-                logger.warning(f"FFmpeg already running in state: {self.state}")
-                return False
-            self.state = FFmpegState.STARTING
-
-        try:
-            cmd = [
-                "ffmpeg",
-                "-hide_banner",
-                "-loglevel", "error",
-                "-i", "pipe:0",
-                "-f", "s16le",
-                "-acodec", "pcm_s16le",
-                "-ac", str(self.channels),
-                "-ar", str(self.sample_rate),
-                "pipe:1"
-            ]
-
-            self.process = await asyncio.create_subprocess_exec(
-                *cmd,
-                stdin=asyncio.subprocess.PIPE,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE
-            )
-
-            self._stderr_task = asyncio.create_task(self._drain_stderr())
-
-            async with self._state_lock:
-                self.state = FFmpegState.RUNNING
-
-            logger.info("FFmpeg started.")
-            return True
-
-        except FileNotFoundError:
-            logger.error(ERROR_INSTALL_INSTRUCTIONS)
-            async with self._state_lock:
-                self.state = FFmpegState.FAILED
-            if self.on_error_callback:
-                await self.on_error_callback("ffmpeg_not_found")
-            return False
-
-        except Exception as e:
-            logger.error(f"Error starting FFmpeg: {e}")
-            async with self._state_lock:
-                self.state = FFmpegState.FAILED
-            if self.on_error_callback:
-                await self.on_error_callback("start_failed")
-            return False
-
-    async def stop(self):
-        async with self._state_lock:
-            if self.state == FFmpegState.STOPPED:
-                return
-            self.state = FFmpegState.STOPPED
-
-        if self.process:
-            if self.process.stdin and not self.process.stdin.is_closing():
-                self.process.stdin.close()
-                await self.process.stdin.wait_closed()
-            await self.process.wait()
-            self.process = None
-
-        if self._stderr_task:
-            self._stderr_task.cancel()
-            with contextlib.suppress(asyncio.CancelledError):
-                await self._stderr_task
-
-        logger.info("FFmpeg stopped.")
-
-    async def write_data(self, data: bytes) -> bool:
-        async with self._state_lock:
-            if self.state != FFmpegState.RUNNING:
-                logger.warning(f"Cannot write, FFmpeg state: {self.state}")
-                return False
-
-        try:
-            self.process.stdin.write(data)
-            await self.process.stdin.drain()
-            return True
-        except Exception as e:
-            logger.error(f"Error writing to FFmpeg: {e}")
-            if self.on_error_callback:
-                await self.on_error_callback("write_error")
-            return False
-
-    async def read_data(self, size: int) -> Optional[bytes]:
-        async with self._state_lock:
-            if self.state != FFmpegState.RUNNING:
-                logger.warning(f"Cannot read, FFmpeg state: {self.state}")
-                return None
-
-        try:
-            data = await asyncio.wait_for(
-                self.process.stdout.read(size),
-                timeout=20.0
-            )
-            return data
-        except asyncio.TimeoutError:
-            logger.warning("FFmpeg read timeout.")
-            return None
-        except Exception as e:
-            logger.error(f"Error reading from FFmpeg: {e}")
-            if self.on_error_callback:
-                await self.on_error_callback("read_error")
-            return None
-
-    async def get_state(self) -> FFmpegState:
-        async with self._state_lock:
-            return self.state
-
-    async def restart(self) -> bool:
-        async with self._state_lock:
-            if self.state == FFmpegState.RESTARTING:
-                logger.warning("Restart already in progress.")
-                return False
-            self.state = FFmpegState.RESTARTING
-
-        logger.info("Restarting FFmpeg...")
-
-        try:
-            await self.stop()
-            await asyncio.sleep(1)  # short delay before restarting
-            return await self.start()
-        except Exception as e:
-            logger.error(f"Error during FFmpeg restart: {e}")
-            async with self._state_lock:
-                self.state = FFmpegState.FAILED
-            if self.on_error_callback:
-                await self.on_error_callback("restart_failed")
-            return False
-
-    async def _drain_stderr(self):
-        try:
-            while True:
-                line = await self.process.stderr.readline()
-                if not line:
-                    break
-                logger.debug(f"FFmpeg stderr: {line.decode(errors='ignore').strip()}")
-        except asyncio.CancelledError:
-            logger.info("FFmpeg stderr drain task cancelled.")
-        except Exception as e:
-            logger.error(f"Error draining FFmpeg stderr: {e}")
--- a/whisperlivekit/parse_args.py
+++ b/whisperlivekit/parse_args.py
@@ -20,7 +20,7 @@ def parse_args():
        help="""
        The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
        If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
-        If False, no warmup is performed.
+        If empty, no warmup is performed.
        """,
    )

--- a/whisperlivekit/results_formater.py
+++ b/whisperlivekit/results_formater.py
@@ -46,15 +46,14 @@ def append_token_to_last_line(lines, sep, token, debug_info):
        lines[-1].text += sep + token.text + debug_info
        lines[-1].end = token.end

-def format_output(state, silence, current_time, args, debug):
+def format_output(state, silence, current_time, args, debug, sep):
    diarization = args.diarization
    disable_punctuation_split = args.disable_punctuation_split
-    tokens = state["tokens"]
-    translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments
-    buffer_transcription = state["buffer_transcription"]
-    buffer_diarization = state["buffer_diarization"]
-    end_attributed_speaker = state["end_attributed_speaker"]
-    sep = state["sep"]
+    tokens = state.tokens
+    translated_segments = state.translated_segments # Here we will attribute the speakers only based on the timestamps of the segments
+    buffer_transcription = state.buffer_transcription
+    buffer_diarization = state.buffer_diarization
+    end_attributed_speaker = state.end_attributed_speaker
    
    previous_speaker = -1
    lines = []
@@ -128,7 +127,7 @@ def format_output(state, silence, current_time, args, debug):
        for line in lines:
            while cts_idx < len(translated_segments):
                ts = translated_segments[cts_idx]
-                if ts.start and ts.start >= line.start and ts.end <= line.end:
+                if ts and ts.start and ts.start >= line.start and ts.end <= line.end:
                    line.translation += ts.text + ' '
                    cts_idx += 1
                else:
--- a/whisperlivekit/timed_objects.py
+++ b/whisperlivekit/timed_objects.py
@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 from datetime import timedelta

@@ -57,4 +57,38 @@ class Line(TimedText):
            'translation': self.translation,
            'start': format_time(self.start),
            'end': format_time(self.end),
-        }
+        }
+        
+@dataclass  
+class FrontData():
+    status: str = ''
+    error: str = ''
+    lines: list[Line] = field(default_factory=list)
+    buffer_transcription: str = ''
+    buffer_diarization: str = ''
+    remaining_time_transcription: float = 0.
+    remaining_time_diarization: float = 0.
+    
+    def to_dict(self):
+        _dict = {
+            'status': self.status,
+            'lines': [line.to_dict() for line in self.lines],
+            'buffer_transcription': self.buffer_transcription,
+            'buffer_diarization': self.buffer_diarization,
+            'remaining_time_transcription': self.remaining_time_transcription,
+            'remaining_time_diarization': self.remaining_time_diarization,
+        }
+        if self.error:
+            _dict['error'] = self.error
+        return _dict
+    
+@dataclass  
+class State():
+    tokens: list
+    translated_segments: list
+    buffer_transcription: str
+    buffer_diarization: str
+    end_buffer: float
+    end_attributed_speaker: float
+    remaining_time_transcription: float
+    remaining_time_diarization: float
--- a/whisperlivekit/warmup.py
+++ b/whisperlivekit/warmup.py
@@ -6,57 +6,46 @@ logger = logging.getLogger(__name__)
 def load_file(warmup_file=None, timeout=5):
    import os
    import tempfile
+    import urllib.request
    import librosa
-        
+
+    if warmup_file == "":
+        logger.info(f"Skipping warmup.")
+        return None
+
+    # Download JFK sample if not already present
    if warmup_file is None:
-        # Download JFK sample if not already present
        jfk_url = "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav"
        temp_dir = tempfile.gettempdir()
        warmup_file = os.path.join(temp_dir, "whisper_warmup_jfk.wav")
-        
-        if not os.path.exists(warmup_file):
-            logger.debug(f"Downloading warmup file from {jfk_url}")
-            print(f"Downloading warmup file from {jfk_url}")
-            import time
-            import urllib.request
-            import urllib.error
-            import socket
-            
-            original_timeout = socket.getdefaulttimeout()
-            socket.setdefaulttimeout(timeout)
-            
-            start_time = time.time()
+        if not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
            try:
-                urllib.request.urlretrieve(jfk_url, warmup_file)
-                logger.debug(f"Download successful in {time.time() - start_time:.2f}s")
-            except (urllib.error.URLError, socket.timeout) as e:
-                logger.warning(f"Download failed: {e}. Proceeding without warmup.")
+                logger.debug(f"Downloading warmup file from {jfk_url}")
+                with urllib.request.urlopen(jfk_url, timeout=timeout) as r, open(warmup_file, "wb") as f:
+                    f.write(r.read())
+            except Exception as e:
+                logger.warning(f"Warmup file download failed: {e}.")
                return None
-            finally:
-                socket.setdefaulttimeout(original_timeout)
-    elif not warmup_file:
-        return None 
-    
-    if not warmup_file or not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
-        logger.warning(f"Warmup file {warmup_file} invalid or missing.")
+
+    # Validate file and load
+    if not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
+        logger.warning(f"Warmup file {warmup_file} is invalid or missing.")
        return None
-    
+
    try:
-        audio, sr = librosa.load(warmup_file, sr=16000)
+        audio, _ = librosa.load(warmup_file, sr=16000)
+        return audio
    except Exception as e:
-        logger.warning(f"Failed to load audio file: {e}")
+        logger.warning(f"Failed to load warmup file: {e}")
        return None
-    return audio

 def warmup_asr(asr, warmup_file=None, timeout=5):
    """
    Warmup the ASR model by transcribing a short audio file.
    """
-    audio = load_file(warmup_file=None, timeout=5)
+    audio = load_file(warmup_file=warmup_file, timeout=timeout)
+    if audio is None:
+        logger.warning("Warmup file unavailable. Skipping ASR warmup.")
+        return
    asr.transcribe(audio)
-    logger.info("ASR model is warmed up")
-    
-def warmup_online(online, warmup_file=None, timeout=5):
-    audio = load_file(warmup_file=None, timeout=5)
-    online.warmup(audio)
-    logger.warning("ASR is warmed up")
+    logger.info("ASR model is warmed up.")
--- a/whisperlivekit/web/live_transcription.js
+++ b/whisperlivekit/web/live_transcription.js
@@ -12,6 +12,8 @@ let timerInterval = null;
 let audioContext = null;
 let analyser = null;
 let microphone = null;
+let workletNode = null;
+let recorderWorker = null;
 let waveCanvas = document.getElementById("waveCanvas");
 let waveCtx = waveCanvas.getContext("2d");
 let animationFrame = null;
@@ -457,13 +459,38 @@ async function startRecording() {
    microphone = audioContext.createMediaStreamSource(stream);
    microphone.connect(analyser);

-    recorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
-    recorder.ondataavailable = (e) => {
+    if (!audioContext.audioWorklet) {
+      throw new Error("AudioWorklet is not supported in this browser");
+    }
+    await audioContext.audioWorklet.addModule("/web/pcm_worklet.js");
+    workletNode = new AudioWorkletNode(audioContext, "pcm-forwarder", { numberOfInputs: 1, numberOfOutputs: 0, channelCount: 1 });
+    microphone.connect(workletNode);
+
+    recorderWorker = new Worker("/web/recorder_worker.js");
+    recorderWorker.postMessage({
+      command: "init",
+      config: {
+        sampleRate: audioContext.sampleRate,
+      },
+    });
+
+    recorderWorker.onmessage = (e) => {
      if (websocket && websocket.readyState === WebSocket.OPEN) {
-        websocket.send(e.data);
+        websocket.send(e.data.buffer);
      }
    };
-    recorder.start(chunkDuration);
+
+    workletNode.port.onmessage = (e) => {
+      const data = e.data;
+      const ab = data instanceof ArrayBuffer ? data : data.buffer;
+      recorderWorker.postMessage(
+        {
+          command: "record",
+          buffer: ab,
+        },
+        [ab]
+      );
+    };

    startTime = Date.now();
    timerInterval = setInterval(updateTimer, 1000);
@@ -501,9 +528,19 @@ async function stopRecording() {
    statusText.textContent = "Recording stopped. Processing final audio...";
  }

-  if (recorder) {
-    recorder.stop();
-    recorder = null;
+  if (recorderWorker) {
+    recorderWorker.terminate();
+    recorderWorker = null;
+  }
+  
+  if (workletNode) {
+    try {
+      workletNode.port.onmessage = null;
+    } catch (e) {}
+    try {
+      workletNode.disconnect();
+    } catch (e) {}
+    workletNode = null;
  }

  if (microphone) {
--- a/whisperlivekit/web/pcm_worklet.js
+++ b/whisperlivekit/web/pcm_worklet.js
@@ -0,0 +1,16 @@
+class PCMForwarder extends AudioWorkletProcessor {
+  process(inputs) {
+    const input = inputs[0];
+    if (input && input[0] && input[0].length) {
+      // Forward mono channel (0). If multi-channel, downmixing can be added here.
+      const channelData = input[0];
+      const copy = new Float32Array(channelData.length);
+      copy.set(channelData);
+      this.port.postMessage(copy, [copy.buffer]);
+    }
+    // Keep processor alive
+    return true;
+  }
+}
+
+registerProcessor('pcm-forwarder', PCMForwarder);
--- a/whisperlivekit/web/recorder_worker.js
+++ b/whisperlivekit/web/recorder_worker.js
@@ -0,0 +1,58 @@
+let sampleRate = 48000;
+let targetSampleRate = 16000;
+
+self.onmessage = function (e) {
+  switch (e.data.command) {
+    case 'init':
+      init(e.data.config);
+      break;
+    case 'record':
+      record(e.data.buffer);
+      break;
+  }
+};
+
+function init(config) {
+  sampleRate = config.sampleRate;
+  targetSampleRate = config.targetSampleRate || 16000;
+}
+
+function record(inputBuffer) {
+  const buffer = new Float32Array(inputBuffer);
+  const resampledBuffer = resample(buffer, sampleRate, targetSampleRate);
+  const pcmBuffer = toPCM(resampledBuffer);
+  self.postMessage({ buffer: pcmBuffer }, [pcmBuffer]);
+}
+
+function resample(buffer, from, to) {
+    if (from === to) {
+        return buffer;
+    }
+    const ratio = from / to;
+    const newLength = Math.round(buffer.length / ratio);
+    const result = new Float32Array(newLength);
+    let offsetResult = 0;
+    let offsetBuffer = 0;
+    while (offsetResult < result.length) {
+        const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
+        let accum = 0, count = 0;
+        for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
+            accum += buffer[i];
+            count++;
+        }
+        result[offsetResult] = accum / count;
+        offsetResult++;
+        offsetBuffer = nextOffsetBuffer;
+    }
+    return result;
+}
+
+function toPCM(input) {
+  const buffer = new ArrayBuffer(input.length * 2);
+  const view = new DataView(buffer);
+  for (let i = 0; i < input.length; i++) {
+    const s = Math.max(-1, Math.min(1, input[i]));
+    view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+  }
+  return buffer;
+}
Author	SHA1	Message	Date
Quentin Fuxa	4d7c487614	replace deprecated ScriptProcessorNode with AudioWorklet	2025-09-17 10:53:53 +02:00
GeorgeCaoJ	2a27d2030a	feat: support web audio 16kHz PCM input and remove ffmpeg dependency	2025-09-15 23:22:25 +08:00
Quentin Fuxa	cd160caaa1	asyncio.to_thread for transcription and translation	2025-09-15 15:23:22 +02:00
Quentin Fuxa	d27b5eb23e	Merge pull request #219 from notV3NOM/main Fix warmup file behavior	2025-09-15 10:19:26 +02:00
Quentin Fuxa	f9d704a900	Merge branch 'main' of https://github.com/notv3nom/whisperlivekit into pr/notV3NOM/219	2025-09-15 10:00:14 +02:00
Quentin Fuxa	2f6e00f512	simulstreaming warmup is done in whisperlivekit.simul_whisper.backend.load_model, not in warmup_online	2025-09-15 09:43:15 +02:00
Quentin Fuxa	5aa312e437	simulstreaming warmup is done in whisperlivekit.simul_whisper.backend.load_model, not in warmup_online	2025-09-13 20:19:19 +01:00
notV3NOM	ebaf36a8be	Fix warmup file behavior	2025-09-13 20:44:24 +05:30