Files
WhisperLiveKit/whisperlivekit/audio_processor.py
2026-01-25 13:48:00 +01:00

751 lines
34 KiB
Python

import asyncio
import logging
import traceback
from time import time
from typing import Any, AsyncGenerator, List, Optional, Union
import numpy as np
from whisperlivekit.core import (
TranscriptionEngine,
online_diarization_factory,
online_factory,
online_translation_factory,
)
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
from whisperlivekit.metrics_collector import SessionMetrics
from whisperlivekit.silero_vad_iterator import FixedVADIterator, OnnxWrapper, load_jit_vad
from whisperlivekit.timed_objects import ChangeSpeaker, FrontData, Silence, State
from whisperlivekit.tokens_alignment import TokensAlignment
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
SENTINEL = object() # unique sentinel object for end of stream marker
MIN_DURATION_REAL_SILENCE = 5
async def get_all_from_queue(queue: asyncio.Queue) -> Union[object, Silence, np.ndarray, List[Any]]:
items: List[Any] = []
first_item = await queue.get()
queue.task_done()
if first_item is SENTINEL:
return first_item
if isinstance(first_item, Silence):
return first_item
items.append(first_item)
while True:
if not queue._queue:
break
next_item = queue._queue[0]
if next_item is SENTINEL:
break
if isinstance(next_item, Silence):
break
items.append(await queue.get())
queue.task_done()
if isinstance(items[0], np.ndarray):
return np.concatenate(items)
else: #translation
return items
class AudioProcessor:
"""
Processes audio streams for transcription and diarization.
Handles audio processing, state management, and result formatting.
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize the audio processor with configuration, models, and state."""
# Extract per-session language override before passing to TranscriptionEngine
session_language = kwargs.pop('language', None)
if 'transcription_engine' in kwargs and isinstance(kwargs['transcription_engine'], TranscriptionEngine):
models = kwargs['transcription_engine']
else:
models = TranscriptionEngine(**kwargs)
# Audio processing settings
self.args = models.args
self.sample_rate = 16000
self.channels = 1
chunk_seconds = self.args.vac_chunk_size if self.args.vac else self.args.min_chunk_size
self.samples_per_sec = int(self.sample_rate * chunk_seconds)
self.bytes_per_sample = 2
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
self.is_pcm_input = self.args.pcm_input
# State management
self.is_stopping: bool = False
self.current_silence: Optional[Silence] = None
self.state: State = State()
self.lock: asyncio.Lock = asyncio.Lock()
self.sep: str = " " # Default separator
self.last_response_content: FrontData = FrontData()
self.tokens_alignment: TokensAlignment = TokensAlignment(self.state, self.args, self.sep)
self.beg_loop: Optional[float] = None
# Models and processing
self.asr: Any = models.asr
self.vac: Optional[FixedVADIterator] = None
if self.args.vac:
if models.vac_session is not None:
vac_model = OnnxWrapper(session=models.vac_session)
self.vac = FixedVADIterator(vac_model)
else:
self.vac = FixedVADIterator(load_jit_vad())
self.ffmpeg_manager: Optional[FFmpegManager] = None
self.ffmpeg_reader_task: Optional[asyncio.Task] = None
self._ffmpeg_error: Optional[str] = None
if not self.is_pcm_input:
self.ffmpeg_manager = FFmpegManager(
sample_rate=self.sample_rate,
channels=self.channels
)
async def handle_ffmpeg_error(error_type: str):
logger.error(f"FFmpeg error: {error_type}")
self._ffmpeg_error = error_type
self.ffmpeg_manager.on_error_callback = handle_ffmpeg_error
self.transcription_queue: Optional[asyncio.Queue] = asyncio.Queue() if self.args.transcription else None
self.diarization_queue: Optional[asyncio.Queue] = asyncio.Queue() if self.args.diarization else None
self.translation_queue: Optional[asyncio.Queue] = asyncio.Queue() if self.args.target_language else None
self.pcm_buffer: bytearray = bytearray()
self.total_pcm_samples: int = 0
self.transcription_task: Optional[asyncio.Task] = None
self.diarization_task: Optional[asyncio.Task] = None
self.translation_task: Optional[asyncio.Task] = None
self.watchdog_task: Optional[asyncio.Task] = None
self.all_tasks_for_cleanup: List[asyncio.Task] = []
self.metrics: SessionMetrics = SessionMetrics()
self.transcription: Optional[Any] = None
self.translation: Optional[Any] = None
self.diarization: Optional[Any] = None
if self.args.transcription:
self.transcription = online_factory(self.args, models.asr, language=session_language)
self.sep = self.transcription.asr.sep
if self.args.diarization:
self.diarization = online_diarization_factory(self.args, models.diarization_model)
if models.translation_model:
self.translation = online_translation_factory(self.args, models.translation_model)
async def _push_silence_event(self) -> None:
if self.transcription_queue:
await self.transcription_queue.put(self.current_silence)
if self.args.diarization and self.diarization_queue:
await self.diarization_queue.put(self.current_silence)
if self.translation_queue:
await self.translation_queue.put(self.current_silence)
async def _begin_silence(self, at_sample: Optional[int] = None) -> None:
if self.current_silence:
return
# Use audio stream time (sample-precise) for accurate silence duration
if at_sample is not None:
audio_t = at_sample / self.sample_rate
else:
audio_t = self.total_pcm_samples / self.sample_rate if self.sample_rate else 0.0
self.current_silence = Silence(
is_starting=True, start=audio_t
)
# Push a separate start-only event so _end_silence won't mutate it
start_event = Silence(is_starting=True, start=audio_t)
if self.transcription_queue:
await self.transcription_queue.put(start_event)
if self.args.diarization and self.diarization_queue:
await self.diarization_queue.put(start_event)
if self.translation_queue:
await self.translation_queue.put(start_event)
async def _end_silence(self, at_sample: Optional[int] = None) -> None:
if not self.current_silence:
return
if at_sample is not None:
audio_t = at_sample / self.sample_rate
else:
audio_t = self.total_pcm_samples / self.sample_rate if self.sample_rate else 0.0
self.current_silence.end = audio_t
self.current_silence.is_starting = False
self.current_silence.has_ended = True
self.current_silence.compute_duration()
self.metrics.n_silence_events += 1
if self.current_silence.duration is not None:
self.metrics.total_silence_duration_s += self.current_silence.duration
if self.current_silence.duration and self.current_silence.duration > MIN_DURATION_REAL_SILENCE:
self.state.new_tokens.append(self.current_silence)
# Push the completed silence as the end event (separate from the start event)
await self._push_silence_event()
self.current_silence = None
async def _enqueue_active_audio(self, pcm_chunk: np.ndarray) -> None:
if pcm_chunk is None or pcm_chunk.size == 0:
return
if self.transcription_queue:
await self.transcription_queue.put(pcm_chunk.copy())
if self.args.diarization and self.diarization_queue:
await self.diarization_queue.put(pcm_chunk.copy())
def _slice_before_silence(self, pcm_array: np.ndarray, chunk_sample_start: int, silence_sample: Optional[int]) -> Optional[np.ndarray]:
if silence_sample is None:
return None
relative_index = int(silence_sample - chunk_sample_start)
if relative_index <= 0:
return None
split_index = min(relative_index, len(pcm_array))
if split_index <= 0:
return None
return pcm_array[:split_index]
def convert_pcm_to_float(self, pcm_buffer: Union[bytes, bytearray]) -> np.ndarray:
"""Convert PCM buffer in s16le format to normalized NumPy array."""
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
async def get_current_state(self) -> State:
"""Get current state."""
async with self.lock:
current_time = time()
remaining_transcription = 0
if self.state.end_buffer > 0:
remaining_transcription = max(0, round(current_time - self.beg_loop - self.state.end_buffer, 1))
remaining_diarization = 0
if self.state.tokens:
latest_end = max(self.state.end_buffer, self.state.tokens[-1].end if self.state.tokens else 0)
remaining_diarization = max(0, round(latest_end - self.state.end_attributed_speaker, 1))
self.state.remaining_time_transcription = remaining_transcription
self.state.remaining_time_diarization = remaining_diarization
return self.state
async def ffmpeg_stdout_reader(self) -> None:
"""Read audio data from FFmpeg stdout and process it into the PCM pipeline."""
beg = time()
while True:
try:
if self.is_stopping:
logger.info("Stopping ffmpeg_stdout_reader due to stopping flag.")
break
state = await self.ffmpeg_manager.get_state() if self.ffmpeg_manager else FFmpegState.STOPPED
if state == FFmpegState.FAILED:
logger.error("FFmpeg is in FAILED state, cannot read data")
break
elif state == FFmpegState.STOPPED:
logger.info("FFmpeg is stopped")
break
elif state != FFmpegState.RUNNING:
await asyncio.sleep(0.1)
continue
current_time = time()
elapsed_time = max(0.0, current_time - beg)
buffer_size = max(int(32000 * elapsed_time), 4096) # dynamic read
beg = current_time
chunk = await self.ffmpeg_manager.read_data(buffer_size)
if not chunk:
# No data currently available
await asyncio.sleep(0.05)
continue
self.pcm_buffer.extend(chunk)
await self.handle_pcm_data()
except asyncio.CancelledError:
logger.info("ffmpeg_stdout_reader cancelled.")
break
except Exception as e:
logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
logger.debug(f"Traceback: {traceback.format_exc()}")
await asyncio.sleep(0.2)
logger.info("FFmpeg stdout processing finished. Signaling downstream processors if needed.")
if self.transcription_queue:
await self.transcription_queue.put(SENTINEL)
if self.diarization:
await self.diarization_queue.put(SENTINEL)
if self.translation:
await self.translation_queue.put(SENTINEL)
async def _finish_transcription(self) -> None:
"""Call finish() on the online processor to flush remaining tokens."""
if not self.transcription:
return
try:
if hasattr(self.transcription, 'finish'):
final_tokens, end_time = await asyncio.to_thread(self.transcription.finish)
else:
# SimulStreamingOnlineProcessor uses start_silence() → process_iter(is_last=True)
final_tokens, end_time = await asyncio.to_thread(self.transcription.start_silence)
final_tokens = final_tokens or []
if final_tokens:
logger.info(f"Finish flushed {len(final_tokens)} tokens")
self.metrics.n_tokens_produced += len(final_tokens)
_buffer_transcript = self.transcription.get_buffer()
async with self.lock:
self.state.tokens.extend(final_tokens)
self.state.buffer_transcription = _buffer_transcript
self.state.end_buffer = max(self.state.end_buffer, end_time)
self.state.new_tokens.extend(final_tokens)
self.state.new_tokens_buffer = _buffer_transcript
if self.translation_queue:
for token in final_tokens:
await self.translation_queue.put(token)
except Exception as e:
logger.warning(f"Error finishing transcription: {e}")
logger.debug(f"Traceback: {traceback.format_exc()}")
async def transcription_processor(self) -> None:
"""Process audio chunks for transcription."""
cumulative_pcm_duration_stream_time = 0.0
while True:
try:
# Use a timeout so we periodically wake up and refresh the
# buffer state. Streaming backends (e.g. voxtral) may
# produce text tokens asynchronously; without a periodic
# drain, those tokens would sit unread until the next audio
# chunk arrives — causing the frontend to show nothing.
try:
item = await asyncio.wait_for(
get_all_from_queue(self.transcription_queue),
timeout=0.5,
)
except asyncio.TimeoutError:
# No new audio — just refresh buffer for streaming backends
_buffer_transcript = self.transcription.get_buffer()
async with self.lock:
self.state.buffer_transcription = _buffer_transcript
continue
if item is SENTINEL:
logger.debug("Transcription processor received sentinel. Finishing.")
await self._finish_transcription()
break
asr_internal_buffer_duration_s = len(getattr(self.transcription, 'audio_buffer', [])) / self.transcription.SAMPLING_RATE
transcription_lag_s = max(0.0, time() - self.beg_loop - self.state.end_buffer)
asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |"
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
new_tokens = []
current_audio_processed_upto = self.state.end_buffer
if isinstance(item, Silence):
if item.is_starting:
new_tokens, current_audio_processed_upto = await asyncio.to_thread(
self.transcription.start_silence
)
asr_processing_logs += " + Silence starting"
if item.has_ended:
asr_processing_logs += f" + Silence of = {item.duration:.2f}s"
cumulative_pcm_duration_stream_time += item.duration
current_audio_processed_upto = cumulative_pcm_duration_stream_time
self.transcription.end_silence(item.duration, self.state.tokens[-1].end if self.state.tokens else 0)
if self.state.tokens:
asr_processing_logs += f" | last_end = {self.state.tokens[-1].end} |"
logger.info(asr_processing_logs)
new_tokens = new_tokens or []
current_audio_processed_upto = max(current_audio_processed_upto, stream_time_end_of_current_pcm)
elif isinstance(item, ChangeSpeaker):
self.transcription.new_speaker(item)
continue
elif isinstance(item, np.ndarray):
pcm_array = item
logger.info(asr_processing_logs)
cumulative_pcm_duration_stream_time += len(pcm_array) / self.sample_rate
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
self.transcription.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
_t0 = time()
new_tokens, current_audio_processed_upto = await asyncio.to_thread(self.transcription.process_iter)
_dur = time() - _t0
self.metrics.transcription_durations.append(_dur)
self.metrics.n_transcription_calls += 1
new_tokens = new_tokens or []
self.metrics.n_tokens_produced += len(new_tokens)
_buffer_transcript = self.transcription.get_buffer()
buffer_text = _buffer_transcript.text
if new_tokens:
validated_text = self.sep.join([t.text for t in new_tokens])
if buffer_text.startswith(validated_text):
_buffer_transcript.text = buffer_text[len(validated_text):].lstrip()
candidate_end_times = [self.state.end_buffer]
if new_tokens:
candidate_end_times.append(new_tokens[-1].end)
if _buffer_transcript.end is not None:
candidate_end_times.append(_buffer_transcript.end)
candidate_end_times.append(current_audio_processed_upto)
async with self.lock:
self.state.tokens.extend(new_tokens)
self.state.buffer_transcription = _buffer_transcript
self.state.end_buffer = max(candidate_end_times)
self.state.new_tokens.extend(new_tokens)
self.state.new_tokens_buffer = _buffer_transcript
if self.translation_queue:
for token in new_tokens:
await self.translation_queue.put(token)
except Exception as e:
logger.warning(f"Exception in transcription_processor: {e}")
logger.warning(f"Traceback: {traceback.format_exc()}")
if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
self.transcription_queue.task_done()
if self.is_stopping:
logger.info("Transcription processor finishing due to stopping flag.")
if self.diarization_queue:
await self.diarization_queue.put(SENTINEL)
if self.translation_queue:
await self.translation_queue.put(SENTINEL)
logger.info("Transcription processor task finished.")
async def diarization_processor(self) -> None:
while True:
try:
item = await get_all_from_queue(self.diarization_queue)
if item is SENTINEL:
break
elif isinstance(item, Silence):
if item.has_ended:
self.diarization.insert_silence(item.duration)
continue
self.diarization.insert_audio_chunk(item)
diarization_segments = await self.diarization.diarize()
diar_end = 0.0
if diarization_segments:
diar_end = max(getattr(s, "end", 0.0) for s in diarization_segments)
async with self.lock:
self.state.new_diarization = diarization_segments
self.state.end_attributed_speaker = max(self.state.end_attributed_speaker, diar_end)
except Exception as e:
logger.warning(f"Exception in diarization_processor: {e}")
logger.warning(f"Traceback: {traceback.format_exc()}")
logger.info("Diarization processor task finished.")
async def translation_processor(self) -> None:
# the idea is to ignore diarization for the moment. We use only transcription tokens.
# And the speaker is attributed given the segments used for the translation
# in the future we want to have different languages for each speaker etc, so it will be more complex.
while True:
try:
item = await get_all_from_queue(self.translation_queue)
if item is SENTINEL:
logger.debug("Translation processor received sentinel. Finishing.")
break
new_translation = None
new_translation_buffer = None
if isinstance(item, Silence):
if item.is_starting:
new_translation, new_translation_buffer = self.translation.validate_buffer_and_reset()
if item.has_ended:
self.translation.insert_silence(item.duration)
continue
elif isinstance(item, ChangeSpeaker):
new_translation, new_translation_buffer = self.translation.validate_buffer_and_reset()
else:
self.translation.insert_tokens(item)
new_translation, new_translation_buffer = await asyncio.to_thread(self.translation.process)
if new_translation is not None:
async with self.lock:
self.state.new_translation.append(new_translation)
self.state.new_translation_buffer = new_translation_buffer
except Exception as e:
logger.warning(f"Exception in translation_processor: {e}")
logger.warning(f"Traceback: {traceback.format_exc()}")
logger.info("Translation processor task finished.")
async def results_formatter(self) -> AsyncGenerator[FrontData, None]:
"""Format processing results for output."""
while True:
try:
if self._ffmpeg_error:
yield FrontData(status="error", error=f"FFmpeg error: {self._ffmpeg_error}")
self._ffmpeg_error = None
await asyncio.sleep(1)
continue
self.tokens_alignment.update()
lines, buffer_diarization_text, buffer_translation_text = self.tokens_alignment.get_lines(
diarization=self.args.diarization,
translation=bool(self.translation),
current_silence=self.current_silence,
audio_time=self.total_pcm_samples / self.sample_rate if self.sample_rate else None,
)
state = await self.get_current_state()
buffer_transcription_text = state.buffer_transcription.text if state.buffer_transcription else ''
response_status = "active_transcription"
if not lines and not buffer_transcription_text and not buffer_diarization_text:
response_status = "no_audio_detected"
response = FrontData(
status=response_status,
lines=lines,
buffer_transcription=buffer_transcription_text,
buffer_diarization=buffer_diarization_text,
buffer_translation=buffer_translation_text,
remaining_time_transcription=state.remaining_time_transcription,
remaining_time_diarization=state.remaining_time_diarization if self.args.diarization else 0
)
should_push = (response != self.last_response_content)
if should_push:
self.metrics.n_responses_sent += 1
yield response
self.last_response_content = response
if self.is_stopping and self._processing_tasks_done():
logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
return
await asyncio.sleep(0.05)
except Exception:
logger.warning(f"Exception in results_formatter. Traceback: {traceback.format_exc()}")
await asyncio.sleep(0.5)
async def create_tasks(self) -> AsyncGenerator[FrontData, None]:
"""Create and start processing tasks."""
self.all_tasks_for_cleanup = []
processing_tasks_for_watchdog: List[asyncio.Task] = []
# If using FFmpeg (non-PCM input), start it and spawn stdout reader
if not self.is_pcm_input:
success = await self.ffmpeg_manager.start()
if not success:
logger.error("Failed to start FFmpeg manager")
async def error_generator() -> AsyncGenerator[FrontData, None]:
yield FrontData(
status="error",
error="FFmpeg failed to start. Please check that FFmpeg is installed."
)
return error_generator()
self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
if self.transcription:
self.transcription_task = asyncio.create_task(self.transcription_processor())
self.all_tasks_for_cleanup.append(self.transcription_task)
processing_tasks_for_watchdog.append(self.transcription_task)
if self.diarization:
self.diarization_task = asyncio.create_task(self.diarization_processor())
self.all_tasks_for_cleanup.append(self.diarization_task)
processing_tasks_for_watchdog.append(self.diarization_task)
if self.translation:
self.translation_task = asyncio.create_task(self.translation_processor())
self.all_tasks_for_cleanup.append(self.translation_task)
processing_tasks_for_watchdog.append(self.translation_task)
# Monitor overall system health
self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
self.all_tasks_for_cleanup.append(self.watchdog_task)
return self.results_formatter()
async def watchdog(self, tasks_to_monitor: List[asyncio.Task]) -> None:
"""Monitors the health of critical processing tasks."""
tasks_remaining: List[asyncio.Task] = [task for task in tasks_to_monitor if task]
while True:
try:
if not tasks_remaining:
logger.info("Watchdog task finishing: all monitored tasks completed.")
return
await asyncio.sleep(10)
for i, task in enumerate(list(tasks_remaining)):
if task.done():
exc = task.exception()
task_name = task.get_name() if hasattr(task, 'get_name') else f"Monitored Task {i}"
if exc:
logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
else:
logger.info(f"{task_name} completed normally.")
tasks_remaining.remove(task)
except asyncio.CancelledError:
logger.info("Watchdog task cancelled.")
break
except Exception as e:
logger.error(f"Error in watchdog task: {e}", exc_info=True)
async def cleanup(self) -> None:
"""Clean up resources when processing is complete."""
logger.info("Starting cleanup of AudioProcessor resources.")
self.is_stopping = True
for task in self.all_tasks_for_cleanup:
if task and not task.done():
task.cancel()
created_tasks = [t for t in self.all_tasks_for_cleanup if t]
if created_tasks:
await asyncio.gather(*created_tasks, return_exceptions=True)
logger.info("All processing tasks cancelled or finished.")
if not self.is_pcm_input and self.ffmpeg_manager:
try:
await self.ffmpeg_manager.stop()
logger.info("FFmpeg manager stopped.")
except Exception as e:
logger.warning(f"Error stopping FFmpeg manager: {e}")
if self.diarization:
self.diarization.close()
# Finalize session metrics
self.metrics.total_audio_duration_s = self.total_pcm_samples / self.sample_rate
self.metrics.log_summary()
logger.info("AudioProcessor cleanup complete.")
def _processing_tasks_done(self) -> bool:
"""Return True when all active processing tasks have completed."""
tasks_to_check = [
self.transcription_task,
self.diarization_task,
self.translation_task,
self.ffmpeg_reader_task,
]
return all(task.done() for task in tasks_to_check if task)
async def process_audio(self, message: Optional[bytes]) -> None:
"""Process incoming audio data."""
if not self.beg_loop:
self.beg_loop = time()
self.metrics.session_start = self.beg_loop
self.current_silence = Silence(start=0.0, is_starting=True)
self.tokens_alignment.beg_loop = self.beg_loop
if not message:
logger.info("Empty audio message received, initiating stop sequence.")
self.is_stopping = True
# Flush any remaining PCM data before signaling end-of-stream
if self.is_pcm_input and self.pcm_buffer:
await self._flush_remaining_pcm()
if self.transcription_queue:
await self.transcription_queue.put(SENTINEL)
if not self.is_pcm_input and self.ffmpeg_manager:
await self.ffmpeg_manager.stop()
return
if self.is_stopping:
logger.warning("AudioProcessor is stopping. Ignoring incoming audio.")
return
self.metrics.n_chunks_received += 1
if self.is_pcm_input:
self.pcm_buffer.extend(message)
await self.handle_pcm_data()
else:
if not self.ffmpeg_manager:
logger.error("FFmpeg manager not initialized for non-PCM input.")
return
success = await self.ffmpeg_manager.write_data(message)
if not success:
ffmpeg_state = await self.ffmpeg_manager.get_state()
if ffmpeg_state == FFmpegState.FAILED:
logger.error("FFmpeg is in FAILED state, cannot process audio")
else:
logger.warning("Failed to write audio data to FFmpeg")
async def handle_pcm_data(self) -> None:
# Without VAC, there's no speech detector to end the initial silence.
# Clear it on the first audio chunk so audio actually gets enqueued.
if not self.args.vac and self.current_silence:
await self._end_silence()
# Process when enough data
if len(self.pcm_buffer) < self.bytes_per_sec:
return
if len(self.pcm_buffer) > self.max_bytes_per_sec:
logger.warning(
f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. "
f"Consider using a smaller model."
)
chunk_size = min(len(self.pcm_buffer), self.max_bytes_per_sec)
aligned_chunk_size = (chunk_size // self.bytes_per_sample) * self.bytes_per_sample
if aligned_chunk_size == 0:
return
pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:aligned_chunk_size])
self.pcm_buffer = self.pcm_buffer[aligned_chunk_size:]
num_samples = len(pcm_array)
chunk_sample_start = self.total_pcm_samples
chunk_sample_end = chunk_sample_start + num_samples
res = None
if self.args.vac:
res = self.vac(pcm_array)
if res is not None:
if "start" in res and self.current_silence:
await self._end_silence(at_sample=res.get("start"))
if "end" in res and not self.current_silence:
pre_silence_chunk = self._slice_before_silence(
pcm_array, chunk_sample_start, res.get("end")
)
if pre_silence_chunk is not None and pre_silence_chunk.size > 0:
await self._enqueue_active_audio(pre_silence_chunk)
await self._begin_silence(at_sample=res.get("end"))
if not self.current_silence:
await self._enqueue_active_audio(pcm_array)
self.total_pcm_samples = chunk_sample_end
if not self.args.transcription and not self.args.diarization:
await asyncio.sleep(0.1)
async def _flush_remaining_pcm(self) -> None:
"""Flush whatever PCM data remains in the buffer, regardless of size threshold."""
if not self.pcm_buffer:
return
aligned_size = (len(self.pcm_buffer) // self.bytes_per_sample) * self.bytes_per_sample
if aligned_size == 0:
return
pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:aligned_size])
self.pcm_buffer = self.pcm_buffer[aligned_size:]
# End any active silence so the audio gets enqueued
if self.current_silence:
await self._end_silence(at_sample=self.total_pcm_samples)
await self._enqueue_active_audio(pcm_array)
self.total_pcm_samples += len(pcm_array)
logger.info(f"Flushed remaining PCM buffer: {len(pcm_array)} samples ({len(pcm_array)/self.sample_rate:.2f}s)")