mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 14:23:18 +00:00
638 lines
28 KiB
Python
638 lines
28 KiB
Python
import asyncio
|
|
import numpy as np
|
|
from time import time, sleep
|
|
import math
|
|
import logging
|
|
import traceback
|
|
from whisperlivekit.timed_objects import ASRToken, Silence, Line, FrontData, State, StateLight, Transcript, ChangeSpeaker
|
|
from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory
|
|
from whisperlivekit.silero_vad_iterator import FixedVADIterator
|
|
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
|
|
from whisperlivekit.tokens_alignment import TokensAlignment
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
SENTINEL = object() # unique sentinel object for end of stream marker
|
|
MIN_DURATION_REAL_SILENCE = 5
|
|
|
|
def cut_at(cumulative_pcm, cut_sec):
|
|
cumulative_len = 0
|
|
cut_sample = int(cut_sec * 16000)
|
|
|
|
for ind, pcm_array in enumerate(cumulative_pcm):
|
|
if (cumulative_len + len(pcm_array)) >= cut_sample:
|
|
cut_chunk = cut_sample - cumulative_len
|
|
before = np.concatenate(cumulative_pcm[:ind] + [cumulative_pcm[ind][:cut_chunk]])
|
|
after = [cumulative_pcm[ind][cut_chunk:]] + cumulative_pcm[ind+1:]
|
|
return before, after
|
|
cumulative_len += len(pcm_array)
|
|
return np.concatenate(cumulative_pcm), []
|
|
|
|
async def get_all_from_queue(queue):
|
|
items = []
|
|
|
|
first_item = await queue.get()
|
|
queue.task_done()
|
|
if first_item is SENTINEL:
|
|
return first_item
|
|
if isinstance(first_item, Silence):
|
|
return first_item
|
|
items.append(first_item)
|
|
|
|
while True:
|
|
if not queue._queue:
|
|
break
|
|
next_item = queue._queue[0]
|
|
if next_item is SENTINEL:
|
|
break
|
|
if isinstance(next_item, Silence):
|
|
break
|
|
items.append(await queue.get())
|
|
queue.task_done()
|
|
if isinstance(items[0], np.ndarray):
|
|
return np.concatenate(items)
|
|
else: #translation
|
|
return items
|
|
|
|
class AudioProcessor:
|
|
"""
|
|
Processes audio streams for transcription and diarization.
|
|
Handles audio processing, state management, and result formatting.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""Initialize the audio processor with configuration, models, and state."""
|
|
|
|
if 'transcription_engine' in kwargs and isinstance(kwargs['transcription_engine'], TranscriptionEngine):
|
|
models = kwargs['transcription_engine']
|
|
else:
|
|
models = TranscriptionEngine(**kwargs)
|
|
|
|
# Audio processing settings
|
|
self.args = models.args
|
|
self.sample_rate = 16000
|
|
self.channels = 1
|
|
self.samples_per_sec = int(self.sample_rate * self.args.min_chunk_size)
|
|
self.bytes_per_sample = 2
|
|
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
|
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
|
self.is_pcm_input = self.args.pcm_input
|
|
|
|
# State management
|
|
self.is_stopping = False
|
|
self.current_silence = None
|
|
self.state = State()
|
|
self.state_light = StateLight()
|
|
self.lock = asyncio.Lock()
|
|
self.sep = " " # Default separator
|
|
self.last_response_content = FrontData()
|
|
self.last_detected_speaker = None
|
|
self.speaker_languages = {}
|
|
|
|
self.tokens_alignment = TokensAlignment(self.state_light, self.args, self.sep)
|
|
self.beg_loop = None
|
|
|
|
# Models and processing
|
|
self.asr = models.asr
|
|
self.vac_model = models.vac_model
|
|
if self.args.vac:
|
|
self.vac = FixedVADIterator(models.vac_model)
|
|
else:
|
|
self.vac = None
|
|
|
|
self.ffmpeg_manager = None
|
|
self.ffmpeg_reader_task = None
|
|
self._ffmpeg_error = None
|
|
|
|
if not self.is_pcm_input:
|
|
self.ffmpeg_manager = FFmpegManager(
|
|
sample_rate=self.sample_rate,
|
|
channels=self.channels
|
|
)
|
|
async def handle_ffmpeg_error(error_type: str):
|
|
logger.error(f"FFmpeg error: {error_type}")
|
|
self._ffmpeg_error = error_type
|
|
self.ffmpeg_manager.on_error_callback = handle_ffmpeg_error
|
|
|
|
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
|
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
|
self.translation_queue = asyncio.Queue() if self.args.target_language else None
|
|
self.pcm_buffer = bytearray()
|
|
self.total_pcm_samples = 0
|
|
self.end_buffer = 0.0
|
|
self.transcription_task = None
|
|
self.diarization_task = None
|
|
self.translation_task = None
|
|
self.watchdog_task = None
|
|
self.all_tasks_for_cleanup = []
|
|
|
|
self.transcription = None
|
|
self.translation = None
|
|
self.diarization = None
|
|
|
|
if self.args.transcription:
|
|
self.transcription = online_factory(self.args, models.asr)
|
|
self.sep = self.transcription.asr.sep
|
|
if self.args.diarization:
|
|
self.diarization = online_diarization_factory(self.args, models.diarization_model)
|
|
if models.translation_model:
|
|
self.translation = online_translation_factory(self.args, models.translation_model)
|
|
|
|
async def _push_silence_event(self):
|
|
if self.transcription_queue:
|
|
await self.transcription_queue.put(self.current_silence)
|
|
if self.args.diarization and self.diarization_queue:
|
|
await self.diarization_queue.put(self.current_silence)
|
|
if self.translation_queue:
|
|
await self.translation_queue.put(self.current_silence)
|
|
|
|
async def _begin_silence(self):
|
|
if self.current_silence:
|
|
return
|
|
now = time() - self.beg_loop
|
|
self.current_silence = Silence(
|
|
is_starting=True, start=now
|
|
)
|
|
await self._push_silence_event()
|
|
|
|
async def _end_silence(self):
|
|
if not self.current_silence:
|
|
return
|
|
now = time() - self.beg_loop
|
|
self.current_silence.end = now
|
|
self.current_silence.is_starting=False
|
|
self.current_silence.has_ended=True
|
|
self.current_silence.compute_duration()
|
|
if self.current_silence.duration > MIN_DURATION_REAL_SILENCE:
|
|
self.state_light.new_tokens.append(self.current_silence)
|
|
await self._push_silence_event()
|
|
self.current_silence = None
|
|
|
|
async def _enqueue_active_audio(self, pcm_chunk: np.ndarray):
|
|
if pcm_chunk is None or pcm_chunk.size == 0:
|
|
return
|
|
if self.transcription_queue:
|
|
await self.transcription_queue.put(pcm_chunk.copy())
|
|
if self.args.diarization and self.diarization_queue:
|
|
await self.diarization_queue.put(pcm_chunk.copy())
|
|
|
|
def _slice_before_silence(self, pcm_array, chunk_sample_start, silence_sample):
|
|
if silence_sample is None:
|
|
return None
|
|
relative_index = int(silence_sample - chunk_sample_start)
|
|
if relative_index <= 0:
|
|
return None
|
|
split_index = min(relative_index, len(pcm_array))
|
|
if split_index <= 0:
|
|
return None
|
|
return pcm_array[:split_index]
|
|
|
|
def convert_pcm_to_float(self, pcm_buffer):
|
|
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
|
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
|
|
|
|
async def get_current_state(self):
|
|
"""Get current state."""
|
|
async with self.lock:
|
|
current_time = time()
|
|
|
|
remaining_transcription = 0
|
|
if self.end_buffer > 0:
|
|
remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 1))
|
|
|
|
remaining_diarization = 0
|
|
if self.state.tokens:
|
|
latest_end = max(self.end_buffer, self.state.tokens[-1].end if self.state.tokens else 0)
|
|
remaining_diarization = max(0, round(latest_end - self.state.end_attributed_speaker, 1))
|
|
|
|
self.state.remaining_time_transcription = remaining_transcription
|
|
self.state.remaining_time_diarization = remaining_diarization
|
|
|
|
return self.state
|
|
|
|
async def ffmpeg_stdout_reader(self):
|
|
"""Read audio data from FFmpeg stdout and process it into the PCM pipeline."""
|
|
beg = time()
|
|
while True:
|
|
try:
|
|
if self.is_stopping:
|
|
logger.info("Stopping ffmpeg_stdout_reader due to stopping flag.")
|
|
break
|
|
|
|
state = await self.ffmpeg_manager.get_state() if self.ffmpeg_manager else FFmpegState.STOPPED
|
|
if state == FFmpegState.FAILED:
|
|
logger.error("FFmpeg is in FAILED state, cannot read data")
|
|
break
|
|
elif state == FFmpegState.STOPPED:
|
|
logger.info("FFmpeg is stopped")
|
|
break
|
|
elif state != FFmpegState.RUNNING:
|
|
await asyncio.sleep(0.1)
|
|
continue
|
|
|
|
current_time = time()
|
|
elapsed_time = max(0.0, current_time - beg)
|
|
buffer_size = max(int(32000 * elapsed_time), 4096) # dynamic read
|
|
beg = current_time
|
|
|
|
chunk = await self.ffmpeg_manager.read_data(buffer_size)
|
|
if not chunk:
|
|
# No data currently available
|
|
await asyncio.sleep(0.05)
|
|
continue
|
|
|
|
self.pcm_buffer.extend(chunk)
|
|
await self.handle_pcm_data()
|
|
|
|
except asyncio.CancelledError:
|
|
logger.info("ffmpeg_stdout_reader cancelled.")
|
|
break
|
|
except Exception as e:
|
|
logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
await asyncio.sleep(0.2)
|
|
|
|
logger.info("FFmpeg stdout processing finished. Signaling downstream processors if needed.")
|
|
if self.transcription_queue:
|
|
await self.transcription_queue.put(SENTINEL)
|
|
if self.diarization:
|
|
await self.diarization_queue.put(SENTINEL)
|
|
if self.translation:
|
|
await self.translation_queue.put(SENTINEL)
|
|
|
|
async def transcription_processor(self):
|
|
"""Process audio chunks for transcription."""
|
|
cumulative_pcm_duration_stream_time = 0.0
|
|
|
|
while True:
|
|
try:
|
|
# item = await self.transcription_queue.get()
|
|
item = await get_all_from_queue(self.transcription_queue)
|
|
if item is SENTINEL:
|
|
logger.debug("Transcription processor received sentinel. Finishing.")
|
|
break
|
|
|
|
asr_internal_buffer_duration_s = len(getattr(self.transcription, 'audio_buffer', [])) / self.transcription.SAMPLING_RATE
|
|
transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer)
|
|
asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |"
|
|
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
|
|
new_tokens = []
|
|
current_audio_processed_upto = self.end_buffer
|
|
|
|
if isinstance(item, Silence):
|
|
if item.is_starting:
|
|
new_tokens, current_audio_processed_upto = await asyncio.to_thread(
|
|
self.transcription.start_silence
|
|
)
|
|
asr_processing_logs += f" + Silence starting"
|
|
if item.has_ended:
|
|
asr_processing_logs += f" + Silence of = {item.duration:.2f}s"
|
|
cumulative_pcm_duration_stream_time += item.duration
|
|
current_audio_processed_upto = cumulative_pcm_duration_stream_time
|
|
self.transcription.end_silence(item.duration, self.state.tokens[-1].end if self.state.tokens else 0)
|
|
if self.state.tokens:
|
|
asr_processing_logs += f" | last_end = {self.state.tokens[-1].end} |"
|
|
logger.info(asr_processing_logs)
|
|
new_tokens = new_tokens or []
|
|
current_audio_processed_upto = max(current_audio_processed_upto, stream_time_end_of_current_pcm)
|
|
elif isinstance(item, ChangeSpeaker):
|
|
self.transcription.new_speaker(item)
|
|
continue
|
|
elif isinstance(item, np.ndarray):
|
|
pcm_array = item
|
|
logger.info(asr_processing_logs)
|
|
cumulative_pcm_duration_stream_time += len(pcm_array) / self.sample_rate
|
|
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
|
|
self.transcription.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
|
|
new_tokens, current_audio_processed_upto = await asyncio.to_thread(self.transcription.process_iter)
|
|
new_tokens = new_tokens or []
|
|
|
|
_buffer_transcript = self.transcription.get_buffer()
|
|
buffer_text = _buffer_transcript.text
|
|
|
|
if new_tokens:
|
|
validated_text = self.sep.join([t.text for t in new_tokens])
|
|
if buffer_text.startswith(validated_text):
|
|
_buffer_transcript.text = buffer_text[len(validated_text):].lstrip()
|
|
|
|
candidate_end_times = [self.end_buffer]
|
|
|
|
if new_tokens:
|
|
candidate_end_times.append(new_tokens[-1].end)
|
|
|
|
if _buffer_transcript.end is not None:
|
|
candidate_end_times.append(_buffer_transcript.end)
|
|
|
|
candidate_end_times.append(current_audio_processed_upto)
|
|
|
|
async with self.lock:
|
|
self.state.tokens.extend(new_tokens)
|
|
self.state.buffer_transcription = _buffer_transcript
|
|
self.end_buffer = max(candidate_end_times)
|
|
self.state_light.new_tokens.extend(new_tokens)
|
|
self.state_light.new_tokens_buffer = _buffer_transcript
|
|
|
|
if self.translation_queue:
|
|
for token in new_tokens:
|
|
await self.translation_queue.put(token)
|
|
except Exception as e:
|
|
logger.warning(f"Exception in transcription_processor: {e}")
|
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
|
if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
|
|
self.transcription_queue.task_done()
|
|
|
|
if self.is_stopping:
|
|
logger.info("Transcription processor finishing due to stopping flag.")
|
|
if self.diarization_queue:
|
|
await self.diarization_queue.put(SENTINEL)
|
|
if self.translation_queue:
|
|
await self.translation_queue.put(SENTINEL)
|
|
|
|
logger.info("Transcription processor task finished.")
|
|
|
|
|
|
async def diarization_processor(self):
|
|
while True:
|
|
try:
|
|
item = await get_all_from_queue(self.diarization_queue)
|
|
if item is SENTINEL:
|
|
break
|
|
elif type(item) is Silence:
|
|
if item.has_ended:
|
|
self.diarization.insert_silence(item.duration)
|
|
continue
|
|
|
|
self.diarization.insert_audio_chunk(item)
|
|
diarization_segments = await self.diarization.diarize()
|
|
self.state_light.new_diarization = diarization_segments
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Exception in diarization_processor: {e}")
|
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
|
logger.info("Diarization processor task finished.")
|
|
|
|
async def translation_processor(self):
|
|
# the idea is to ignore diarization for the moment. We use only transcription tokens.
|
|
# And the speaker is attributed given the segments used for the translation
|
|
# in the future we want to have different languages for each speaker etc, so it will be more complex.
|
|
while True:
|
|
try:
|
|
tokens_to_process = await get_all_from_queue(self.translation_queue)
|
|
if tokens_to_process is SENTINEL:
|
|
logger.debug("Translation processor received sentinel. Finishing.")
|
|
self.translation_queue.task_done()
|
|
break
|
|
elif type(tokens_to_process) is Silence:
|
|
if tokens_to_process.has_ended:
|
|
self.translation.insert_silence(tokens_to_process.duration)
|
|
continue
|
|
if tokens_to_process:
|
|
self.translation.insert_tokens(tokens_to_process)
|
|
translation_validated_segments, buffer_translation = await asyncio.to_thread(self.translation.process)
|
|
async with self.lock:
|
|
self.state_light.new_translation = translation_validated_segments
|
|
self.state_light.new_translation_buffer = buffer_translation
|
|
except Exception as e:
|
|
logger.warning(f"Exception in translation_processor: {e}")
|
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
|
logger.info("Translation processor task finished.")
|
|
|
|
async def results_formatter(self):
|
|
"""Format processing results for output."""
|
|
while True:
|
|
try:
|
|
if self._ffmpeg_error:
|
|
yield FrontData(status="error", error=f"FFmpeg error: {self._ffmpeg_error}")
|
|
self._ffmpeg_error = None
|
|
await asyncio.sleep(1)
|
|
continue
|
|
|
|
self.tokens_alignment.update()
|
|
lines, buffer_diarization_text, buffer_translation_text = self.tokens_alignment.get_lines(
|
|
diarization=self.args.diarization,
|
|
translation=bool(self.translation),
|
|
current_silence=self.current_silence
|
|
)
|
|
state = await self.get_current_state()
|
|
|
|
buffer_transcription_text = ''
|
|
buffer_diarization_text = ''
|
|
|
|
response_status = "active_transcription"
|
|
if not lines and not buffer_transcription_text and not buffer_diarization_text:
|
|
response_status = "no_audio_detected"
|
|
|
|
response = FrontData(
|
|
status=response_status,
|
|
lines=lines,
|
|
buffer_transcription=buffer_transcription_text,
|
|
buffer_diarization=buffer_diarization_text,
|
|
buffer_translation=buffer_translation_text,
|
|
remaining_time_transcription=state.remaining_time_transcription,
|
|
remaining_time_diarization=state.remaining_time_diarization if self.args.diarization else 0
|
|
)
|
|
|
|
should_push = (response != self.last_response_content)
|
|
if should_push:
|
|
yield response
|
|
self.last_response_content = response
|
|
|
|
if self.is_stopping and self._processing_tasks_done():
|
|
logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
|
|
return
|
|
|
|
await asyncio.sleep(0.05)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Exception in results_formatter. Traceback: {traceback.format_exc()}")
|
|
await asyncio.sleep(0.5)
|
|
|
|
async def create_tasks(self):
|
|
"""Create and start processing tasks."""
|
|
self.all_tasks_for_cleanup = []
|
|
processing_tasks_for_watchdog = []
|
|
|
|
# If using FFmpeg (non-PCM input), start it and spawn stdout reader
|
|
if not self.is_pcm_input:
|
|
success = await self.ffmpeg_manager.start()
|
|
if not success:
|
|
logger.error("Failed to start FFmpeg manager")
|
|
async def error_generator():
|
|
yield FrontData(
|
|
status="error",
|
|
error="FFmpeg failed to start. Please check that FFmpeg is installed."
|
|
)
|
|
return error_generator()
|
|
self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
|
|
self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
|
|
processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
|
|
|
|
if self.transcription:
|
|
self.transcription_task = asyncio.create_task(self.transcription_processor())
|
|
self.all_tasks_for_cleanup.append(self.transcription_task)
|
|
processing_tasks_for_watchdog.append(self.transcription_task)
|
|
|
|
if self.diarization:
|
|
self.diarization_task = asyncio.create_task(self.diarization_processor())
|
|
self.all_tasks_for_cleanup.append(self.diarization_task)
|
|
processing_tasks_for_watchdog.append(self.diarization_task)
|
|
|
|
if self.translation:
|
|
self.translation_task = asyncio.create_task(self.translation_processor())
|
|
self.all_tasks_for_cleanup.append(self.translation_task)
|
|
processing_tasks_for_watchdog.append(self.translation_task)
|
|
|
|
# Monitor overall system health
|
|
self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
|
|
self.all_tasks_for_cleanup.append(self.watchdog_task)
|
|
|
|
return self.results_formatter()
|
|
|
|
async def watchdog(self, tasks_to_monitor):
|
|
"""Monitors the health of critical processing tasks."""
|
|
tasks_remaining = [task for task in tasks_to_monitor if task]
|
|
while True:
|
|
try:
|
|
if not tasks_remaining:
|
|
logger.info("Watchdog task finishing: all monitored tasks completed.")
|
|
return
|
|
|
|
await asyncio.sleep(10)
|
|
|
|
for i, task in enumerate(list(tasks_remaining)):
|
|
if task.done():
|
|
exc = task.exception()
|
|
task_name = task.get_name() if hasattr(task, 'get_name') else f"Monitored Task {i}"
|
|
if exc:
|
|
logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
|
|
else:
|
|
logger.info(f"{task_name} completed normally.")
|
|
tasks_remaining.remove(task)
|
|
|
|
except asyncio.CancelledError:
|
|
logger.info("Watchdog task cancelled.")
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error in watchdog task: {e}", exc_info=True)
|
|
|
|
async def cleanup(self):
|
|
"""Clean up resources when processing is complete."""
|
|
logger.info("Starting cleanup of AudioProcessor resources.")
|
|
self.is_stopping = True
|
|
for task in self.all_tasks_for_cleanup:
|
|
if task and not task.done():
|
|
task.cancel()
|
|
|
|
created_tasks = [t for t in self.all_tasks_for_cleanup if t]
|
|
if created_tasks:
|
|
await asyncio.gather(*created_tasks, return_exceptions=True)
|
|
logger.info("All processing tasks cancelled or finished.")
|
|
|
|
if not self.is_pcm_input and self.ffmpeg_manager:
|
|
try:
|
|
await self.ffmpeg_manager.stop()
|
|
logger.info("FFmpeg manager stopped.")
|
|
except Exception as e:
|
|
logger.warning(f"Error stopping FFmpeg manager: {e}")
|
|
if self.diarization:
|
|
self.diarization.close()
|
|
logger.info("AudioProcessor cleanup complete.")
|
|
|
|
def _processing_tasks_done(self):
|
|
"""Return True when all active processing tasks have completed."""
|
|
tasks_to_check = [
|
|
self.transcription_task,
|
|
self.diarization_task,
|
|
self.translation_task,
|
|
self.ffmpeg_reader_task,
|
|
]
|
|
return all(task.done() for task in tasks_to_check if task)
|
|
|
|
|
|
async def process_audio(self, message):
|
|
"""Process incoming audio data."""
|
|
|
|
if not self.beg_loop:
|
|
self.beg_loop = time()
|
|
self.current_silence = Silence(start=0.0, is_starting=True)
|
|
self.tokens_alignment.beg_loop = self.beg_loop
|
|
|
|
if not message:
|
|
logger.info("Empty audio message received, initiating stop sequence.")
|
|
self.is_stopping = True
|
|
|
|
if self.transcription_queue:
|
|
await self.transcription_queue.put(SENTINEL)
|
|
|
|
if not self.is_pcm_input and self.ffmpeg_manager:
|
|
await self.ffmpeg_manager.stop()
|
|
|
|
return
|
|
|
|
if self.is_stopping:
|
|
logger.warning("AudioProcessor is stopping. Ignoring incoming audio.")
|
|
return
|
|
|
|
if self.is_pcm_input:
|
|
self.pcm_buffer.extend(message)
|
|
await self.handle_pcm_data()
|
|
else:
|
|
if not self.ffmpeg_manager:
|
|
logger.error("FFmpeg manager not initialized for non-PCM input.")
|
|
return
|
|
success = await self.ffmpeg_manager.write_data(message)
|
|
if not success:
|
|
ffmpeg_state = await self.ffmpeg_manager.get_state()
|
|
if ffmpeg_state == FFmpegState.FAILED:
|
|
logger.error("FFmpeg is in FAILED state, cannot process audio")
|
|
else:
|
|
logger.warning("Failed to write audio data to FFmpeg")
|
|
|
|
async def handle_pcm_data(self):
|
|
# Process when enough data
|
|
if len(self.pcm_buffer) < self.bytes_per_sec:
|
|
return
|
|
|
|
if len(self.pcm_buffer) > self.max_bytes_per_sec:
|
|
logger.warning(
|
|
f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. "
|
|
f"Consider using a smaller model."
|
|
)
|
|
|
|
chunk_size = min(len(self.pcm_buffer), self.max_bytes_per_sec)
|
|
aligned_chunk_size = (chunk_size // self.bytes_per_sample) * self.bytes_per_sample
|
|
|
|
if aligned_chunk_size == 0:
|
|
return
|
|
pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:aligned_chunk_size])
|
|
self.pcm_buffer = self.pcm_buffer[aligned_chunk_size:]
|
|
|
|
num_samples = len(pcm_array)
|
|
chunk_sample_start = self.total_pcm_samples
|
|
chunk_sample_end = chunk_sample_start + num_samples
|
|
|
|
res = None
|
|
if self.args.vac:
|
|
res = self.vac(pcm_array)
|
|
|
|
if res is not None:
|
|
silence_detected = res.get("end", 0) > res.get("start", 0)
|
|
if silence_detected and not self.current_silence:
|
|
pre_silence_chunk = self._slice_before_silence(
|
|
pcm_array, chunk_sample_start, res.get("end")
|
|
)
|
|
if pre_silence_chunk is not None and pre_silence_chunk.size > 0:
|
|
await self._enqueue_active_audio(pre_silence_chunk)
|
|
await self._begin_silence()
|
|
elif self.current_silence:
|
|
await self._end_silence()
|
|
|
|
if not self.current_silence:
|
|
await self._enqueue_active_audio(pcm_array)
|
|
|
|
self.total_pcm_samples = chunk_sample_end
|
|
|
|
if not self.args.transcription and not self.args.diarization:
|
|
await asyncio.sleep(0.1)
|