typo and simpler conditions

This commit is contained in:
Quentin Fuxa
2025-09-26 20:38:26 +02:00
parent 1fa9e1f656
commit d55490cd27
2 changed files with 25 additions and 21 deletions

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "whisperlivekit"
version = "0.2.11"
version = "0.2.11.post1"
description = "Real-time speech-to-text with speaker diarization using Whisper"
readme = "README.md"
authors = [

View File

@@ -9,7 +9,7 @@ from whisperlivekit.core import TranscriptionEngine, online_factory, online_diar
from whisperlivekit.silero_vad_iterator import FixedVADIterator
from whisperlivekit.results_formater import format_output
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
# Set up logging once
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
@@ -100,17 +100,21 @@ class AudioProcessor:
self.transcription_task = None
self.diarization_task = None
self.translation_task = None
self.watchdog_task = None
self.all_tasks_for_cleanup = []
self.online_translation = None
self.transcription = None
self.translation = None
self.diarization = None
if self.args.transcription:
self.online = online_factory(self.args, models.asr, models.tokenizer)
self.sep = self.online.asr.sep
self.transcription = online_factory(self.args, models.asr, models.tokenizer)
self.sep = self.transcription.asr.sep
if self.args.diarization:
self.diarization = online_diarization_factory(self.args, models.diarization_model)
if models.translation_model:
self.online_translation = online_translation_factory(self.args, models.translation_model)
self.translation = online_translation_factory(self.args, models.translation_model)
def convert_pcm_to_float(self, pcm_buffer):
"""Convert PCM buffer in s16le format to normalized NumPy array."""
@@ -204,9 +208,9 @@ class AudioProcessor:
logger.info("FFmpeg stdout processing finished. Signaling downstream processors if needed.")
if not self.diarization_before_transcription and self.transcription_queue:
await self.transcription_queue.put(SENTINEL)
if self.args.diarization and self.diarization_queue:
if self.diarization:
await self.diarization_queue.put(SENTINEL)
if self.online_translation:
if self.translation:
await self.translation_queue.put(SENTINEL)
async def transcription_processor(self):
@@ -221,7 +225,7 @@ class AudioProcessor:
self.transcription_queue.task_done()
break
asr_internal_buffer_duration_s = len(getattr(self.online, 'audio_buffer', [])) / self.online.SAMPLING_RATE
asr_internal_buffer_duration_s = len(getattr(self.transcription, 'audio_buffer', [])) / self.transcription.SAMPLING_RATE
transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer)
asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |"
if type(item) is Silence:
@@ -230,10 +234,10 @@ class AudioProcessor:
asr_processing_logs += f" | last_end = {self.tokens[-1].end} |"
logger.info(asr_processing_logs)
cumulative_pcm_duration_stream_time += item.duration
self.online.insert_silence(item.duration, self.tokens[-1].end if self.tokens else 0)
self.transcription.insert_silence(item.duration, self.tokens[-1].end if self.tokens else 0)
continue
elif isinstance(item, ChangeSpeaker):
self.online.new_speaker(item)
self.transcription.new_speaker(item)
elif isinstance(item, np.ndarray):
pcm_array = item
@@ -243,10 +247,10 @@ class AudioProcessor:
cumulative_pcm_duration_stream_time += duration_this_chunk
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
new_tokens, current_audio_processed_upto = await asyncio.to_thread(self.online.process_iter)
self.transcription.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
new_tokens, current_audio_processed_upto = await asyncio.to_thread(self.transcription.process_iter)
_buffer_transcript = self.online.get_buffer()
_buffer_transcript = self.transcription.get_buffer()
buffer_text = _buffer_transcript.text
if new_tokens:
@@ -352,7 +356,7 @@ class AudioProcessor:
self.translation_queue.task_done()
break
elif type(item) is Silence:
self.online_translation.insert_silence(item.duration)
self.translation.insert_silence(item.duration)
continue
# get all the available tokens for translation. The more words, the more precise
@@ -366,8 +370,8 @@ class AudioProcessor:
break
tokens_to_process.append(additional_token)
if tokens_to_process:
self.online_translation.insert_tokens(tokens_to_process)
self.translated_segments = await asyncio.to_thread(self.online_translation.process)
self.translation.insert_tokens(tokens_to_process)
self.translated_segments = await asyncio.to_thread(self.translation.process)
self.translation_queue.task_done()
for _ in additional_tokens:
self.translation_queue.task_done()
@@ -494,17 +498,17 @@ class AudioProcessor:
self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
if self.args.transcription and self.online:
if self.transcription:
self.transcription_task = asyncio.create_task(self.transcription_processor())
self.all_tasks_for_cleanup.append(self.transcription_task)
processing_tasks_for_watchdog.append(self.transcription_task)
if self.args.diarization and self.diarization:
if self.diarization:
self.diarization_task = asyncio.create_task(self.diarization_processor(self.diarization))
self.all_tasks_for_cleanup.append(self.diarization_task)
processing_tasks_for_watchdog.append(self.diarization_task)
if self.online_translation:
if self.translation:
self.translation_task = asyncio.create_task(self.translation_processor())
self.all_tasks_for_cleanup.append(self.translation_task)
processing_tasks_for_watchdog.append(self.translation_task)
@@ -555,7 +559,7 @@ class AudioProcessor:
logger.info("FFmpeg manager stopped.")
except Exception as e:
logger.warning(f"Error stopping FFmpeg manager: {e}")
if self.args.diarization and hasattr(self, 'dianization') and hasattr(self.diarization, 'close'):
if self.diarization:
self.diarization.close()
logger.info("AudioProcessor cleanup complete.")