From 2bbdc70187ce6ada70f2ed569713bcc3fae9c227 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sat, 9 Aug 2025 23:11:05 +0200 Subject: [PATCH] lags are now updated every 0.1s --- whisperlivekit/audio_processor.py | 19 ++++++++-- whisperlivekit/core.py | 13 ++++--- whisperlivekit/web/live_transcription.html | 43 +++++++++++++++++++--- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index beee001..2826f1f 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -128,12 +128,12 @@ class AudioProcessor: # Calculate remaining times remaining_transcription = 0 if self.end_buffer > 0: - remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 2)) + remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 1)) remaining_diarization = 0 if self.tokens: latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0) - remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 2)) + remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1)) return { "tokens": self.tokens.copy(), @@ -343,6 +343,8 @@ class AudioProcessor: async def results_formatter(self): """Format processing results for output.""" + last_sent_trans = None + last_sent_diar = None while True: try: ffmpeg_state = await self.ffmpeg_manager.get_state() @@ -446,10 +448,19 @@ class AudioProcessor: ' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \ f" | {buffer_transcription} | {buffer_diarization}" - if current_response_signature != self.last_response_content and \ - (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected"): + trans = state["remaining_time_transcription"] + diar = state["remaining_time_diarization"] + should_push = ( + current_response_signature != self.last_response_content + or last_sent_trans is None + or round(trans, 1) != round(last_sent_trans, 1) + or round(diar, 1) != round(last_sent_diar, 1) + ) + if should_push and (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0): yield response self.last_response_content = current_response_signature + last_sent_trans = trans + last_sent_diar = diar # Check for termination condition if self.is_stopping: diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index f187a35..b3da054 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -25,7 +25,6 @@ class TranscriptionEngine: "host": "localhost", "port": 8000, "warmup_file": None, - "confidence_validation": False, "diarization": False, "punctuation_split": False, "min_chunk_size": 0.5, @@ -37,15 +36,15 @@ class TranscriptionEngine: "backend": "faster-whisper", "vac": False, "vac_chunk_size": 0.04, - "buffer_trimming": "segment", - "buffer_trimming_sec": 15, "log_level": "DEBUG", "ssl_certfile": None, "ssl_keyfile": None, "transcription": True, "vad": True, - "segmentation_model": "pyannote/segmentation-3.0", - "embedding_model": "pyannote/embedding", + # whisperstreaming params: + "buffer_trimming": "segment", + "confidence_validation": False, + "buffer_trimming_sec": 15, # simulstreaming params: "frame_threshold": 25, "beams": 1, @@ -58,6 +57,10 @@ class TranscriptionEngine: "static_init_prompt": None, "max_context_tokens": None, "model_path": './base.pt', + # diart params: + "segmentation_model": "pyannote/segmentation-3.0", + "embedding_model": "pyannote/embedding", + } config_dict = {**defaults, **kwargs} diff --git a/whisperlivekit/web/live_transcription.html b/whisperlivekit/web/live_transcription.html index e3e242c..542d252 100644 --- a/whisperlivekit/web/live_transcription.html +++ b/whisperlivekit/web/live_transcription.html @@ -4,7 +4,7 @@ - Audio Transcription + WhisperLiveKit