From 8d9be88fe6fc506209ca4df804958699b53ee910 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Mon, 10 Nov 2025 15:22:26 +0100 Subject: [PATCH] translation buffer is now displayed in frontend --- whisperlivekit/audio_processor.py | 11 +++++++++-- whisperlivekit/results_formater.py | 1 - whisperlivekit/timed_objects.py | 6 ++++-- whisperlivekit/web/live_transcription.css | 5 +++++ whisperlivekit/web/live_transcription.js | 21 ++++++++++++++++++--- 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 0e42bb9..5e74819 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -387,10 +387,10 @@ class AudioProcessor: tokens_to_process.append(additional_token) if tokens_to_process: self.translation.insert_tokens(tokens_to_process) - translation_validated_segments, translation_buffer = await asyncio.to_thread(self.translation.process) + translation_validated_segments, buffer_translation = await asyncio.to_thread(self.translation.process) async with self.lock: self.state.translation_validated_segments = translation_validated_segments - self.state.translation_buffer = translation_buffer + self.state.buffer_translation = buffer_translation self.translation_queue.task_done() for _ in additional_tokens: self.translation_queue.task_done() @@ -438,6 +438,12 @@ class AudioProcessor: async with self.lock: self.state.end_attributed_speaker = state.end_attributed_speaker + + buffer_translation_text = '' + if state.buffer_translation: + raw_buffer_translation = getattr(state.buffer_translation, 'text', state.buffer_translation) + if raw_buffer_translation: + buffer_translation_text = raw_buffer_translation.strip() response_status = "active_transcription" if not state.tokens and not buffer_transcription and not buffer_diarization: @@ -455,6 +461,7 @@ class AudioProcessor: lines=lines, buffer_transcription=buffer_transcription.text.strip(), buffer_diarization=buffer_diarization, + buffer_translation=buffer_translation_text, remaining_time_transcription=state.remaining_time_transcription, remaining_time_diarization=state.remaining_time_diarization if self.args.diarization else 0 ) diff --git a/whisperlivekit/results_formater.py b/whisperlivekit/results_formater.py index 1e600ec..f5a64bf 100644 --- a/whisperlivekit/results_formater.py +++ b/whisperlivekit/results_formater.py @@ -57,7 +57,6 @@ def format_output(state, silence, args, sep): disable_punctuation_split = args.disable_punctuation_split tokens = state.tokens translation_validated_segments = state.translation_validated_segments # Here we will attribute the speakers only based on the timestamps of the segments - translation_buffer = state.translation_buffer last_validated_token = state.last_validated_token previous_speaker = 1 diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index 7641178..6d9aff9 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -151,6 +151,7 @@ class FrontData(): lines: list[Line] = field(default_factory=list) buffer_transcription: str = '' buffer_diarization: str = '' + buffer_translation: str = '' remaining_time_transcription: float = 0. remaining_time_diarization: float = 0. @@ -160,6 +161,7 @@ class FrontData(): 'lines': [line.to_dict() for line in self.lines if (line.text or line.speaker == -2)], 'buffer_transcription': self.buffer_transcription, 'buffer_diarization': self.buffer_diarization, + 'buffer_translation': self.buffer_translation, 'remaining_time_transcription': self.remaining_time_transcription, 'remaining_time_diarization': self.remaining_time_diarization, } @@ -178,10 +180,10 @@ class State(): last_validated_token: int = 0 last_punctuation_index: Optional[int] = None translation_validated_segments: list = field(default_factory=list) - translation_buffer: list = field(default_factory=list) + buffer_translation: str = field(default_factory=Transcript) buffer_transcription: str = field(default_factory=Transcript) end_buffer: float = 0.0 end_attributed_speaker: float = 0.0 remaining_time_transcription: float = 0.0 remaining_time_diarization: float = 0.0 - beg_loop: Optional[int] = None \ No newline at end of file + beg_loop: Optional[int] = None diff --git a/whisperlivekit/web/live_transcription.css b/whisperlivekit/web/live_transcription.css index 363c01c..1e4867c 100644 --- a/whisperlivekit/web/live_transcription.css +++ b/whisperlivekit/web/live_transcription.css @@ -490,6 +490,11 @@ label { margin-left: 4px; } +.buffer_translation { + color: #a0a0a0; + margin-left: 6px; +} + .spinner { display: inline-block; width: 8px; diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js index 2d61e41..90fba8c 100644 --- a/whisperlivekit/web/live_transcription.js +++ b/whisperlivekit/web/live_transcription.js @@ -232,10 +232,11 @@ function setupWebSocket() { if (waitingForStop) { statusText.textContent = "Processing finalized or connection closed."; if (lastReceivedData) { - renderLinesWithBuffer( + renderLinesWithBuffer( lastReceivedData.lines || [], lastReceivedData.buffer_diarization || "", lastReceivedData.buffer_transcription || "", + lastReceivedData.buffer_translation || "", 0, 0, true @@ -281,6 +282,7 @@ function setupWebSocket() { lastReceivedData.lines || [], lastReceivedData.buffer_diarization || "", lastReceivedData.buffer_transcription || "", + lastReceivedData.buffer_translation || "", 0, 0, true @@ -301,6 +303,7 @@ function setupWebSocket() { lines = [], buffer_transcription = "", buffer_diarization = "", + buffer_translation = "", remaining_time_transcription = 0, remaining_time_diarization = 0, status = "active_transcription", @@ -310,6 +313,7 @@ function setupWebSocket() { lines, buffer_diarization, buffer_transcription, + buffer_translation, remaining_time_diarization, remaining_time_transcription, false, @@ -323,6 +327,7 @@ function renderLinesWithBuffer( lines, buffer_diarization, buffer_transcription, + buffer_translation, remaining_time_diarization, remaining_time_transcription, isFinalizing = false, @@ -341,6 +346,7 @@ function renderLinesWithBuffer( lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end, detected_language: it.detected_language })), buffer_transcription: buffer_transcription || "", buffer_diarization: buffer_diarization || "", + buffer_translation: buffer_translation, status: current_status, showLoading, showTransLag, @@ -415,13 +421,22 @@ function renderLinesWithBuffer( } } } - + let translationContent = ""; if (item.translation) { + translationContent += item.translation.trim(); + } + if (idx === lines.length - 1 && buffer_translation) { + const bufferPiece = isFinalizing + ? buffer_translation + : `${buffer_translation}`; + translationContent += translationContent ? `${bufferPiece}` : bufferPiece; + } + if (translationContent.trim().length > 0) { currentLineText += `
${translationIcon} - ${item.translation} + ${translationContent}
`; }