diff --git a/chrome-extension/live_transcription.js b/chrome-extension/live_transcription.js index 51a6c46..84a5472 100644 --- a/chrome-extension/live_transcription.js +++ b/chrome-extension/live_transcription.js @@ -310,7 +310,7 @@ function renderLinesWithBuffer( const showTransLag = !isFinalizing && remaining_time_transcription > 0; const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0; const signature = JSON.stringify({ - lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })), + lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })), buffer_transcription: buffer_transcription || "", buffer_diarization: buffer_diarization || "", status: current_status, @@ -333,8 +333,8 @@ function renderLinesWithBuffer( const linesHtml = (lines || []) .map((item, idx) => { let timeInfo = ""; - if (item.beg !== undefined && item.end !== undefined) { - timeInfo = ` ${item.beg} - ${item.end}`; + if (item.start !== undefined && item.end !== undefined) { + timeInfo = ` ${item.start} - ${item.end}`; } let speakerLabel = ""; diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index c99d013..942c9ed 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -4,11 +4,11 @@ from time import time, sleep import math import logging import traceback -from whisperlivekit.timed_objects import ASRToken, Silence +from whisperlivekit.timed_objects import ASRToken, Silence, Line from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState from whisperlivekit.silero_vad_iterator import FixedVADIterator -from whisperlivekit.results_formater import format_output, format_time +from whisperlivekit.results_formater import format_output # Set up logging once logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -59,7 +59,7 @@ class AudioProcessor: self.silence = False self.silence_duration = 0.0 self.tokens = [] - self.translated_tokens = [] + self.translated_segments = [] self.buffer_transcription = "" self.buffer_diarization = "" self.end_buffer = 0 @@ -153,7 +153,7 @@ class AudioProcessor: return { "tokens": self.tokens.copy(), - "translated_tokens": self.translated_tokens.copy(), + "translated_segments": self.translated_segments.copy(), "buffer_transcription": self.buffer_transcription, "buffer_diarization": self.buffer_diarization, "end_buffer": self.end_buffer, @@ -167,7 +167,7 @@ class AudioProcessor: """Reset all state variables to initial values.""" async with self.lock: self.tokens = [] - self.translated_tokens = [] + self.translated_segments = [] self.buffer_transcription = self.buffer_diarization = "" self.end_buffer = self.end_attributed_speaker = 0 self.beg_loop = time() @@ -431,8 +431,7 @@ class AudioProcessor: tokens_to_process.append(additional_token) if tokens_to_process: online_translation.insert_tokens(tokens_to_process) - translations = online_translation.process() - print(translations) + self.translated_segments = online_translation.process() self.translation_queue.task_done() for _ in additional_tokens: @@ -505,23 +504,19 @@ class AudioProcessor: buffer_diarization = combined response_status = "active_transcription" - final_lines_for_response = lines.copy() - if not tokens and not buffer_transcription and not buffer_diarization: response_status = "no_audio_detected" - final_lines_for_response = [] - elif response_status == "active_transcription" and not final_lines_for_response: - final_lines_for_response = [{ - "speaker": 1, - "text": "", - "beg": format_time(state.get("end_buffer", 0)), - "end": format_time(state.get("end_buffer", 0)), - "diff": 0 - }] + lines = [] + elif response_status == "active_transcription" and not lines: + lines = [Line( + speaker=1, + start=state.get("end_buffer", 0), + end=state.get("end_buffer", 0) + )] response = { "status": response_status, - "lines": final_lines_for_response, + "lines": [line.to_dict() for line in lines], "buffer_transcription": buffer_transcription, "buffer_diarization": buffer_diarization, "remaining_time_transcription": state["remaining_time_transcription"], @@ -529,7 +524,7 @@ class AudioProcessor: } current_response_signature = f"{response_status} | " + \ - ' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \ + ' '.join([f"{line.speaker} {line.text}" for line in lines]) + \ f" | {buffer_transcription} | {buffer_diarization}" trans = state["remaining_time_transcription"] @@ -540,7 +535,7 @@ class AudioProcessor: or round(trans, 1) != round(last_sent_trans, 1) or round(diar, 1) != round(last_sent_diar, 1) ) - if should_push and (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0): + if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0): yield response self.last_response_content = current_response_signature last_sent_trans = trans @@ -556,7 +551,6 @@ class AudioProcessor: if all_processors_done: logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.") - final_state = await self.get_current_state() return await asyncio.sleep(0.1) # Avoid overwhelming the client diff --git a/whisperlivekit/results_formater.py b/whisperlivekit/results_formater.py index e6f664d..f8d84a7 100644 --- a/whisperlivekit/results_formater.py +++ b/whisperlivekit/results_formater.py @@ -1,7 +1,7 @@ import logging -from datetime import timedelta from whisperlivekit.remove_silences import handle_silences +from timed_objects import Line, format_time logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -9,11 +9,6 @@ logger.setLevel(logging.DEBUG) PUNCTUATION_MARKS = {'.', '!', '?', '。', '!', '?'} CHECK_AROUND = 4 -def format_time(seconds: float) -> str: - """Format seconds as HH:MM:SS.""" - return str(timedelta(seconds=int(seconds))) - - def is_punctuation(token): if token.text.strip() in PUNCTUATION_MARKS: return True @@ -34,32 +29,26 @@ def next_speaker_change(i, tokens, speaker): return ind, token.speaker return None, speaker - def new_line( token, speaker, - last_end_diarized, debug_info = "" ): - return { - "speaker": int(speaker), - "text": token.text + debug_info, - "beg": format_time(token.start), - "end": format_time(token.end), - "diff": round(token.end - last_end_diarized, 2) - } + return Line( + speaker = speaker, + text = token.text + debug_info, + start = token.start, + end = token.end, + ) - -def append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized): +def append_token_to_last_line(lines, sep, token, debug_info): if token.text: - lines[-1]["text"] += sep + token.text + debug_info - lines[-1]["end"] = format_time(token.end) - lines[-1]["diff"] = round(token.end - last_end_diarized, 2) - + lines[-1].text += sep + token.text + debug_info + lines[-1].end = token.end def format_output(state, silence, current_time, diarization, debug): tokens = state["tokens"] - translated_tokens = state["translated_tokens"] # Here we will attribute the speakers only based on the timestamps of the segments + translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments buffer_transcription = state["buffer_transcription"] buffer_diarization = state["buffer_diarization"] end_attributed_speaker = state["end_attributed_speaker"] @@ -67,13 +56,11 @@ def format_output(state, silence, current_time, diarization, debug): previous_speaker = -1 lines = [] - last_end_diarized = 0 undiarized_text = [] tokens, buffer_transcription, buffer_diarization = handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, silence) last_punctuation = None for i, token in enumerate(tokens): speaker = token.speaker - if not diarization and speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1' speaker = 1 if diarization and not tokens[-1].speaker == -2: @@ -82,18 +69,15 @@ def format_output(state, silence, current_time, diarization, debug): continue elif (speaker in [-1, 0]) and token.end < end_attributed_speaker: speaker = previous_speaker - if speaker not in [-1, 0]: - last_end_diarized = max(token.end, last_end_diarized) - debug_info = "" if debug: debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]" if not lines: - lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) + lines.append(new_line(token, speaker, debug_info = "")) continue else: - previous_speaker = lines[-1]['speaker'] + previous_speaker = lines[-1].speaker if is_punctuation(token): last_punctuation = i @@ -102,7 +86,7 @@ def format_output(state, silence, current_time, diarization, debug): if last_punctuation == i-1: if speaker != previous_speaker: # perfect, diarization perfectly aligned - lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) + lines.append(new_line(token, speaker, debug_info = "")) last_punctuation, next_punctuation = None, None continue @@ -112,28 +96,38 @@ def format_output(state, silence, current_time, diarization, debug): # That was the idea. Okay haha |SPLIT SPEAKER| that's a good one # should become: # That was the idea. |SPLIT SPEAKER| Okay haha that's a good one - lines.append(new_line(token, new_speaker, last_end_diarized, debug_info = "")) + lines.append(new_line(token, new_speaker, debug_info = "")) else: # No speaker change to come - append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized) + append_token_to_last_line(lines, sep, token, debug_info) continue if speaker != previous_speaker: if speaker == -2 or previous_speaker == -2: #silences can happen anytime - lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) + lines.append(new_line(token, speaker, debug_info = "")) continue elif next_punctuation_change(i, tokens): # Corrects advance: # Are you |SPLIT SPEAKER| okay? yeah, sure. Absolutely # should become: # Are you okay? |SPLIT SPEAKER| yeah, sure. Absolutely - append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized) + append_token_to_last_line(lines, sep, token, debug_info) continue else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line - # lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) + # lines.append(new_line(token, speaker, debug_info = "")) pass - append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized) + append_token_to_last_line(lines, sep, token, debug_info) + if lines and translated_segments: + cts_idx = 0 # current_translated_segment_idx + for line in lines: + while cts_idx < len(translated_segments): + ts = translated_segments[cts_idx] + if ts.start and ts.start >= line.start and ts.end <= line.end: + line.translation += ts.text + ' ' + cts_idx += 1 + else: + break return lines, undiarized_text, buffer_transcription, '' diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index c8ad3ec..ab4045a 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -1,5 +1,11 @@ from dataclasses import dataclass from typing import Optional +from datetime import timedelta + +def format_time(seconds: float) -> str: + """Format seconds as HH:MM:SS.""" + return str(timedelta(seconds=int(seconds))) + @dataclass class TimedText: @@ -37,4 +43,18 @@ class Translation(TimedText): @dataclass class Silence(): - duration: float \ No newline at end of file + duration: float + + +@dataclass +class Line(TimedText): + translation: str = '' + + def to_dict(self): + return { + 'speaker': int(self.speaker), + 'text': self.text, + 'translation': self.translation, + 'start': format_time(self.start), + 'end': format_time(self.end), + } \ No newline at end of file diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js index af804b5..c51ce35 100644 --- a/whisperlivekit/web/live_transcription.js +++ b/whisperlivekit/web/live_transcription.js @@ -293,7 +293,7 @@ function renderLinesWithBuffer( const showTransLag = !isFinalizing && remaining_time_transcription > 0; const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0; const signature = JSON.stringify({ - lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })), + lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })), buffer_transcription: buffer_transcription || "", buffer_diarization: buffer_diarization || "", status: current_status, @@ -316,8 +316,8 @@ function renderLinesWithBuffer( const linesHtml = (lines || []) .map((item, idx) => { let timeInfo = ""; - if (item.beg !== undefined && item.end !== undefined) { - timeInfo = ` ${item.beg} - ${item.end}`; + if (item.start !== undefined && item.end !== undefined) { + timeInfo = ` ${item.start} - ${item.end}`; } let speakerLabel = "";