audio processor lines use now Lines objects instead of dict

2026-03-07 14:23:18 +00:00 · 2025-09-09 21:45:00 +02:00
parent add7ea07ee
commit cb2d4ea88a
5 changed files with 73 additions and 65 deletions
--- a/chrome-extension/live_transcription.js
+++ b/chrome-extension/live_transcription.js
@@ -310,7 +310,7 @@ function renderLinesWithBuffer(
  const showTransLag = !isFinalizing && remaining_time_transcription > 0;
  const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
  const signature = JSON.stringify({
-    lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })),
+    lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })),
    buffer_transcription: buffer_transcription || "",
    buffer_diarization: buffer_diarization || "",
    status: current_status,
@@ -333,8 +333,8 @@ function renderLinesWithBuffer(
  const linesHtml = (lines || [])
    .map((item, idx) => {
      let timeInfo = "";
-      if (item.beg !== undefined && item.end !== undefined) {
-        timeInfo = ` ${item.beg} - ${item.end}`;
+      if (item.start !== undefined && item.end !== undefined) {
+        timeInfo = ` ${item.start} - ${item.end}`;
      }

      let speakerLabel = "";
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -4,11 +4,11 @@ from time import time, sleep
 import math
 import logging
 import traceback
-from whisperlivekit.timed_objects import ASRToken, Silence
+from whisperlivekit.timed_objects import ASRToken, Silence, Line
 from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory
 from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
 from whisperlivekit.silero_vad_iterator import FixedVADIterator
-from whisperlivekit.results_formater import format_output, format_time
+from whisperlivekit.results_formater import format_output
 # Set up logging once
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
@@ -59,7 +59,7 @@ class AudioProcessor:
        self.silence = False
        self.silence_duration = 0.0
        self.tokens = []
-        self.translated_tokens = []
+        self.translated_segments = []
        self.buffer_transcription = ""
        self.buffer_diarization = ""
        self.end_buffer = 0
@@ -153,7 +153,7 @@ class AudioProcessor:
                
            return {
                "tokens": self.tokens.copy(),
-                "translated_tokens": self.translated_tokens.copy(),
+                "translated_segments": self.translated_segments.copy(),
                "buffer_transcription": self.buffer_transcription,
                "buffer_diarization": self.buffer_diarization,
                "end_buffer": self.end_buffer,
@@ -167,7 +167,7 @@ class AudioProcessor:
        """Reset all state variables to initial values."""
        async with self.lock:
            self.tokens = []
-            self.translated_tokens = []
+            self.translated_segments = []
            self.buffer_transcription = self.buffer_diarization = ""
            self.end_buffer = self.end_attributed_speaker = 0
            self.beg_loop = time()
@@ -431,8 +431,7 @@ class AudioProcessor:
                    tokens_to_process.append(additional_token)                
                if tokens_to_process:
                    online_translation.insert_tokens(tokens_to_process)
-                    translations = online_translation.process()
-                    print(translations)
+                    self.translated_segments = online_translation.process()
                
                self.translation_queue.task_done()
                for _ in additional_tokens:
@@ -505,23 +504,19 @@ class AudioProcessor:
                    buffer_diarization = combined
                
                response_status = "active_transcription"
-                final_lines_for_response = lines.copy()
-
                if not tokens and not buffer_transcription and not buffer_diarization:
                    response_status = "no_audio_detected"
-                    final_lines_for_response = []
-                elif response_status == "active_transcription" and not final_lines_for_response:
-                    final_lines_for_response = [{
-                        "speaker": 1,
-                        "text": "",
-                        "beg": format_time(state.get("end_buffer", 0)),
-                        "end": format_time(state.get("end_buffer", 0)),
-                        "diff": 0
-                    }]
+                    lines = []
+                elif response_status == "active_transcription" and not lines:
+                    lines = [Line(
+                        speaker=1,
+                        start=state.get("end_buffer", 0),
+                        end=state.get("end_buffer", 0)        
+                    )]
                
                response = {
                    "status": response_status,
-                    "lines": final_lines_for_response,
+                    "lines": [line.to_dict() for line in lines],
                    "buffer_transcription": buffer_transcription,
                    "buffer_diarization": buffer_diarization,
                    "remaining_time_transcription": state["remaining_time_transcription"],
@@ -529,7 +524,7 @@ class AudioProcessor:
                }
                
                current_response_signature = f"{response_status} | " + \
-                                           ' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \
+                                           ' '.join([f"{line.speaker} {line.text}" for line in lines]) + \
                                           f" | {buffer_transcription} | {buffer_diarization}"
                
                trans = state["remaining_time_transcription"]
@@ -540,7 +535,7 @@ class AudioProcessor:
                    or round(trans, 1) != round(last_sent_trans, 1)
                    or round(diar, 1) != round(last_sent_diar, 1)
                )
-                if should_push and (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
+                if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
                    yield response
                    self.last_response_content = current_response_signature
                    last_sent_trans = trans
@@ -556,7 +551,6 @@ class AudioProcessor:
                    
                    if all_processors_done:
                        logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
-                        final_state = await self.get_current_state()
                        return
                
                await asyncio.sleep(0.1)  # Avoid overwhelming the client
--- a/whisperlivekit/results_formater.py
+++ b/whisperlivekit/results_formater.py
@@ -1,7 +1,7 @@

 import logging
-from datetime import timedelta
 from whisperlivekit.remove_silences import handle_silences
+from timed_objects import Line, format_time

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -9,11 +9,6 @@ logger.setLevel(logging.DEBUG)
 PUNCTUATION_MARKS = {'.', '!', '?', '。', '！', '？'}
 CHECK_AROUND = 4

-def format_time(seconds: float) -> str:
-    """Format seconds as HH:MM:SS."""
-    return str(timedelta(seconds=int(seconds)))
-
-
 def is_punctuation(token):
    if token.text.strip() in PUNCTUATION_MARKS:
        return True
@@ -34,32 +29,26 @@ def next_speaker_change(i, tokens, speaker):
            return ind, token.speaker
    return None, speaker
    
-
 def new_line(
    token,
    speaker,
-    last_end_diarized,
    debug_info = ""
 ):
-    return {
-            "speaker": int(speaker),
-            "text": token.text + debug_info,
-            "beg": format_time(token.start),
-            "end": format_time(token.end),
-            "diff": round(token.end - last_end_diarized, 2)
-    }
+    return Line(
+        speaker = speaker,
+        text = token.text + debug_info,
+        start = token.start,
+        end = token.end,
+    )

-
-def append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized):
+def append_token_to_last_line(lines, sep, token, debug_info):
    if token.text:
-        lines[-1]["text"] += sep + token.text + debug_info
-        lines[-1]["end"] = format_time(token.end)
-        lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
-            
+        lines[-1].text += sep + token.text + debug_info
+        lines[-1].end = token.end

 def format_output(state, silence, current_time, diarization, debug):
    tokens = state["tokens"]
-    translated_tokens = state["translated_tokens"] # Here we will attribute the speakers only based on the timestamps of the segments
+    translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments
    buffer_transcription = state["buffer_transcription"]
    buffer_diarization = state["buffer_diarization"]
    end_attributed_speaker = state["end_attributed_speaker"]
@@ -67,13 +56,11 @@ def format_output(state, silence, current_time, diarization, debug):
    
    previous_speaker = -1
    lines = []
-    last_end_diarized = 0
    undiarized_text = []
    tokens, buffer_transcription, buffer_diarization = handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, silence)
    last_punctuation = None
    for i, token in enumerate(tokens):
        speaker = token.speaker
-        
        if not diarization and speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1'
            speaker = 1
        if diarization and not tokens[-1].speaker == -2:
@@ -82,18 +69,15 @@ def format_output(state, silence, current_time, diarization, debug):
                continue
            elif (speaker in [-1, 0]) and token.end < end_attributed_speaker:
                speaker = previous_speaker
-            if speaker not in [-1, 0]:
-                last_end_diarized = max(token.end, last_end_diarized)
-
        debug_info = ""
        if debug:
            debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]"
            
        if not lines:
-            lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
+            lines.append(new_line(token, speaker, debug_info = ""))
            continue
        else:
-            previous_speaker = lines[-1]['speaker']
+            previous_speaker = lines[-1].speaker
        
        if is_punctuation(token):
            last_punctuation = i
@@ -102,7 +86,7 @@ def format_output(state, silence, current_time, diarization, debug):
        if last_punctuation == i-1:
            if speaker != previous_speaker:
                # perfect, diarization perfectly aligned
-                lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
+                lines.append(new_line(token, speaker, debug_info = ""))
                last_punctuation, next_punctuation = None, None
                continue
            
@@ -112,28 +96,38 @@ def format_output(state, silence, current_time, diarization, debug):
                # That was the idea. Okay haha |SPLIT SPEAKER| that's a good one 
                # should become:
                # That was the idea. |SPLIT SPEAKER| Okay haha that's a good one 
-                lines.append(new_line(token, new_speaker, last_end_diarized, debug_info = ""))
+                lines.append(new_line(token, new_speaker, debug_info = ""))
            else:
                # No speaker change to come
-                append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized)
+                append_token_to_last_line(lines, sep, token, debug_info)
            continue
        

        if speaker != previous_speaker:
            if speaker == -2 or previous_speaker == -2: #silences can happen anytime
-                lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
+                lines.append(new_line(token, speaker, debug_info = ""))
                continue
            elif next_punctuation_change(i, tokens):
                # Corrects advance:
                # Are you |SPLIT SPEAKER| okay? yeah, sure. Absolutely 
                # should become:
                # Are you okay? |SPLIT SPEAKER| yeah, sure. Absolutely 
-                append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized)
+                append_token_to_last_line(lines, sep, token, debug_info)
                continue
            else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line
-                # lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
+                # lines.append(new_line(token, speaker, debug_info = ""))
                pass
            
-        append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized)
+        append_token_to_last_line(lines, sep, token, debug_info)
+    if lines and translated_segments:
+        cts_idx = 0 # current_translated_segment_idx
+        for line in lines:
+            while cts_idx < len(translated_segments):
+                ts = translated_segments[cts_idx]
+                if ts.start and ts.start >= line.start and ts.end <= line.end:
+                    line.translation += ts.text + ' '
+                    cts_idx += 1
+                else:
+                    break
    return lines, undiarized_text, buffer_transcription, '' 

--- a/whisperlivekit/timed_objects.py
+++ b/whisperlivekit/timed_objects.py
@@ -1,5 +1,11 @@
 from dataclasses import dataclass
 from typing import Optional
+from datetime import timedelta
+
+def format_time(seconds: float) -> str:
+    """Format seconds as HH:MM:SS."""
+    return str(timedelta(seconds=int(seconds)))
+

@dataclass
 class TimedText:
@@ -37,4 +43,18 @@ class Translation(TimedText):

@dataclass
 class Silence():
-    duration: float
+    duration: float
+    
+    
+@dataclass
+class Line(TimedText):
+    translation: str = ''
+    
+    def to_dict(self):
+        return {
+            'speaker': int(self.speaker),
+            'text': self.text,
+            'translation': self.translation,
+            'start': format_time(self.start),
+            'end': format_time(self.end),
+        }
--- a/whisperlivekit/web/live_transcription.js
+++ b/whisperlivekit/web/live_transcription.js
@@ -293,7 +293,7 @@ function renderLinesWithBuffer(
  const showTransLag = !isFinalizing && remaining_time_transcription > 0;
  const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
  const signature = JSON.stringify({
-    lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })),
+    lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })),
    buffer_transcription: buffer_transcription || "",
    buffer_diarization: buffer_diarization || "",
    status: current_status,
@@ -316,8 +316,8 @@ function renderLinesWithBuffer(
  const linesHtml = (lines || [])
    .map((item, idx) => {
      let timeInfo = "";
-      if (item.beg !== undefined && item.end !== undefined) {
-        timeInfo = ` ${item.beg} - ${item.end}`;
+      if (item.start !== undefined && item.end !== undefined) {
+        timeInfo = ` ${item.start} - ${item.end}`;
      }

      let speakerLabel = "";