audio processor lines use now Lines objects instead of dict

This commit is contained in:
Quentin Fuxa
2025-09-09 21:45:00 +02:00
parent add7ea07ee
commit cb2d4ea88a
5 changed files with 73 additions and 65 deletions

View File

@@ -310,7 +310,7 @@ function renderLinesWithBuffer(
const showTransLag = !isFinalizing && remaining_time_transcription > 0;
const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
const signature = JSON.stringify({
lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })),
lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })),
buffer_transcription: buffer_transcription || "",
buffer_diarization: buffer_diarization || "",
status: current_status,
@@ -333,8 +333,8 @@ function renderLinesWithBuffer(
const linesHtml = (lines || [])
.map((item, idx) => {
let timeInfo = "";
if (item.beg !== undefined && item.end !== undefined) {
timeInfo = ` ${item.beg} - ${item.end}`;
if (item.start !== undefined && item.end !== undefined) {
timeInfo = ` ${item.start} - ${item.end}`;
}
let speakerLabel = "";

View File

@@ -4,11 +4,11 @@ from time import time, sleep
import math
import logging
import traceback
from whisperlivekit.timed_objects import ASRToken, Silence
from whisperlivekit.timed_objects import ASRToken, Silence, Line
from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
from whisperlivekit.silero_vad_iterator import FixedVADIterator
from whisperlivekit.results_formater import format_output, format_time
from whisperlivekit.results_formater import format_output
# Set up logging once
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
@@ -59,7 +59,7 @@ class AudioProcessor:
self.silence = False
self.silence_duration = 0.0
self.tokens = []
self.translated_tokens = []
self.translated_segments = []
self.buffer_transcription = ""
self.buffer_diarization = ""
self.end_buffer = 0
@@ -153,7 +153,7 @@ class AudioProcessor:
return {
"tokens": self.tokens.copy(),
"translated_tokens": self.translated_tokens.copy(),
"translated_segments": self.translated_segments.copy(),
"buffer_transcription": self.buffer_transcription,
"buffer_diarization": self.buffer_diarization,
"end_buffer": self.end_buffer,
@@ -167,7 +167,7 @@ class AudioProcessor:
"""Reset all state variables to initial values."""
async with self.lock:
self.tokens = []
self.translated_tokens = []
self.translated_segments = []
self.buffer_transcription = self.buffer_diarization = ""
self.end_buffer = self.end_attributed_speaker = 0
self.beg_loop = time()
@@ -431,8 +431,7 @@ class AudioProcessor:
tokens_to_process.append(additional_token)
if tokens_to_process:
online_translation.insert_tokens(tokens_to_process)
translations = online_translation.process()
print(translations)
self.translated_segments = online_translation.process()
self.translation_queue.task_done()
for _ in additional_tokens:
@@ -505,23 +504,19 @@ class AudioProcessor:
buffer_diarization = combined
response_status = "active_transcription"
final_lines_for_response = lines.copy()
if not tokens and not buffer_transcription and not buffer_diarization:
response_status = "no_audio_detected"
final_lines_for_response = []
elif response_status == "active_transcription" and not final_lines_for_response:
final_lines_for_response = [{
"speaker": 1,
"text": "",
"beg": format_time(state.get("end_buffer", 0)),
"end": format_time(state.get("end_buffer", 0)),
"diff": 0
}]
lines = []
elif response_status == "active_transcription" and not lines:
lines = [Line(
speaker=1,
start=state.get("end_buffer", 0),
end=state.get("end_buffer", 0)
)]
response = {
"status": response_status,
"lines": final_lines_for_response,
"lines": [line.to_dict() for line in lines],
"buffer_transcription": buffer_transcription,
"buffer_diarization": buffer_diarization,
"remaining_time_transcription": state["remaining_time_transcription"],
@@ -529,7 +524,7 @@ class AudioProcessor:
}
current_response_signature = f"{response_status} | " + \
' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \
' '.join([f"{line.speaker} {line.text}" for line in lines]) + \
f" | {buffer_transcription} | {buffer_diarization}"
trans = state["remaining_time_transcription"]
@@ -540,7 +535,7 @@ class AudioProcessor:
or round(trans, 1) != round(last_sent_trans, 1)
or round(diar, 1) != round(last_sent_diar, 1)
)
if should_push and (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
yield response
self.last_response_content = current_response_signature
last_sent_trans = trans
@@ -556,7 +551,6 @@ class AudioProcessor:
if all_processors_done:
logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
final_state = await self.get_current_state()
return
await asyncio.sleep(0.1) # Avoid overwhelming the client

View File

@@ -1,7 +1,7 @@
import logging
from datetime import timedelta
from whisperlivekit.remove_silences import handle_silences
from timed_objects import Line, format_time
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
@@ -9,11 +9,6 @@ logger.setLevel(logging.DEBUG)
PUNCTUATION_MARKS = {'.', '!', '?', '', '', ''}
CHECK_AROUND = 4
def format_time(seconds: float) -> str:
"""Format seconds as HH:MM:SS."""
return str(timedelta(seconds=int(seconds)))
def is_punctuation(token):
if token.text.strip() in PUNCTUATION_MARKS:
return True
@@ -34,32 +29,26 @@ def next_speaker_change(i, tokens, speaker):
return ind, token.speaker
return None, speaker
def new_line(
token,
speaker,
last_end_diarized,
debug_info = ""
):
return {
"speaker": int(speaker),
"text": token.text + debug_info,
"beg": format_time(token.start),
"end": format_time(token.end),
"diff": round(token.end - last_end_diarized, 2)
}
return Line(
speaker = speaker,
text = token.text + debug_info,
start = token.start,
end = token.end,
)
def append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized):
def append_token_to_last_line(lines, sep, token, debug_info):
if token.text:
lines[-1]["text"] += sep + token.text + debug_info
lines[-1]["end"] = format_time(token.end)
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
lines[-1].text += sep + token.text + debug_info
lines[-1].end = token.end
def format_output(state, silence, current_time, diarization, debug):
tokens = state["tokens"]
translated_tokens = state["translated_tokens"] # Here we will attribute the speakers only based on the timestamps of the segments
translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments
buffer_transcription = state["buffer_transcription"]
buffer_diarization = state["buffer_diarization"]
end_attributed_speaker = state["end_attributed_speaker"]
@@ -67,13 +56,11 @@ def format_output(state, silence, current_time, diarization, debug):
previous_speaker = -1
lines = []
last_end_diarized = 0
undiarized_text = []
tokens, buffer_transcription, buffer_diarization = handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, silence)
last_punctuation = None
for i, token in enumerate(tokens):
speaker = token.speaker
if not diarization and speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1'
speaker = 1
if diarization and not tokens[-1].speaker == -2:
@@ -82,18 +69,15 @@ def format_output(state, silence, current_time, diarization, debug):
continue
elif (speaker in [-1, 0]) and token.end < end_attributed_speaker:
speaker = previous_speaker
if speaker not in [-1, 0]:
last_end_diarized = max(token.end, last_end_diarized)
debug_info = ""
if debug:
debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]"
if not lines:
lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
lines.append(new_line(token, speaker, debug_info = ""))
continue
else:
previous_speaker = lines[-1]['speaker']
previous_speaker = lines[-1].speaker
if is_punctuation(token):
last_punctuation = i
@@ -102,7 +86,7 @@ def format_output(state, silence, current_time, diarization, debug):
if last_punctuation == i-1:
if speaker != previous_speaker:
# perfect, diarization perfectly aligned
lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
lines.append(new_line(token, speaker, debug_info = ""))
last_punctuation, next_punctuation = None, None
continue
@@ -112,28 +96,38 @@ def format_output(state, silence, current_time, diarization, debug):
# That was the idea. Okay haha |SPLIT SPEAKER| that's a good one
# should become:
# That was the idea. |SPLIT SPEAKER| Okay haha that's a good one
lines.append(new_line(token, new_speaker, last_end_diarized, debug_info = ""))
lines.append(new_line(token, new_speaker, debug_info = ""))
else:
# No speaker change to come
append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized)
append_token_to_last_line(lines, sep, token, debug_info)
continue
if speaker != previous_speaker:
if speaker == -2 or previous_speaker == -2: #silences can happen anytime
lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
lines.append(new_line(token, speaker, debug_info = ""))
continue
elif next_punctuation_change(i, tokens):
# Corrects advance:
# Are you |SPLIT SPEAKER| okay? yeah, sure. Absolutely
# should become:
# Are you okay? |SPLIT SPEAKER| yeah, sure. Absolutely
append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized)
append_token_to_last_line(lines, sep, token, debug_info)
continue
else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line
# lines.append(new_line(token, speaker, last_end_diarized, debug_info = ""))
# lines.append(new_line(token, speaker, debug_info = ""))
pass
append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized)
append_token_to_last_line(lines, sep, token, debug_info)
if lines and translated_segments:
cts_idx = 0 # current_translated_segment_idx
for line in lines:
while cts_idx < len(translated_segments):
ts = translated_segments[cts_idx]
if ts.start and ts.start >= line.start and ts.end <= line.end:
line.translation += ts.text + ' '
cts_idx += 1
else:
break
return lines, undiarized_text, buffer_transcription, ''

View File

@@ -1,5 +1,11 @@
from dataclasses import dataclass
from typing import Optional
from datetime import timedelta
def format_time(seconds: float) -> str:
"""Format seconds as HH:MM:SS."""
return str(timedelta(seconds=int(seconds)))
@dataclass
class TimedText:
@@ -37,4 +43,18 @@ class Translation(TimedText):
@dataclass
class Silence():
duration: float
duration: float
@dataclass
class Line(TimedText):
translation: str = ''
def to_dict(self):
return {
'speaker': int(self.speaker),
'text': self.text,
'translation': self.translation,
'start': format_time(self.start),
'end': format_time(self.end),
}

View File

@@ -293,7 +293,7 @@ function renderLinesWithBuffer(
const showTransLag = !isFinalizing && remaining_time_transcription > 0;
const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
const signature = JSON.stringify({
lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })),
lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })),
buffer_transcription: buffer_transcription || "",
buffer_diarization: buffer_diarization || "",
status: current_status,
@@ -316,8 +316,8 @@ function renderLinesWithBuffer(
const linesHtml = (lines || [])
.map((item, idx) => {
let timeInfo = "";
if (item.beg !== undefined && item.end !== undefined) {
timeInfo = ` ${item.beg} - ${item.end}`;
if (item.start !== undefined && item.end !== undefined) {
timeInfo = ` ${item.start} - ${item.end}`;
}
let speakerLabel = "";