From 61edb70fffae5b64f839da52462e553bf7642139 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 26 Oct 2025 18:54:47 +0100 Subject: [PATCH] audioProcessor state variables are now uniquely in State dataclass --- whisperlivekit/audio_processor.py | 83 ++++++++++++------------------ whisperlivekit/remove_silences.py | 8 +-- whisperlivekit/results_formater.py | 18 +++---- whisperlivekit/timed_objects.py | 18 ++++--- 4 files changed, 54 insertions(+), 73 deletions(-) diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 2106383..0fc9176 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -67,20 +67,17 @@ class AudioProcessor: self.is_stopping = False self.silence = False self.silence_duration = 0.0 - self.tokens = [] - self.last_validated_token = 0 - self.translated_segments = [] - self.buffer_transcription = Transcript() - self.end_buffer = 0 - self.end_attributed_speaker = 0 + self.state = State() self.lock = asyncio.Lock() - self.beg_loop = 0.0 #to deal with a potential little lag at the websocket initialization, this is now set in process_audio self.sep = " " # Default separator self.last_response_content = FrontData() self.last_detected_speaker = None self.speaker_languages = {} self.diarization_before_transcription = False + self.segments = [] + + if self.diarization_before_transcription: self.cumulative_pcm = [] self.last_start = 0.0 @@ -138,8 +135,8 @@ class AudioProcessor: async def add_dummy_token(self): """Placeholder token when no transcription is available.""" async with self.lock: - current_time = time() - self.beg_loop - self.tokens.append(ASRToken( + current_time = time() - self.state.beg_loop + self.state.tokens.append(ASRToken( start=current_time, end=current_time + 1, text=".", speaker=-1, is_dummy=True )) @@ -149,35 +146,19 @@ class AudioProcessor: async with self.lock: current_time = time() - # Calculate remaining times remaining_transcription = 0 - if self.end_buffer > 0: - remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 1)) + if self.state.end_buffer > 0: + remaining_transcription = max(0, round(current_time - self.state.beg_loop - self.state.end_buffer, 1)) remaining_diarization = 0 - if self.tokens: - latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0) - remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1)) + if self.state.tokens: + latest_end = max(self.state.end_buffer, self.state.tokens[-1].end if self.state.tokens else 0) + remaining_diarization = max(0, round(latest_end - self.state.end_attributed_speaker, 1)) - return State( - tokens=self.tokens.copy(), - last_validated_token=self.last_validated_token, - translated_segments=self.translated_segments.copy(), - buffer_transcription=self.buffer_transcription, - end_buffer=self.end_buffer, - end_attributed_speaker=self.end_attributed_speaker, - remaining_time_transcription=remaining_transcription, - remaining_time_diarization=remaining_diarization - ) + self.state.remaining_time_transcription = remaining_transcription + self.state.remaining_time_diarization = remaining_diarization - async def reset(self): - """Reset all state variables to initial values.""" - async with self.lock: - self.tokens = [] - self.translated_segments = [] - self.buffer_transcription = Transcript() - self.end_buffer = self.end_attributed_speaker = 0 - self.beg_loop = time() + return self.state async def ffmpeg_stdout_reader(self): """Read audio data from FFmpeg stdout and process it into the PCM pipeline.""" @@ -242,15 +223,15 @@ class AudioProcessor: break asr_internal_buffer_duration_s = len(getattr(self.transcription, 'audio_buffer', [])) / self.transcription.SAMPLING_RATE - transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer) + transcription_lag_s = max(0.0, time() - self.state.beg_loop - self.state.end_buffer) asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |" if type(item) is Silence: asr_processing_logs += f" + Silence of = {item.duration:.2f}s" - if self.tokens: - asr_processing_logs += f" | last_end = {self.tokens[-1].end} |" + if self.state.tokens: + asr_processing_logs += f" | last_end = {self.state.tokens[-1].end} |" logger.info(asr_processing_logs) cumulative_pcm_duration_stream_time += item.duration - self.transcription.insert_silence(item.duration, self.tokens[-1].end if self.tokens else 0) + self.transcription.insert_silence(item.duration, self.state.tokens[-1].end if self.state.tokens else 0) continue elif isinstance(item, ChangeSpeaker): self.transcription.new_speaker(item) @@ -274,7 +255,7 @@ class AudioProcessor: if buffer_text.startswith(validated_text): _buffer_transcript.text = buffer_text[len(validated_text):].lstrip() - candidate_end_times = [self.end_buffer] + candidate_end_times = [self.state.end_buffer] if new_tokens: candidate_end_times.append(new_tokens[-1].end) @@ -285,9 +266,9 @@ class AudioProcessor: candidate_end_times.append(current_audio_processed_upto) async with self.lock: - self.tokens.extend(new_tokens) - self.buffer_transcription = _buffer_transcript - self.end_buffer = max(candidate_end_times) + self.state.tokens.extend(new_tokens) + self.state.buffer_transcription = _buffer_transcript + self.state.end_buffer = max(candidate_end_times) if self.translation_queue: for token in new_tokens: @@ -360,12 +341,12 @@ class AudioProcessor: self.last_end = last_segment.end elif not self.diarization_before_transcription: async with self.lock: - self.tokens = diarization_obj.assign_speakers_to_tokens( - self.tokens, + self.state.tokens = diarization_obj.assign_speakers_to_tokens( + self.state.tokens, use_punctuation_split=self.args.punctuation_split ) - if len(self.tokens) > 0: - self.end_attributed_speaker = max(self.tokens[-1].end, self.end_attributed_speaker) + if len(self.state.tokens) > 0: + self.state.end_attributed_speaker = max(self.state.tokens[-1].end, self.state.end_attributed_speaker) self.diarization_queue.task_done() except Exception as e: @@ -406,7 +387,10 @@ class AudioProcessor: tokens_to_process.append(additional_token) if tokens_to_process: self.translation.insert_tokens(tokens_to_process) - self.translated_segments = await asyncio.to_thread(self.translation.process) + translation_validated_segments, translation_buffer = await asyncio.to_thread(self.translation.process) + async with self.lock: + self.state.translation_validated_segments = translation_validated_segments + self.state.translation_buffer = translation_buffer self.translation_queue.task_done() for _ in additional_tokens: self.translation_queue.task_done() @@ -440,7 +424,6 @@ class AudioProcessor: lines, undiarized_text = format_output( state, self.silence, - current_time = time() - self.beg_loop, args = self.args, sep=self.sep ) @@ -454,7 +437,7 @@ class AudioProcessor: buffer_diarization = self.sep.join(undiarized_text) async with self.lock: - self.end_attributed_speaker = state.end_attributed_speaker + self.state.end_attributed_speaker = state.end_attributed_speaker response_status = "active_transcription" if not state.tokens and not buffer_transcription and not buffer_diarization: @@ -580,8 +563,8 @@ class AudioProcessor: async def process_audio(self, message): """Process incoming audio data.""" - if not self.beg_loop: - self.beg_loop = time() + if not self.state.beg_loop: + self.state.beg_loop = time() if not message: logger.info("Empty audio message received, initiating stop sequence.") diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py index 6ff472f..785c793 100644 --- a/whisperlivekit/remove_silences.py +++ b/whisperlivekit/remove_silences.py @@ -1,4 +1,5 @@ from whisperlivekit.timed_objects import ASRToken +from time import time import re MIN_SILENCE_DURATION = 4 #in seconds @@ -77,7 +78,8 @@ def no_token_to_silence(tokens): new_tokens.append(token) return new_tokens -def ends_with_silence(tokens, current_time, vac_detected_silence): +def ends_with_silence(tokens, beg_loop, vac_detected_silence): + current_time = time() - (beg_loop if beg_loop else 0.0) last_token = tokens[-1] if vac_detected_silence or (current_time - last_token.end >= END_SILENCE_DURATION): if last_token.speaker == -2: @@ -94,11 +96,11 @@ def ends_with_silence(tokens, current_time, vac_detected_silence): return tokens -def handle_silences(tokens, current_time, vac_detected_silence): +def handle_silences(tokens, beg_loop, vac_detected_silence): if not tokens: return [] tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text tokens = no_token_to_silence(tokens) - tokens = ends_with_silence(tokens, current_time, vac_detected_silence) + tokens = ends_with_silence(tokens, beg_loop, vac_detected_silence) return tokens \ No newline at end of file diff --git a/whisperlivekit/results_formater.py b/whisperlivekit/results_formater.py index 22df26d..9598611 100644 --- a/whisperlivekit/results_formater.py +++ b/whisperlivekit/results_formater.py @@ -52,16 +52,17 @@ def append_token_to_last_line(lines, sep, token): lines[-1].detected_language = token.detected_language -def format_output(state, silence, current_time, args, sep): +def format_output(state, silence, args, sep): diarization = args.diarization disable_punctuation_split = args.disable_punctuation_split tokens = state.tokens - translated_segments = state.translated_segments # Here we will attribute the speakers only based on the timestamps of the segments + translation_validated_segments = state.translation_validated_segments # Here we will attribute the speakers only based on the timestamps of the segments + translation_buffer = state.translation_buffer last_validated_token = state.last_validated_token previous_speaker = 1 undiarized_text = [] - tokens = handle_silences(tokens, current_time, silence) + tokens = handle_silences(tokens, state.beg_loop, silence) last_punctuation = None for i, token in enumerate(tokens[last_validated_token:]): speaker = int(token.speaker) @@ -71,13 +72,6 @@ def format_output(state, silence, current_time, args, sep): token.corrected_speaker = 1 token.validated_speaker = True else: - # if token.end > end_attributed_speaker and token.speaker != -2: - # if tokens[-1].speaker == -2: #if it finishes by a silence, we want to append the undiarized text to the last speaker. - # token.corrected_speaker = previous_speaker - # else: - # undiarized_text.append(token.text) - # continue - # else: if is_punctuation(token): last_punctuation = i @@ -123,9 +117,9 @@ def format_output(state, silence, current_time, args, sep): previous_speaker = token.corrected_speaker - if lines and translated_segments: + if lines: unassigned_translated_segments = [] - for ts in translated_segments: + for ts in translation_validated_segments: assigned = False for line in lines: if ts and ts.overlaps_with(line): diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index 103ed33..23dd3f5 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -174,11 +174,13 @@ class ChangeSpeaker: @dataclass class State(): - tokens: list - last_validated_token: int - translated_segments: list - buffer_transcription: str - end_buffer: float - end_attributed_speaker: float - remaining_time_transcription: float - remaining_time_diarization: float + tokens: list = field(default_factory=list) + last_validated_token: int = 0 + translation_validated_segments: list = field(default_factory=list) + translation_buffer: list = field(default_factory=list) + buffer_transcription: str = field(default_factory=Transcript) + end_buffer: float = 0.0 + end_attributed_speaker: float = 0.0 + remaining_time_transcription: float = 0.0 + remaining_time_diarization: float = 0.0 + beg_loop: Optional[int] = None \ No newline at end of file