From e1823dd99c48006f57189940da529d0e010caf6b Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sat, 17 Jan 2026 09:35:00 +0100 Subject: [PATCH] Improve online ASR processor --- whisperlivekit/local_agreement/backends.py | 6 +-- whisperlivekit/local_agreement/online_asr.py | 37 +++++++++---------- .../local_agreement/whisper_online.py | 5 +-- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/whisperlivekit/local_agreement/backends.py b/whisperlivekit/local_agreement/backends.py index b669ef2..3748944 100644 --- a/whisperlivekit/local_agreement/backends.py +++ b/whisperlivekit/local_agreement/backends.py @@ -44,13 +44,13 @@ class WhisperASR(ASRBase): from whisperlivekit.whisper import load_model as load_whisper_model if model_dir is not None: - resolved_path = resolve_model_path(model_dir) + resolved_path = resolve_model_path(model_dir) if resolved_path.is_dir(): model_info = detect_model_format(resolved_path) if not model_info.has_pytorch: raise FileNotFoundError( f"No supported PyTorch checkpoint found under {resolved_path}" - ) + ) logger.debug(f"Loading Whisper model from custom path {resolved_path}") return load_whisper_model(str(resolved_path), lora_path=self.lora_path) @@ -116,7 +116,7 @@ class FasterWhisperASR(ASRBase): raise ValueError("Either model_size or model_dir must be set") device = "auto" # Allow CTranslate2 to decide available device compute_type = "auto" # Allow CTranslate2 to decide faster compute type - + model = WhisperModel( model_size_or_path, diff --git a/whisperlivekit/local_agreement/online_asr.py b/whisperlivekit/local_agreement/online_asr.py index 28869df..c9e076a 100644 --- a/whisperlivekit/local_agreement/online_asr.py +++ b/whisperlivekit/local_agreement/online_asr.py @@ -28,8 +28,8 @@ class HypothesisBuffer: def insert(self, new_tokens: List[ASRToken], offset: float): """ - Insert new tokens (after applying a time offset) and compare them with the - already committed tokens. Only tokens that extend the committed hypothesis + Insert new tokens (after applying a time offset) and compare them with the + already committed tokens. Only tokens that extend the committed hypothesis are added. """ # Apply the offset to each token. @@ -98,7 +98,7 @@ class OnlineASRProcessor: """ Processes incoming audio in a streaming fashion, calling the ASR system periodically, and uses a hypothesis buffer to commit and trim recognized text. - + The processor supports two types of buffer trimming: - "sentence": trims at sentence boundaries (using a sentence tokenizer) - "segment": trims at fixed segment durations. @@ -187,7 +187,7 @@ class OnlineASRProcessor: def prompt(self) -> Tuple[str, str]: """ Returns a tuple: (prompt, context), where: - - prompt is a 200-character suffix of committed text that falls + - prompt is a 200-character suffix of committed text that falls outside the current audio buffer. - context is the committed text within the current audio buffer. """ @@ -213,7 +213,7 @@ class OnlineASRProcessor: Get the unvalidated buffer in string format. """ return self.concatenate_tokens(self.transcript_buffer.buffer) - + def process_iter(self) -> Tuple[List[ASRToken], float]: """ @@ -262,9 +262,6 @@ class OnlineASRProcessor: logger.debug( f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds" ) - if self.global_time_offset: - for token in committed_tokens: - token = token.with_offset(self.global_time_offset) return committed_tokens, current_audio_processed_upto def chunk_completed_sentence(self): @@ -273,19 +270,19 @@ class OnlineASRProcessor: buffer at the end time of the penultimate sentence. Also ensures chunking happens if audio buffer exceeds a time limit. """ - buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE + buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE if not self.committed: if buffer_duration > self.buffer_trimming_sec: chunk_time = self.buffer_time_offset + (buffer_duration / 2) logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}") self.chunk_at(chunk_time) return - + logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed)) sentences = self.words_to_sentences(self.committed) for sentence in sentences: logger.debug(f"\tSentence: {sentence.text}") - + chunk_done = False if len(sentences) >= 2: while len(sentences) > 2: @@ -294,7 +291,7 @@ class OnlineASRProcessor: logger.debug(f"--- Sentence chunked at {chunk_time:.2f}") self.chunk_at(chunk_time) chunk_done = True - + if not chunk_done and buffer_duration > self.buffer_trimming_sec: last_committed_time = self.committed[-1].end logger.debug(f"--- Not enough sentences, chunking at last committed time {last_committed_time:.2f}") @@ -305,17 +302,17 @@ class OnlineASRProcessor: Chunk the audio buffer based on segment-end timestamps reported by the ASR. Also ensures chunking happens if audio buffer exceeds a time limit. """ - buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE + buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE if not self.committed: if buffer_duration > self.buffer_trimming_sec: chunk_time = self.buffer_time_offset + (buffer_duration / 2) logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}") self.chunk_at(chunk_time) return - + logger.debug("Processing committed tokens for segmenting") ends = self.asr.segments_end_ts(res) - last_committed_time = self.committed[-1].end + last_committed_time = self.committed[-1].end chunk_done = False if len(ends) > 1: logger.debug("Multiple segments available for chunking") @@ -331,13 +328,13 @@ class OnlineASRProcessor: logger.debug("--- Last segment not within committed area") else: logger.debug("--- Not enough segments to chunk") - + if not chunk_done and buffer_duration > self.buffer_trimming_sec: logger.debug(f"--- Buffer too large, chunking at last committed time {last_committed_time:.2f}") self.chunk_at(last_committed_time) - + logger.debug("Segment chunking complete") - + def chunk_at(self, time: float): """ Trim both the hypothesis and audio buffer at the given time. @@ -367,7 +364,7 @@ class OnlineASRProcessor: if self.tokenize: try: sentence_texts = self.tokenize(full_text) - except Exception as e: + except Exception: # Some tokenizers (e.g., MosesSentenceSplitter) expect a list input. try: sentence_texts = self.tokenize([full_text]) @@ -398,7 +395,7 @@ class OnlineASRProcessor: ) sentences.append(sentence) return sentences - + def finish(self) -> Tuple[List[ASRToken], float]: """ Flush the remaining transcript when processing ends. diff --git a/whisperlivekit/local_agreement/whisper_online.py b/whisperlivekit/local_agreement/whisper_online.py index 4ab3062..993ca7e 100644 --- a/whisperlivekit/local_agreement/whisper_online.py +++ b/whisperlivekit/local_agreement/whisper_online.py @@ -3,8 +3,7 @@ import logging import platform import time -from whisperlivekit.backend_support import (faster_backend_available, - mlx_backend_available) +from whisperlivekit.backend_support import faster_backend_available, mlx_backend_available from whisperlivekit.model_paths import detect_model_format, resolve_model_path from whisperlivekit.warmup import warmup_asr @@ -39,7 +38,7 @@ def create_tokenizer(lan): lan in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split() ): - from mosestokenizer import MosesSentenceSplitter + from mosestokenizer import MosesSentenceSplitter return MosesSentenceSplitter(lan)