From e9b4ceeee5f4a47bb353445c9f4837901a2d0f8e Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Mon, 17 Nov 2025 22:52:00 +0100 Subject: [PATCH] Add audio partial silence in chunks handling. bump to 0.2.14.post3 --- pyproject.toml | 2 +- whisperlivekit/simul_whisper/backend.py | 12 +++--- .../simul_whisper/generation_progress.py | 43 ------------------- whisperlivekit/simul_whisper/simul_whisper.py | 27 +++--------- 4 files changed, 13 insertions(+), 71 deletions(-) delete mode 100644 whisperlivekit/simul_whisper/generation_progress.py diff --git a/pyproject.toml b/pyproject.toml index d387377..e6decb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "whisperlivekit" -version = "0.2.14.post2" +version = "0.2.14.post3" description = "Real-time speech-to-text with speaker diarization using Whisper" readme = "README.md" authors = [ diff --git a/whisperlivekit/simul_whisper/backend.py b/whisperlivekit/simul_whisper/backend.py index 2c18823..f3714f8 100644 --- a/whisperlivekit/simul_whisper/backend.py +++ b/whisperlivekit/simul_whisper/backend.py @@ -18,7 +18,7 @@ from whisperlivekit.backend_support import ( import torch from whisperlivekit.simul_whisper.config import AlignAttConfig -from whisperlivekit.simul_whisper.simul_whisper import PaddedAlignAttWhisper +from whisperlivekit.simul_whisper.simul_whisper import AlignAtt logger = logging.getLogger(__name__) @@ -34,6 +34,8 @@ if HAS_FASTER_WHISPER: else: WhisperModel = None +MIN_DURATION_REAL_SILENCE = 5 + class SimulStreamingOnlineProcessor: SAMPLING_RATE = 16000 @@ -56,7 +58,7 @@ class SimulStreamingOnlineProcessor: def load_new_backend(self): model = self.asr.get_new_model_instance() - self.model = PaddedAlignAttWhisper( + self.model = AlignAtt( cfg=self.asr.cfg, loaded_model=model, mlx_encoder=self.asr.mlx_encoder, @@ -69,10 +71,10 @@ class SimulStreamingOnlineProcessor: def end_silence(self, silence_duration, offset): """ - If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame + If silences are > MIN_DURATION_REAL_SILENCE, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame """ self.end += silence_duration - long_silence = silence_duration >= 5 + long_silence = silence_duration >= MIN_DURATION_REAL_SILENCE if not long_silence: gap_len = int(16000 * silence_duration) if gap_len > 0: @@ -306,7 +308,7 @@ class SimulStreamingASR(): if warmup_audio is not None: warmup_audio = torch.from_numpy(warmup_audio).float() if self.fast_encoder: - temp_model = PaddedAlignAttWhisper( + temp_model = AlignAtt( cfg=self.cfg, loaded_model=whisper_model, mlx_encoder=self.mlx_encoder, diff --git a/whisperlivekit/simul_whisper/generation_progress.py b/whisperlivekit/simul_whisper/generation_progress.py deleted file mode 100644 index 86c6263..0000000 --- a/whisperlivekit/simul_whisper/generation_progress.py +++ /dev/null @@ -1,43 +0,0 @@ -class Tokens: - def __init__(self, tokens): - self.tokens = tokens - -# def clone(self): -# return Tokens(self.tokens.clone()) - - def __str__(self): - return str(self.tokens.tolist()) - - def __repr__(self): - return self.__str__() - -class BeamTokens(Tokens): - def __init__(self, tokens, beam_size): - self.tokens = tokens - self.beam_size = beam_size - - def clone(self): - return BeamTokens(self.tokens.clone()) - - def __str__(self): - return f"BeamTokens({self.tokens.tolist()}, beam_size={self.beam_size})" - - def __repr__(self): - return self.__str__() - - def as_text(self, tokenizer): - return tokenizer.decode(self.tokens) - -class Logits(Tokens): - def __init__(self, logits): - super().__init__(logits) - -# def clone(self): -# return Logits(self.tokens.clone(), self.beam_size) - - def __str__(self): -# return "abc" - return f"Logits({self.tokens.shape})" - - def __repr__(self): - return self.__str__() \ No newline at end of file diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index 0f362f3..692639d 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -1,17 +1,16 @@ -# This code was originally in simul_whisper/transcriber/simul_whisper.py . It is adapted a lot for SimulStreaming. - import os import logging import torch import torch.nn.functional as F +import numpy as np -from whisperlivekit.whisper import load_model, DecodingOptions, tokenizer +from whisperlivekit.whisper import DecodingOptions, tokenizer from .config import AlignAttConfig from whisperlivekit.timed_objects import ASRToken from whisperlivekit.whisper.audio import log_mel_spectrogram, TOKENS_PER_SECOND, pad_or_trim, N_SAMPLES, N_FRAMES from whisperlivekit.whisper.timing import median_filter -from whisperlivekit.whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens, detect_language +from whisperlivekit.whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens from .beam import BeamPyTorchInference from .eow_detection import fire_at_boundary, load_cif import os @@ -22,26 +21,18 @@ from whisperlivekit.backend_support import ( faster_backend_available, ) -import numpy as np from ..timed_objects import PUNCTUATION_MARKS -from .generation_progress import * DEC_PAD = 50257 logger = logging.getLogger(__name__) - -HAS_MLX_WHISPER = False -HAS_FASTER_WHISPER = False - if mlx_backend_available(): from mlx_whisper.audio import log_mel_spectrogram as mlx_log_mel_spectrogram from mlx_whisper.transcribe import pad_or_trim as mlx_pad_or_trim - HAS_MLX_WHISPER = True if faster_backend_available(): from faster_whisper.audio import pad_or_trim as fw_pad_or_trim from faster_whisper.feature_extractor import FeatureExtractor - HAS_FASTER_WHISPER = True USE_MLCORE = False @@ -60,7 +51,7 @@ def load_coreml_encoder(): return _coreml_encoder, _coreml_input_name, _coreml_output_name -class PaddedAlignAttWhisper: +class AlignAtt: def __init__( self, cfg: AlignAttConfig, @@ -72,7 +63,7 @@ class PaddedAlignAttWhisper: self.model = loaded_model self.mlx_encoder = mlx_encoder - self.fw_encoder = fw_encoder + self.fw_encoder = fw_encoder if fw_encoder: self.fw_feature_extractor = FeatureExtractor(feature_size=self.model.dims.n_mels) self.coreml_encoder_tuple = None @@ -414,14 +405,6 @@ class PaddedAlignAttWhisper: else: input_segments = self.segments[0] - # if self.cfg.language == "auto" and self.reset_tokenizer_to_auto_next_call: - # logger.debug("Resetting tokenizer to auto for new sentence.") - # self.create_tokenizer(None) - # self.detected_language = None - # self.init_tokens() - # self.reset_tokenizer_to_auto_next_call = False - - # NEW : we can use a different encoder, before using standart whisper for cross attention with the hooks on the decoder beg_encode = time() if self.use_mlcore: coreml_encoder, coreml_input_name, coreml_output_name = self.coreml_encoder_tuple