From 437641fb43d6f52bc365c00208eca37064d799ff Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 27 Apr 2025 23:52:00 +0200 Subject: [PATCH] reduce min-chunk-size to 0.1, set default model to base --- whisperlivekit/core.py | 4 ++-- whisperlivekit/parse_args.py | 4 ++-- whisperlivekit/simul_whisper/simul_whisper.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index 5811f7c..f80b815 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -52,8 +52,8 @@ class TranscriptionEngine: transcription_common_params = { "warmup_file": None, - "min_chunk_size": 0.5, - "model_size": "tiny", + "min_chunk_size": 0.1, + "model_size": "base", "model_cache_dir": None, "model_dir": None, "model_path": None, diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index 624c59c..b24c029 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -81,14 +81,14 @@ def parse_args(): parser.add_argument( "--min-chunk-size", type=float, - default=0.5, + default=0.1, help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.", ) parser.add_argument( "--model", type=str, - default="small", + default="base", dest='model_size', help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.", ) diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index bbcbf07..0f362f3 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -472,7 +472,7 @@ class PaddedAlignAttWhisper: content_mel_len = int((mel_padded.shape[2] - mel.shape[2])/2) encoder_feature = self.model.encoder(mel) end_encode = time() - print('Encoder duration:', end_encode-beg_encode) + # print('Encoder duration:', end_encode-beg_encode) if self.cfg.language == "auto" and self.detected_language is None and self.first_timestamp: seconds_since_start = self.segments_len() - self.first_timestamp