diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index 5811f7c..f80b815 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -52,8 +52,8 @@ class TranscriptionEngine: transcription_common_params = { "warmup_file": None, - "min_chunk_size": 0.5, - "model_size": "tiny", + "min_chunk_size": 0.1, + "model_size": "base", "model_cache_dir": None, "model_dir": None, "model_path": None, diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index 624c59c..b24c029 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -81,14 +81,14 @@ def parse_args(): parser.add_argument( "--min-chunk-size", type=float, - default=0.5, + default=0.1, help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.", ) parser.add_argument( "--model", type=str, - default="small", + default="base", dest='model_size', help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.", ) diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index bbcbf07..0f362f3 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -472,7 +472,7 @@ class PaddedAlignAttWhisper: content_mel_len = int((mel_padded.shape[2] - mel.shape[2])/2) encoder_feature = self.model.encoder(mel) end_encode = time() - print('Encoder duration:', end_encode-beg_encode) + # print('Encoder duration:', end_encode-beg_encode) if self.cfg.language == "auto" and self.detected_language is None and self.first_timestamp: seconds_since_start = self.segments_len() - self.first_timestamp