reduce min-chunk-size to 0.1, set default model to base

This commit is contained in:
Quentin Fuxa
2025-04-27 23:52:00 +02:00
parent bfd60b3921
commit 437641fb43
3 changed files with 5 additions and 5 deletions

View File

@@ -52,8 +52,8 @@ class TranscriptionEngine:
transcription_common_params = {
"warmup_file": None,
"min_chunk_size": 0.5,
"model_size": "tiny",
"min_chunk_size": 0.1,
"model_size": "base",
"model_cache_dir": None,
"model_dir": None,
"model_path": None,

View File

@@ -81,14 +81,14 @@ def parse_args():
parser.add_argument(
"--min-chunk-size",
type=float,
default=0.5,
default=0.1,
help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
)
parser.add_argument(
"--model",
type=str,
default="small",
default="base",
dest='model_size',
help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.",
)

View File

@@ -472,7 +472,7 @@ class PaddedAlignAttWhisper:
content_mel_len = int((mel_padded.shape[2] - mel.shape[2])/2)
encoder_feature = self.model.encoder(mel)
end_encode = time()
print('Encoder duration:', end_encode-beg_encode)
# print('Encoder duration:', end_encode-beg_encode)
if self.cfg.language == "auto" and self.detected_language is None and self.first_timestamp:
seconds_since_start = self.segments_len() - self.first_timestamp