diff --git a/README.md b/README.md index 02387ed..9c00570 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,12 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str 5. **MLX Whisper backend**: Integrates the alternative backend option MLX Whisper, optimized for efficient speech recognition on Apple silicon. -![Demo Screenshot](src/demo.png) +![Demo Screenshot](src/web/demo.png) ## Code Origins This project reuses and extends code from the original Whisper Streaming repository: -- whisper_online.py: Contains code from whisper_streaming +- whisper_online.py, backends.py and online_asr.py: Contains code from whisper_streaming - silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project. ## Installation diff --git a/src/demo.png b/src/web/demo.png similarity index 100% rename from src/demo.png rename to src/web/demo.png diff --git a/src/live_transcription.html b/src/web/live_transcription.html similarity index 100% rename from src/live_transcription.html rename to src/web/live_transcription.html diff --git a/src/whisper_streaming/backends.py b/src/whisper_streaming/backends.py new file mode 100644 index 0000000..682cfc5 --- /dev/null +++ b/src/whisper_streaming/backends.py @@ -0,0 +1,368 @@ +import sys +import logging + +import io +import soundfile as sf +import math + + +logger = logging.getLogger(__name__) + +class ASRBase: + sep = " " # join transcribe words with this character (" " for whisper_timestamped, + # "" for faster-whisper because it emits the spaces when neeeded) + + def __init__( + self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr + ): + self.logfile = logfile + + self.transcribe_kargs = {} + if lan == "auto": + self.original_language = None + else: + self.original_language = lan + + self.model = self.load_model(modelsize, cache_dir, model_dir) + + def load_model(self, modelsize, cache_dir): + raise NotImplemented("must be implemented in the child class") + + def transcribe(self, audio, init_prompt=""): + raise NotImplemented("must be implemented in the child class") + + def use_vad(self): + raise NotImplemented("must be implemented in the child class") + + +class WhisperTimestampedASR(ASRBase): + """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper. + On the other hand, the installation for GPU could be easier. + """ + + sep = " " + + def load_model(self, modelsize=None, cache_dir=None, model_dir=None): + import whisper + import whisper_timestamped + from whisper_timestamped import transcribe_timestamped + + self.transcribe_timestamped = transcribe_timestamped + if model_dir is not None: + logger.debug("ignoring model_dir, not implemented") + return whisper.load_model(modelsize, download_root=cache_dir) + + def transcribe(self, audio, init_prompt=""): + result = self.transcribe_timestamped( + self.model, + audio, + language=self.original_language, + initial_prompt=init_prompt, + verbose=None, + condition_on_previous_text=True, + **self.transcribe_kargs, + ) + return result + + def ts_words(self, r): + # return: transcribe result object to [(beg,end,"word1"), ...] + o = [] + for s in r["segments"]: + for w in s["words"]: + t = (w["start"], w["end"], w["text"]) + o.append(t) + return o + + def segments_end_ts(self, res): + return [s["end"] for s in res["segments"]] + + def use_vad(self): + self.transcribe_kargs["vad"] = True + + def set_translate_task(self): + self.transcribe_kargs["task"] = "translate" + + +class FasterWhisperASR(ASRBase): + """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.""" + + sep = "" + + def load_model(self, modelsize=None, cache_dir=None, model_dir=None): + from faster_whisper import WhisperModel + + # logging.getLogger("faster_whisper").setLevel(logger.level) + if model_dir is not None: + logger.debug( + f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used." + ) + model_size_or_path = model_dir + elif modelsize is not None: + model_size_or_path = modelsize + else: + raise ValueError("modelsize or model_dir parameter must be set") + + # this worked fast and reliably on NVIDIA L40 + model = WhisperModel( + model_size_or_path, + device="cuda", + compute_type="float16", + download_root=cache_dir, + ) + + # or run on GPU with INT8 + # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower + # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") + + # or run on CPU with INT8 + # tested: works, but slow, appx 10-times than cuda FP16 + # model = WhisperModel(modelsize, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/") + return model + + def transcribe(self, audio, init_prompt=""): + + # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) + segments, info = self.model.transcribe( + audio, + language=self.original_language, + initial_prompt=init_prompt, + beam_size=5, + word_timestamps=True, + condition_on_previous_text=True, + **self.transcribe_kargs, + ) + # print(info) # info contains language detection result + + return list(segments) + + def ts_words(self, segments): + o = [] + for segment in segments: + for word in segment.words: + if segment.no_speech_prob > 0.9: + continue + # not stripping the spaces -- should not be merged with them! + w = word.word + t = (word.start, word.end, w) + o.append(t) + return o + + def segments_end_ts(self, res): + return [s.end for s in res] + + def use_vad(self): + self.transcribe_kargs["vad_filter"] = True + + def set_translate_task(self): + self.transcribe_kargs["task"] = "translate" + + +class MLXWhisper(ASRBase): + """ + Uses MPX Whisper library as the backend, optimized for Apple Silicon. + Models available: https://huggingface.co/collections/mlx-community/whisper-663256f9964fbb1177db93dc + Significantly faster than faster-whisper (without CUDA) on Apple M1. + """ + + sep = " " + + def load_model(self, modelsize=None, cache_dir=None, model_dir=None): + """ + Loads the MLX-compatible Whisper model. + + Args: + modelsize (str, optional): The size or name of the Whisper model to load. + If provided, it will be translated to an MLX-compatible model path using the `translate_model_name` method. + Example: "large-v3-turbo" -> "mlx-community/whisper-large-v3-turbo". + cache_dir (str, optional): Path to the directory for caching models. + **Note**: This is not supported by MLX Whisper and will be ignored. + model_dir (str, optional): Direct path to a custom model directory. + If specified, it overrides the `modelsize` parameter. + """ + from mlx_whisper.transcribe import ModelHolder, transcribe + import mlx.core as mx + + if model_dir is not None: + logger.debug( + f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used." + ) + model_size_or_path = model_dir + elif modelsize is not None: + model_size_or_path = self.translate_model_name(modelsize) + logger.debug( + f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used." + ) + + self.model_size_or_path = model_size_or_path + + # In mlx_whisper.transcribe, dtype is defined as: + # dtype = mx.float16 if decode_options.get("fp16", True) else mx.float32 + # Since we do not use decode_options in self.transcribe, we will set dtype to mx.float16 + dtype = mx.float16 + ModelHolder.get_model(model_size_or_path, dtype) + return transcribe + + def translate_model_name(self, model_name): + """ + Translates a given model name to its corresponding MLX-compatible model path. + + Args: + model_name (str): The name of the model to translate. + + Returns: + str: The MLX-compatible model path. + """ + # Dictionary mapping model names to MLX-compatible paths + model_mapping = { + "tiny.en": "mlx-community/whisper-tiny.en-mlx", + "tiny": "mlx-community/whisper-tiny-mlx", + "base.en": "mlx-community/whisper-base.en-mlx", + "base": "mlx-community/whisper-base-mlx", + "small.en": "mlx-community/whisper-small.en-mlx", + "small": "mlx-community/whisper-small-mlx", + "medium.en": "mlx-community/whisper-medium.en-mlx", + "medium": "mlx-community/whisper-medium-mlx", + "large-v1": "mlx-community/whisper-large-v1-mlx", + "large-v2": "mlx-community/whisper-large-v2-mlx", + "large-v3": "mlx-community/whisper-large-v3-mlx", + "large-v3-turbo": "mlx-community/whisper-large-v3-turbo", + "large": "mlx-community/whisper-large-mlx", + } + + # Retrieve the corresponding MLX model path + mlx_model_path = model_mapping.get(model_name) + + if mlx_model_path: + return mlx_model_path + else: + raise ValueError( + f"Model name '{model_name}' is not recognized or not supported." + ) + + def transcribe(self, audio, init_prompt=""): + if self.transcribe_kargs: + logger.warning("Transcribe kwargs (vad, task) are not compatible with MLX Whisper and will be ignored.") + segments = self.model( + audio, + language=self.original_language, + initial_prompt=init_prompt, + word_timestamps=True, + condition_on_previous_text=True, + path_or_hf_repo=self.model_size_or_path, + ) + return segments.get("segments", []) + + def ts_words(self, segments): + """ + Extract timestamped words from transcription segments and skips words with high no-speech probability. + """ + return [ + (word["start"], word["end"], word["word"]) + for segment in segments + for word in segment.get("words", []) + if segment.get("no_speech_prob", 0) <= 0.9 + ] + + def segments_end_ts(self, res): + return [s["end"] for s in res] + + def use_vad(self): + self.transcribe_kargs["vad_filter"] = True + + def set_translate_task(self): + self.transcribe_kargs["task"] = "translate" + + +class OpenaiApiASR(ASRBase): + """Uses OpenAI's Whisper API for audio transcription.""" + + def __init__(self, lan=None, temperature=0, logfile=sys.stderr): + self.logfile = logfile + + self.modelname = "whisper-1" + self.original_language = ( + None if lan == "auto" else lan + ) # ISO-639-1 language code + self.response_format = "verbose_json" + self.temperature = temperature + + self.load_model() + + self.use_vad_opt = False + + # reset the task in set_translate_task + self.task = "transcribe" + + def load_model(self, *args, **kwargs): + from openai import OpenAI + + self.client = OpenAI() + + self.transcribed_seconds = ( + 0 # for logging how many seconds were processed by API, to know the cost + ) + + def ts_words(self, segments): + no_speech_segments = [] + if self.use_vad_opt: + for segment in segments.segments: + # TODO: threshold can be set from outside + if segment["no_speech_prob"] > 0.8: + no_speech_segments.append( + (segment.get("start"), segment.get("end")) + ) + + o = [] + for word in segments.words: + start = word.start + end = word.end + if any(s[0] <= start <= s[1] for s in no_speech_segments): + # print("Skipping word", word.get("word"), "because it's in a no-speech segment") + continue + o.append((start, end, word.word)) + return o + + def segments_end_ts(self, res): + return [s.end for s in res.words] + + def transcribe(self, audio_data, prompt=None, *args, **kwargs): + # Write the audio data to a buffer + buffer = io.BytesIO() + buffer.name = "temp.wav" + sf.write(buffer, audio_data, samplerate=16000, format="WAV", subtype="PCM_16") + buffer.seek(0) # Reset buffer's position to the beginning + + self.transcribed_seconds += math.ceil( + len(audio_data) / 16000 + ) # it rounds up to the whole seconds + + params = { + "model": self.modelname, + "file": buffer, + "response_format": self.response_format, + "temperature": self.temperature, + "timestamp_granularities": ["word", "segment"], + } + if self.task != "translate" and self.original_language: + params["language"] = self.original_language + if prompt: + params["prompt"] = prompt + + if self.task == "translate": + proc = self.client.audio.translations + else: + proc = self.client.audio.transcriptions + + # Process transcription/translation + transcript = proc.create(**params) + logger.debug( + f"OpenAI API processed accumulated {self.transcribed_seconds} seconds" + ) + + return transcript + + def use_vad(self): + self.use_vad_opt = True + + def set_translate_task(self): + self.task = "translate" diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py new file mode 100644 index 0000000..dc34fd8 --- /dev/null +++ b/src/whisper_streaming/online_asr.py @@ -0,0 +1,401 @@ +import sys +import numpy as np +import logging + +logger = logging.getLogger(__name__) + +class HypothesisBuffer: + + def __init__(self, logfile=sys.stderr): + self.commited_in_buffer = [] + self.buffer = [] + self.new = [] + + self.last_commited_time = 0 + self.last_commited_word = None + + self.logfile = logfile + + def insert(self, new, offset): + # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content + # the new tail is added to self.new + + new = [(a + offset, b + offset, t) for a, b, t in new] + self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1] + + if len(self.new) >= 1: + a, b, t = self.new[0] + if abs(a - self.last_commited_time) < 1: + if self.commited_in_buffer: + # it's going to search for 1, 2, ..., 5 consecutive words (n-grams) that are identical in commited and new. If they are, they're dropped. + cn = len(self.commited_in_buffer) + nn = len(self.new) + for i in range(1, min(min(cn, nn), 5) + 1): # 5 is the maximum + c = " ".join( + [self.commited_in_buffer[-j][2] for j in range(1, i + 1)][ + ::-1 + ] + ) + tail = " ".join(self.new[j - 1][2] for j in range(1, i + 1)) + if c == tail: + words = [] + for j in range(i): + words.append(repr(self.new.pop(0))) + words_msg = " ".join(words) + logger.debug(f"removing last {i} words: {words_msg}") + break + + def flush(self): + # returns commited chunk = the longest common prefix of 2 last inserts. + + commit = [] + while self.new: + na, nb, nt = self.new[0] + + if len(self.buffer) == 0: + break + + if nt == self.buffer[0][2]: + commit.append((na, nb, nt)) + self.last_commited_word = nt + self.last_commited_time = nb + self.buffer.pop(0) + self.new.pop(0) + else: + break + self.buffer = self.new + self.new = [] + self.commited_in_buffer.extend(commit) + return commit + + def pop_commited(self, time): + while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time: + self.commited_in_buffer.pop(0) + + def complete(self): + return self.buffer + + +class OnlineASRProcessor: + + SAMPLING_RATE = 16000 + + def __init__( + self, + asr, + tokenize_method=None, + buffer_trimming=("segment", 15), + logfile=sys.stderr, + ): + """asr: WhisperASR object + tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all. + ("segment", 15) + buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option. + logfile: where to store the log. + """ + self.asr = asr + self.tokenize = tokenize_method + self.logfile = logfile + + self.init() + + self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming + + def init(self, offset=None): + """run this when starting or restarting processing""" + self.audio_buffer = np.array([], dtype=np.float32) + self.transcript_buffer = HypothesisBuffer(logfile=self.logfile) + self.buffer_time_offset = 0 + if offset is not None: + self.buffer_time_offset = offset + self.transcript_buffer.last_commited_time = self.buffer_time_offset + self.commited = [] + + def insert_audio_chunk(self, audio): + self.audio_buffer = np.append(self.audio_buffer, audio) + + def prompt(self): + """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer. + "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons. + """ + k = max(0, len(self.commited) - 1) + while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset: + k -= 1 + + p = self.commited[:k] + p = [t for _, _, t in p] + prompt = [] + l = 0 + while p and l < 200: # 200 characters prompt size + x = p.pop(-1) + l += len(x) + 1 + prompt.append(x) + non_prompt = self.commited[k:] + return self.asr.sep.join(prompt[::-1]), self.asr.sep.join( + t for _, _, t in non_prompt + ) + + def process_iter(self): + """Runs on the current audio buffer. + Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, ""). + The non-emty text is confirmed (committed) partial transcript. + """ + + prompt, non_prompt = self.prompt() + logger.debug(f"PROMPT: {prompt}") + logger.debug(f"CONTEXT: {non_prompt}") + logger.debug( + f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}" + ) + res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt) + + # transform to [(beg,end,"word1"), ...] + tsw = self.asr.ts_words(res) + + self.transcript_buffer.insert(tsw, self.buffer_time_offset) + o = self.transcript_buffer.flush() + self.commited.extend(o) + completed = self.to_flush(o) + logger.debug(f">>>>COMPLETE NOW: {completed[2]}") + the_rest = self.to_flush(self.transcript_buffer.complete()) + logger.debug(f"INCOMPLETE: {the_rest[2]}") + + # there is a newly confirmed text + + if o and self.buffer_trimming_way == "sentence": # trim the completed sentences + if ( + len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec + ): # longer than this + self.chunk_completed_sentence() + + if self.buffer_trimming_way == "segment": + s = self.buffer_trimming_sec # trim the completed segments longer than s, + else: + s = 30 # if the audio buffer is longer than 30s, trim it + + if len(self.audio_buffer) / self.SAMPLING_RATE > s: + self.chunk_completed_segment(res) + + # alternative: on any word + # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10 + # let's find commited word that is less + # k = len(self.commited)-1 + # while k>0 and self.commited[k][1] > l: + # k -= 1 + # t = self.commited[k][1] + logger.debug("chunking segment") + # self.chunk_at(t) + + logger.debug( + f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}" + ) + return self.to_flush(o) + + def chunk_completed_sentence(self): + if self.commited == []: + return + logger.debug("COMPLETED SENTENCE: ", [s[2] for s in self.commited]) + sents = self.words_to_sentences(self.commited) + for s in sents: + logger.debug(f"\t\tSENT: {s}") + if len(sents) < 2: + return + while len(sents) > 2: + sents.pop(0) + # we will continue with audio processing at this timestamp + chunk_at = sents[-2][1] + + logger.debug(f"--- sentence chunked at {chunk_at:2.2f}") + self.chunk_at(chunk_at) + + def chunk_completed_segment(self, res): + if self.commited == []: + return + + ends = self.asr.segments_end_ts(res) + + t = self.commited[-1][1] + + if len(ends) > 1: + + e = ends[-2] + self.buffer_time_offset + while len(ends) > 2 and e > t: + ends.pop(-1) + e = ends[-2] + self.buffer_time_offset + if e <= t: + logger.debug(f"--- segment chunked at {e:2.2f}") + self.chunk_at(e) + else: + logger.debug(f"--- last segment not within commited area") + else: + logger.debug(f"--- not enough segments to chunk") + + def chunk_at(self, time): + """trims the hypothesis and audio buffer at "time" """ + self.transcript_buffer.pop_commited(time) + cut_seconds = time - self.buffer_time_offset + self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :] + self.buffer_time_offset = time + + def words_to_sentences(self, words): + """Uses self.tokenize for sentence segmentation of words. + Returns: [(beg,end,"sentence 1"),...] + """ + + cwords = [w for w in words] + t = " ".join(o[2] for o in cwords) + s = self.tokenize(t) + out = [] + while s: + beg = None + end = None + sent = s.pop(0).strip() + fsent = sent + while cwords: + b, e, w = cwords.pop(0) + w = w.strip() + if beg is None and sent.startswith(w): + beg = b + elif end is None and sent == w: + end = e + out.append((beg, end, fsent)) + break + sent = sent[len(w) :].strip() + return out + + def finish(self): + """Flush the incomplete text when the whole processing ends. + Returns: the same format as self.process_iter() + """ + o = self.transcript_buffer.complete() + f = self.to_flush(o) + logger.debug(f"last, noncommited: {f}") + self.buffer_time_offset += len(self.audio_buffer) / 16000 + return f + + def to_flush( + self, + sents, + sep=None, + offset=0, + ): + # concatenates the timestamped words or sentences into one sequence that is flushed in one line + # sents: [(beg1, end1, "sentence1"), ...] or [] if empty + # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty + if sep is None: + sep = self.asr.sep + t = sep.join(s[2] for s in sents) + if len(sents) == 0: + b = None + e = None + else: + b = offset + sents[0][0] + e = offset + sents[-1][1] + return (b, e, t) + + +class VACOnlineASRProcessor(OnlineASRProcessor): + """Wraps OnlineASRProcessor with VAC (Voice Activity Controller). + + It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds), + it runs VAD and continuously detects whether there is speech or not. + When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately. + """ + + def __init__(self, online_chunk_size, *a, **kw): + self.online_chunk_size = online_chunk_size + + self.online = OnlineASRProcessor(*a, **kw) + + # VAC: + import torch + + model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad") + from silero_vad_iterator import FixedVADIterator + + self.vac = FixedVADIterator( + model + ) # we use the default options there: 500ms silence, 100ms padding, etc. + + self.logfile = self.online.logfile + self.init() + + def init(self): + self.online.init() + self.vac.reset_states() + self.current_online_chunk_buffer_size = 0 + + self.is_currently_final = False + + self.status = None # or "voice" or "nonvoice" + self.audio_buffer = np.array([], dtype=np.float32) + self.buffer_offset = 0 # in frames + + def clear_buffer(self): + self.buffer_offset += len(self.audio_buffer) + self.audio_buffer = np.array([], dtype=np.float32) + + def insert_audio_chunk(self, audio): + res = self.vac(audio) + self.audio_buffer = np.append(self.audio_buffer, audio) + + if res is not None: + frame = list(res.values())[0] - self.buffer_offset + if "start" in res and "end" not in res: + self.status = "voice" + send_audio = self.audio_buffer[frame:] + self.online.init( + offset=(frame + self.buffer_offset) / self.SAMPLING_RATE + ) + self.online.insert_audio_chunk(send_audio) + self.current_online_chunk_buffer_size += len(send_audio) + self.clear_buffer() + elif "end" in res and "start" not in res: + self.status = "nonvoice" + send_audio = self.audio_buffer[:frame] + self.online.insert_audio_chunk(send_audio) + self.current_online_chunk_buffer_size += len(send_audio) + self.is_currently_final = True + self.clear_buffer() + else: + beg = res["start"] - self.buffer_offset + end = res["end"] - self.buffer_offset + self.status = "nonvoice" + send_audio = self.audio_buffer[beg:end] + self.online.init(offset=(beg + self.buffer_offset) / self.SAMPLING_RATE) + self.online.insert_audio_chunk(send_audio) + self.current_online_chunk_buffer_size += len(send_audio) + self.is_currently_final = True + self.clear_buffer() + else: + if self.status == "voice": + self.online.insert_audio_chunk(self.audio_buffer) + self.current_online_chunk_buffer_size += len(self.audio_buffer) + self.clear_buffer() + else: + # We keep 1 second because VAD may later find start of voice in it. + # But we trim it to prevent OOM. + self.buffer_offset += max( + 0, len(self.audio_buffer) - self.SAMPLING_RATE + ) + self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE :] + + def process_iter(self): + if self.is_currently_final: + return self.finish() + elif ( + self.current_online_chunk_buffer_size + > self.SAMPLING_RATE * self.online_chunk_size + ): + self.current_online_chunk_buffer_size = 0 + ret = self.online.process_iter() + return ret + else: + print("no online update, only VAD", self.status, file=self.logfile) + return (None, None, "") + + def finish(self): + ret = self.online.finish() + self.current_online_chunk_buffer_size = 0 + self.is_currently_final = False + return ret diff --git a/whisper_fastapi_online_server.py b/whisper_fastapi_online_server.py index b6ffd05..6dd09e3 100644 --- a/whisper_fastapi_online_server.py +++ b/whisper_fastapi_online_server.py @@ -43,7 +43,7 @@ args = parser.parse_args() asr, tokenizer = backend_factory(args) # Load demo HTML for the root endpoint -with open("src/live_transcription.html", "r", encoding="utf-8") as f: +with open("src/web/live_transcription.html", "r", encoding="utf-8") as f: html = f.read() diff --git a/whisper_online.py b/whisper_online.py index 7b438ab..077e660 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -5,10 +5,8 @@ import librosa from functools import lru_cache import time import logging - -import io -import soundfile as sf -import math +from src.whisper_streaming.backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR +from src.whisper_streaming.online_asr import OnlineASRProcessor, VACOnlineASRProcessor logger = logging.getLogger(__name__) @@ -25,768 +23,6 @@ def load_audio_chunk(fname, beg, end): end_s = int(end * 16000) return audio[beg_s:end_s] - -# Whisper backend - - -class ASRBase: - - sep = " " # join transcribe words with this character (" " for whisper_timestamped, - # "" for faster-whisper because it emits the spaces when neeeded) - - def __init__( - self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr - ): - self.logfile = logfile - - self.transcribe_kargs = {} - if lan == "auto": - self.original_language = None - else: - self.original_language = lan - - self.model = self.load_model(modelsize, cache_dir, model_dir) - - def load_model(self, modelsize, cache_dir): - raise NotImplemented("must be implemented in the child class") - - def transcribe(self, audio, init_prompt=""): - raise NotImplemented("must be implemented in the child class") - - def use_vad(self): - raise NotImplemented("must be implemented in the child class") - - -class WhisperTimestampedASR(ASRBase): - """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper. - On the other hand, the installation for GPU could be easier. - """ - - sep = " " - - def load_model(self, modelsize=None, cache_dir=None, model_dir=None): - import whisper - import whisper_timestamped - from whisper_timestamped import transcribe_timestamped - - self.transcribe_timestamped = transcribe_timestamped - if model_dir is not None: - logger.debug("ignoring model_dir, not implemented") - return whisper.load_model(modelsize, download_root=cache_dir) - - def transcribe(self, audio, init_prompt=""): - result = self.transcribe_timestamped( - self.model, - audio, - language=self.original_language, - initial_prompt=init_prompt, - verbose=None, - condition_on_previous_text=True, - **self.transcribe_kargs, - ) - return result - - def ts_words(self, r): - # return: transcribe result object to [(beg,end,"word1"), ...] - o = [] - for s in r["segments"]: - for w in s["words"]: - t = (w["start"], w["end"], w["text"]) - o.append(t) - return o - - def segments_end_ts(self, res): - return [s["end"] for s in res["segments"]] - - def use_vad(self): - self.transcribe_kargs["vad"] = True - - def set_translate_task(self): - self.transcribe_kargs["task"] = "translate" - - -class FasterWhisperASR(ASRBase): - """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.""" - - sep = "" - - def load_model(self, modelsize=None, cache_dir=None, model_dir=None): - from faster_whisper import WhisperModel - - # logging.getLogger("faster_whisper").setLevel(logger.level) - if model_dir is not None: - logger.debug( - f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used." - ) - model_size_or_path = model_dir - elif modelsize is not None: - model_size_or_path = modelsize - else: - raise ValueError("modelsize or model_dir parameter must be set") - - # this worked fast and reliably on NVIDIA L40 - model = WhisperModel( - model_size_or_path, - device="cuda", - compute_type="float16", - download_root=cache_dir, - ) - - # or run on GPU with INT8 - # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower - # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") - - # or run on CPU with INT8 - # tested: works, but slow, appx 10-times than cuda FP16 - # model = WhisperModel(modelsize, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/") - return model - - def transcribe(self, audio, init_prompt=""): - - # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) - segments, info = self.model.transcribe( - audio, - language=self.original_language, - initial_prompt=init_prompt, - beam_size=5, - word_timestamps=True, - condition_on_previous_text=True, - **self.transcribe_kargs, - ) - # print(info) # info contains language detection result - - return list(segments) - - def ts_words(self, segments): - o = [] - for segment in segments: - for word in segment.words: - if segment.no_speech_prob > 0.9: - continue - # not stripping the spaces -- should not be merged with them! - w = word.word - t = (word.start, word.end, w) - o.append(t) - return o - - def segments_end_ts(self, res): - return [s.end for s in res] - - def use_vad(self): - self.transcribe_kargs["vad_filter"] = True - - def set_translate_task(self): - self.transcribe_kargs["task"] = "translate" - - -class MLXWhisper(ASRBase): - """ - Uses MPX Whisper library as the backend, optimized for Apple Silicon. - Models available: https://huggingface.co/collections/mlx-community/whisper-663256f9964fbb1177db93dc - Significantly faster than faster-whisper (without CUDA) on Apple M1. - """ - - sep = " " - - def load_model(self, modelsize=None, cache_dir=None, model_dir=None): - """ - Loads the MLX-compatible Whisper model. - - Args: - modelsize (str, optional): The size or name of the Whisper model to load. - If provided, it will be translated to an MLX-compatible model path using the `translate_model_name` method. - Example: "large-v3-turbo" -> "mlx-community/whisper-large-v3-turbo". - cache_dir (str, optional): Path to the directory for caching models. - **Note**: This is not supported by MLX Whisper and will be ignored. - model_dir (str, optional): Direct path to a custom model directory. - If specified, it overrides the `modelsize` parameter. - """ - from mlx_whisper.transcribe import ModelHolder, transcribe - import mlx.core as mx - - if model_dir is not None: - logger.debug( - f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used." - ) - model_size_or_path = model_dir - elif modelsize is not None: - model_size_or_path = self.translate_model_name(modelsize) - logger.debug( - f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used." - ) - - self.model_size_or_path = model_size_or_path - - # In mlx_whisper.transcribe, dtype is defined as: - # dtype = mx.float16 if decode_options.get("fp16", True) else mx.float32 - # Since we do not use decode_options in self.transcribe, we will set dtype to mx.float16 - dtype = mx.float16 - ModelHolder.get_model(model_size_or_path, dtype) - return transcribe - - def translate_model_name(self, model_name): - """ - Translates a given model name to its corresponding MLX-compatible model path. - - Args: - model_name (str): The name of the model to translate. - - Returns: - str: The MLX-compatible model path. - """ - # Dictionary mapping model names to MLX-compatible paths - model_mapping = { - "tiny.en": "mlx-community/whisper-tiny.en-mlx", - "tiny": "mlx-community/whisper-tiny-mlx", - "base.en": "mlx-community/whisper-base.en-mlx", - "base": "mlx-community/whisper-base-mlx", - "small.en": "mlx-community/whisper-small.en-mlx", - "small": "mlx-community/whisper-small-mlx", - "medium.en": "mlx-community/whisper-medium.en-mlx", - "medium": "mlx-community/whisper-medium-mlx", - "large-v1": "mlx-community/whisper-large-v1-mlx", - "large-v2": "mlx-community/whisper-large-v2-mlx", - "large-v3": "mlx-community/whisper-large-v3-mlx", - "large-v3-turbo": "mlx-community/whisper-large-v3-turbo", - "large": "mlx-community/whisper-large-mlx", - } - - # Retrieve the corresponding MLX model path - mlx_model_path = model_mapping.get(model_name) - - if mlx_model_path: - return mlx_model_path - else: - raise ValueError( - f"Model name '{model_name}' is not recognized or not supported." - ) - - def transcribe(self, audio, init_prompt=""): - if self.transcribe_kargs: - logger.warning("Transcribe kwargs (vad, task) are not compatible with MLX Whisper and will be ignored.") - segments = self.model( - audio, - language=self.original_language, - initial_prompt=init_prompt, - word_timestamps=True, - condition_on_previous_text=True, - path_or_hf_repo=self.model_size_or_path, - ) - return segments.get("segments", []) - - def ts_words(self, segments): - """ - Extract timestamped words from transcription segments and skips words with high no-speech probability. - """ - return [ - (word["start"], word["end"], word["word"]) - for segment in segments - for word in segment.get("words", []) - if segment.get("no_speech_prob", 0) <= 0.9 - ] - - def segments_end_ts(self, res): - return [s["end"] for s in res] - - def use_vad(self): - self.transcribe_kargs["vad_filter"] = True - - def set_translate_task(self): - self.transcribe_kargs["task"] = "translate" - - -class OpenaiApiASR(ASRBase): - """Uses OpenAI's Whisper API for audio transcription.""" - - def __init__(self, lan=None, temperature=0, logfile=sys.stderr): - self.logfile = logfile - - self.modelname = "whisper-1" - self.original_language = ( - None if lan == "auto" else lan - ) # ISO-639-1 language code - self.response_format = "verbose_json" - self.temperature = temperature - - self.load_model() - - self.use_vad_opt = False - - # reset the task in set_translate_task - self.task = "transcribe" - - def load_model(self, *args, **kwargs): - from openai import OpenAI - - self.client = OpenAI() - - self.transcribed_seconds = ( - 0 # for logging how many seconds were processed by API, to know the cost - ) - - def ts_words(self, segments): - no_speech_segments = [] - if self.use_vad_opt: - for segment in segments.segments: - # TODO: threshold can be set from outside - if segment["no_speech_prob"] > 0.8: - no_speech_segments.append( - (segment.get("start"), segment.get("end")) - ) - - o = [] - for word in segments.words: - start = word.start - end = word.end - if any(s[0] <= start <= s[1] for s in no_speech_segments): - # print("Skipping word", word.get("word"), "because it's in a no-speech segment") - continue - o.append((start, end, word.word)) - return o - - def segments_end_ts(self, res): - return [s.end for s in res.words] - - def transcribe(self, audio_data, prompt=None, *args, **kwargs): - # Write the audio data to a buffer - buffer = io.BytesIO() - buffer.name = "temp.wav" - sf.write(buffer, audio_data, samplerate=16000, format="WAV", subtype="PCM_16") - buffer.seek(0) # Reset buffer's position to the beginning - - self.transcribed_seconds += math.ceil( - len(audio_data) / 16000 - ) # it rounds up to the whole seconds - - params = { - "model": self.modelname, - "file": buffer, - "response_format": self.response_format, - "temperature": self.temperature, - "timestamp_granularities": ["word", "segment"], - } - if self.task != "translate" and self.original_language: - params["language"] = self.original_language - if prompt: - params["prompt"] = prompt - - if self.task == "translate": - proc = self.client.audio.translations - else: - proc = self.client.audio.transcriptions - - # Process transcription/translation - transcript = proc.create(**params) - logger.debug( - f"OpenAI API processed accumulated {self.transcribed_seconds} seconds" - ) - - return transcript - - def use_vad(self): - self.use_vad_opt = True - - def set_translate_task(self): - self.task = "translate" - - -class HypothesisBuffer: - - def __init__(self, logfile=sys.stderr): - self.commited_in_buffer = [] - self.buffer = [] - self.new = [] - - self.last_commited_time = 0 - self.last_commited_word = None - - self.logfile = logfile - - def insert(self, new, offset): - # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content - # the new tail is added to self.new - - new = [(a + offset, b + offset, t) for a, b, t in new] - self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1] - - if len(self.new) >= 1: - a, b, t = self.new[0] - if abs(a - self.last_commited_time) < 1: - if self.commited_in_buffer: - # it's going to search for 1, 2, ..., 5 consecutive words (n-grams) that are identical in commited and new. If they are, they're dropped. - cn = len(self.commited_in_buffer) - nn = len(self.new) - for i in range(1, min(min(cn, nn), 5) + 1): # 5 is the maximum - c = " ".join( - [self.commited_in_buffer[-j][2] for j in range(1, i + 1)][ - ::-1 - ] - ) - tail = " ".join(self.new[j - 1][2] for j in range(1, i + 1)) - if c == tail: - words = [] - for j in range(i): - words.append(repr(self.new.pop(0))) - words_msg = " ".join(words) - logger.debug(f"removing last {i} words: {words_msg}") - break - - def flush(self): - # returns commited chunk = the longest common prefix of 2 last inserts. - - commit = [] - while self.new: - na, nb, nt = self.new[0] - - if len(self.buffer) == 0: - break - - if nt == self.buffer[0][2]: - commit.append((na, nb, nt)) - self.last_commited_word = nt - self.last_commited_time = nb - self.buffer.pop(0) - self.new.pop(0) - else: - break - self.buffer = self.new - self.new = [] - self.commited_in_buffer.extend(commit) - return commit - - def pop_commited(self, time): - while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time: - self.commited_in_buffer.pop(0) - - def complete(self): - return self.buffer - - -class OnlineASRProcessor: - - SAMPLING_RATE = 16000 - - def __init__( - self, - asr, - tokenize_method=None, - buffer_trimming=("segment", 15), - logfile=sys.stderr, - ): - """asr: WhisperASR object - tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all. - ("segment", 15) - buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option. - logfile: where to store the log. - """ - self.asr = asr - self.tokenize = tokenize_method - self.logfile = logfile - - self.init() - - self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming - - def init(self, offset=None): - """run this when starting or restarting processing""" - self.audio_buffer = np.array([], dtype=np.float32) - self.transcript_buffer = HypothesisBuffer(logfile=self.logfile) - self.buffer_time_offset = 0 - if offset is not None: - self.buffer_time_offset = offset - self.transcript_buffer.last_commited_time = self.buffer_time_offset - self.commited = [] - - def insert_audio_chunk(self, audio): - self.audio_buffer = np.append(self.audio_buffer, audio) - - def prompt(self): - """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer. - "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons. - """ - k = max(0, len(self.commited) - 1) - while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset: - k -= 1 - - p = self.commited[:k] - p = [t for _, _, t in p] - prompt = [] - l = 0 - while p and l < 200: # 200 characters prompt size - x = p.pop(-1) - l += len(x) + 1 - prompt.append(x) - non_prompt = self.commited[k:] - return self.asr.sep.join(prompt[::-1]), self.asr.sep.join( - t for _, _, t in non_prompt - ) - - def process_iter(self): - """Runs on the current audio buffer. - Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, ""). - The non-emty text is confirmed (committed) partial transcript. - """ - - prompt, non_prompt = self.prompt() - logger.debug(f"PROMPT: {prompt}") - logger.debug(f"CONTEXT: {non_prompt}") - logger.debug( - f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}" - ) - res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt) - - # transform to [(beg,end,"word1"), ...] - tsw = self.asr.ts_words(res) - - self.transcript_buffer.insert(tsw, self.buffer_time_offset) - o = self.transcript_buffer.flush() - self.commited.extend(o) - completed = self.to_flush(o) - logger.debug(f">>>>COMPLETE NOW: {completed[2]}") - the_rest = self.to_flush(self.transcript_buffer.complete()) - logger.debug(f"INCOMPLETE: {the_rest[2]}") - - # there is a newly confirmed text - - if o and self.buffer_trimming_way == "sentence": # trim the completed sentences - if ( - len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec - ): # longer than this - self.chunk_completed_sentence() - - if self.buffer_trimming_way == "segment": - s = self.buffer_trimming_sec # trim the completed segments longer than s, - else: - s = 30 # if the audio buffer is longer than 30s, trim it - - if len(self.audio_buffer) / self.SAMPLING_RATE > s: - self.chunk_completed_segment(res) - - # alternative: on any word - # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10 - # let's find commited word that is less - # k = len(self.commited)-1 - # while k>0 and self.commited[k][1] > l: - # k -= 1 - # t = self.commited[k][1] - logger.debug("chunking segment") - # self.chunk_at(t) - - logger.debug( - f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}" - ) - return self.to_flush(o) - - def chunk_completed_sentence(self): - if self.commited == []: - return - logger.debug("COMPLETED SENTENCE: ", [s[2] for s in self.commited]) - sents = self.words_to_sentences(self.commited) - for s in sents: - logger.debug(f"\t\tSENT: {s}") - if len(sents) < 2: - return - while len(sents) > 2: - sents.pop(0) - # we will continue with audio processing at this timestamp - chunk_at = sents[-2][1] - - logger.debug(f"--- sentence chunked at {chunk_at:2.2f}") - self.chunk_at(chunk_at) - - def chunk_completed_segment(self, res): - if self.commited == []: - return - - ends = self.asr.segments_end_ts(res) - - t = self.commited[-1][1] - - if len(ends) > 1: - - e = ends[-2] + self.buffer_time_offset - while len(ends) > 2 and e > t: - ends.pop(-1) - e = ends[-2] + self.buffer_time_offset - if e <= t: - logger.debug(f"--- segment chunked at {e:2.2f}") - self.chunk_at(e) - else: - logger.debug(f"--- last segment not within commited area") - else: - logger.debug(f"--- not enough segments to chunk") - - def chunk_at(self, time): - """trims the hypothesis and audio buffer at "time" """ - self.transcript_buffer.pop_commited(time) - cut_seconds = time - self.buffer_time_offset - self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :] - self.buffer_time_offset = time - - def words_to_sentences(self, words): - """Uses self.tokenize for sentence segmentation of words. - Returns: [(beg,end,"sentence 1"),...] - """ - - cwords = [w for w in words] - t = " ".join(o[2] for o in cwords) - s = self.tokenize(t) - out = [] - while s: - beg = None - end = None - sent = s.pop(0).strip() - fsent = sent - while cwords: - b, e, w = cwords.pop(0) - w = w.strip() - if beg is None and sent.startswith(w): - beg = b - elif end is None and sent == w: - end = e - out.append((beg, end, fsent)) - break - sent = sent[len(w) :].strip() - return out - - def finish(self): - """Flush the incomplete text when the whole processing ends. - Returns: the same format as self.process_iter() - """ - o = self.transcript_buffer.complete() - f = self.to_flush(o) - logger.debug(f"last, noncommited: {f}") - self.buffer_time_offset += len(self.audio_buffer) / 16000 - return f - - def to_flush( - self, - sents, - sep=None, - offset=0, - ): - # concatenates the timestamped words or sentences into one sequence that is flushed in one line - # sents: [(beg1, end1, "sentence1"), ...] or [] if empty - # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty - if sep is None: - sep = self.asr.sep - t = sep.join(s[2] for s in sents) - if len(sents) == 0: - b = None - e = None - else: - b = offset + sents[0][0] - e = offset + sents[-1][1] - return (b, e, t) - - -class VACOnlineASRProcessor(OnlineASRProcessor): - """Wraps OnlineASRProcessor with VAC (Voice Activity Controller). - - It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds), - it runs VAD and continuously detects whether there is speech or not. - When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately. - """ - - def __init__(self, online_chunk_size, *a, **kw): - self.online_chunk_size = online_chunk_size - - self.online = OnlineASRProcessor(*a, **kw) - - # VAC: - import torch - - model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad") - from silero_vad_iterator import FixedVADIterator - - self.vac = FixedVADIterator( - model - ) # we use the default options there: 500ms silence, 100ms padding, etc. - - self.logfile = self.online.logfile - self.init() - - def init(self): - self.online.init() - self.vac.reset_states() - self.current_online_chunk_buffer_size = 0 - - self.is_currently_final = False - - self.status = None # or "voice" or "nonvoice" - self.audio_buffer = np.array([], dtype=np.float32) - self.buffer_offset = 0 # in frames - - def clear_buffer(self): - self.buffer_offset += len(self.audio_buffer) - self.audio_buffer = np.array([], dtype=np.float32) - - def insert_audio_chunk(self, audio): - res = self.vac(audio) - self.audio_buffer = np.append(self.audio_buffer, audio) - - if res is not None: - frame = list(res.values())[0] - self.buffer_offset - if "start" in res and "end" not in res: - self.status = "voice" - send_audio = self.audio_buffer[frame:] - self.online.init( - offset=(frame + self.buffer_offset) / self.SAMPLING_RATE - ) - self.online.insert_audio_chunk(send_audio) - self.current_online_chunk_buffer_size += len(send_audio) - self.clear_buffer() - elif "end" in res and "start" not in res: - self.status = "nonvoice" - send_audio = self.audio_buffer[:frame] - self.online.insert_audio_chunk(send_audio) - self.current_online_chunk_buffer_size += len(send_audio) - self.is_currently_final = True - self.clear_buffer() - else: - beg = res["start"] - self.buffer_offset - end = res["end"] - self.buffer_offset - self.status = "nonvoice" - send_audio = self.audio_buffer[beg:end] - self.online.init(offset=(beg + self.buffer_offset) / self.SAMPLING_RATE) - self.online.insert_audio_chunk(send_audio) - self.current_online_chunk_buffer_size += len(send_audio) - self.is_currently_final = True - self.clear_buffer() - else: - if self.status == "voice": - self.online.insert_audio_chunk(self.audio_buffer) - self.current_online_chunk_buffer_size += len(self.audio_buffer) - self.clear_buffer() - else: - # We keep 1 second because VAD may later find start of voice in it. - # But we trim it to prevent OOM. - self.buffer_offset += max( - 0, len(self.audio_buffer) - self.SAMPLING_RATE - ) - self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE :] - - def process_iter(self): - if self.is_currently_final: - return self.finish() - elif ( - self.current_online_chunk_buffer_size - > self.SAMPLING_RATE * self.online_chunk_size - ): - self.current_online_chunk_buffer_size = 0 - ret = self.online.process_iter() - return ret - else: - print("no online update, only VAD", self.status, file=self.logfile) - return (None, None, "") - - def finish(self): - ret = self.online.finish() - self.current_online_chunk_buffer_size = 0 - self.is_currently_final = False - return ret - - WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split( "," ) @@ -852,7 +88,7 @@ def add_shared_args(parser): parser.add_argument( "--model", type=str, - default="tiny", + default="large-v3-turbo", choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split( "," ), @@ -887,14 +123,14 @@ def add_shared_args(parser): parser.add_argument( "--backend", type=str, - default="mlx-whisper", + default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"], help="Load only this backend for Whisper processing.", ) parser.add_argument( "--vac", action="store_true", - default=True, + default=False, help="Use VAC = voice activity controller. Recommended. Requires torch.", ) parser.add_argument( @@ -903,7 +139,7 @@ def add_shared_args(parser): parser.add_argument( "--vad", action="store_true", - default=True, + default=False, help="Use VAD = voice activity detection, with the default parameters.", ) parser.add_argument(