From 16461052ed6baeac7fd8bb7c766ca24cab9a22ce Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Mon, 10 Nov 2025 13:20:26 +0100 Subject: [PATCH] task to direct-english-translation --- README.md | 3 +-- whisperlivekit/core.py | 2 +- whisperlivekit/local_agreement/backends.py | 21 +++---------------- .../local_agreement/whisper_online.py | 4 ++-- whisperlivekit/parse_args.py | 9 ++++---- whisperlivekit/simul_whisper/backend.py | 4 ++-- 6 files changed, 13 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index f570eb7..00dede3 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,7 @@ async def websocket_endpoint(websocket: WebSocket): | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` | | `--model-path` | .pt file/directory containing whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` | | `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` | -| `--target-language` | If sets, translate to using NLLB. Ex: `fr`. [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` | -| `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` | +| `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` | | `--diarization` | Enable speaker identification | `False` | | `--backend` | Processing backend. You can switch to `faster-whisper` if `simulstreaming` does not work correctly | `simulstreaming` | | `--no-vac` | Disable Voice Activity Controller | `False` | diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index 3cdbbf1..31df810 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -52,7 +52,7 @@ class TranscriptionEngine: "model_cache_dir": None, "model_dir": None, "lan": "auto", - "task": "transcribe", + "direct_english_translation": False, } transcription_common_params = update_with_kwargs(transcription_common_params, kwargs) diff --git a/whisperlivekit/local_agreement/backends.py b/whisperlivekit/local_agreement/backends.py index 39c04ec..4fce26d 100644 --- a/whisperlivekit/local_agreement/backends.py +++ b/whisperlivekit/local_agreement/backends.py @@ -80,10 +80,6 @@ class WhisperTimestampedASR(ASRBase): def use_vad(self): self.transcribe_kargs["vad"] = True - def set_translate_task(self): - self.transcribe_kargs["task"] = "translate" - - class FasterWhisperASR(ASRBase): """Uses faster-whisper as the backend.""" sep = "" @@ -139,10 +135,6 @@ class FasterWhisperASR(ASRBase): def use_vad(self): self.transcribe_kargs["vad_filter"] = True - def set_translate_task(self): - self.transcribe_kargs["task"] = "translate" - - class MLXWhisper(ASRBase): """ Uses MLX Whisper optimized for Apple Silicon. @@ -218,10 +210,6 @@ class MLXWhisper(ASRBase): def use_vad(self): self.transcribe_kargs["vad_filter"] = True - def set_translate_task(self): - self.transcribe_kargs["task"] = "translate" - - class OpenaiApiASR(ASRBase): """Uses OpenAI's Whisper API for transcription.""" def __init__(self, lan=None, temperature=0, logfile=sys.stderr): @@ -232,7 +220,7 @@ class OpenaiApiASR(ASRBase): self.temperature = temperature self.load_model() self.use_vad_opt = False - self.task = "transcribe" + self.direct_english_translation = False def load_model(self, *args, **kwargs): from openai import OpenAI @@ -274,7 +262,7 @@ class OpenaiApiASR(ASRBase): "temperature": self.temperature, "timestamp_granularities": ["word", "segment"], } - if self.task != "translate" and self.original_language: + if not self.direct_english_translation and self.original_language: params["language"] = self.original_language if prompt: params["prompt"] = prompt @@ -284,7 +272,4 @@ class OpenaiApiASR(ASRBase): return transcript def use_vad(self): - self.use_vad_opt = True - - def set_translate_task(self): - self.task = "translate" \ No newline at end of file + self.use_vad_opt = True \ No newline at end of file diff --git a/whisperlivekit/local_agreement/whisper_online.py b/whisperlivekit/local_agreement/whisper_online.py index 6fae3ab..aac85b5 100644 --- a/whisperlivekit/local_agreement/whisper_online.py +++ b/whisperlivekit/local_agreement/whisper_online.py @@ -70,7 +70,7 @@ def backend_factory( model_size, model_cache_dir, model_dir, - task, + direct_english_translation, buffer_trimming, buffer_trimming_sec, confidence_validation, @@ -102,7 +102,7 @@ def backend_factory( e = time.time() logger.info(f"done. It took {round(e-t,2)} seconds.") - if task == "translate": + if direct_english_translation: tgt_language = "en" # Whisper translates into English else: tgt_language = lan # Whisper transcribes in this language diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index e7db271..cbaea22 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -114,11 +114,10 @@ def parse_args(): help="Source language code, e.g. en,de,cs, or 'auto' for language detection.", ) parser.add_argument( - "--task", - type=str, - default="transcribe", - choices=["transcribe", "translate"], - help="Transcribe or translate.", + "--direct-english-translation", + action="store_true", + default=False, + help="Use Whisper to directly translate to english.", ) parser.add_argument( diff --git a/whisperlivekit/simul_whisper/backend.py b/whisperlivekit/simul_whisper/backend.py index 0e4a4f2..dbfbd1f 100644 --- a/whisperlivekit/simul_whisper/backend.py +++ b/whisperlivekit/simul_whisper/backend.py @@ -211,7 +211,7 @@ class SimulStreamingASR(): cif_ckpt_path=self.cif_ckpt_path, decoder_type="beam", beam_size=self.beams, - task=self.task, + task=self.direct_english_translation, never_fire=self.never_fire, init_prompt=self.init_prompt, max_context_tokens=self.max_context_tokens, @@ -219,7 +219,7 @@ class SimulStreamingASR(): ) # Set up tokenizer for translation if needed - if self.task == "translate": + if self.direct_english_translation: self.tokenizer = self.set_translate_task() else: self.tokenizer = None