task to direct-english-translation

2026-05-01 19:37:53 +00:00 · 2025-11-10 13:20:26 +01:00
parent 5491dbd824
commit 16461052ed
6 changed files with 13 additions and 30 deletions
--- a/README.md
+++ b/README.md
@@ -141,8 +141,7 @@ async def websocket_endpoint(websocket: WebSocket):
 | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` |
 | `--model-path` | .pt file/directory containing whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` |
 | `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
-| `--target-language` | If sets, translate to using NLLB. Ex: `fr`. [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` |
+| `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` |
 | `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` |
 | `--diarization` | Enable speaker identification | `False` |
 | `--backend` | Processing backend. You can switch to `faster-whisper` if  `simulstreaming` does not work correctly | `simulstreaming` |
 | `--no-vac` | Disable Voice Activity Controller | `False` |
--- a/whisperlivekit/core.py
+++ b/whisperlivekit/core.py
@@ -52,7 +52,7 @@ class TranscriptionEngine:
            "model_cache_dir": None,
            "model_dir": None,
            "lan": "auto",
-            "task": "transcribe",
+            "direct_english_translation": False,
        }
        transcription_common_params = update_with_kwargs(transcription_common_params, kwargs)                                            
--- a/whisperlivekit/local_agreement/backends.py
+++ b/whisperlivekit/local_agreement/backends.py
@@ -80,10 +80,6 @@ class WhisperTimestampedASR(ASRBase):
    def use_vad(self):
        self.transcribe_kargs["vad"] = True
    def set_translate_task(self):
        self.transcribe_kargs["task"] = "translate"
 class FasterWhisperASR(ASRBase):
    """Uses faster-whisper as the backend."""
    sep = ""
@@ -139,10 +135,6 @@ class FasterWhisperASR(ASRBase):
    def use_vad(self):
        self.transcribe_kargs["vad_filter"] = True
    def set_translate_task(self):
        self.transcribe_kargs["task"] = "translate"
 class MLXWhisper(ASRBase):
    """
    Uses MLX Whisper optimized for Apple Silicon.
@@ -218,10 +210,6 @@ class MLXWhisper(ASRBase):
    def use_vad(self):
        self.transcribe_kargs["vad_filter"] = True
    def set_translate_task(self):
        self.transcribe_kargs["task"] = "translate"
 class OpenaiApiASR(ASRBase):
    """Uses OpenAI's Whisper API for transcription."""
    def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
@@ -232,7 +220,7 @@ class OpenaiApiASR(ASRBase):
        self.temperature = temperature
        self.load_model()
        self.use_vad_opt = False
-        self.task = "transcribe"
+        self.direct_english_translation = False
    def load_model(self, *args, **kwargs):
        from openai import OpenAI
@@ -274,7 +262,7 @@ class OpenaiApiASR(ASRBase):
            "temperature": self.temperature,
            "timestamp_granularities": ["word", "segment"],
        }
-        if self.task != "translate" and self.original_language:
+        if not self.direct_english_translation and self.original_language:
            params["language"] = self.original_language
        if prompt:
            params["prompt"] = prompt
@@ -284,7 +272,4 @@ class OpenaiApiASR(ASRBase):
        return transcript
    def use_vad(self):
-        self.use_vad_opt = True
+        self.use_vad_opt = True
    def set_translate_task(self):
        self.task = "translate"
--- a/whisperlivekit/local_agreement/whisper_online.py
+++ b/whisperlivekit/local_agreement/whisper_online.py
@@ -70,7 +70,7 @@ def backend_factory(
            model_size,
            model_cache_dir,
            model_dir,
-            task,
+            direct_english_translation,
            buffer_trimming,
            buffer_trimming_sec,
            confidence_validation,
@@ -102,7 +102,7 @@ def backend_factory(
        e = time.time()
        logger.info(f"done. It took {round(e-t,2)} seconds.")
-    if task == "translate":
+    if direct_english_translation:
        tgt_language = "en"  # Whisper translates into English
    else:
        tgt_language = lan  # Whisper transcribes in this language
--- a/whisperlivekit/parse_args.py
+++ b/whisperlivekit/parse_args.py
@@ -114,11 +114,10 @@ def parse_args():
        help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
    )
    parser.add_argument(
-        "--task",
+        "--direct-english-translation",
-        type=str,
+        action="store_true",
-        default="transcribe",
+        default=False,
-        choices=["transcribe", "translate"],
+        help="Use Whisper to directly translate to english.",
        help="Transcribe or translate.",
    )
    parser.add_argument(
--- a/whisperlivekit/simul_whisper/backend.py
+++ b/whisperlivekit/simul_whisper/backend.py
@@ -211,7 +211,7 @@ class SimulStreamingASR():
                cif_ckpt_path=self.cif_ckpt_path,
                decoder_type="beam",
                beam_size=self.beams,
-                task=self.task,
+                task=self.direct_english_translation,
                never_fire=self.never_fire,
                init_prompt=self.init_prompt,
                max_context_tokens=self.max_context_tokens,
@@ -219,7 +219,7 @@ class SimulStreamingASR():
        )  
        # Set up tokenizer for translation if needed
-        if self.task == "translate":
+        if self.direct_english_translation:
            self.tokenizer = self.set_translate_task()
        else:
            self.tokenizer = None