task to direct-english-translation

This commit is contained in:
Quentin Fuxa
2025-11-10 13:20:26 +01:00
parent 5491dbd824
commit 16461052ed
6 changed files with 13 additions and 30 deletions

View File

@@ -141,8 +141,7 @@ async def websocket_endpoint(websocket: WebSocket):
| `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` | | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` |
| `--model-path` | .pt file/directory containing whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` | | `--model-path` | .pt file/directory containing whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` |
| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` | | `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
| `--target-language` | If sets, translate to using NLLB. Ex: `fr`. [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` | | `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` |
| `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` |
| `--diarization` | Enable speaker identification | `False` | | `--diarization` | Enable speaker identification | `False` |
| `--backend` | Processing backend. You can switch to `faster-whisper` if `simulstreaming` does not work correctly | `simulstreaming` | | `--backend` | Processing backend. You can switch to `faster-whisper` if `simulstreaming` does not work correctly | `simulstreaming` |
| `--no-vac` | Disable Voice Activity Controller | `False` | | `--no-vac` | Disable Voice Activity Controller | `False` |

View File

@@ -52,7 +52,7 @@ class TranscriptionEngine:
"model_cache_dir": None, "model_cache_dir": None,
"model_dir": None, "model_dir": None,
"lan": "auto", "lan": "auto",
"task": "transcribe", "direct_english_translation": False,
} }
transcription_common_params = update_with_kwargs(transcription_common_params, kwargs) transcription_common_params = update_with_kwargs(transcription_common_params, kwargs)

View File

@@ -80,10 +80,6 @@ class WhisperTimestampedASR(ASRBase):
def use_vad(self): def use_vad(self):
self.transcribe_kargs["vad"] = True self.transcribe_kargs["vad"] = True
def set_translate_task(self):
self.transcribe_kargs["task"] = "translate"
class FasterWhisperASR(ASRBase): class FasterWhisperASR(ASRBase):
"""Uses faster-whisper as the backend.""" """Uses faster-whisper as the backend."""
sep = "" sep = ""
@@ -139,10 +135,6 @@ class FasterWhisperASR(ASRBase):
def use_vad(self): def use_vad(self):
self.transcribe_kargs["vad_filter"] = True self.transcribe_kargs["vad_filter"] = True
def set_translate_task(self):
self.transcribe_kargs["task"] = "translate"
class MLXWhisper(ASRBase): class MLXWhisper(ASRBase):
""" """
Uses MLX Whisper optimized for Apple Silicon. Uses MLX Whisper optimized for Apple Silicon.
@@ -218,10 +210,6 @@ class MLXWhisper(ASRBase):
def use_vad(self): def use_vad(self):
self.transcribe_kargs["vad_filter"] = True self.transcribe_kargs["vad_filter"] = True
def set_translate_task(self):
self.transcribe_kargs["task"] = "translate"
class OpenaiApiASR(ASRBase): class OpenaiApiASR(ASRBase):
"""Uses OpenAI's Whisper API for transcription.""" """Uses OpenAI's Whisper API for transcription."""
def __init__(self, lan=None, temperature=0, logfile=sys.stderr): def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
@@ -232,7 +220,7 @@ class OpenaiApiASR(ASRBase):
self.temperature = temperature self.temperature = temperature
self.load_model() self.load_model()
self.use_vad_opt = False self.use_vad_opt = False
self.task = "transcribe" self.direct_english_translation = False
def load_model(self, *args, **kwargs): def load_model(self, *args, **kwargs):
from openai import OpenAI from openai import OpenAI
@@ -274,7 +262,7 @@ class OpenaiApiASR(ASRBase):
"temperature": self.temperature, "temperature": self.temperature,
"timestamp_granularities": ["word", "segment"], "timestamp_granularities": ["word", "segment"],
} }
if self.task != "translate" and self.original_language: if not self.direct_english_translation and self.original_language:
params["language"] = self.original_language params["language"] = self.original_language
if prompt: if prompt:
params["prompt"] = prompt params["prompt"] = prompt
@@ -284,7 +272,4 @@ class OpenaiApiASR(ASRBase):
return transcript return transcript
def use_vad(self): def use_vad(self):
self.use_vad_opt = True self.use_vad_opt = True
def set_translate_task(self):
self.task = "translate"

View File

@@ -70,7 +70,7 @@ def backend_factory(
model_size, model_size,
model_cache_dir, model_cache_dir,
model_dir, model_dir,
task, direct_english_translation,
buffer_trimming, buffer_trimming,
buffer_trimming_sec, buffer_trimming_sec,
confidence_validation, confidence_validation,
@@ -102,7 +102,7 @@ def backend_factory(
e = time.time() e = time.time()
logger.info(f"done. It took {round(e-t,2)} seconds.") logger.info(f"done. It took {round(e-t,2)} seconds.")
if task == "translate": if direct_english_translation:
tgt_language = "en" # Whisper translates into English tgt_language = "en" # Whisper translates into English
else: else:
tgt_language = lan # Whisper transcribes in this language tgt_language = lan # Whisper transcribes in this language

View File

@@ -114,11 +114,10 @@ def parse_args():
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.", help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
) )
parser.add_argument( parser.add_argument(
"--task", "--direct-english-translation",
type=str, action="store_true",
default="transcribe", default=False,
choices=["transcribe", "translate"], help="Use Whisper to directly translate to english.",
help="Transcribe or translate.",
) )
parser.add_argument( parser.add_argument(

View File

@@ -211,7 +211,7 @@ class SimulStreamingASR():
cif_ckpt_path=self.cif_ckpt_path, cif_ckpt_path=self.cif_ckpt_path,
decoder_type="beam", decoder_type="beam",
beam_size=self.beams, beam_size=self.beams,
task=self.task, task=self.direct_english_translation,
never_fire=self.never_fire, never_fire=self.never_fire,
init_prompt=self.init_prompt, init_prompt=self.init_prompt,
max_context_tokens=self.max_context_tokens, max_context_tokens=self.max_context_tokens,
@@ -219,7 +219,7 @@ class SimulStreamingASR():
) )
# Set up tokenizer for translation if needed # Set up tokenizer for translation if needed
if self.task == "translate": if self.direct_english_translation:
self.tokenizer = self.set_translate_task() self.tokenizer = self.set_translate_task()
else: else:
self.tokenizer = None self.tokenizer = None