task to direct-english-translation

This commit is contained in:
Quentin Fuxa
2025-11-10 13:20:26 +01:00
parent 5491dbd824
commit 16461052ed
6 changed files with 13 additions and 30 deletions

View File

@@ -141,8 +141,7 @@ async def websocket_endpoint(websocket: WebSocket):
| `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` |
| `--model-path` | .pt file/directory containing whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` |
| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
| `--target-language` | If sets, translate to using NLLB. Ex: `fr`. [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` |
| `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` |
| `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` |
| `--diarization` | Enable speaker identification | `False` |
| `--backend` | Processing backend. You can switch to `faster-whisper` if `simulstreaming` does not work correctly | `simulstreaming` |
| `--no-vac` | Disable Voice Activity Controller | `False` |

View File

@@ -52,7 +52,7 @@ class TranscriptionEngine:
"model_cache_dir": None,
"model_dir": None,
"lan": "auto",
"task": "transcribe",
"direct_english_translation": False,
}
transcription_common_params = update_with_kwargs(transcription_common_params, kwargs)

View File

@@ -80,10 +80,6 @@ class WhisperTimestampedASR(ASRBase):
def use_vad(self):
self.transcribe_kargs["vad"] = True
def set_translate_task(self):
self.transcribe_kargs["task"] = "translate"
class FasterWhisperASR(ASRBase):
"""Uses faster-whisper as the backend."""
sep = ""
@@ -139,10 +135,6 @@ class FasterWhisperASR(ASRBase):
def use_vad(self):
self.transcribe_kargs["vad_filter"] = True
def set_translate_task(self):
self.transcribe_kargs["task"] = "translate"
class MLXWhisper(ASRBase):
"""
Uses MLX Whisper optimized for Apple Silicon.
@@ -218,10 +210,6 @@ class MLXWhisper(ASRBase):
def use_vad(self):
self.transcribe_kargs["vad_filter"] = True
def set_translate_task(self):
self.transcribe_kargs["task"] = "translate"
class OpenaiApiASR(ASRBase):
"""Uses OpenAI's Whisper API for transcription."""
def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
@@ -232,7 +220,7 @@ class OpenaiApiASR(ASRBase):
self.temperature = temperature
self.load_model()
self.use_vad_opt = False
self.task = "transcribe"
self.direct_english_translation = False
def load_model(self, *args, **kwargs):
from openai import OpenAI
@@ -274,7 +262,7 @@ class OpenaiApiASR(ASRBase):
"temperature": self.temperature,
"timestamp_granularities": ["word", "segment"],
}
if self.task != "translate" and self.original_language:
if not self.direct_english_translation and self.original_language:
params["language"] = self.original_language
if prompt:
params["prompt"] = prompt
@@ -284,7 +272,4 @@ class OpenaiApiASR(ASRBase):
return transcript
def use_vad(self):
self.use_vad_opt = True
def set_translate_task(self):
self.task = "translate"
self.use_vad_opt = True

View File

@@ -70,7 +70,7 @@ def backend_factory(
model_size,
model_cache_dir,
model_dir,
task,
direct_english_translation,
buffer_trimming,
buffer_trimming_sec,
confidence_validation,
@@ -102,7 +102,7 @@ def backend_factory(
e = time.time()
logger.info(f"done. It took {round(e-t,2)} seconds.")
if task == "translate":
if direct_english_translation:
tgt_language = "en" # Whisper translates into English
else:
tgt_language = lan # Whisper transcribes in this language

View File

@@ -114,11 +114,10 @@ def parse_args():
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
)
parser.add_argument(
"--task",
type=str,
default="transcribe",
choices=["transcribe", "translate"],
help="Transcribe or translate.",
"--direct-english-translation",
action="store_true",
default=False,
help="Use Whisper to directly translate to english.",
)
parser.add_argument(

View File

@@ -211,7 +211,7 @@ class SimulStreamingASR():
cif_ckpt_path=self.cif_ckpt_path,
decoder_type="beam",
beam_size=self.beams,
task=self.task,
task=self.direct_english_translation,
never_fire=self.never_fire,
init_prompt=self.init_prompt,
max_context_tokens=self.max_context_tokens,
@@ -219,7 +219,7 @@ class SimulStreamingASR():
)
# Set up tokenizer for translation if needed
if self.task == "translate":
if self.direct_english_translation:
self.tokenizer = self.set_translate_task()
else:
self.tokenizer = None