From 9b2c3ee844ea3b1da97fb7912e764e1a200ee1a9 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 22 Feb 2026 23:27:57 +0100 Subject: [PATCH] docs: update README with voxtral backend, benchmarks, testing sections - Add Voxtral Backend section explaining voxtral-mlx and voxtral (HF). - Add Testing & Benchmarks section with commands to run tests/benchmarks. - Update --backend parameter docs to include voxtral-mlx and voxtral. - Update optional dependencies table with Voxtral entry. - Link to BENCHMARK.md for detailed performance comparisons. --- README.md | 55 +++++++++++++++++++++++++++++++++--- whisperlivekit/core.py | 10 ++++++- whisperlivekit/parse_args.py | 4 +-- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 948f015..d434a25 100644 --- a/README.md +++ b/README.md @@ -75,15 +75,35 @@ Go to `chrome-extension` for instructions. |-----------|-------------| | **Windows/Linux optimizations** | `faster-whisper` | | **Apple Silicon optimizations** | `mlx-whisper` | +| **Voxtral (multilingual, auto-detect)** | `transformers torch` (or use built-in `voxtral-mlx` on Apple Silicon) | | **Translation** | `nllw` | | **Speaker diarization** | `git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]` | | OpenAI API | `openai` | | *[Not recommanded]* Speaker diarization with Diart | `diart` | -See **Parameters & Configuration** below on how to use them. +See **Parameters & Configuration** below on how to use them. +See **[BENCHMARK.md](BENCHMARK.md)** for detailed performance comparisons across all backends. +### Voxtral Backend + +WhisperLiveKit supports [Voxtral Mini](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602), +a 4B-parameter speech model from Mistral AI that natively handles 100+ languages with automatic +language detection. Unlike whisper-based backends, Voxtral does not require specifying `--language`. + +```bash +# Apple Silicon (native MLX, recommended) +wlk --backend voxtral-mlx + +# Linux/GPU (HuggingFace transformers) +pip install transformers torch +wlk --backend voxtral +``` + +Voxtral uses its own streaming policy and does not use LocalAgreement or SimulStreaming. +See [BENCHMARK.md](BENCHMARK.md) for performance numbers. + ### Usage Examples **Command-line Interface**: Start the transcription server with various options: @@ -92,8 +112,11 @@ See **Parameters & Configuration** below on how to use them. # Large model and translate from french to danish wlk --model large-v3 --language fr --target-language da -# Diarization and server listening on */80 +# Diarization and server listening on */80 wlk --host 0.0.0.0 --port 80 --model medium --diarization --language fr + +# Voxtral multilingual (auto-detects language) +wlk --backend voxtral-mlx ``` @@ -151,7 +174,7 @@ async def websocket_endpoint(websocket: WebSocket): | `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` | | `--diarization` | Enable speaker identification | `False` | | `--backend-policy` | Streaming strategy: `1`/`simulstreaming` uses AlignAtt SimulStreaming, `2`/`localagreement` uses the LocalAgreement policy | `simulstreaming` | -| `--backend` | Whisper implementation selector. `auto` picks MLX on macOS (if installed), otherwise Faster-Whisper, otherwise vanilla Whisper. You can also force `mlx-whisper`, `faster-whisper`, `whisper`, or `openai-api` (LocalAgreement only) | `auto` | +| `--backend` | ASR backend selector. `auto` picks MLX on macOS (if installed), otherwise Faster-Whisper, otherwise vanilla Whisper. Options: `mlx-whisper`, `faster-whisper`, `whisper`, `openai-api` (LocalAgreement only), `voxtral-mlx` (Apple Silicon), `voxtral` (HuggingFace) | `auto` | | `--no-vac` | Disable Voice Activity Controller. NOT ADVISED | `False` | | `--no-vad` | Disable Voice Activity Detection. NOT ADVISED | `False` | | `--warmup-file` | Audio file path for model warmup | `jfk.wav` | @@ -271,5 +294,29 @@ docker run --gpus all -p 8000:8000 --name wlk wlk --model large-v3 --language fr - `HF_PRECACHE_DIR="./.cache/"` - Pre-load a model cache for faster first-time start - `HF_TKN_FILE="./token"` - Add your Hugging Face Hub access token to download gated models -## 🔮 Use Cases +## Testing & Benchmarks + +WhisperLiveKit includes a unit test suite and an offline benchmark harness. + +```bash +# Install test dependencies +pip install -e ".[test]" + +# Run unit tests (no model download required) +pytest tests/ -v + +# Benchmark a single backend +python test_backend_offline.py --backend faster-whisper --no-realtime + +# Benchmark all installed backends +python test_backend_offline.py --benchmark --no-realtime + +# Export benchmark results as JSON +python test_backend_offline.py --benchmark --no-realtime --json results.json +``` + +See [BENCHMARK.md](BENCHMARK.md) for a full comparison of backends, policies, WER, speed, and +timestamp accuracy on Apple Silicon. + +## Use Cases Capture discussions in real-time for meeting transcription, help hearing-impaired users follow conversations through accessibility tools, transcribe podcasts or videos automatically for content creation, transcribe support calls with speaker identification for customer service... diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index 7cf1041..c306f52 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -92,7 +92,12 @@ class TranscriptionEngine: } if config.transcription: - if config.backend == "voxtral": + if config.backend == "voxtral-mlx": + from whisperlivekit.voxtral_mlx_asr import VoxtralMLXASR + self.tokenizer = None + self.asr = VoxtralMLXASR(**transcription_common_params) + logger.info("Using Voxtral MLX native backend") + elif config.backend == "voxtral": from whisperlivekit.voxtral_hf_streaming import VoxtralHFStreamingASR self.tokenizer = None self.asr = VoxtralHFStreamingASR(**transcription_common_params) @@ -169,6 +174,9 @@ class TranscriptionEngine: def online_factory(args, asr): + if getattr(args, 'backend', None) == "voxtral-mlx": + from whisperlivekit.voxtral_mlx_asr import VoxtralMLXOnlineProcessor + return VoxtralMLXOnlineProcessor(asr) if getattr(args, 'backend', None) == "voxtral": from whisperlivekit.voxtral_hf_streaming import VoxtralHFStreamingOnlineProcessor return VoxtralHFStreamingOnlineProcessor(asr) diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index d89aaca..94518d7 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -147,8 +147,8 @@ def parse_args(): "--backend", type=str, default="auto", - choices=["auto", "mlx-whisper", "faster-whisper", "whisper", "openai-api", "voxtral"], - help="Select the ASR backend implementation (auto: prefer MLX on macOS, otherwise Faster-Whisper, else Whisper). Use 'voxtral' for Voxtral streaming via HuggingFace Transformers (CUDA/CPU/MPS).", + choices=["auto", "mlx-whisper", "faster-whisper", "whisper", "openai-api", "voxtral", "voxtral-mlx"], + help="Select the ASR backend implementation (auto: prefer MLX on macOS, otherwise Faster-Whisper, else Whisper). Use 'voxtral' for HF Transformers Voxtral (CUDA/CPU/MPS). Use 'voxtral-mlx' for native MLX Voxtral on Apple Silicon.", ) parser.add_argument( "--no-vac",