mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-21 16:40:35 +00:00
1681 lines
62 KiB
Python
1681 lines
62 KiB
Python
"""CLI entry point for WhisperLiveKit.
|
|
|
|
Provides subcommands:
|
|
wlk serve — Start the transcription server (default when no args)
|
|
wlk listen — Live microphone transcription
|
|
wlk run — Auto-pull model and start server
|
|
wlk transcribe — Transcribe audio files offline
|
|
wlk bench — Benchmark speed and accuracy on standard test audio
|
|
wlk models — List available and installed backends/models
|
|
wlk pull — Download a model for offline use
|
|
wlk rm — Delete downloaded models
|
|
wlk check — Verify system dependencies (ffmpeg, etc.)
|
|
wlk diagnose — Run pipeline diagnostics on audio file
|
|
"""
|
|
|
|
import importlib.util
|
|
import logging
|
|
import platform
|
|
import sys
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Backend detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _module_available(name: str) -> bool:
|
|
return importlib.util.find_spec(name) is not None
|
|
|
|
|
|
def _gpu_info() -> str:
|
|
"""Return a short string describing available accelerators."""
|
|
parts = []
|
|
try:
|
|
import torch
|
|
if torch.cuda.is_available():
|
|
name = torch.cuda.get_device_name(0)
|
|
parts.append(f"CUDA ({name})")
|
|
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
parts.append("MPS (Apple Silicon)")
|
|
except ImportError:
|
|
pass
|
|
|
|
if platform.system() == "Darwin" and platform.machine() == "arm64":
|
|
if _module_available("mlx"):
|
|
parts.append("MLX")
|
|
|
|
return ", ".join(parts) if parts else "CPU only"
|
|
|
|
|
|
BACKENDS = [
|
|
{
|
|
"id": "faster-whisper",
|
|
"name": "Faster Whisper",
|
|
"module": "faster_whisper",
|
|
"install": "pip install faster-whisper",
|
|
"description": "CTranslate2-based Whisper (fast, CPU/CUDA)",
|
|
"policy": "localagreement",
|
|
"streaming": "chunk", # batch inference with LocalAgreement/SimulStreaming
|
|
"devices": ["cpu", "cuda"],
|
|
},
|
|
{
|
|
"id": "whisper",
|
|
"name": "OpenAI Whisper",
|
|
"module": "whisper",
|
|
"install": "pip install openai-whisper",
|
|
"description": "Original OpenAI Whisper (PyTorch)",
|
|
"policy": "simulstreaming",
|
|
"streaming": "chunk",
|
|
"devices": ["cpu", "cuda"],
|
|
},
|
|
{
|
|
"id": "mlx-whisper",
|
|
"name": "MLX Whisper",
|
|
"module": "mlx_whisper",
|
|
"install": "pip install mlx-whisper",
|
|
"description": "Apple Silicon native Whisper (MLX)",
|
|
"policy": "localagreement",
|
|
"platform": "darwin-arm64",
|
|
"streaming": "chunk",
|
|
"devices": ["mlx"],
|
|
},
|
|
{
|
|
"id": "voxtral-mlx",
|
|
"name": "Voxtral MLX",
|
|
"module": "mlx",
|
|
"install": "pip install whisperlivekit[voxtral-mlx]",
|
|
"description": "Mistral Voxtral Mini on Apple Silicon (MLX, native streaming)",
|
|
"platform": "darwin-arm64",
|
|
"streaming": "native", # truly streaming (token-by-token)
|
|
"devices": ["mlx"],
|
|
},
|
|
{
|
|
"id": "voxtral",
|
|
"name": "Voxtral HF",
|
|
"module": "transformers",
|
|
"install": "pip install whisperlivekit[voxtral-hf]",
|
|
"description": "Mistral Voxtral Mini (HF Transformers, native streaming)",
|
|
"streaming": "native",
|
|
"devices": ["cuda", "mps", "cpu"],
|
|
},
|
|
{
|
|
"id": "qwen3",
|
|
"name": "Qwen3 ASR",
|
|
"module": "qwen_asr",
|
|
"install": "pip install qwen-asr",
|
|
"description": "Qwen3-ASR with ForcedAligner timestamps",
|
|
"streaming": "chunk",
|
|
"devices": ["cuda", "mps", "cpu"],
|
|
},
|
|
{
|
|
"id": "qwen3-mlx",
|
|
"name": "Qwen3 MLX",
|
|
"module": "mlx_qwen3_asr",
|
|
"install": "pip install mlx-qwen3-asr",
|
|
"description": "Qwen3-ASR on Apple Silicon (MLX, native streaming)",
|
|
"platform": "darwin-arm64",
|
|
"streaming": "native",
|
|
"devices": ["mlx"],
|
|
},
|
|
{
|
|
"id": "openai-api",
|
|
"name": "OpenAI API",
|
|
"module": "openai",
|
|
"install": "pip install openai",
|
|
"description": "Cloud-based transcription via OpenAI API",
|
|
"streaming": "cloud",
|
|
"devices": ["cloud"],
|
|
},
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Model catalog — maps "wlk pull <name>" to download actions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Whisper model sizes available across backends
|
|
WHISPER_SIZES = [
|
|
"tiny", "tiny.en", "base", "base.en", "small", "small.en",
|
|
"medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo",
|
|
]
|
|
|
|
# Faster-Whisper uses Systran HuggingFace repos
|
|
FASTER_WHISPER_REPOS = {
|
|
"tiny": "Systran/faster-whisper-tiny",
|
|
"tiny.en": "Systran/faster-whisper-tiny.en",
|
|
"base": "Systran/faster-whisper-base",
|
|
"base.en": "Systran/faster-whisper-base.en",
|
|
"small": "Systran/faster-whisper-small",
|
|
"small.en": "Systran/faster-whisper-small.en",
|
|
"medium": "Systran/faster-whisper-medium",
|
|
"medium.en": "Systran/faster-whisper-medium.en",
|
|
"large-v1": "Systran/faster-whisper-large-v1",
|
|
"large-v2": "Systran/faster-whisper-large-v2",
|
|
"large-v3": "Systran/faster-whisper-large-v3",
|
|
"large-v3-turbo": "Systran/faster-distil-whisper-large-v3",
|
|
}
|
|
|
|
# MLX Whisper repos from model_mapping.py
|
|
MLX_WHISPER_REPOS = {
|
|
"tiny.en": "mlx-community/whisper-tiny.en-mlx",
|
|
"tiny": "mlx-community/whisper-tiny-mlx",
|
|
"base.en": "mlx-community/whisper-base.en-mlx",
|
|
"base": "mlx-community/whisper-base-mlx",
|
|
"small.en": "mlx-community/whisper-small.en-mlx",
|
|
"small": "mlx-community/whisper-small-mlx",
|
|
"medium.en": "mlx-community/whisper-medium.en-mlx",
|
|
"medium": "mlx-community/whisper-medium-mlx",
|
|
"large-v1": "mlx-community/whisper-large-v1-mlx",
|
|
"large-v2": "mlx-community/whisper-large-v2-mlx",
|
|
"large-v3": "mlx-community/whisper-large-v3-mlx",
|
|
"large-v3-turbo": "mlx-community/whisper-large-v3-turbo",
|
|
"large": "mlx-community/whisper-large-mlx",
|
|
}
|
|
|
|
# Voxtral/Qwen3 model repos
|
|
VOXTRAL_HF_REPO = "mistralai/Voxtral-Mini-4B-Realtime-2602"
|
|
VOXTRAL_MLX_REPO = "mlx-community/Voxtral-Mini-4B-Realtime-6bit"
|
|
QWEN3_REPOS = {
|
|
"1.7b": "Qwen/Qwen3-ASR-1.7B",
|
|
"0.6b": "Qwen/Qwen3-ASR-0.6B",
|
|
}
|
|
QWEN3_ALIGNER_REPO = "Qwen/Qwen3-ForcedAligner-0.6B"
|
|
|
|
# Model catalog: metadata for display in `wlk models`
|
|
# params = approximate parameter count, disk = approximate download size
|
|
MODEL_CATALOG = [
|
|
# Whisper family (available across faster-whisper, mlx-whisper, whisper backends)
|
|
{"name": "tiny", "family": "whisper", "params": "39M", "disk": "75 MB", "languages": 99, "quality": "low", "speed": "fastest"},
|
|
{"name": "tiny.en", "family": "whisper", "params": "39M", "disk": "75 MB", "languages": 1, "quality": "low", "speed": "fastest"},
|
|
{"name": "base", "family": "whisper", "params": "74M", "disk": "142 MB", "languages": 99, "quality": "fair", "speed": "fast"},
|
|
{"name": "base.en", "family": "whisper", "params": "74M", "disk": "142 MB", "languages": 1, "quality": "fair", "speed": "fast"},
|
|
{"name": "small", "family": "whisper", "params": "244M", "disk": "466 MB", "languages": 99, "quality": "good", "speed": "medium"},
|
|
{"name": "small.en", "family": "whisper", "params": "244M", "disk": "466 MB", "languages": 1, "quality": "good", "speed": "medium"},
|
|
{"name": "medium", "family": "whisper", "params": "769M", "disk": "1.5 GB", "languages": 99, "quality": "great", "speed": "slow"},
|
|
{"name": "medium.en", "family": "whisper", "params": "769M", "disk": "1.5 GB", "languages": 1, "quality": "great", "speed": "slow"},
|
|
{"name": "large-v3", "family": "whisper", "params": "1.5B", "disk": "3.1 GB", "languages": 99, "quality": "best", "speed": "slowest"},
|
|
{"name": "large-v3-turbo", "family": "whisper", "params": "809M", "disk": "1.6 GB", "languages": 99, "quality": "great", "speed": "medium"},
|
|
# Voxtral (native streaming, single model)
|
|
{"name": "voxtral", "family": "voxtral", "params": "4B", "disk": "8.2 GB", "languages": 15, "quality": "great", "speed": "medium"},
|
|
{"name": "voxtral-mlx", "family": "voxtral", "params": "4B", "disk": "2.7 GB", "languages": 15, "quality": "great", "speed": "medium"},
|
|
# Qwen3 ASR
|
|
{"name": "qwen3:1.7b", "family": "qwen3", "params": "1.7B", "disk": "3.6 GB", "languages": 12, "quality": "good", "speed": "fast"},
|
|
{"name": "qwen3:0.6b", "family": "qwen3", "params": "0.6B", "disk": "1.4 GB", "languages": 12, "quality": "fair", "speed": "fastest"},
|
|
# Qwen3 MLX (native streaming on Apple Silicon)
|
|
{"name": "qwen3-mlx:1.7b", "family": "qwen3-mlx", "params": "1.7B", "disk": "1.8 GB", "languages": 12, "quality": "good", "speed": "fast"},
|
|
{"name": "qwen3-mlx:0.6b", "family": "qwen3-mlx", "params": "0.6B", "disk": "0.7 GB", "languages": 12, "quality": "fair", "speed": "fastest"},
|
|
]
|
|
|
|
|
|
def _check_platform(backend: dict) -> bool:
|
|
"""Check if backend is compatible with current platform."""
|
|
req = backend.get("platform")
|
|
if req is None:
|
|
return True
|
|
if req == "darwin-arm64":
|
|
return platform.system() == "Darwin" and platform.machine() == "arm64"
|
|
return True
|
|
|
|
|
|
def _is_installed(backend: dict) -> bool:
|
|
return _module_available(backend["module"])
|
|
|
|
|
|
def _check_ffmpeg() -> bool:
|
|
"""Check if ffmpeg is available."""
|
|
import shutil
|
|
return shutil.which("ffmpeg") is not None
|
|
|
|
|
|
def _scan_downloaded_models() -> dict:
|
|
"""Scan HuggingFace and Whisper caches to find downloaded models.
|
|
|
|
Returns:
|
|
dict mapping repo_id → cached path (or True if found).
|
|
"""
|
|
found = {}
|
|
|
|
# 1. Scan HuggingFace hub cache
|
|
try:
|
|
from huggingface_hub import scan_cache_dir
|
|
cache_info = scan_cache_dir()
|
|
for repo in cache_info.repos:
|
|
found[repo.repo_id] = str(repo.repo_path)
|
|
except Exception:
|
|
pass
|
|
|
|
# 2. Scan native Whisper cache (~/.cache/whisper)
|
|
import os
|
|
whisper_cache = os.path.join(os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "whisper")
|
|
if os.path.isdir(whisper_cache):
|
|
for f in os.listdir(whisper_cache):
|
|
if f.endswith(".pt"):
|
|
# e.g. "base.pt" or "large-v3.pt"
|
|
size = f.rsplit(".", 1)[0]
|
|
found[f"openai/whisper-{size}"] = os.path.join(whisper_cache, f)
|
|
|
|
return found
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Startup banner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def print_banner(config, host: str, port: int, ssl: bool = False):
|
|
"""Print a clean startup banner with server info."""
|
|
protocol = "https" if ssl else "http"
|
|
ws_protocol = "wss" if ssl else "ws"
|
|
|
|
# Resolve display host
|
|
display_host = host if host not in ("0.0.0.0", "::") else "localhost"
|
|
base_url = f"{protocol}://{display_host}:{port}"
|
|
ws_url = f"{ws_protocol}://{display_host}:{port}"
|
|
|
|
backend = getattr(config, "backend", "auto")
|
|
model = getattr(config, "model_size", "base")
|
|
language = getattr(config, "lan", "auto")
|
|
|
|
# Resolve actual backend name
|
|
backend_label = backend
|
|
if backend == "auto":
|
|
backend_label = "auto (will resolve on first request)"
|
|
|
|
lines = [
|
|
"",
|
|
" WhisperLiveKit",
|
|
f" Backend: {backend_label} | Model: {model} | Language: {language}",
|
|
f" Accelerator: {_gpu_info()}",
|
|
"",
|
|
f" Web UI: {base_url}/",
|
|
f" WebSocket: {ws_url}/asr",
|
|
f" Deepgram: {ws_url}/v1/listen",
|
|
f" REST API: {base_url}/v1/audio/transcriptions",
|
|
f" Models: {base_url}/v1/models",
|
|
f" Health: {base_url}/health",
|
|
"",
|
|
]
|
|
print("\n".join(lines), file=sys.stderr)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk models` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _model_is_downloaded(model_entry: dict, downloaded: dict) -> bool:
|
|
"""Check if a model catalog entry has been downloaded."""
|
|
name = model_entry["name"]
|
|
family = model_entry["family"]
|
|
|
|
if family == "whisper":
|
|
# Check all whisper backends
|
|
repos = [
|
|
FASTER_WHISPER_REPOS.get(name),
|
|
MLX_WHISPER_REPOS.get(name),
|
|
f"openai/whisper-{name}",
|
|
]
|
|
return any(r in downloaded for r in repos if r)
|
|
elif name == "voxtral":
|
|
return VOXTRAL_HF_REPO in downloaded
|
|
elif name == "voxtral-mlx":
|
|
return VOXTRAL_MLX_REPO in downloaded
|
|
elif family == "qwen3":
|
|
size = name.split(":")[1] if ":" in name else "1.7b"
|
|
return QWEN3_REPOS.get(size, "") in downloaded
|
|
elif family == "qwen3-mlx":
|
|
size = name.split(":")[1] if ":" in name else "1.7b"
|
|
return QWEN3_REPOS.get(size, "") in downloaded
|
|
return False
|
|
|
|
|
|
def _best_backend_for_model(model_entry: dict) -> str:
|
|
"""Suggest the best available backend for a model."""
|
|
family = model_entry["family"]
|
|
is_apple = platform.system() == "Darwin" and platform.machine() == "arm64"
|
|
|
|
if family == "voxtral":
|
|
if "mlx" in model_entry["name"]:
|
|
return "voxtral-mlx"
|
|
return "voxtral"
|
|
elif family == "qwen3":
|
|
return "qwen3"
|
|
elif family == "qwen3-mlx":
|
|
return "qwen3-mlx"
|
|
elif family == "whisper":
|
|
if is_apple and _module_available("mlx_whisper"):
|
|
return "mlx-whisper"
|
|
if _module_available("faster_whisper"):
|
|
return "faster-whisper"
|
|
if _module_available("whisper"):
|
|
return "whisper"
|
|
# Suggest best installable
|
|
return "mlx-whisper" if is_apple else "faster-whisper"
|
|
return "auto"
|
|
|
|
|
|
def cmd_models():
|
|
"""List available models and backends (ollama-style)."""
|
|
is_apple_silicon = platform.system() == "Darwin" and platform.machine() == "arm64"
|
|
downloaded = _scan_downloaded_models()
|
|
|
|
# --- Installed backends ---
|
|
print("\n Backends:\n")
|
|
|
|
max_name = max(len(b["name"]) for b in BACKENDS)
|
|
for b in BACKENDS:
|
|
compatible = _check_platform(b)
|
|
installed = _is_installed(b)
|
|
streaming = b.get("streaming", "chunk")
|
|
stream_label = {"native": "streaming", "chunk": "chunked", "cloud": "cloud"}.get(streaming, streaming)
|
|
|
|
if installed:
|
|
status = "\033[32m+\033[0m"
|
|
elif not compatible:
|
|
status = "\033[90m-\033[0m"
|
|
else:
|
|
status = "\033[33m-\033[0m"
|
|
|
|
name_pad = b["name"].ljust(max_name)
|
|
desc_short = b["description"]
|
|
print(f" {status} {name_pad} {desc_short} [{stream_label}]")
|
|
|
|
if not installed and compatible:
|
|
print(f" {''.ljust(max_name)} \033[90m{b['install']}\033[0m")
|
|
|
|
# --- System info ---
|
|
print(f"\n Platform: {platform.system()} {platform.machine()}")
|
|
print(f" Accelerator: {_gpu_info()}")
|
|
print(f" ffmpeg: {'found' if _check_ffmpeg() else '\033[31mNOT FOUND\033[0m (required)'}")
|
|
|
|
# --- Model catalog ---
|
|
print("\n Models:\n")
|
|
|
|
# Table header
|
|
hdr = f" {'NAME':<20} {'PARAMS':>7} {'SIZE':>8} {'QUALITY':<8} {'SPEED':<8} {'LANGS':>5} {'STATUS':<10}"
|
|
print(hdr)
|
|
print(f" {'─' * 20} {'─' * 7} {'─' * 8} {'─' * 8} {'─' * 8} {'─' * 5} {'─' * 10}")
|
|
|
|
for m in MODEL_CATALOG:
|
|
name = m["name"]
|
|
# Skip platform-incompatible models
|
|
if name == "voxtral-mlx" and not is_apple_silicon:
|
|
continue
|
|
if m["family"] == "qwen3-mlx" and not is_apple_silicon:
|
|
continue
|
|
|
|
is_dl = _model_is_downloaded(m, downloaded)
|
|
|
|
if is_dl:
|
|
status = "\033[32mpulled\033[0m "
|
|
else:
|
|
status = "\033[90mavailable\033[0m "
|
|
|
|
langs = str(m["languages"]) if m["languages"] < 99 else "99+"
|
|
|
|
print(
|
|
f" {name:<20} {m['params']:>7} {m['disk']:>8} "
|
|
f"{m['quality']:<8} {m['speed']:<8} {langs:>5} {status}"
|
|
)
|
|
|
|
# --- Quick start ---
|
|
print(f"\n Quick start:\n")
|
|
if is_apple_silicon:
|
|
print(" wlk run voxtral-mlx # Best streaming on Apple Silicon")
|
|
print(" wlk run large-v3-turbo # Best quality/speed balance")
|
|
else:
|
|
print(" wlk run large-v3-turbo # Best quality/speed balance")
|
|
print(" wlk run voxtral # Native streaming (CUDA/CPU)")
|
|
print(" wlk pull base # Download smallest multilingual model")
|
|
print(" wlk transcribe audio.mp3 # Offline transcription")
|
|
print()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk pull` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _hf_download(repo_id: str, label: str):
|
|
"""Download a HuggingFace model repo to the local cache."""
|
|
from huggingface_hub import snapshot_download
|
|
print(f" Downloading {label} ({repo_id})...")
|
|
path = snapshot_download(repo_id)
|
|
print(f" Saved to: {path}")
|
|
return path
|
|
|
|
|
|
def _resolve_pull_target(spec: str):
|
|
"""Parse a pull spec like 'faster-whisper:large-v3' or 'base' into (backend, size/repo).
|
|
|
|
Returns: list of (backend_id, repo_id, label) tuples to download.
|
|
"""
|
|
targets = []
|
|
|
|
# Check for backend:size format
|
|
if ":" in spec:
|
|
backend_part, size_part = spec.split(":", 1)
|
|
else:
|
|
backend_part = None
|
|
size_part = spec
|
|
|
|
# Handle voxtral
|
|
if size_part == "voxtral" or backend_part == "voxtral":
|
|
targets.append(("voxtral", VOXTRAL_HF_REPO, "Voxtral Mini (HF)"))
|
|
return targets
|
|
|
|
if size_part == "voxtral-mlx" or backend_part == "voxtral-mlx":
|
|
targets.append(("voxtral-mlx", VOXTRAL_MLX_REPO, "Voxtral Mini (MLX)"))
|
|
return targets
|
|
|
|
# Handle qwen3-mlx (must check before generic qwen3)
|
|
if backend_part == "qwen3-mlx" or size_part.startswith("qwen3-mlx"):
|
|
qwen_size = size_part.split(":")[-1] if ":" in spec else "1.7b"
|
|
if qwen_size.startswith("qwen3"):
|
|
qwen_size = "1.7b" # default
|
|
repo = QWEN3_REPOS.get(qwen_size)
|
|
if not repo:
|
|
print(f" Unknown Qwen3 size: {qwen_size}. Available: {', '.join(QWEN3_REPOS.keys())}")
|
|
return []
|
|
targets.append(("qwen3-mlx", repo, f"Qwen3-ASR MLX {qwen_size}"))
|
|
return targets
|
|
|
|
# Handle qwen3
|
|
if backend_part == "qwen3" or size_part.startswith("qwen3"):
|
|
qwen_size = size_part.split(":")[-1] if ":" in spec else "1.7b"
|
|
if qwen_size.startswith("qwen3"):
|
|
qwen_size = "1.7b" # default
|
|
repo = QWEN3_REPOS.get(qwen_size)
|
|
if not repo:
|
|
print(f" Unknown Qwen3 size: {qwen_size}. Available: {', '.join(QWEN3_REPOS.keys())}")
|
|
return []
|
|
targets.append(("qwen3", repo, f"Qwen3-ASR {qwen_size}"))
|
|
targets.append(("qwen3-aligner", QWEN3_ALIGNER_REPO, "Qwen3 ForcedAligner"))
|
|
return targets
|
|
|
|
# Handle whisper-family models with optional backend prefix
|
|
if backend_part:
|
|
# Specific backend requested
|
|
if backend_part == "faster-whisper":
|
|
repo = FASTER_WHISPER_REPOS.get(size_part)
|
|
if not repo:
|
|
print(f" Unknown size: {size_part}. Available: {', '.join(FASTER_WHISPER_REPOS.keys())}")
|
|
return []
|
|
targets.append(("faster-whisper", repo, f"Faster Whisper {size_part}"))
|
|
elif backend_part == "mlx-whisper":
|
|
repo = MLX_WHISPER_REPOS.get(size_part)
|
|
if not repo:
|
|
print(f" Unknown size: {size_part}. Available: {', '.join(MLX_WHISPER_REPOS.keys())}")
|
|
return []
|
|
targets.append(("mlx-whisper", repo, f"MLX Whisper {size_part}"))
|
|
elif backend_part == "whisper":
|
|
# OpenAI whisper downloads on first use; we can at least pull HF version
|
|
targets.append(("whisper", f"openai/whisper-{size_part}", f"Whisper {size_part}"))
|
|
else:
|
|
print(f" Unknown backend: {backend_part}")
|
|
return []
|
|
else:
|
|
# No backend specified — download for the best available backend
|
|
is_apple = platform.system() == "Darwin" and platform.machine() == "arm64"
|
|
|
|
if size_part in WHISPER_SIZES:
|
|
if is_apple and _module_available("mlx_whisper"):
|
|
repo = MLX_WHISPER_REPOS.get(size_part)
|
|
if repo:
|
|
targets.append(("mlx-whisper", repo, f"MLX Whisper {size_part}"))
|
|
if _module_available("faster_whisper"):
|
|
repo = FASTER_WHISPER_REPOS.get(size_part)
|
|
if repo:
|
|
targets.append(("faster-whisper", repo, f"Faster Whisper {size_part}"))
|
|
|
|
if not targets:
|
|
# Fallback: download for any available backend
|
|
repo = FASTER_WHISPER_REPOS.get(size_part)
|
|
if repo:
|
|
targets.append(("faster-whisper", repo, f"Faster Whisper {size_part}"))
|
|
else:
|
|
print(f" Unknown model: {spec}")
|
|
print(f" Available sizes: {', '.join(WHISPER_SIZES)}")
|
|
print(" Other models: voxtral, voxtral-mlx, qwen3:1.7b, qwen3:0.6b, qwen3-mlx:1.7b, qwen3-mlx:0.6b")
|
|
return []
|
|
|
|
return targets
|
|
|
|
|
|
def cmd_pull(spec: str):
|
|
"""Download a model for offline use."""
|
|
targets = _resolve_pull_target(spec)
|
|
if not targets:
|
|
return 1
|
|
|
|
print(f"\n Pulling model: {spec}\n")
|
|
|
|
for backend_id, repo_id, label in targets:
|
|
try:
|
|
_hf_download(repo_id, label)
|
|
except Exception as e:
|
|
print(f" Failed to download {label}: {e}")
|
|
return 1
|
|
|
|
print("\n Done. Model ready for offline use.")
|
|
print()
|
|
return 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk transcribe` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_transcribe(args: list):
|
|
"""Transcribe audio files using the full pipeline, no server needed.
|
|
|
|
Usage: wlk transcribe [options] <audio_file> [audio_file ...]
|
|
"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="wlk transcribe",
|
|
description="Transcribe audio files offline using WhisperLiveKit.",
|
|
)
|
|
parser.add_argument("files", nargs="+", help="Audio files to transcribe")
|
|
parser.add_argument("--backend", default="auto", help="ASR backend (default: auto)")
|
|
parser.add_argument("--model", default="base", dest="model_size", help="Model size (default: base)")
|
|
parser.add_argument("--language", "--lan", default="auto", dest="lan", help="Language code (default: auto)")
|
|
parser.add_argument("--format", default="text", choices=["text", "json", "srt", "vtt", "verbose_json"],
|
|
help="Output format (default: text)")
|
|
parser.add_argument("--output", "-o", default=None, help="Output file (default: stdout)")
|
|
parser.add_argument("--diarization", action="store_true", help="Enable speaker diarization")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed processing logs")
|
|
|
|
parsed = parser.parse_args(args)
|
|
|
|
import asyncio
|
|
|
|
# Suppress noisy logging unless --verbose.
|
|
# Must happen AFTER importing (some modules set levels at import time)
|
|
# so we use a wrapper that silences after import.
|
|
if not parsed.verbose:
|
|
asyncio.run(_transcribe_files_quiet(parsed))
|
|
else:
|
|
asyncio.run(_transcribe_files(parsed))
|
|
|
|
|
|
async def _transcribe_files_quiet(parsed):
|
|
"""Wrapper that silences logging after imports are done."""
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
|
|
# Force root logger to ERROR — overrides any per-module settings
|
|
logging.root.setLevel(logging.ERROR)
|
|
for handler in logging.root.handlers:
|
|
handler.setLevel(logging.ERROR)
|
|
# Silence all known noisy loggers
|
|
for name in list(logging.Logger.manager.loggerDict.keys()):
|
|
logging.getLogger(name).setLevel(logging.ERROR)
|
|
|
|
await _transcribe_files(parsed)
|
|
|
|
|
|
async def _transcribe_files(parsed):
|
|
"""Run transcription on one or more audio files."""
|
|
import json as json_module
|
|
|
|
from whisperlivekit.test_harness import TestHarness, load_audio_pcm
|
|
|
|
results = []
|
|
|
|
for audio_path in parsed.files:
|
|
print(f" Transcribing: {audio_path}", file=sys.stderr)
|
|
|
|
kwargs = {
|
|
"model_size": parsed.model_size,
|
|
"lan": parsed.lan,
|
|
"pcm_input": True,
|
|
}
|
|
if parsed.backend != "auto":
|
|
kwargs["backend"] = parsed.backend
|
|
if parsed.diarization:
|
|
kwargs["diarization"] = True
|
|
|
|
async with TestHarness(**kwargs) as h:
|
|
await h.feed(audio_path, speed=0)
|
|
await h.drain(5.0)
|
|
result = await h.finish(timeout=120)
|
|
|
|
duration = len(load_audio_pcm(audio_path)) / (16000 * 2)
|
|
|
|
if parsed.format == "text":
|
|
results.append(result.committed_text or result.text)
|
|
elif parsed.format == "json":
|
|
results.append(json_module.dumps({"text": result.committed_text or result.text}))
|
|
elif parsed.format == "verbose_json":
|
|
results.append(json_module.dumps({
|
|
"text": result.committed_text or result.text,
|
|
"duration": round(duration, 2),
|
|
"language": parsed.lan,
|
|
"segments": [
|
|
{
|
|
"text": line.get("text", ""),
|
|
"start": line.get("start", "0:00:00"),
|
|
"end": line.get("end", "0:00:00"),
|
|
"speaker": line.get("speaker", 0),
|
|
}
|
|
for line in result.lines
|
|
if line.get("text") and line.get("speaker", 0) != -2
|
|
],
|
|
}, indent=2))
|
|
elif parsed.format in ("srt", "vtt"):
|
|
results.append(_format_subtitle(result, parsed.format))
|
|
|
|
# Output
|
|
output_text = "\n".join(results)
|
|
if parsed.output:
|
|
with open(parsed.output, "w") as f:
|
|
f.write(output_text)
|
|
print(f" Output written to: {parsed.output}", file=sys.stderr)
|
|
else:
|
|
print(output_text)
|
|
|
|
|
|
def _format_subtitle(result, fmt: str) -> str:
|
|
"""Format result as SRT or VTT subtitles."""
|
|
from whisperlivekit.test_harness import _parse_time
|
|
|
|
lines_out = []
|
|
if fmt == "vtt":
|
|
lines_out.append("WEBVTT\n")
|
|
|
|
idx = 0
|
|
for line in result.lines:
|
|
if line.get("speaker") == -2 or not line.get("text"):
|
|
continue
|
|
idx += 1
|
|
start = line.get("start", "0:00:00")
|
|
end = line.get("end", "0:00:00")
|
|
|
|
start_s = _parse_time(start)
|
|
end_s = _parse_time(end)
|
|
|
|
start_ts = _subtitle_timestamp(start_s, fmt)
|
|
end_ts = _subtitle_timestamp(end_s, fmt)
|
|
|
|
if fmt == "srt":
|
|
lines_out.append(str(idx))
|
|
lines_out.append(f"{start_ts} --> {end_ts}")
|
|
lines_out.append(line["text"])
|
|
lines_out.append("")
|
|
|
|
return "\n".join(lines_out)
|
|
|
|
|
|
def _subtitle_timestamp(seconds: float, fmt: str) -> str:
|
|
"""Format seconds as SRT or VTT timestamp."""
|
|
h = int(seconds // 3600)
|
|
m = int((seconds % 3600) // 60)
|
|
s = int(seconds % 60)
|
|
ms = int(round((seconds % 1) * 1000))
|
|
sep = "," if fmt == "srt" else "."
|
|
return f"{h:02d}:{m:02d}:{s:02d}{sep}{ms:03d}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk bench` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_bench(args: list):
|
|
"""Benchmark the transcription pipeline on public test audio.
|
|
|
|
Downloads samples from LibriSpeech, Multilingual LibriSpeech, FLEURS,
|
|
and AMI on first run. Supports multilingual benchmarking across all
|
|
available backends.
|
|
|
|
Usage: wlk bench [options]
|
|
"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="wlk bench",
|
|
description="Benchmark WhisperLiveKit on public test audio.",
|
|
)
|
|
parser.add_argument("--backend", default="auto",
|
|
help="ASR backend (default: auto-detect)")
|
|
parser.add_argument("--model", default="base", dest="model_size",
|
|
help="Model size (default: base)")
|
|
parser.add_argument("--languages", "--lan", default=None,
|
|
help="Comma-separated language codes, or 'all' (default: en)")
|
|
parser.add_argument("--categories", default=None,
|
|
help="Comma-separated categories: clean,noisy,multilingual,meeting")
|
|
parser.add_argument("--quick", action="store_true",
|
|
help="Quick mode: small subset for smoke tests")
|
|
parser.add_argument("--json", default=None, dest="json_out",
|
|
help="Export full report to JSON file")
|
|
parser.add_argument("--transcriptions", action="store_true",
|
|
help="Show hypothesis vs reference for each sample")
|
|
parser.add_argument("--verbose", "-v", action="store_true",
|
|
help="Show detailed logs")
|
|
|
|
parsed = parser.parse_args(args)
|
|
|
|
# Parse languages
|
|
languages = None
|
|
if parsed.languages and parsed.languages != "all":
|
|
languages = [l.strip() for l in parsed.languages.split(",")]
|
|
elif parsed.languages is None:
|
|
languages = ["en"] # default to English only
|
|
|
|
categories = None
|
|
if parsed.categories:
|
|
categories = [c.strip() for c in parsed.categories.split(",")]
|
|
|
|
import asyncio
|
|
|
|
if not parsed.verbose:
|
|
_suppress_logging()
|
|
|
|
asyncio.run(_run_bench_new(parsed, languages, categories))
|
|
|
|
|
|
def _suppress_logging():
|
|
"""Suppress noisy logs during benchmark."""
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
logging.root.setLevel(logging.ERROR)
|
|
for handler in logging.root.handlers:
|
|
handler.setLevel(logging.ERROR)
|
|
for name in list(logging.Logger.manager.loggerDict.keys()):
|
|
logging.getLogger(name).setLevel(logging.ERROR)
|
|
|
|
|
|
async def _run_bench_new(parsed, languages, categories):
|
|
"""Run the benchmark using the new benchmark module."""
|
|
from whisperlivekit.benchmark.report import print_report, print_transcriptions, write_json
|
|
from whisperlivekit.benchmark.runner import BenchmarkRunner
|
|
|
|
def on_progress(name, i, total):
|
|
if name == "done":
|
|
print(f"\r [{total}/{total}] Done.{' ' * 30}", file=sys.stderr)
|
|
else:
|
|
print(f"\r [{i + 1}/{total}] {name}...{' ' * 20}",
|
|
end="", file=sys.stderr, flush=True)
|
|
|
|
runner = BenchmarkRunner(
|
|
backend=parsed.backend,
|
|
model_size=parsed.model_size,
|
|
languages=languages,
|
|
categories=categories,
|
|
quick=parsed.quick,
|
|
on_progress=on_progress,
|
|
)
|
|
|
|
print(f"\n Downloading benchmark samples (cached after first run)...",
|
|
file=sys.stderr)
|
|
|
|
report = await runner.run()
|
|
|
|
print_report(report)
|
|
|
|
if parsed.transcriptions:
|
|
print_transcriptions(report)
|
|
|
|
if parsed.json_out:
|
|
write_json(report, parsed.json_out)
|
|
print(f" Results exported to: {parsed.json_out}\n", file=sys.stderr)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk listen` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_listen(args: list):
|
|
"""Live microphone transcription.
|
|
|
|
Usage: wlk listen [options]
|
|
"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="wlk listen",
|
|
description="Transcribe live microphone input in real-time.",
|
|
)
|
|
parser.add_argument("--backend", default="auto", help="ASR backend (default: auto)")
|
|
parser.add_argument("--model", default="base", dest="model_size", help="Model size (default: base)")
|
|
parser.add_argument("--language", "--lan", default="auto", dest="lan", help="Language code (default: auto)")
|
|
parser.add_argument("--diarization", action="store_true", help="Enable speaker diarization")
|
|
parser.add_argument("--output", "-o", default=None, help="Save transcription to file on exit")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed logs")
|
|
|
|
parsed = parser.parse_args(args)
|
|
|
|
try:
|
|
import sounddevice # noqa: F401
|
|
except ImportError:
|
|
print("\n sounddevice is required for microphone input.", file=sys.stderr)
|
|
print(" Install it with: pip install sounddevice\n", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
import asyncio
|
|
|
|
if not parsed.verbose:
|
|
asyncio.run(_listen_quiet(parsed))
|
|
else:
|
|
asyncio.run(_listen_main(parsed))
|
|
|
|
|
|
async def _listen_quiet(parsed):
|
|
"""Run listen with suppressed logging."""
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
logging.root.setLevel(logging.ERROR)
|
|
for handler in logging.root.handlers:
|
|
handler.setLevel(logging.ERROR)
|
|
for name in list(logging.Logger.manager.loggerDict.keys()):
|
|
logging.getLogger(name).setLevel(logging.ERROR)
|
|
await _listen_main(parsed)
|
|
|
|
|
|
async def _listen_main(parsed):
|
|
"""Live microphone transcription loop."""
|
|
import numpy as np
|
|
import sounddevice as sd
|
|
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
SAMPLE_RATE = 16000
|
|
BLOCK_SIZE = int(SAMPLE_RATE * 0.5) # 500ms chunks
|
|
|
|
kwargs = {
|
|
"model_size": parsed.model_size,
|
|
"lan": parsed.lan,
|
|
"pcm_input": True,
|
|
}
|
|
if parsed.backend != "auto":
|
|
kwargs["backend"] = parsed.backend
|
|
if parsed.diarization:
|
|
kwargs["diarization"] = True
|
|
|
|
out = sys.stderr
|
|
|
|
out.write("\n Loading model...")
|
|
out.flush()
|
|
|
|
async with TestHarness(**kwargs) as h:
|
|
out.write(" done.\n")
|
|
out.write(" Listening (Ctrl+C to stop)\n\n")
|
|
out.flush()
|
|
|
|
n_lines_printed = 0
|
|
pipe_stdout = not sys.stdout.isatty()
|
|
|
|
def on_state_update(state):
|
|
nonlocal n_lines_printed
|
|
speech = state.speech_lines
|
|
buf = state.buffer_transcription.strip()
|
|
|
|
# Clear the buffer line
|
|
out.write("\r\033[K")
|
|
|
|
# Print new committed lines
|
|
while n_lines_printed < len(speech):
|
|
text = speech[n_lines_printed].get("text", "")
|
|
out.write(f" {text}\n")
|
|
if pipe_stdout:
|
|
sys.stdout.write(f"{text}\n")
|
|
sys.stdout.flush()
|
|
n_lines_printed += 1
|
|
|
|
# Show buffer (ephemeral, overwritten next update)
|
|
if buf:
|
|
out.write(f" \033[90m| {buf}\033[0m")
|
|
out.flush()
|
|
|
|
h.on_update(on_state_update)
|
|
|
|
# Bridge sounddevice thread -> async event loop
|
|
import asyncio
|
|
feed_queue = asyncio.Queue()
|
|
loop = asyncio.get_running_loop()
|
|
|
|
def audio_callback(indata, frames, time_info, status):
|
|
pcm = (indata[:, 0] * 32767).astype(np.int16).tobytes()
|
|
loop.call_soon_threadsafe(feed_queue.put_nowait, pcm)
|
|
|
|
try:
|
|
stream = sd.InputStream(
|
|
samplerate=SAMPLE_RATE,
|
|
channels=1,
|
|
dtype="float32",
|
|
blocksize=BLOCK_SIZE,
|
|
callback=audio_callback,
|
|
)
|
|
stream.start()
|
|
except Exception as e:
|
|
out.write(f"\n Could not open microphone: {e}\n")
|
|
out.write(" Check that a microphone is connected and permissions are granted.\n\n")
|
|
return
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
pcm_data = await asyncio.wait_for(feed_queue.get(), timeout=0.1)
|
|
await h.feed_pcm(pcm_data, speed=0)
|
|
except asyncio.TimeoutError:
|
|
pass
|
|
except KeyboardInterrupt:
|
|
pass
|
|
finally:
|
|
stream.stop()
|
|
stream.close()
|
|
|
|
out.write("\r\033[K\n Finishing...\n")
|
|
out.flush()
|
|
|
|
result = await h.finish(timeout=30)
|
|
|
|
# Print any remaining committed lines
|
|
speech = result.speech_lines
|
|
while n_lines_printed < len(speech):
|
|
text = speech[n_lines_printed].get("text", "")
|
|
out.write(f" {text}\n")
|
|
if pipe_stdout:
|
|
sys.stdout.write(f"{text}\n")
|
|
sys.stdout.flush()
|
|
n_lines_printed += 1
|
|
|
|
# Print remaining buffer
|
|
buf = result.buffer_transcription.strip()
|
|
if buf:
|
|
out.write(f" {buf}\n")
|
|
if pipe_stdout:
|
|
sys.stdout.write(f"{buf}\n")
|
|
sys.stdout.flush()
|
|
|
|
out.write("\n")
|
|
out.flush()
|
|
|
|
if parsed.output:
|
|
with open(parsed.output, "w") as f:
|
|
f.write(result.text + "\n")
|
|
out.write(f" Saved to: {parsed.output}\n\n")
|
|
out.flush()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk run` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _resolve_run_spec(spec: str):
|
|
"""Map a model spec to (backend, model_size).
|
|
|
|
Returns (backend_id_or_None, model_size_or_None).
|
|
"""
|
|
if ":" in spec:
|
|
backend_part, model_part = spec.split(":", 1)
|
|
return backend_part, model_part
|
|
|
|
backend_ids = {b["id"] for b in BACKENDS}
|
|
if spec in backend_ids:
|
|
return spec, None
|
|
|
|
if spec == "voxtral-mlx":
|
|
return "voxtral-mlx", None
|
|
|
|
if spec == "qwen3-mlx":
|
|
return "qwen3-mlx", None
|
|
|
|
if spec in WHISPER_SIZES:
|
|
return None, spec
|
|
|
|
return None, spec
|
|
|
|
|
|
def cmd_run(args: list):
|
|
"""Auto-pull model if needed and start the server.
|
|
|
|
Usage: wlk run [model] [server options]
|
|
"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="wlk run",
|
|
description="Download model (if needed) and start the transcription server.",
|
|
)
|
|
parser.add_argument("model", nargs="?", default=None,
|
|
help="Model spec (e.g., voxtral, large-v3, faster-whisper:base)")
|
|
|
|
parsed, extra_args = parser.parse_known_args(args)
|
|
|
|
backend_flag = None
|
|
model_flag = None
|
|
|
|
if parsed.model:
|
|
backend_flag, model_flag = _resolve_run_spec(parsed.model)
|
|
|
|
# Show what we resolved
|
|
catalog_match = next(
|
|
(m for m in MODEL_CATALOG if m["name"] == parsed.model),
|
|
None,
|
|
)
|
|
if catalog_match:
|
|
print(
|
|
f"\n Model: {catalog_match['name']} "
|
|
f"({catalog_match['params']} params, {catalog_match['disk']})",
|
|
file=sys.stderr,
|
|
)
|
|
if backend_flag:
|
|
print(f" Backend: {backend_flag}", file=sys.stderr)
|
|
else:
|
|
best = _best_backend_for_model(catalog_match)
|
|
print(f" Backend: {best} (auto-detected)", file=sys.stderr)
|
|
|
|
# Auto-pull if needed
|
|
downloaded = _scan_downloaded_models()
|
|
targets = _resolve_pull_target(parsed.model)
|
|
need_pull = any(repo_id not in downloaded for _, repo_id, _ in targets)
|
|
|
|
if need_pull and targets:
|
|
print("\n Model not found locally. Downloading...\n", file=sys.stderr)
|
|
result = cmd_pull(parsed.model)
|
|
if result != 0:
|
|
sys.exit(1)
|
|
print(file=sys.stderr)
|
|
|
|
# Build server argv
|
|
server_argv = [sys.argv[0]]
|
|
if backend_flag:
|
|
server_argv.extend(["--backend", backend_flag])
|
|
if model_flag:
|
|
server_argv.extend(["--model", model_flag])
|
|
server_argv.extend(extra_args)
|
|
|
|
sys.argv = server_argv
|
|
from whisperlivekit.basic_server import main as serve_main
|
|
serve_main()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk rm` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_rm(spec: str):
|
|
"""Delete a downloaded model from the cache."""
|
|
targets = _resolve_pull_target(spec)
|
|
if not targets:
|
|
return 1
|
|
|
|
downloaded = _scan_downloaded_models()
|
|
found_any = any(repo_id in downloaded for _, repo_id, _ in targets)
|
|
|
|
if not found_any:
|
|
print(f"\n Model '{spec}' is not downloaded.\n", file=sys.stderr)
|
|
return 1
|
|
|
|
print(file=sys.stderr)
|
|
|
|
for _, repo_id, label in targets:
|
|
if repo_id not in downloaded:
|
|
continue
|
|
|
|
try:
|
|
# Try HuggingFace cache first
|
|
from huggingface_hub import scan_cache_dir
|
|
cache_info = scan_cache_dir()
|
|
deleted = False
|
|
|
|
for repo in cache_info.repos:
|
|
if repo.repo_id == repo_id:
|
|
size_bytes = repo.size_on_disk
|
|
size_str = f"{size_bytes / 1e9:.1f} GB" if size_bytes > 1e9 else f"{size_bytes / 1e6:.0f} MB"
|
|
hashes = [rev.commit_hash for rev in repo.revisions]
|
|
strategy = cache_info.delete_revisions(*hashes)
|
|
print(f" Deleting {label} ({repo_id})...", file=sys.stderr)
|
|
strategy.execute()
|
|
print(f" Freed {size_str}", file=sys.stderr)
|
|
deleted = True
|
|
break
|
|
|
|
if not deleted:
|
|
# Native whisper cache — plain file
|
|
import os
|
|
path = downloaded.get(repo_id)
|
|
if path and os.path.isfile(path):
|
|
size = os.path.getsize(path)
|
|
size_str = f"{size / 1e6:.0f} MB"
|
|
os.remove(path)
|
|
print(f" Deleted {label} ({path})", file=sys.stderr)
|
|
print(f" Freed {size_str}", file=sys.stderr)
|
|
|
|
except Exception as e:
|
|
print(f" Failed to delete {label}: {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
print(file=sys.stderr)
|
|
return 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk check` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_check():
|
|
"""Verify system dependencies."""
|
|
print("\nSystem check:\n")
|
|
|
|
checks = [
|
|
("Python >= 3.11", sys.version_info >= (3, 11)),
|
|
("ffmpeg", _check_ffmpeg()),
|
|
("torch", _module_available("torch")),
|
|
("torchaudio", _module_available("torchaudio")),
|
|
("faster-whisper", _module_available("faster_whisper")),
|
|
("uvicorn", _module_available("uvicorn")),
|
|
("fastapi", _module_available("fastapi")),
|
|
]
|
|
|
|
all_ok = True
|
|
for name, ok in checks:
|
|
icon = "\033[32m OK\033[0m" if ok else "\033[31m MISSING\033[0m"
|
|
print(f" {icon} {name}")
|
|
if not ok:
|
|
all_ok = False
|
|
|
|
print()
|
|
if all_ok:
|
|
print(" All dependencies OK. Ready to serve.")
|
|
else:
|
|
print(" Some dependencies are missing. Install them before running the server.")
|
|
print()
|
|
return 0 if all_ok else 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# `wlk diagnose` subcommand
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_diagnose(args: list):
|
|
"""Run pipeline diagnostics on an audio file.
|
|
|
|
Feeds audio through the full pipeline while probing internal backend state
|
|
at regular intervals. Produces a timeline of what happened inside the
|
|
pipeline, flags anomalies (stuck tokens, generate thread errors, etc.),
|
|
and prints a pass/fail summary.
|
|
|
|
Usage: wlk diagnose [audio_file] [options]
|
|
"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="wlk diagnose",
|
|
description="Run pipeline diagnostics to debug transcription issues.",
|
|
)
|
|
parser.add_argument("file", nargs="?", default=None,
|
|
help="Audio file to diagnose (default: built-in test sample)")
|
|
parser.add_argument("--backend", default="auto", help="ASR backend (default: auto)")
|
|
parser.add_argument("--model", default="base", dest="model_size", help="Model size (default: base)")
|
|
parser.add_argument("--language", "--lan", default="auto", dest="lan", help="Language code (default: auto)")
|
|
parser.add_argument("--speed", type=float, default=1.0,
|
|
help="Playback speed (1.0=realtime, 0=instant, default: 1.0)")
|
|
parser.add_argument("--probe-interval", type=float, default=2.0,
|
|
help="Seconds between state probes (default: 2.0)")
|
|
parser.add_argument("--diarization", action="store_true", help="Enable speaker diarization")
|
|
|
|
parsed = parser.parse_args(args)
|
|
|
|
import asyncio
|
|
asyncio.run(_diagnose_main(parsed))
|
|
|
|
|
|
def _probe_backend_state(processor) -> dict:
|
|
"""Probe internal state of whatever ASR backend is running.
|
|
|
|
Returns a dict of diagnostic key-value pairs specific to the backend.
|
|
"""
|
|
info = {}
|
|
transcription = processor.transcription
|
|
if transcription is None:
|
|
info["error"] = "no transcription processor"
|
|
return info
|
|
|
|
# Common: audio buffer size
|
|
audio_buf = getattr(transcription, "audio_buffer", None)
|
|
if audio_buf is not None:
|
|
info["audio_buffer_samples"] = len(audio_buf)
|
|
info["audio_buffer_sec"] = round(len(audio_buf) / 16000, 2)
|
|
|
|
# Common: get_buffer result
|
|
try:
|
|
buf = transcription.get_buffer()
|
|
info["buffer_text"] = buf.text if buf else ""
|
|
except Exception as e:
|
|
info["buffer_error"] = str(e)
|
|
|
|
# Voxtral HF streaming specifics
|
|
if hasattr(transcription, "_generate_started"):
|
|
info["backend_type"] = "voxtral-hf-streaming"
|
|
info["generate_started"] = transcription._generate_started
|
|
info["generate_finished"] = transcription._generate_finished
|
|
info["n_audio_tokens_fed"] = transcription._n_audio_tokens_fed
|
|
info["n_text_tokens_received"] = transcription._n_text_tokens_received
|
|
info["n_committed_words"] = transcription._n_committed_words
|
|
info["pending_audio_samples"] = transcription._pending_len
|
|
with transcription._text_lock:
|
|
info["accumulated_text"] = transcription._get_accumulated_text()
|
|
if transcription._generate_error:
|
|
info["generate_error"] = str(transcription._generate_error)
|
|
# Audio queue depth
|
|
info["audio_queue_depth"] = transcription._audio_queue.qsize()
|
|
|
|
# Voxtral MLX specifics
|
|
elif hasattr(transcription, "_mlx_processor"):
|
|
info["backend_type"] = "voxtral-mlx"
|
|
|
|
# Qwen3 MLX specifics
|
|
elif hasattr(transcription, "_session") and hasattr(transcription, "_state"):
|
|
info["backend_type"] = "qwen3-mlx"
|
|
info["samples_fed"] = getattr(transcription, "_samples_fed", 0)
|
|
info["committed_words"] = getattr(transcription, "_n_committed_words", 0)
|
|
|
|
# SimulStreaming specifics
|
|
elif hasattr(transcription, "prev_output"):
|
|
info["backend_type"] = "simulstreaming"
|
|
info["prev_output_len"] = len(getattr(transcription, "prev_output", "") or "")
|
|
|
|
# LocalAgreement (OnlineASRProcessor) specifics
|
|
elif hasattr(transcription, "hypothesis_buffer"):
|
|
info["backend_type"] = "localagreement"
|
|
hb = transcription.hypothesis_buffer
|
|
if hasattr(hb, "committed"):
|
|
info["committed_words"] = len(hb.committed)
|
|
if hasattr(hb, "buffer"):
|
|
info["hypothesis_buffer_words"] = len(hb.buffer)
|
|
|
|
else:
|
|
info["backend_type"] = "unknown"
|
|
|
|
return info
|
|
|
|
|
|
def _probe_pipeline_state(processor) -> dict:
|
|
"""Probe pipeline-level state (queues, tasks, ffmpeg)."""
|
|
info = {}
|
|
if processor.transcription_queue:
|
|
info["transcription_queue_size"] = processor.transcription_queue.qsize()
|
|
if processor.diarization_queue:
|
|
info["diarization_queue_size"] = processor.diarization_queue.qsize()
|
|
if processor.translation_queue:
|
|
info["translation_queue_size"] = processor.translation_queue.qsize()
|
|
info["total_pcm_samples"] = processor.total_pcm_samples
|
|
info["total_audio_sec"] = round(processor.total_pcm_samples / 16000, 2)
|
|
info["is_stopping"] = processor.is_stopping
|
|
info["in_silence"] = processor.current_silence is not None
|
|
info["n_state_lines"] = len(processor.state.tokens)
|
|
info["n_state_updates"] = len(getattr(processor.state, "new_tokens", []))
|
|
return info
|
|
|
|
|
|
async def _diagnose_main(parsed):
|
|
"""Run the full diagnostic pipeline."""
|
|
import asyncio
|
|
import time as time_module
|
|
|
|
from whisperlivekit.test_harness import TestHarness, load_audio_pcm
|
|
|
|
out = sys.stderr
|
|
|
|
# Resolve audio file
|
|
audio_path = parsed.file
|
|
if audio_path is None:
|
|
try:
|
|
from whisperlivekit.test_data import get_samples
|
|
samples = get_samples()
|
|
# Prefer a sample matching the requested language
|
|
lang_match = [s for s in samples if s.language == parsed.lan]
|
|
sample = lang_match[0] if lang_match else samples[0]
|
|
audio_path = sample.path
|
|
out.write(f"\n Using test sample: {sample.name} ({sample.duration:.1f}s)\n")
|
|
except Exception as e:
|
|
out.write(f"\n No audio file provided and couldn't load test sample: {e}\n")
|
|
out.write(" Usage: wlk diagnose <audio_file> [options]\n\n")
|
|
sys.exit(1)
|
|
|
|
# Load audio
|
|
try:
|
|
pcm = load_audio_pcm(audio_path)
|
|
except Exception as e:
|
|
out.write(f"\n Failed to load audio: {e}\n\n")
|
|
sys.exit(1)
|
|
|
|
audio_duration = len(pcm) / (16000 * 2)
|
|
|
|
# Print header
|
|
out.write(f"\n {'━' * 70}\n")
|
|
out.write(" WhisperLiveKit Pipeline Diagnostic\n")
|
|
out.write(f" {'━' * 70}\n\n")
|
|
out.write(f" Audio: {audio_path}\n")
|
|
out.write(f" Duration: {audio_duration:.1f}s\n")
|
|
out.write(f" Backend: {parsed.backend}\n")
|
|
out.write(f" Model: {parsed.model_size}\n")
|
|
out.write(f" Language: {parsed.lan}\n")
|
|
out.write(f" Speed: {parsed.speed}x\n")
|
|
out.write(f" Probe every: {parsed.probe_interval}s\n")
|
|
out.write(f" Platform: {platform.system()} {platform.machine()}\n")
|
|
out.write(f" Accelerator: {_gpu_info()}\n")
|
|
out.write(f"\n {'─' * 70}\n")
|
|
out.write(" Loading model...\n")
|
|
out.flush()
|
|
|
|
kwargs = {
|
|
"model_size": parsed.model_size,
|
|
"lan": parsed.lan,
|
|
"pcm_input": True,
|
|
}
|
|
if parsed.backend != "auto":
|
|
kwargs["backend"] = parsed.backend
|
|
if parsed.diarization:
|
|
kwargs["diarization"] = True
|
|
|
|
t_load_start = time_module.perf_counter()
|
|
|
|
probes = []
|
|
anomalies = []
|
|
|
|
async with TestHarness(**kwargs) as h:
|
|
t_load = time_module.perf_counter() - t_load_start
|
|
out.write(f" Model loaded in {t_load:.1f}s\n")
|
|
out.write(f" {'─' * 70}\n")
|
|
out.write(" Feeding audio...\n\n")
|
|
out.flush()
|
|
|
|
processor = h._processor
|
|
chunk_duration = 0.5 # seconds per chunk
|
|
chunk_bytes = int(chunk_duration * 16000 * 2)
|
|
offset = 0
|
|
t_start = time_module.perf_counter()
|
|
last_probe = t_start
|
|
probe_idx = 0
|
|
|
|
# Feed audio with periodic probes
|
|
while offset < len(pcm):
|
|
end = min(offset + chunk_bytes, len(pcm))
|
|
await processor.process_audio(pcm[offset:end])
|
|
chunk_seconds = (end - offset) / (16000 * 2)
|
|
h._audio_position += chunk_seconds
|
|
offset = end
|
|
|
|
if parsed.speed > 0:
|
|
await asyncio.sleep(chunk_duration / parsed.speed)
|
|
|
|
# Probe at intervals
|
|
now = time_module.perf_counter()
|
|
if now - last_probe >= parsed.probe_interval:
|
|
probe_idx += 1
|
|
elapsed = now - t_start
|
|
audio_pos = h._audio_position
|
|
|
|
backend_state = _probe_backend_state(processor)
|
|
pipeline_state = _probe_pipeline_state(processor)
|
|
harness_state = {
|
|
"n_history": len(h.history),
|
|
"state_text_len": len(h.state.text),
|
|
"state_lines": len(h.state.lines),
|
|
"state_speech_lines": len(h.state.speech_lines),
|
|
"buffer": h.state.buffer_transcription[:80] if h.state.buffer_transcription else "",
|
|
}
|
|
|
|
probe = {
|
|
"idx": probe_idx,
|
|
"wall_time": round(elapsed, 1),
|
|
"audio_pos": round(audio_pos, 1),
|
|
"backend": backend_state,
|
|
"pipeline": pipeline_state,
|
|
"harness": harness_state,
|
|
}
|
|
probes.append(probe)
|
|
|
|
# Print probe
|
|
out.write(f" [{probe_idx:3d}] wall={elapsed:5.1f}s audio={audio_pos:5.1f}s")
|
|
|
|
bt = backend_state.get("backend_type", "?")
|
|
if bt == "voxtral-hf-streaming":
|
|
out.write(
|
|
f" | gen={'Y' if backend_state.get('generate_started') else 'N'}"
|
|
f" fin={'Y' if backend_state.get('generate_finished') else 'N'}"
|
|
f" audio_tok={backend_state.get('n_audio_tokens_fed', 0)}"
|
|
f" text_tok={backend_state.get('n_text_tokens_received', 0)}"
|
|
f" words={backend_state.get('n_committed_words', 0)}"
|
|
f" q={backend_state.get('audio_queue_depth', 0)}"
|
|
)
|
|
if backend_state.get("generate_error"):
|
|
out.write(f" \033[31mERROR: {backend_state['generate_error']}\033[0m")
|
|
elif bt == "localagreement":
|
|
out.write(
|
|
f" | committed={backend_state.get('committed_words', 0)}"
|
|
f" buf_words={backend_state.get('hypothesis_buffer_words', 0)}"
|
|
)
|
|
elif bt == "simulstreaming":
|
|
out.write(
|
|
f" | prev_out_len={backend_state.get('prev_output_len', 0)}"
|
|
)
|
|
|
|
buf_text = backend_state.get("buffer_text", "")
|
|
if buf_text:
|
|
display = buf_text[:50] + ("..." if len(buf_text) > 50 else "")
|
|
out.write(f'\n buf="{display}"')
|
|
|
|
out.write("\n")
|
|
out.flush()
|
|
|
|
# Anomaly detection
|
|
if bt == "voxtral-hf-streaming":
|
|
if backend_state.get("generate_started") and not backend_state.get("generate_finished"):
|
|
if backend_state.get("n_audio_tokens_fed", 0) > 10 and backend_state.get("n_text_tokens_received", 0) == 0:
|
|
anomalies.append(f"[probe {probe_idx}] {backend_state['n_audio_tokens_fed']} audio tokens fed but 0 text tokens received — model may be stalled")
|
|
if backend_state.get("generate_error"):
|
|
anomalies.append(f"[probe {probe_idx}] Generate thread error: {backend_state['generate_error']}")
|
|
|
|
if harness_state["n_history"] == 0 and elapsed > 5:
|
|
anomalies.append(f"[probe {probe_idx}] No state updates after {elapsed:.0f}s — pipeline may be stuck")
|
|
|
|
last_probe = now
|
|
|
|
# Done feeding — drain and finish
|
|
out.write(f"\n {'─' * 70}\n")
|
|
out.write(" Audio feeding complete. Draining pipeline...\n")
|
|
out.flush()
|
|
|
|
await h.drain(3.0)
|
|
|
|
# One more probe after drain
|
|
backend_state = _probe_backend_state(processor)
|
|
pipeline_state = _probe_pipeline_state(processor)
|
|
probe_idx += 1
|
|
elapsed = time_module.perf_counter() - t_start
|
|
out.write(f" [{probe_idx:3d}] wall={elapsed:5.1f}s audio={h._audio_position:5.1f}s (post-drain)\n")
|
|
|
|
bt = backend_state.get("backend_type", "?")
|
|
if bt == "voxtral-hf-streaming":
|
|
out.write(
|
|
f" text_tok={backend_state.get('n_text_tokens_received', 0)}"
|
|
f" words={backend_state.get('n_committed_words', 0)}"
|
|
f" accumulated_text_len={len(backend_state.get('accumulated_text', ''))}\n"
|
|
)
|
|
|
|
result = await h.finish(timeout=60)
|
|
t_total = time_module.perf_counter() - t_start
|
|
|
|
# === Summary ===
|
|
out.write(f"\n {'━' * 70}\n")
|
|
out.write(" Diagnostic Summary\n")
|
|
out.write(f" {'━' * 70}\n\n")
|
|
|
|
out.write(f" Wall time: {t_total:.1f}s\n")
|
|
out.write(f" Audio duration: {audio_duration:.1f}s\n")
|
|
rtf = t_total / audio_duration if audio_duration > 0 else 0
|
|
out.write(f" RTF: {rtf:.3f}x\n")
|
|
out.write(f" Model load: {t_load:.1f}s\n")
|
|
out.write(f" Probes taken: {probe_idx}\n\n")
|
|
|
|
# Text output summary
|
|
text = result.committed_text or result.text
|
|
n_words = len(text.split()) if text.strip() else 0
|
|
n_lines = len(result.speech_lines)
|
|
has_silence = result.has_silence
|
|
|
|
out.write(f" Output words: {n_words}\n")
|
|
out.write(f" Output lines: {n_lines}\n")
|
|
out.write(f" Has silence: {has_silence}\n")
|
|
out.write(f" Timing valid: {result.timing_valid}\n")
|
|
out.write(f" Timing monotonic: {result.timing_monotonic}\n")
|
|
|
|
timing_errors = result.timing_errors()
|
|
if timing_errors:
|
|
out.write("\n Timing errors:\n")
|
|
for err in timing_errors[:10]:
|
|
out.write(f" - {err}\n")
|
|
|
|
# Transcription preview
|
|
if text:
|
|
preview = text[:200] + ("..." if len(text) > 200 else "")
|
|
out.write(f'\n Transcription:\n "{preview}"\n')
|
|
else:
|
|
out.write("\n \033[31mNo transcription output!\033[0m\n")
|
|
|
|
# Anomalies
|
|
out.write(f"\n {'─' * 70}\n")
|
|
if anomalies:
|
|
out.write(f" \033[33mAnomalies detected ({len(anomalies)}):\033[0m\n")
|
|
for a in anomalies:
|
|
out.write(f" ⚠ {a}\n")
|
|
else:
|
|
out.write(" \033[32mNo anomalies detected.\033[0m\n")
|
|
|
|
# Pass/fail checks
|
|
out.write(f"\n {'─' * 70}\n")
|
|
out.write(" Health checks:\n\n")
|
|
|
|
checks = [
|
|
("Model loaded successfully", t_load < 300),
|
|
("Audio processed without errors", not anomalies),
|
|
("Transcription produced output", n_words > 0),
|
|
("At least one committed line", n_lines > 0),
|
|
("Timestamps are valid", result.timing_valid),
|
|
("Timestamps are monotonic", result.timing_monotonic),
|
|
("RTF < 2.0x (faster than half real-time)", rtf < 2.0),
|
|
]
|
|
|
|
all_pass = True
|
|
for label, ok in checks:
|
|
icon = "\033[32m PASS\033[0m" if ok else "\033[31m FAIL\033[0m"
|
|
out.write(f" {icon} {label}\n")
|
|
if not ok:
|
|
all_pass = False
|
|
|
|
out.write(f"\n {'━' * 70}\n")
|
|
if all_pass:
|
|
out.write(" \033[32mAll checks passed.\033[0m\n")
|
|
else:
|
|
out.write(" \033[31mSome checks failed. Review the timeline above for details.\033[0m\n")
|
|
out.write(f" {'━' * 70}\n\n")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _print_version():
|
|
"""Print version."""
|
|
from importlib.metadata import version
|
|
try:
|
|
v = version("whisperlivekit")
|
|
except Exception:
|
|
v = "dev"
|
|
print(f"WhisperLiveKit {v}")
|
|
|
|
|
|
def _print_help():
|
|
"""Print top-level help."""
|
|
print("""
|
|
WhisperLiveKit — Local speech-to-text toolkit
|
|
|
|
Usage: wlk <command> [options]
|
|
|
|
Commands:
|
|
serve Start the transcription server (default)
|
|
listen Live microphone transcription
|
|
run Auto-pull model and start server
|
|
transcribe Transcribe audio files offline
|
|
bench Benchmark speed and accuracy
|
|
diagnose Run pipeline diagnostics on audio
|
|
models List available backends and models
|
|
pull Download models for offline use
|
|
rm Delete downloaded models
|
|
check Verify system dependencies
|
|
|
|
Examples:
|
|
wlk # Start server with defaults
|
|
wlk listen # Transcribe from microphone
|
|
wlk listen --backend voxtral # Listen with specific backend
|
|
wlk run voxtral # Auto-pull + start server
|
|
wlk run large-v3 # Auto-pull + start server
|
|
wlk transcribe audio.wav # Transcribe a file
|
|
wlk transcribe --format srt audio.wav # Generate SRT subtitles
|
|
wlk bench # Benchmark current backend
|
|
wlk diagnose audio.wav --backend voxtral # Diagnose pipeline issues
|
|
wlk models # List backends + models
|
|
wlk pull large-v3 # Download model
|
|
wlk rm large-v3 # Delete downloaded model
|
|
wlk check # Check dependencies
|
|
|
|
Run 'wlk <command> --help' for command-specific help.
|
|
""")
|
|
|
|
|
|
def main():
|
|
"""CLI entry point: routes to subcommands or defaults to 'serve'."""
|
|
# Quick subcommand routing before argparse (so `wlk models` works
|
|
# without loading the full server stack)
|
|
if len(sys.argv) >= 2:
|
|
subcmd = sys.argv[1]
|
|
if subcmd == "models":
|
|
cmd_models()
|
|
return
|
|
if subcmd == "check":
|
|
sys.exit(cmd_check())
|
|
if subcmd == "pull":
|
|
if len(sys.argv) < 3:
|
|
print("Usage: wlk pull <model>")
|
|
print(" e.g.: wlk pull base, wlk pull faster-whisper:large-v3, wlk pull voxtral")
|
|
sys.exit(1)
|
|
sys.exit(cmd_pull(sys.argv[2]))
|
|
if subcmd == "rm":
|
|
if len(sys.argv) < 3:
|
|
print("Usage: wlk rm <model>")
|
|
print(" e.g.: wlk rm base, wlk rm voxtral")
|
|
sys.exit(1)
|
|
sys.exit(cmd_rm(sys.argv[2]))
|
|
if subcmd == "transcribe":
|
|
cmd_transcribe(sys.argv[2:])
|
|
return
|
|
if subcmd == "bench":
|
|
cmd_bench(sys.argv[2:])
|
|
return
|
|
if subcmd == "listen":
|
|
cmd_listen(sys.argv[2:])
|
|
return
|
|
if subcmd == "diagnose":
|
|
cmd_diagnose(sys.argv[2:])
|
|
return
|
|
if subcmd == "run":
|
|
cmd_run(sys.argv[2:])
|
|
return
|
|
if subcmd in ("-h", "--help", "help"):
|
|
_print_help()
|
|
return
|
|
if subcmd in ("version", "--version", "-V"):
|
|
_print_version()
|
|
return
|
|
if subcmd == "serve":
|
|
# Strip "serve" and pass remaining args to the server
|
|
sys.argv = [sys.argv[0]] + sys.argv[2:]
|
|
|
|
# Default: serve
|
|
from whisperlivekit.basic_server import main as serve_main
|
|
serve_main()
|