mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 06:14:05 +00:00
- Extend test_backend_offline.py with WER and timestamp accuracy metrics computed via whisperlivekit.metrics against ground truth transcripts. - Add --benchmark flag to auto-detect all installed backends and run each (backend, policy) combination in sequence. - Add --policy flag to override the streaming policy. - Add detect_available_backends() probing faster-whisper, mlx-whisper, voxtral-mlx, voxtral (HF), and openai-whisper. - Add print_cross_backend_comparison() with per-combo averages. - Add run_benchmark.py for comprehensive multi-model benchmarking. - Add BENCHMARK.md with full results on Apple M4: speed, WER, timestamp accuracy, VAC impact, and recommendations. - Add ground truth transcript JSON files for all audio test files.
292 lines
8.7 KiB
Python
292 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive benchmark runner for WhisperLiveKit.
|
|
|
|
Tests all available backend+policy combinations across multiple audio files,
|
|
model sizes, and VAC on/off configurations. Outputs structured JSON that
|
|
is consumed by the report generator.
|
|
|
|
Usage:
|
|
python run_benchmark.py # full benchmark
|
|
python run_benchmark.py --quick # subset (tiny models, fewer combos)
|
|
python run_benchmark.py --json results.json # custom output path
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import gc
|
|
import json
|
|
import logging
|
|
import platform
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from dataclasses import asdict
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
|
logger = logging.getLogger("benchmark")
|
|
logger.setLevel(logging.INFO)
|
|
|
|
# Re-use harness functions
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from test_backend_offline import (
|
|
AUDIO_TESTS_DIR,
|
|
SAMPLE_RATE,
|
|
TestResult,
|
|
create_engine,
|
|
discover_audio_files,
|
|
download_sample_audio,
|
|
load_audio,
|
|
run_test,
|
|
)
|
|
|
|
CACHE_DIR = Path(__file__).parent / ".test_cache"
|
|
|
|
|
|
def get_system_info() -> dict:
|
|
"""Collect system metadata for the report."""
|
|
info = {
|
|
"platform": platform.platform(),
|
|
"machine": platform.machine(),
|
|
"processor": platform.processor(),
|
|
"python_version": platform.python_version(),
|
|
}
|
|
|
|
# macOS: get chip info
|
|
try:
|
|
chip = subprocess.check_output(
|
|
["sysctl", "-n", "machdep.cpu.brand_string"], text=True
|
|
).strip()
|
|
info["cpu"] = chip
|
|
except Exception:
|
|
info["cpu"] = platform.processor()
|
|
|
|
# RAM
|
|
try:
|
|
mem_bytes = int(
|
|
subprocess.check_output(["sysctl", "-n", "hw.memsize"], text=True).strip()
|
|
)
|
|
info["ram_gb"] = round(mem_bytes / (1024**3))
|
|
except Exception:
|
|
info["ram_gb"] = None
|
|
|
|
# Backend versions
|
|
versions = {}
|
|
try:
|
|
import faster_whisper
|
|
versions["faster-whisper"] = faster_whisper.__version__
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
import mlx_whisper # noqa: F401
|
|
versions["mlx-whisper"] = "installed"
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
import mlx.core as mx
|
|
versions["mlx"] = mx.__version__
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
import transformers
|
|
versions["transformers"] = transformers.__version__
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
import torch
|
|
versions["torch"] = torch.__version__
|
|
except ImportError:
|
|
pass
|
|
|
|
info["backend_versions"] = versions
|
|
return info
|
|
|
|
|
|
def detect_combos(quick: bool = False) -> list:
|
|
"""Build list of (backend, policy, model_size) combos to test."""
|
|
combos = []
|
|
|
|
# Model sizes to test
|
|
model_sizes = ["tiny", "base", "small"] if not quick else ["tiny", "base"]
|
|
|
|
# faster-whisper
|
|
try:
|
|
import faster_whisper # noqa: F401
|
|
for model in model_sizes:
|
|
combos.append({"backend": "faster-whisper", "policy": "localagreement", "model": model})
|
|
combos.append({"backend": "faster-whisper", "policy": "simulstreaming", "model": model})
|
|
except ImportError:
|
|
pass
|
|
|
|
# mlx-whisper
|
|
try:
|
|
import mlx_whisper # noqa: F401
|
|
for model in model_sizes:
|
|
combos.append({"backend": "mlx-whisper", "policy": "localagreement", "model": model})
|
|
combos.append({"backend": "mlx-whisper", "policy": "simulstreaming", "model": model})
|
|
except ImportError:
|
|
pass
|
|
|
|
# voxtral-mlx (single model, single policy)
|
|
try:
|
|
from whisperlivekit.voxtral_mlx import VoxtralMLXModel # noqa: F401
|
|
combos.append({"backend": "voxtral-mlx", "policy": "voxtral", "model": ""})
|
|
except ImportError:
|
|
pass
|
|
|
|
# voxtral HF (single model, single policy)
|
|
try:
|
|
from transformers import AutoModelForSpeechSeq2Seq # noqa: F401
|
|
combos.append({"backend": "voxtral", "policy": "voxtral", "model": ""})
|
|
except ImportError:
|
|
pass
|
|
|
|
return combos
|
|
|
|
|
|
def collect_audio_files() -> list:
|
|
"""Collect all benchmark audio files."""
|
|
files = []
|
|
|
|
# audio_tests/ directory
|
|
if AUDIO_TESTS_DIR.is_dir():
|
|
files.extend(discover_audio_files(str(AUDIO_TESTS_DIR)))
|
|
|
|
# JFK sample
|
|
jfk = CACHE_DIR / "jfk.wav"
|
|
if not jfk.exists():
|
|
jfk = download_sample_audio()
|
|
if jfk.exists():
|
|
files.append(jfk)
|
|
|
|
return files
|
|
|
|
|
|
async def run_single_combo(
|
|
combo: dict, audio_files: list, vac: bool, lan: str, max_duration: float,
|
|
) -> list:
|
|
"""Run one backend+policy+model combo across all audio files."""
|
|
backend = combo["backend"]
|
|
policy = combo["policy"]
|
|
model = combo["model"]
|
|
|
|
results = []
|
|
try:
|
|
engine = create_engine(
|
|
backend=backend,
|
|
model_size=model,
|
|
lan=lan,
|
|
vac=vac,
|
|
policy=policy,
|
|
)
|
|
|
|
# Quiet noisy loggers
|
|
for mod in (
|
|
"whisperlivekit.audio_processor",
|
|
"whisperlivekit.simul_whisper",
|
|
"whisperlivekit.tokens_alignment",
|
|
"whisperlivekit.simul_whisper.align_att_base",
|
|
"whisperlivekit.simul_whisper.simul_whisper",
|
|
):
|
|
logging.getLogger(mod).setLevel(logging.WARNING)
|
|
|
|
for audio_path in audio_files:
|
|
duration = len(load_audio(str(audio_path))) / SAMPLE_RATE
|
|
if duration > max_duration:
|
|
logger.info(f" Skipping {audio_path.name} ({duration:.0f}s > {max_duration:.0f}s)")
|
|
continue
|
|
|
|
file_lan = lan
|
|
if "french" in audio_path.name.lower() and lan == "en":
|
|
file_lan = "fr"
|
|
|
|
audio = load_audio(str(audio_path))
|
|
result = await run_test(
|
|
engine, audio, chunk_ms=100, realtime=False,
|
|
audio_file=audio_path.name, backend=backend,
|
|
policy=policy, lan=file_lan,
|
|
)
|
|
# Tag with extra metadata
|
|
result_dict = asdict(result)
|
|
result_dict["model_size"] = model
|
|
result_dict["vac"] = vac
|
|
results.append(result_dict)
|
|
|
|
except Exception as e:
|
|
logger.error(f" FAILED: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
return results
|
|
|
|
|
|
async def run_full_benchmark(combos, audio_files, max_duration=60.0):
|
|
"""Run all combos with VAC on and off."""
|
|
all_results = []
|
|
total = len(combos) * 2 # x2 for VAC on/off
|
|
idx = 0
|
|
|
|
for combo in combos:
|
|
for vac in [True, False]:
|
|
idx += 1
|
|
vac_str = "VAC=on" if vac else "VAC=off"
|
|
desc = f"{combo['backend']} / {combo['policy']}"
|
|
if combo["model"]:
|
|
desc += f" / {combo['model']}"
|
|
desc += f" / {vac_str}"
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"[{idx}/{total}] {desc}")
|
|
print(f"{'='*70}")
|
|
|
|
results = await run_single_combo(
|
|
combo, audio_files, vac=vac, lan="en", max_duration=max_duration,
|
|
)
|
|
all_results.extend(results)
|
|
|
|
# Free memory between combos
|
|
gc.collect()
|
|
|
|
return all_results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Run comprehensive WhisperLiveKit benchmark")
|
|
parser.add_argument("--quick", action="store_true", help="Quick mode: fewer models and combos")
|
|
parser.add_argument("--json", default="benchmark_results.json", dest="json_output", help="Output JSON path")
|
|
parser.add_argument("--max-duration", type=float, default=60.0, help="Max audio duration in seconds")
|
|
args = parser.parse_args()
|
|
|
|
system_info = get_system_info()
|
|
combos = detect_combos(quick=args.quick)
|
|
audio_files = collect_audio_files()
|
|
|
|
print(f"System: {system_info.get('cpu', 'unknown')}, {system_info.get('ram_gb', '?')}GB RAM")
|
|
print(f"Backends: {list(system_info['backend_versions'].keys())}")
|
|
print(f"Combos to test: {len(combos)} x 2 (VAC on/off) = {len(combos)*2}")
|
|
print(f"Audio files: {[f.name for f in audio_files]}")
|
|
print()
|
|
|
|
t0 = time.time()
|
|
all_results = asyncio.run(
|
|
run_full_benchmark(combos, audio_files, max_duration=args.max_duration)
|
|
)
|
|
total_time = time.time() - t0
|
|
|
|
output = {
|
|
"system_info": system_info,
|
|
"benchmark_date": time.strftime("%Y-%m-%d %H:%M"),
|
|
"total_benchmark_time_s": round(total_time, 1),
|
|
"n_combos": len(combos) * 2,
|
|
"n_audio_files": len(audio_files),
|
|
"results": all_results,
|
|
}
|
|
|
|
Path(args.json_output).write_text(json.dumps(output, indent=2, ensure_ascii=False))
|
|
print(f"\nBenchmark complete in {total_time:.0f}s. Results: {args.json_output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|