Files
WhisperLiveKit/audio_tests/generate_transcripts.py
Quentin Fuxa 83d0fa3fac feat: benchmark suite with WER, timestamp accuracy, cross-backend comparison
- Extend test_backend_offline.py with WER and timestamp accuracy metrics
  computed via whisperlivekit.metrics against ground truth transcripts.
- Add --benchmark flag to auto-detect all installed backends and run
  each (backend, policy) combination in sequence.
- Add --policy flag to override the streaming policy.
- Add detect_available_backends() probing faster-whisper, mlx-whisper,
  voxtral-mlx, voxtral (HF), and openai-whisper.
- Add print_cross_backend_comparison() with per-combo averages.
- Add run_benchmark.py for comprehensive multi-model benchmarking.
- Add BENCHMARK.md with full results on Apple M4: speed, WER,
  timestamp accuracy, VAC impact, and recommendations.
- Add ground truth transcript JSON files for all audio test files.
2026-02-22 23:27:50 +01:00

58 lines
1.7 KiB
Python

#!/usr/bin/env python3
"""Generate word-level timestamped transcripts using faster-whisper (offline).
Produces one JSON file per audio with: [{word, start, end}, ...]
"""
import json
import os
from faster_whisper import WhisperModel
AUDIO_DIR = os.path.dirname(os.path.abspath(__file__))
FILES = [
("00_00_07_english_1_speaker.wav", "en"),
("00_00_16_french_1_speaker.wav", "fr"),
("00_00_30_english_3_speakers.wav", "en"),
]
def main():
print("Loading faster-whisper model (base, cpu, float32)...")
model = WhisperModel("base", device="cpu", compute_type="float32")
for filename, lang in FILES:
audio_path = os.path.join(AUDIO_DIR, filename)
out_path = os.path.join(
AUDIO_DIR, filename.rsplit(".", 1)[0] + ".transcript.json"
)
print(f"\n{'='*60}")
print(f"Transcribing: {filename} (language={lang})")
print(f"{'='*60}")
segments, info = model.transcribe(
audio_path, word_timestamps=True, language=lang
)
words = []
for segment in segments:
if segment.words:
for w in segment.words:
words.append({
"word": w.word.strip(),
"start": round(w.start, 3),
"end": round(w.end, 3),
})
print(f" {w.start:6.2f} - {w.end:6.2f} {w.word.strip()}")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(words, f, indent=2, ensure_ascii=False)
print(f"\n -> {len(words)} words written to {os.path.basename(out_path)}")
print("\nDone.")
if __name__ == "__main__":
main()