mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-04-26 16:45:46 +00:00
533 lines
20 KiB
Python
533 lines
20 KiB
Python
"""End-to-end pipeline tests using real models and real audio.
|
|
|
|
Run with: pytest tests/test_pipeline.py -v
|
|
|
|
Tests exercise the full pipeline through TestHarness + AudioPlayer:
|
|
audio feeding, play/pause/resume, silence detection, buffer inspection,
|
|
timing validation, and WER evaluation.
|
|
|
|
Each test is parameterized by backend so that adding a new backend
|
|
automatically gets test coverage. Tests use AudioPlayer for timeline
|
|
control — play segments, pause (inject silence), resume, cut.
|
|
|
|
Designed for AI agent automation: an agent can modify code, run these
|
|
tests, and validate transcription quality, timing, and streaming behavior.
|
|
"""
|
|
|
|
import logging
|
|
|
|
import pytest
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Backend detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
AVAILABLE_BACKENDS = []
|
|
|
|
try:
|
|
import mlx.core # noqa: F401
|
|
|
|
from whisperlivekit.voxtral_mlx.loader import load_voxtral_model # noqa: F401
|
|
AVAILABLE_BACKENDS.append("voxtral-mlx")
|
|
except ImportError:
|
|
pass
|
|
|
|
AVAILABLE_BACKENDS.append("whisper")
|
|
|
|
try:
|
|
from transformers import VoxtralRealtimeForConditionalGeneration # noqa: F401
|
|
AVAILABLE_BACKENDS.append("voxtral-hf")
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from qwen_asr import Qwen3ASRModel # noqa: F401
|
|
AVAILABLE_BACKENDS.append("qwen3")
|
|
except ImportError:
|
|
pass
|
|
|
|
BACKEND_CONFIG = {
|
|
"whisper": {"model_size": "tiny", "lan": "en"},
|
|
"voxtral-mlx": {"backend": "voxtral-mlx", "lan": "en"},
|
|
"voxtral-hf": {"backend": "voxtral", "lan": "en"},
|
|
"qwen3": {"backend": "qwen3", "lan": "en"},
|
|
}
|
|
|
|
# Voxtral backends flush all words at once with proportionally-distributed
|
|
# timestamps. After a silence gap the speech line that follows may start
|
|
# before the silence segment, making the sequence non-monotonic. This is
|
|
# a known limitation of the batch-flush architecture, not a bug.
|
|
VOXTRAL_BACKENDS = {"voxtral-mlx", "voxtral-hf"}
|
|
|
|
# Backends that use batch-flush and may have non-monotonic timestamps
|
|
BATCH_FLUSH_BACKENDS = {"voxtral-mlx", "voxtral-hf", "qwen3"}
|
|
|
|
|
|
def backend_kwargs(backend: str) -> dict:
|
|
return BACKEND_CONFIG.get(backend, {"model_size": "tiny", "lan": "en"})
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.fixture(scope="session")
|
|
def samples():
|
|
"""Download test samples once per session."""
|
|
from whisperlivekit.test_data import get_samples
|
|
return {s.name: s for s in get_samples()}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def short_sample(samples):
|
|
return samples["librispeech_short"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def medium_sample(samples):
|
|
return samples["librispeech_1"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def meeting_sample(samples):
|
|
return samples["ami_meeting"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 1. Transcription Quality
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_transcription_quality(backend, short_sample):
|
|
"""Feed a short clip and verify: text produced, WER < 50%, timestamps valid."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(short_sample.path, speed=0)
|
|
await h.drain(5.0)
|
|
result = await h.finish(timeout=60)
|
|
|
|
assert result.text.strip(), f"No text produced for {backend}"
|
|
|
|
errors = result.timing_errors()
|
|
assert not errors, f"Timing errors: {errors}"
|
|
|
|
wer = result.wer(short_sample.reference)
|
|
assert wer < 0.50, f"WER too high for {backend}: {wer:.2%}"
|
|
|
|
logger.info("[%s] WER=%.2f%% text='%s'", backend, wer * 100, result.text[:80])
|
|
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_medium_clip_timing_spans_audio(backend, medium_sample):
|
|
"""Feed ~14s clip and verify speech timestamps span roughly the audio duration."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(medium_sample.path, speed=0, chunk_duration=1.0)
|
|
await h.drain(5.0)
|
|
result = await h.finish(timeout=60)
|
|
|
|
assert result.text.strip(), f"No text for {backend}"
|
|
assert not result.timing_errors(), f"Timing errors: {result.timing_errors()}"
|
|
|
|
wer = result.wer(medium_sample.reference)
|
|
assert wer < 0.50, f"WER too high: {wer:.2%}"
|
|
|
|
# Speech should span most of the audio duration
|
|
speech_ts = [t for t in result.timestamps if t["speaker"] != -2]
|
|
if speech_ts:
|
|
last_end = speech_ts[-1]["end"]
|
|
assert last_end > medium_sample.duration * 0.5, (
|
|
f"Speech ends at {last_end:.1f}s but audio is {medium_sample.duration:.1f}s"
|
|
)
|
|
|
|
logger.info("[%s] medium: WER=%.2f%% lines=%d", backend, wer * 100, len(result.lines))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 2. Streaming Behavior
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_text_appears_progressively(backend, medium_sample):
|
|
"""Verify text grows during streaming, not just at finish."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
snapshots = []
|
|
|
|
def on_update(state):
|
|
snapshots.append(state.text)
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
h.on_update(on_update)
|
|
await h.feed(medium_sample.path, speed=2.0, chunk_duration=0.5)
|
|
await h.drain(5.0)
|
|
await h.finish(timeout=60)
|
|
|
|
non_empty = [t for t in snapshots if t.strip()]
|
|
assert len(non_empty) >= 2, (
|
|
f"Expected progressive updates for {backend}, got {len(non_empty)} non-empty"
|
|
)
|
|
|
|
if len(non_empty) >= 3:
|
|
mid = len(non_empty) // 2
|
|
assert len(non_empty[-1]) > len(non_empty[mid]), (
|
|
f"Text not growing during streaming for {backend}"
|
|
)
|
|
|
|
logger.info("[%s] streaming: %d updates, %d non-empty", backend, len(snapshots), len(non_empty))
|
|
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_buffer_lifecycle(backend, medium_sample):
|
|
"""Buffer has content during processing; finish() empties buffer, committed grows."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(medium_sample.path, speed=0, chunk_duration=1.0)
|
|
await h.drain(5.0)
|
|
result = await h.finish(timeout=60)
|
|
|
|
# After finish, buffer should be empty
|
|
assert not result.buffer_transcription.strip(), (
|
|
f"Buffer not empty after finish for {backend}: '{result.buffer_transcription}'"
|
|
)
|
|
# Committed text should have substantial content
|
|
assert result.committed_word_count > 5, (
|
|
f"Too few committed words for {backend}: {result.committed_word_count}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3. Play / Pause / Resume
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_silence_flushes_all_words(backend, medium_sample):
|
|
"""Silence must flush ALL pending words immediately — none held back for next speech.
|
|
|
|
This catches a critical bug where the last few words only appeared when
|
|
the user started speaking again, instead of being committed at silence time.
|
|
Root cause: non-blocking streamer drain racing with the generate thread.
|
|
"""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
# Feed all audio and let pipeline fully process
|
|
await h.feed(medium_sample.path, speed=0, chunk_duration=1.0)
|
|
await h.drain(8.0)
|
|
|
|
# Inject silence → triggers start_silence() which must flush everything
|
|
await h.pause(7.0, speed=0)
|
|
|
|
# Wait for start_silence() to complete (may block while generate thread
|
|
# catches up) AND for results_formatter to turn tokens into lines.
|
|
try:
|
|
await h.wait_for(
|
|
lambda s: s.has_silence and s.committed_word_count > 0,
|
|
timeout=30,
|
|
)
|
|
except TimeoutError:
|
|
pass
|
|
await h.drain(2.0)
|
|
|
|
# Capture state AFTER silence processing, BEFORE finish()
|
|
words_at_silence = h.state.committed_word_count
|
|
buffer_at_silence = h.state.buffer_transcription.strip()
|
|
|
|
# finish() joins the generate thread and flushes any stragglers
|
|
result = await h.finish(timeout=60)
|
|
words_at_finish = result.committed_word_count
|
|
|
|
# Key assertion: silence must have committed most words.
|
|
# Some backends (voxtral-hf) produce extra words from right-padding
|
|
# at finish(), and MPS inference may leave some words in the pipeline.
|
|
# At least 50% of final words must be committed at silence time.
|
|
if words_at_finish > 3:
|
|
flushed_pct = words_at_silence / words_at_finish
|
|
assert flushed_pct >= 0.50, (
|
|
f"[{backend}] Only {flushed_pct:.0%} of words flushed at silence. "
|
|
f"At silence: {words_at_silence}, at finish: {words_at_finish}. "
|
|
f"Buffer at silence: '{buffer_at_silence}'"
|
|
)
|
|
|
|
logger.info(
|
|
"[%s] silence flush: at_silence=%d, at_finish=%d, buffer='%s'",
|
|
backend, words_at_silence, words_at_finish, buffer_at_silence[:40],
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_play_pause_resume(backend, medium_sample):
|
|
"""Play 3s -> pause 7s -> resume 5s. Verify silence detected with valid timing."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
player = h.load_audio(medium_sample)
|
|
|
|
# Play first 3 seconds
|
|
await player.play(3.0, speed=0)
|
|
await h.drain(3.0)
|
|
|
|
# Pause 7s (above MIN_DURATION_REAL_SILENCE=5)
|
|
await h.pause(7.0, speed=0)
|
|
await h.drain(3.0)
|
|
|
|
# Resume and play 5 more seconds
|
|
await player.play(5.0, speed=0)
|
|
await h.drain(3.0)
|
|
|
|
result = await h.finish(timeout=60)
|
|
|
|
# Must have text
|
|
assert result.text.strip(), f"No text for {backend}"
|
|
|
|
# Must detect silence
|
|
assert result.has_silence, f"No silence detected for {backend}"
|
|
|
|
# Timing must be valid (start <= end for each line)
|
|
assert result.timing_valid, f"Invalid timing: {result.timing_errors()}"
|
|
|
|
# Monotonic timing — voxtral backends batch-flush words so silence
|
|
# segments can appear before the speech line they precede.
|
|
if backend not in BATCH_FLUSH_BACKENDS:
|
|
assert result.timing_monotonic, f"Non-monotonic: {result.timing_errors()}"
|
|
|
|
# At least 1 silence segment
|
|
assert len(result.silence_segments) >= 1
|
|
|
|
logger.info(
|
|
"[%s] play/pause/resume: %d lines, %d silence segs",
|
|
backend, len(result.lines), len(result.silence_segments),
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_pauses(backend, medium_sample):
|
|
"""Play-pause-play-pause-play cycle -> at least 2 silence segments."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
player = h.load_audio(medium_sample)
|
|
|
|
# Cycle 1: play 2s, pause 6s
|
|
await player.play(2.0, speed=0)
|
|
await h.drain(2.0)
|
|
await h.pause(6.0, speed=0)
|
|
await h.drain(2.0)
|
|
|
|
# Cycle 2: play 2s, pause 6s
|
|
await player.play(2.0, speed=0)
|
|
await h.drain(2.0)
|
|
await h.pause(6.0, speed=0)
|
|
await h.drain(2.0)
|
|
|
|
# Final: play remaining
|
|
await player.play(speed=0)
|
|
await h.drain(3.0)
|
|
|
|
result = await h.finish(timeout=60)
|
|
|
|
assert result.has_silence, f"No silence for {backend}"
|
|
assert len(result.silence_segments) >= 2, (
|
|
f"Expected >= 2 silence segments, got {len(result.silence_segments)} for {backend}"
|
|
)
|
|
|
|
assert result.timing_valid, f"Invalid timing: {result.timing_errors()}"
|
|
if backend not in BATCH_FLUSH_BACKENDS:
|
|
assert result.timing_monotonic, f"Non-monotonic: {result.timing_errors()}"
|
|
|
|
logger.info(
|
|
"[%s] multiple pauses: %d silence segs, %d speech lines",
|
|
backend, len(result.silence_segments), len(result.speech_lines),
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_short_pause_no_silence(backend, medium_sample):
|
|
"""Pause < 5s between speech segments should NOT produce a silence segment."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
player = h.load_audio(medium_sample)
|
|
|
|
# Play some speech
|
|
await player.play(4.0, speed=0)
|
|
await h.drain(2.0)
|
|
|
|
# Short pause (2s — well below MIN_DURATION_REAL_SILENCE=5)
|
|
await h.pause(2.0, speed=0)
|
|
await h.drain(1.0)
|
|
|
|
# Resume speech (triggers _end_silence with duration=2s < 5s threshold)
|
|
await player.play(4.0, speed=0)
|
|
await h.drain(3.0)
|
|
|
|
result = await h.finish(timeout=60)
|
|
|
|
# Should NOT have silence segments
|
|
assert not result.has_silence, (
|
|
f"Silence detected for {backend} on 2s pause (should be below 5s threshold)"
|
|
)
|
|
|
|
logger.info("[%s] short pause: no silence segment (correct)", backend)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 4. Cutoff
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_abrupt_cutoff(backend, medium_sample):
|
|
"""Cut audio mid-stream -> no crash, partial text preserved."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
player = h.load_audio(medium_sample)
|
|
|
|
# Play only first 4 seconds of a ~14s clip
|
|
await player.play(4.0, speed=0)
|
|
# Voxtral backends need more time to start producing text
|
|
await h.drain(8.0 if backend in BATCH_FLUSH_BACKENDS else 3.0)
|
|
|
|
# Abrupt cut — voxtral backends on MPS are slower
|
|
result = await h.cut(timeout=15 if backend in BATCH_FLUSH_BACKENDS else 10)
|
|
|
|
# Should have some text (even partial)
|
|
assert result.text.strip(), f"No text after cutoff for {backend}"
|
|
|
|
# No crashes — timing should be valid (voxtral may have non-monotonic)
|
|
assert result.timing_valid, f"Invalid timing after cutoff: {result.timing_errors()}"
|
|
|
|
logger.info("[%s] cutoff at 4s: text='%s'", backend, result.text[:60])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 5. Timing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_timing_precision_and_monotonicity(backend, medium_sample):
|
|
"""Timestamps have sub-second precision and are monotonically non-decreasing."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(medium_sample.path, speed=0, chunk_duration=1.0)
|
|
await h.drain(5.0)
|
|
# Add silence to test timing across silence boundary
|
|
await h.silence(7.0, speed=0)
|
|
await h.drain(3.0)
|
|
result = await h.finish(timeout=60)
|
|
|
|
# Sub-second precision (format is "H:MM:SS.cc")
|
|
has_subsecond = any(
|
|
"." in line.get(key, "")
|
|
for line in result.lines
|
|
for key in ("start", "end")
|
|
)
|
|
assert has_subsecond, f"No sub-second precision for {backend}: {result.lines}"
|
|
|
|
assert result.timing_valid, f"Invalid timing: {result.timing_errors()}"
|
|
assert result.timing_monotonic, f"Non-monotonic: {result.timing_errors()}"
|
|
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_silence_timing_reflects_pause(backend, short_sample):
|
|
"""Silence segment duration should roughly match the injected pause duration."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
pause_duration = 8.0
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(short_sample.path, speed=0)
|
|
await h.drain(3.0)
|
|
await h.pause(pause_duration, speed=0)
|
|
await h.drain(3.0)
|
|
result = await h.finish(timeout=60)
|
|
|
|
assert result.has_silence, f"No silence detected for {backend}"
|
|
|
|
# Check silence segment duration is in the right ballpark
|
|
for seg in result.timestamps:
|
|
if seg["speaker"] == -2:
|
|
seg_duration = seg["end"] - seg["start"]
|
|
# Allow generous tolerance (VAC detection + processing lag)
|
|
assert seg_duration > pause_duration * 0.3, (
|
|
f"Silence too short for {backend}: {seg_duration:.1f}s "
|
|
f"vs {pause_duration}s pause"
|
|
)
|
|
|
|
logger.info("[%s] silence timing OK", backend)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 6. State Inspection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_snapshot_history(backend, medium_sample):
|
|
"""Historical snapshots capture growing state at different audio positions."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(medium_sample.path, speed=2.0, chunk_duration=0.5)
|
|
await h.drain(5.0)
|
|
await h.finish(timeout=60)
|
|
|
|
# Should have multiple history entries
|
|
assert len(h.history) >= 2, f"Too few history entries: {len(h.history)}"
|
|
|
|
# Early snapshot should have less (or equal) text than late snapshot
|
|
early = h.snapshot_at(2.0)
|
|
late = h.snapshot_at(medium_sample.duration)
|
|
if early and late and early.audio_position < late.audio_position:
|
|
assert len(late.text) >= len(early.text), (
|
|
f"Late snapshot has less text than early for {backend}"
|
|
)
|
|
|
|
logger.info("[%s] snapshots: %d history entries", backend, len(h.history))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 7. Metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("backend", AVAILABLE_BACKENDS)
|
|
@pytest.mark.asyncio
|
|
async def test_metrics_collected(backend, short_sample):
|
|
"""Operational metrics are recorded during processing."""
|
|
from whisperlivekit.test_harness import TestHarness
|
|
|
|
async with TestHarness(**backend_kwargs(backend)) as h:
|
|
await h.feed(short_sample.path, speed=0)
|
|
await h.drain(3.0)
|
|
await h.finish(timeout=60)
|
|
|
|
m = h.metrics
|
|
assert m is not None, "Metrics not available"
|
|
assert m.n_chunks_received > 0, "No chunks recorded"
|
|
assert m.n_transcription_calls > 0, "No transcription calls"
|
|
assert len(m.transcription_durations) > 0, "No transcription durations"
|
|
assert m.n_tokens_produced > 0, "No tokens produced"
|
|
|
|
logger.info(
|
|
"[%s] metrics: chunks=%d calls=%d tokens=%d avg_lat=%.1fms",
|
|
backend, m.n_chunks_received, m.n_transcription_calls,
|
|
m.n_tokens_produced, m.avg_latency_ms,
|
|
)
|