Files
WhisperLiveKit/whisperlivekit/metrics_collector.py
Quentin Fuxa 4b2377c243 fix: correct false auto-detect claim, median bug, RTF inflation
- BENCHMARK.md: whisper also supports --language auto, voxtral is not
  the only one. Fixed mlx-whisper speed comparison (LA is actually
  faster than SS for mlx-whisper, not comparable).
- metrics.py: median calculation was wrong for even-length lists
  (took upper middle instead of averaging the two middle values).
- metrics_collector.py: RTF was inflated because log_summary() used
  wall-clock elapsed time instead of sum of actual ASR call durations.
- README.md: clarified that whisper also supports auto language
  detection, voxtral just does it better.
- Added 2 new median tests (even + odd length).
2026-02-22 23:38:04 +01:00

85 lines
3.0 KiB
Python

"""Lightweight runtime metrics for AudioProcessor sessions.
Zero external dependencies. Negligible overhead when not queried —
just integer increments and list appends during normal operation.
"""
import logging
import time
from dataclasses import dataclass, field
from typing import Dict, List
logger = logging.getLogger(__name__)
@dataclass
class SessionMetrics:
"""Per-session metrics collected by AudioProcessor."""
session_start: float = 0.0
total_audio_duration_s: float = 0.0
total_processing_time_s: float = 0.0
# Chunk / call counters
n_chunks_received: int = 0
n_transcription_calls: int = 0
n_tokens_produced: int = 0
n_responses_sent: int = 0
# Per-call ASR latency (seconds)
transcription_durations: List[float] = field(default_factory=list)
# Silence
n_silence_events: int = 0
total_silence_duration_s: float = 0.0
# --- Computed properties ---
@property
def rtf(self) -> float:
"""Real-time factor: processing_time / audio_duration."""
if self.total_audio_duration_s <= 0:
return 0.0
return self.total_processing_time_s / self.total_audio_duration_s
@property
def avg_latency_ms(self) -> float:
"""Average per-call ASR latency in milliseconds."""
if not self.transcription_durations:
return 0.0
return (sum(self.transcription_durations) / len(self.transcription_durations)) * 1000
@property
def p95_latency_ms(self) -> float:
"""95th percentile per-call ASR latency in milliseconds."""
if not self.transcription_durations:
return 0.0
sorted_d = sorted(self.transcription_durations)
idx = int(len(sorted_d) * 0.95)
idx = min(idx, len(sorted_d) - 1)
return sorted_d[idx] * 1000
def to_dict(self) -> Dict:
"""Serialize to a plain dict (JSON-safe)."""
return {
"session_start": self.session_start,
"total_audio_duration_s": round(self.total_audio_duration_s, 3),
"total_processing_time_s": round(self.total_processing_time_s, 3),
"rtf": round(self.rtf, 3),
"n_chunks_received": self.n_chunks_received,
"n_transcription_calls": self.n_transcription_calls,
"n_tokens_produced": self.n_tokens_produced,
"n_responses_sent": self.n_responses_sent,
"avg_latency_ms": round(self.avg_latency_ms, 2),
"p95_latency_ms": round(self.p95_latency_ms, 2),
"n_silence_events": self.n_silence_events,
"total_silence_duration_s": round(self.total_silence_duration_s, 3),
}
def log_summary(self) -> None:
"""Emit a structured log line summarising the session."""
self.total_processing_time_s = sum(self.transcription_durations)
d = self.to_dict()
d["session_elapsed_s"] = round(time.time() - self.session_start, 3) if self.session_start else 0
logger.info(f"SESSION_METRICS {d}")