mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-21 16:40:35 +00:00
217 lines
12 KiB
Python
217 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate the architecture.png diagram for WhisperLiveKit README."""
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as mpatches
|
|
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
|
|
|
|
# ── Colours ──
|
|
C_BG = "#1a1a2e"
|
|
C_PANEL = "#16213e"
|
|
C_PANEL2 = "#0f3460"
|
|
C_ACCENT = "#e94560"
|
|
C_GREEN = "#4ecca3"
|
|
C_ORANGE = "#f5a623"
|
|
C_BLUE = "#4a9eff"
|
|
C_PURPLE = "#b06af2"
|
|
C_PINK = "#ff6b9d"
|
|
C_YELLOW = "#f0e68c"
|
|
C_TEXT = "#e8e8e8"
|
|
C_TEXTDIM = "#a0a0b0"
|
|
C_BOX_BG = "#1e2d4a"
|
|
C_BOX_BG2 = "#2a1a3a"
|
|
C_BOX_BG3 = "#1a3a2a"
|
|
C_BORDER = "#3a4a6a"
|
|
|
|
fig, ax = plt.subplots(1, 1, figsize=(20, 12), facecolor=C_BG)
|
|
ax.set_xlim(0, 20)
|
|
ax.set_ylim(0, 12)
|
|
ax.set_aspect("equal")
|
|
ax.axis("off")
|
|
fig.subplots_adjust(left=0.01, right=0.99, top=0.97, bottom=0.01)
|
|
|
|
|
|
def box(x, y, w, h, label, color=C_BORDER, bg=C_BOX_BG, fontsize=8, bold=False,
|
|
text_color=C_TEXT, radius=0.15):
|
|
rect = FancyBboxPatch(
|
|
(x, y), w, h,
|
|
boxstyle=f"round,pad=0.05,rounding_size={radius}",
|
|
facecolor=bg, edgecolor=color, linewidth=1.2,
|
|
)
|
|
ax.add_patch(rect)
|
|
weight = "bold" if bold else "normal"
|
|
ax.text(x + w/2, y + h/2, label, ha="center", va="center",
|
|
fontsize=fontsize, color=text_color, fontweight=weight, family="monospace")
|
|
return rect
|
|
|
|
|
|
def arrow(x1, y1, x2, y2, color=C_TEXTDIM, style="->", lw=1.2):
|
|
ax.annotate("", xy=(x2, y2), xytext=(x1, y1),
|
|
arrowprops=dict(arrowstyle=style, color=color, lw=lw))
|
|
|
|
|
|
def section_box(x, y, w, h, title, bg=C_PANEL, border=C_BORDER, title_color=C_ACCENT):
|
|
rect = FancyBboxPatch(
|
|
(x, y), w, h,
|
|
boxstyle="round,pad=0.05,rounding_size=0.2",
|
|
facecolor=bg, edgecolor=border, linewidth=1.5,
|
|
)
|
|
ax.add_patch(rect)
|
|
ax.text(x + 0.15, y + h - 0.25, title, ha="left", va="top",
|
|
fontsize=9, color=title_color, fontweight="bold", family="monospace")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Title
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
ax.text(10, 11.7, "WhisperLiveKit Architecture", ha="center", va="center",
|
|
fontsize=16, color=C_TEXT, fontweight="bold", family="monospace")
|
|
ax.text(10, 11.35, "CLI commands: serve | listen | run | transcribe | bench | diagnose | models | pull | rm | check",
|
|
ha="center", va="center", fontsize=7, color=C_TEXTDIM, family="monospace")
|
|
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Left: Client / Server
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
section_box(0.1, 7.0, 3.5, 4.0, "FastAPI Server", border=C_GREEN)
|
|
|
|
box(0.3, 10.0, 1.5, 0.5, "Web UI\nHTML + JS", color=C_GREEN, fontsize=7)
|
|
box(2.0, 10.0, 1.4, 0.5, "Frontend\n(optional)", color=C_GREEN, fontsize=7)
|
|
|
|
box(0.3, 9.1, 3.1, 0.6, "WebSocket /asr • /v1/listen", color=C_GREEN, fontsize=7, bold=True)
|
|
box(0.3, 8.3, 3.1, 0.5, "REST /v1/audio/transcriptions", color=C_GREEN, fontsize=7)
|
|
box(0.3, 7.4, 3.1, 0.5, "Health • /v1/models", color=C_GREEN, fontsize=7)
|
|
|
|
# Clients
|
|
ax.text(0.2, 6.5, "Clients:", fontsize=7, color=C_TEXTDIM, family="monospace")
|
|
for i, client in enumerate(["Browser", "OpenAI SDK", "Deepgram SDK", "TestHarness"]):
|
|
box(0.3 + i * 0.9, 5.8, 0.8, 0.5, client, fontsize=5.5, bg="#1a2a1a", color="#3a6a3a")
|
|
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Centre: Audio Processor (per-session pipeline)
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
section_box(4.0, 5.5, 5.5, 5.5, "Audio Processor (per session)", border=C_BLUE)
|
|
|
|
box(4.3, 10.0, 2.0, 0.6, "FFmpeg\nDecoding", color=C_BLUE, bg="#1a2a4a", bold=True)
|
|
arrow(3.6, 9.4, 4.3, 10.2, color=C_GREEN)
|
|
|
|
box(6.6, 10.0, 2.6, 0.6, "Silero VAD\nspeech / silence", color=C_BLUE, bg="#1a2a4a")
|
|
arrow(6.3, 10.3, 6.6, 10.3, color=C_BLUE)
|
|
|
|
box(4.3, 8.8, 4.9, 0.8, "SessionASRProxy\nthread-safe per-session language override", color=C_BLUE, fontsize=7)
|
|
arrow(6.0, 10.0, 6.0, 9.6, color=C_BLUE)
|
|
|
|
box(4.3, 7.6, 2.3, 0.8, "DiffTracker\n(opt-in ?mode=diff)", color="#5a5a7a", fontsize=7)
|
|
box(6.9, 7.6, 2.3, 0.8, "Result Formatter\n→ FrontData.to_dict()", color=C_BLUE, fontsize=7)
|
|
|
|
# Streaming policies
|
|
ax.text(4.3, 7.1, "Streaming policies:", fontsize=7, color=C_ORANGE, fontweight="bold", family="monospace")
|
|
box(4.3, 6.2, 2.3, 0.7, "LocalAgreement\nHypothesisBuffer", color=C_ORANGE, bg="#2a2a1a", fontsize=7)
|
|
box(6.9, 6.2, 2.3, 0.7, "SimulStreaming\nAlignAtt (Whisper)", color=C_ORANGE, bg="#2a2a1a", fontsize=7)
|
|
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Right: TranscriptionEngine (singleton)
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
section_box(10.0, 0.3, 9.8, 10.7, "TranscriptionEngine (singleton — shared across sessions)",
|
|
border=C_ACCENT, bg="#1e1520")
|
|
|
|
ax.text(10.2, 10.5, "6 ASR Backends", fontsize=9, color=C_ACCENT, fontweight="bold", family="monospace")
|
|
|
|
# ── Whisper backends ──
|
|
section_box(10.2, 7.3, 4.5, 3.0, "Whisper Family (chunk-based)", border=C_PURPLE, bg=C_BOX_BG2)
|
|
|
|
box(10.4, 9.2, 1.3, 0.6, "Faster\nWhisper", color=C_PURPLE, bg="#2a1a3a", fontsize=7, bold=True)
|
|
box(11.9, 9.2, 1.3, 0.6, "MLX\nWhisper", color=C_PURPLE, bg="#2a1a3a", fontsize=7, bold=True)
|
|
box(13.4, 9.2, 1.1, 0.6, "OpenAI\nWhisper", color=C_PURPLE, bg="#2a1a3a", fontsize=7)
|
|
|
|
ax.text(10.4, 8.7, "PCM → Encoder → Decoder → Tokens", fontsize=6.5, color=C_TEXTDIM, family="monospace")
|
|
ax.text(10.4, 8.3, "Uses LocalAgreement or SimulStreaming (AlignAtt)", fontsize=6, color=C_PURPLE, family="monospace")
|
|
ax.text(10.4, 7.9, "Language detection • Buffer trimming", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
ax.text(10.4, 7.5, "CPU / CUDA / MLX", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
|
|
# ── Voxtral backends ──
|
|
section_box(10.2, 3.8, 4.5, 3.2, "Voxtral (native streaming)", border=C_PINK, bg="#2a1520")
|
|
|
|
box(10.4, 5.9, 1.8, 0.6, "Voxtral MLX\n(Apple Silicon)", color=C_PINK, bg="#2a1520", fontsize=7, bold=True)
|
|
box(12.5, 5.9, 2.0, 0.6, "Voxtral HF\n(CUDA/MPS/CPU)", color=C_PINK, bg="#2a1520", fontsize=7, bold=True)
|
|
|
|
ax.text(10.4, 5.4, "Incremental encoder → Autoregressive decoder", fontsize=6.5, color=C_TEXTDIM, family="monospace")
|
|
ax.text(10.4, 5.0, "Sliding KV cache • Token-by-token output", fontsize=6, color=C_PINK, family="monospace")
|
|
ax.text(10.4, 4.6, "No chunking needed — truly streams audio", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
ax.text(10.4, 4.2, "4B params • 15 languages • 6-bit quant (MLX)", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
|
|
# ── Qwen3 backend ──
|
|
section_box(15.0, 3.8, 4.6, 3.2, "Qwen3 ASR (batch + aligner)", border=C_GREEN, bg=C_BOX_BG3)
|
|
|
|
box(15.2, 5.9, 1.5, 0.6, "Qwen3 ASR\n1.7B / 0.6B", color=C_GREEN, bg="#1a3a2a", fontsize=7, bold=True)
|
|
box(16.9, 5.9, 1.5, 0.6, "Qwen3\nSimul", color=C_GREEN, bg="#1a3a2a", fontsize=7, bold=True)
|
|
box(18.6, 5.9, 1.0, 0.6, "Forced\nAligner", color=C_GREEN, bg="#1a3a2a", fontsize=6.5)
|
|
|
|
ax.text(15.2, 5.4, "Batch + SimulStreaming (AlignAtt)", fontsize=6.5, color=C_TEXTDIM, family="monospace")
|
|
ax.text(15.2, 5.0, "ForcedAligner provides word timestamps", fontsize=6, color=C_GREEN, family="monospace")
|
|
ax.text(15.2, 4.6, "LocalAgreement or border-distance policy", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
ax.text(15.2, 4.2, "29 languages • CUDA/MPS/CPU", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
|
|
# ── OpenAI API ──
|
|
box(15.2, 7.7, 4.2, 0.6, "OpenAI API (cloud)", color="#5a6a7a", fontsize=7)
|
|
ax.text(15.2, 7.4, "Remote transcription • API key required", fontsize=6, color=C_TEXTDIM, family="monospace")
|
|
|
|
# ── Shared components ──
|
|
section_box(10.2, 0.5, 9.4, 3.0, "Shared Components", border="#5a6a7a", bg="#151520")
|
|
|
|
box(10.4, 2.2, 2.5, 0.8, "Mel Spectrogram\ncached DFT + filterbank",
|
|
color="#5a6a7a", fontsize=7)
|
|
box(13.2, 2.2, 2.5, 0.8, "Diarization\nSortformer / pyannote",
|
|
color="#5a6a7a", fontsize=7)
|
|
box(16.0, 2.2, 3.4, 0.8, "Translation\nNLLB • CTranslate2",
|
|
color="#5a6a7a", fontsize=7)
|
|
|
|
box(10.4, 0.8, 4.0, 0.8, "WhisperLiveKitConfig\n(single source of truth)",
|
|
color=C_ACCENT, fontsize=7, bold=True)
|
|
box(14.8, 0.8, 2.3, 0.8, "TestHarness\npipeline testing",
|
|
color="#5a6a7a", fontsize=7)
|
|
box(17.3, 0.8, 2.3, 0.8, "Benchmark\n8 langs • 13 samples",
|
|
color=C_ORANGE, fontsize=7, bold=True)
|
|
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Arrows: main data flow
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
|
|
# Audio processor → TranscriptionEngine
|
|
arrow(9.5, 8.5, 10.2, 8.5, color=C_ACCENT, lw=2)
|
|
ax.text(9.6, 8.8, "PCM audio", fontsize=6, color=C_ACCENT, family="monospace")
|
|
|
|
# TranscriptionEngine → Audio processor (results)
|
|
arrow(10.2, 7.0, 9.5, 7.0, color=C_GREEN, lw=2)
|
|
ax.text(9.6, 7.3, "ASRTokens", fontsize=6, color=C_GREEN, family="monospace")
|
|
|
|
# Streaming policy connections
|
|
arrow(5.5, 6.2, 5.5, 5.5, color=C_ORANGE, style="->")
|
|
arrow(8.1, 6.2, 8.1, 5.5, color=C_ORANGE, style="->")
|
|
ax.text(4.3, 5.6, "Whisper + Qwen3", fontsize=5.5, color=C_ORANGE, family="monospace")
|
|
ax.text(6.9, 5.6, "Whisper + Qwen3-simul", fontsize=5.5, color=C_ORANGE, family="monospace")
|
|
|
|
# Voxtral note (no policy needed)
|
|
ax.text(10.2, 3.5, "Voxtral: own streaming processor (no external policy)", fontsize=6,
|
|
color=C_PINK, family="monospace", style="italic")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Legend
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
legend_y = 5.0
|
|
ax.text(0.3, legend_y, "Streaming modes:", fontsize=7, color=C_TEXT, fontweight="bold", family="monospace")
|
|
for i, (label, color) in enumerate([
|
|
("Native streaming (Voxtral)", C_PINK),
|
|
("Chunk-based (Whisper)", C_PURPLE),
|
|
("Batch + aligner (Qwen3)", C_GREEN),
|
|
]):
|
|
ax.plot([0.3], [legend_y - 0.4 - i * 0.35], "s", color=color, markersize=6)
|
|
ax.text(0.6, legend_y - 0.4 - i * 0.35, label, fontsize=6.5, color=color,
|
|
va="center", family="monospace")
|
|
|
|
|
|
plt.savefig("architecture.png", dpi=200, facecolor=C_BG, bbox_inches="tight", pad_inches=0.1)
|
|
print("Saved architecture.png")
|