diff --git a/available_models.md b/available_models.md index 75714e6..6495fc5 100644 --- a/available_models.md +++ b/available_models.md @@ -1,4 +1,4 @@ -# Available model sizes: +# Available Whisper model sizes: - tiny.en (english only) - tiny @@ -70,4 +70,40 @@ 2. Limited resources or need speed? → `small` or smaller 3. Good hardware and want best quality? → `large-v3` 4. Need fast, high-quality transcription without translation? → `large-v3-turbo` -5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo) \ No newline at end of file +5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo) + + +_______________________ + +# Translation Models and Backend + +**Language Support**: ~200 languages + +## Distilled Model Sizes Available + +| Model | Size | Parameters | VRAM (FP16) | VRAM (INT8) | Quality | +|-------|------|------------|-------------|-------------|---------| +| 600M | 2.46 GB | 600M | ~1.5GB | ~800MB | Good, understandable | +| 1.3B | 5.48 GB | 1.3B | ~3GB | ~1.5GB | Better accuracy, context | + +**Quality Impact**: 1.3B has ~15-25% better BLEU scores vs 600M across language pairs. + +## Backend Performance + +| Backend | Speed vs Base | Memory Usage | Quality Loss | +|---------|---------------|--------------|--------------| +| CTranslate2 | 6-10x faster | 40-60% less | ~5% BLEU drop | +| Transformers | Baseline | High | None | +| Transformers + MPS (on Apple Silicon) | 2x faster | Medium | None | + +**Metrics**: +- CTranslate2: 50-100+ tokens/sec +- Transformers: 10-30 tokens/sec +- Apple Silicon with MPS: Up to 2x faster than CTranslate2 + +## Quick Decision Matrix + +**Choose 600M**: Limited resources, close to 0 lag +**Choose 1.3B**: Quality matters +**Choose Transformers**: On Apple Silicon + diff --git a/demo.png b/demo.png index bf25527..e903a3a 100644 Binary files a/demo.png and b/demo.png differ diff --git a/pyproject.toml b/pyproject.toml index 8749e1d..148d706 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "whisperlivekit" -version = "0.2.9" +version = "0.2.10" description = "Real-time speech-to-text with speaker diarization using Whisper" readme = "README.md" authors = [ diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 999ddd1..9dcd98b 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -445,8 +445,8 @@ class AudioProcessor: elif not lines: lines = [Line( speaker=1, - start=state.get("end_buffer", 0), - end=state.get("end_buffer", 0) + start=state.end_buffer, + end=state.end_buffer )] response = FrontData( diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js index 714e6a8..a527d85 100644 --- a/whisperlivekit/web/live_transcription.js +++ b/whisperlivekit/web/live_transcription.js @@ -345,13 +345,6 @@ function renderLinesWithBuffer( } let currentLineText = item.text || ""; - - if (item.translation) { - currentLineText += `
${speakerLabel}