mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 14:23:18 +00:00
0.2.10
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# Available model sizes:
|
||||
# Available Whisper model sizes:
|
||||
|
||||
- tiny.en (english only)
|
||||
- tiny
|
||||
@@ -70,4 +70,40 @@
|
||||
2. Limited resources or need speed? → `small` or smaller
|
||||
3. Good hardware and want best quality? → `large-v3`
|
||||
4. Need fast, high-quality transcription without translation? → `large-v3-turbo`
|
||||
5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo)
|
||||
5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo)
|
||||
|
||||
|
||||
_______________________
|
||||
|
||||
# Translation Models and Backend
|
||||
|
||||
**Language Support**: ~200 languages
|
||||
|
||||
## Distilled Model Sizes Available
|
||||
|
||||
| Model | Size | Parameters | VRAM (FP16) | VRAM (INT8) | Quality |
|
||||
|-------|------|------------|-------------|-------------|---------|
|
||||
| 600M | 2.46 GB | 600M | ~1.5GB | ~800MB | Good, understandable |
|
||||
| 1.3B | 5.48 GB | 1.3B | ~3GB | ~1.5GB | Better accuracy, context |
|
||||
|
||||
**Quality Impact**: 1.3B has ~15-25% better BLEU scores vs 600M across language pairs.
|
||||
|
||||
## Backend Performance
|
||||
|
||||
| Backend | Speed vs Base | Memory Usage | Quality Loss |
|
||||
|---------|---------------|--------------|--------------|
|
||||
| CTranslate2 | 6-10x faster | 40-60% less | ~5% BLEU drop |
|
||||
| Transformers | Baseline | High | None |
|
||||
| Transformers + MPS (on Apple Silicon) | 2x faster | Medium | None |
|
||||
|
||||
**Metrics**:
|
||||
- CTranslate2: 50-100+ tokens/sec
|
||||
- Transformers: 10-30 tokens/sec
|
||||
- Apple Silicon with MPS: Up to 2x faster than CTranslate2
|
||||
|
||||
## Quick Decision Matrix
|
||||
|
||||
**Choose 600M**: Limited resources, close to 0 lag
|
||||
**Choose 1.3B**: Quality matters
|
||||
**Choose Transformers**: On Apple Silicon
|
||||
|
||||
|
||||
BIN
demo.png
BIN
demo.png
Binary file not shown.
|
Before Width: | Height: | Size: 449 KiB After Width: | Height: | Size: 1.2 MiB |
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "whisperlivekit"
|
||||
version = "0.2.9"
|
||||
version = "0.2.10"
|
||||
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
|
||||
@@ -445,8 +445,8 @@ class AudioProcessor:
|
||||
elif not lines:
|
||||
lines = [Line(
|
||||
speaker=1,
|
||||
start=state.get("end_buffer", 0),
|
||||
end=state.get("end_buffer", 0)
|
||||
start=state.end_buffer,
|
||||
end=state.end_buffer
|
||||
)]
|
||||
|
||||
response = FrontData(
|
||||
|
||||
@@ -345,13 +345,6 @@ function renderLinesWithBuffer(
|
||||
}
|
||||
|
||||
let currentLineText = item.text || "";
|
||||
|
||||
if (item.translation) {
|
||||
currentLineText += `<div class="label_translation">
|
||||
<img src="/web/src/translate.svg" alt="Translation" width="12" height="12" />
|
||||
<span>${item.translation}</span>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
if (idx === lines.length - 1) {
|
||||
if (!isFinalizing && item.speaker !== -2) {
|
||||
@@ -385,6 +378,13 @@ function renderLinesWithBuffer(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (item.translation) {
|
||||
currentLineText += `<div class="label_translation">
|
||||
<img src="/web/src/translate.svg" alt="Translation" width="12" height="12" />
|
||||
<span>${item.translation}</span>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
return currentLineText.trim().length > 0 || speakerLabel.length > 0
|
||||
? `<p>${speakerLabel}<br/><div class='textcontent'>${currentLineText}</div></p>`
|
||||
|
||||
Reference in New Issue
Block a user