mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 22:33:36 +00:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1833e7c921 | ||
|
|
777ec63a71 | ||
|
|
0a6e5ae9c1 | ||
|
|
ee448a37e9 | ||
|
|
9c051052b0 | ||
|
|
4d7c487614 | ||
|
|
65025cc448 | ||
|
|
bbba1d9bb7 | ||
|
|
99dc96c644 | ||
|
|
2a27d2030a | ||
|
|
cd160caaa1 | ||
|
|
d27b5eb23e | ||
|
|
f9d704a900 | ||
|
|
2f6e00f512 | ||
|
|
5aa312e437 | ||
|
|
ebaf36a8be |
25
DEV_NOTES.md
25
DEV_NOTES.md
@@ -18,8 +18,29 @@ Decoder weights: 59110771 bytes
|
|||||||
Encoder weights: 15268874 bytes
|
Encoder weights: 15268874 bytes
|
||||||
|
|
||||||
|
|
||||||
|
# 2. Translation: Faster model for each system
|
||||||
|
|
||||||
# 2. SortFormer Diarization: 4-to-2 Speaker Constraint Algorithm
|
## Benchmark Results
|
||||||
|
|
||||||
|
Testing on MacBook M3 with NLLB-200-distilled-600M model:
|
||||||
|
|
||||||
|
### Standard Transformers vs CTranslate2
|
||||||
|
|
||||||
|
| Test Text | Standard Inference Time | CTranslate2 Inference Time | Speedup |
|
||||||
|
|-----------|-------------------------|---------------------------|---------|
|
||||||
|
| UN Chief says there is no military solution in Syria | 0.9395s | 2.0472s | 0.5x |
|
||||||
|
| The rapid advancement of AI technology is transforming various industries | 0.7171s | 1.7516s | 0.4x |
|
||||||
|
| Climate change poses a significant threat to global ecosystems | 0.8533s | 1.8323s | 0.5x |
|
||||||
|
| International cooperation is essential for addressing global challenges | 0.7209s | 1.3575s | 0.5x |
|
||||||
|
| The development of renewable energy sources is crucial for a sustainable future | 0.8760s | 1.5589s | 0.6x |
|
||||||
|
|
||||||
|
**Results:**
|
||||||
|
- Total Standard time: 4.1068s
|
||||||
|
- Total CTranslate2 time: 8.5476s
|
||||||
|
- CTranslate2 is slower on this system --> Use Transformers, and ideally we would have an mlx implementation.
|
||||||
|
|
||||||
|
|
||||||
|
# 3. SortFormer Diarization: 4-to-2 Speaker Constraint Algorithm
|
||||||
|
|
||||||
Transform a diarization model that predicts up to 4 speakers into one that predicts up to 2 speakers by mapping the output predictions.
|
Transform a diarization model that predicts up to 4 speakers into one that predicts up to 2 speakers by mapping the output predictions.
|
||||||
|
|
||||||
@@ -67,4 +88,4 @@ ELSE:
|
|||||||
AS_2 ← B
|
AS_2 ← B
|
||||||
|
|
||||||
to finish
|
to finish
|
||||||
```
|
```
|
||||||
|
|||||||
72
README.md
72
README.md
@@ -18,9 +18,9 @@ Real-time speech transcription directly to your browser, with a ready-to-use bac
|
|||||||
|
|
||||||
#### Powered by Leading Research:
|
#### Powered by Leading Research:
|
||||||
|
|
||||||
- [SimulStreaming](https://github.com/ufal/SimulStreaming) (SOTA 2025) - Ultra-low latency transcription with AlignAtt policy
|
- [SimulStreaming](https://github.com/ufalSimulStreaming) (SOTA 2025) - Ultra-low latency transcription using [AlignAtt policy](https://arxiv.org/pdf/2305.11408)
|
||||||
- [NLLB](https://arxiv.org/abs/2207.04672), ([distilled](https://huggingface.co/entai2965/nllb-200-distilled-600M-ctranslate2)) (2024) - Translation to more than 100 languages.
|
- [NLLB](https://arxiv.org/abs/2207.04672), ([distilled](https://huggingface.co/entai2965/nllb-200-distilled-600M-ctranslate2)) (2024) - Translation to more than 100 languages.
|
||||||
- [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) - Low latency transcription with LocalAgreement policy
|
- [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) - Low latency transcription using [LocalAgreement policy](https://www.isca-archive.org/interspeech_2020/liu20s_interspeech.pdf)
|
||||||
- [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) - Advanced real-time speaker diarization
|
- [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) - Advanced real-time speaker diarization
|
||||||
- [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) - Real-time speaker diarization
|
- [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) - Real-time speaker diarization
|
||||||
- [Silero VAD](https://github.com/snakers4/silero-vad) (2024) - Enterprise-grade Voice Activity Detection
|
- [Silero VAD](https://github.com/snakers4/silero-vad) (2024) - Enterprise-grade Voice Activity Detection
|
||||||
@@ -42,15 +42,6 @@ pip install whisperlivekit
|
|||||||
```
|
```
|
||||||
> You can also clone the repo and `pip install -e .` for the latest version.
|
> You can also clone the repo and `pip install -e .` for the latest version.
|
||||||
|
|
||||||
|
|
||||||
> **FFmpeg is required** and must be installed before using WhisperLiveKit
|
|
||||||
>
|
|
||||||
> | OS | How to install |
|
|
||||||
> |-----------|-------------|
|
|
||||||
> | Ubuntu/Debian | `sudo apt install ffmpeg` |
|
|
||||||
> | MacOS | `brew install ffmpeg` |
|
|
||||||
> | Windows | Download .exe from https://ffmpeg.org/download.html and add to PATH |
|
|
||||||
|
|
||||||
#### Quick Start
|
#### Quick Start
|
||||||
1. **Start the transcription server:**
|
1. **Start the transcription server:**
|
||||||
```bash
|
```bash
|
||||||
@@ -86,11 +77,11 @@ See **Parameters & Configuration** below on how to use them.
|
|||||||
**Command-line Interface**: Start the transcription server with various options:
|
**Command-line Interface**: Start the transcription server with various options:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Use better model than default (small)
|
# Large model and translate from french to danish
|
||||||
whisperlivekit-server --model large-v3
|
whisperlivekit-server --model large-v3 --language fr --target-language da
|
||||||
|
|
||||||
# Advanced configuration with diarization and language
|
# Diarization and server listening on */80
|
||||||
whisperlivekit-server --host 0.0.0.0 --port 8000 --model medium --diarization --language fr
|
whisperlivekit-server --host 0.0.0.0 --port 80 --model medium --diarization --language fr
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@@ -137,26 +128,15 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||||||
|
|
||||||
## Parameters & Configuration
|
## Parameters & Configuration
|
||||||
|
|
||||||
An important list of parameters can be changed. But what *should* you change?
|
|
||||||
- the `--model` size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/available_models.md)
|
|
||||||
- the `--language`. List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English.
|
|
||||||
- the `--backend` ? you can switch to `--backend faster-whisper` if `simulstreaming` does not work correctly or if you prefer to avoid the dual-license requirements.
|
|
||||||
- `--warmup-file`, if you have one
|
|
||||||
- `--task translate`, to translate in english
|
|
||||||
- `--host`, `--port`, `--ssl-certfile`, `--ssl-keyfile`, if you set up a server
|
|
||||||
- `--diarization`, if you want to use it.
|
|
||||||
- [BETA] `--target-language`, to translate using NLLB. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly.
|
|
||||||
|
|
||||||
### Full list of parameters :
|
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
| Parameter | Description | Default |
|
||||||
|-----------|-------------|---------|
|
|-----------|-------------|---------|
|
||||||
| `--model` | Whisper model size. | `small` |
|
| `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/available_models.md) | `small` |
|
||||||
| `--language` | Source language code or `auto` | `auto` |
|
| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
|
||||||
| `--task` | Set to `translate` to translate to english | `transcribe` |
|
| `--target-language` | If sets, activates translation using NLLB. Ex: `fr`. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` |
|
||||||
| `--target-language` | [BETA] Translation language target. Ex: `fr` | `None` |
|
| `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` |
|
||||||
| `--backend` | Processing backend | `simulstreaming` |
|
| `--diarization` | Enable speaker identification | `False` |
|
||||||
| `--min-chunk-size` | Minimum audio chunk size (seconds) | `1.0` |
|
| `--backend` | Processing backend. You can switch to `faster-whisper` if `simulstreaming` does not work correctly | `simulstreaming` |
|
||||||
| `--no-vac` | Disable Voice Activity Controller | `False` |
|
| `--no-vac` | Disable Voice Activity Controller | `False` |
|
||||||
| `--no-vad` | Disable Voice Activity Detection | `False` |
|
| `--no-vad` | Disable Voice Activity Detection | `False` |
|
||||||
| `--warmup-file` | Audio file path for model warmup | `jfk.wav` |
|
| `--warmup-file` | Audio file path for model warmup | `jfk.wav` |
|
||||||
@@ -164,8 +144,19 @@ An important list of parameters can be changed. But what *should* you change?
|
|||||||
| `--port` | Server port | `8000` |
|
| `--port` | Server port | `8000` |
|
||||||
| `--ssl-certfile` | Path to the SSL certificate file (for HTTPS support) | `None` |
|
| `--ssl-certfile` | Path to the SSL certificate file (for HTTPS support) | `None` |
|
||||||
| `--ssl-keyfile` | Path to the SSL private key file (for HTTPS support) | `None` |
|
| `--ssl-keyfile` | Path to the SSL private key file (for HTTPS support) | `None` |
|
||||||
| `--pcm-input` | raw PCM (s16le) data is expected as input and FFmpeg will be bypassed. | `False` |
|
| `--pcm-input` | raw PCM (s16le) data is expected as input and FFmpeg will be bypassed. Frontend will use AudioWorklet instead of MediaRecorder | `False` |
|
||||||
|
|
||||||
|
| Translation options | Description | Default |
|
||||||
|
|-----------|-------------|---------|
|
||||||
|
| `--nllb-backend` | `transformers` or `ctranslate2` | `ctranslate2` |
|
||||||
|
| `--nllb-size` | `600M` or `1.3B` | `600M` |
|
||||||
|
|
||||||
|
| Diarization options | Description | Default |
|
||||||
|
|-----------|-------------|---------|
|
||||||
|
| `--diarization-backend` | `diart` or `sortformer` | `sortformer` |
|
||||||
|
| `--disable-punctuation-split` | Disable punctuation based splits. See #214 | `False` |
|
||||||
|
| `--segmentation-model` | Hugging Face model ID for Diart segmentation model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `pyannote/segmentation-3.0` |
|
||||||
|
| `--embedding-model` | Hugging Face model ID for Diart embedding model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `speechbrain/spkrec-ecapa-voxceleb` |
|
||||||
|
|
||||||
| SimulStreaming backend options | Description | Default |
|
| SimulStreaming backend options | Description | Default |
|
||||||
|-----------|-------------|---------|
|
|-----------|-------------|---------|
|
||||||
@@ -184,25 +175,16 @@ An important list of parameters can be changed. But what *should* you change?
|
|||||||
| `--preload-model-count` | Optional. Number of models to preload in memory to speed up loading (set up to the expected number of concurrent users) | `1` |
|
| `--preload-model-count` | Optional. Number of models to preload in memory to speed up loading (set up to the expected number of concurrent users) | `1` |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
| WhisperStreaming backend options | Description | Default |
|
| WhisperStreaming backend options | Description | Default |
|
||||||
|-----------|-------------|---------|
|
|-----------|-------------|---------|
|
||||||
| `--confidence-validation` | Use confidence scores for faster validation | `False` |
|
| `--confidence-validation` | Use confidence scores for faster validation | `False` |
|
||||||
| `--buffer_trimming` | Buffer trimming strategy (`sentence` or `segment`) | `segment` |
|
| `--buffer_trimming` | Buffer trimming strategy (`sentence` or `segment`) | `segment` |
|
||||||
|
|
||||||
| Diarization options | Description | Default |
|
|
||||||
|-----------|-------------|---------|
|
|
||||||
| `--diarization` | Enable speaker identification | `False` |
|
|
||||||
| `--diarization-backend` | `diart` or `sortformer` | `sortformer` |
|
|
||||||
| `--disable-punctuation-split` | Disable punctuation based splits. See #214 | `False` |
|
|
||||||
| `--segmentation-model` | Hugging Face model ID for Diart segmentation model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `pyannote/segmentation-3.0` |
|
|
||||||
| `--embedding-model` | Hugging Face model ID for Diart embedding model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `speechbrain/spkrec-ecapa-voxceleb` |
|
|
||||||
|
|
||||||
|
|
||||||
> For diarization using Diart, you need access to pyannote.audio models:
|
|
||||||
> 1. [Accept user conditions](https://huggingface.co/pyannote/segmentation) for the `pyannote/segmentation` model
|
> For diarization using Diart, you need to accept user conditions [here](https://huggingface.co/pyannote/segmentation) for the `pyannote/segmentation` model, [here](https://huggingface.co/pyannote/segmentation-3.0) for the `pyannote/segmentation-3.0` model and [here](https://huggingface.co/pyannote/embedding) for the `pyannote/embedding` model. **Then**, login to HuggingFace: `huggingface-cli login`
|
||||||
> 2. [Accept user conditions](https://huggingface.co/pyannote/segmentation-3.0) for the `pyannote/segmentation-3.0` model
|
|
||||||
> 3. [Accept user conditions](https://huggingface.co/pyannote/embedding) for the `pyannote/embedding` model
|
|
||||||
>4. Login with HuggingFace: `huggingface-cli login`
|
|
||||||
|
|
||||||
### 🚀 Deployment Guide
|
### 🚀 Deployment Guide
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Available model sizes:
|
# Available Whisper model sizes:
|
||||||
|
|
||||||
- tiny.en (english only)
|
- tiny.en (english only)
|
||||||
- tiny
|
- tiny
|
||||||
@@ -70,4 +70,40 @@
|
|||||||
2. Limited resources or need speed? → `small` or smaller
|
2. Limited resources or need speed? → `small` or smaller
|
||||||
3. Good hardware and want best quality? → `large-v3`
|
3. Good hardware and want best quality? → `large-v3`
|
||||||
4. Need fast, high-quality transcription without translation? → `large-v3-turbo`
|
4. Need fast, high-quality transcription without translation? → `large-v3-turbo`
|
||||||
5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo)
|
5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo)
|
||||||
|
|
||||||
|
|
||||||
|
_______________________
|
||||||
|
|
||||||
|
# Translation Models and Backend
|
||||||
|
|
||||||
|
**Language Support**: ~200 languages
|
||||||
|
|
||||||
|
## Distilled Model Sizes Available
|
||||||
|
|
||||||
|
| Model | Size | Parameters | VRAM (FP16) | VRAM (INT8) | Quality |
|
||||||
|
|-------|------|------------|-------------|-------------|---------|
|
||||||
|
| 600M | 2.46 GB | 600M | ~1.5GB | ~800MB | Good, understandable |
|
||||||
|
| 1.3B | 5.48 GB | 1.3B | ~3GB | ~1.5GB | Better accuracy, context |
|
||||||
|
|
||||||
|
**Quality Impact**: 1.3B has ~15-25% better BLEU scores vs 600M across language pairs.
|
||||||
|
|
||||||
|
## Backend Performance
|
||||||
|
|
||||||
|
| Backend | Speed vs Base | Memory Usage | Quality Loss |
|
||||||
|
|---------|---------------|--------------|--------------|
|
||||||
|
| CTranslate2 | 6-10x faster | 40-60% less | ~5% BLEU drop |
|
||||||
|
| Transformers | Baseline | High | None |
|
||||||
|
| Transformers + MPS (on Apple Silicon) | 2x faster | Medium | None |
|
||||||
|
|
||||||
|
**Metrics**:
|
||||||
|
- CTranslate2: 50-100+ tokens/sec
|
||||||
|
- Transformers: 10-30 tokens/sec
|
||||||
|
- Apple Silicon with MPS: Up to 2x faster than CTranslate2
|
||||||
|
|
||||||
|
## Quick Decision Matrix
|
||||||
|
|
||||||
|
**Choose 600M**: Limited resources, close to 0 lag
|
||||||
|
**Choose 1.3B**: Quality matters
|
||||||
|
**Choose Transformers**: On Apple Silicon
|
||||||
|
|
||||||
|
|||||||
BIN
demo.png
BIN
demo.png
Binary file not shown.
|
Before Width: | Height: | Size: 449 KiB After Width: | Height: | Size: 1.2 MiB |
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "whisperlivekit"
|
name = "whisperlivekit"
|
||||||
version = "0.2.9"
|
version = "0.2.10"
|
||||||
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = [
|
authors = [
|
||||||
|
|||||||
@@ -4,11 +4,11 @@ from time import time, sleep
|
|||||||
import math
|
import math
|
||||||
import logging
|
import logging
|
||||||
import traceback
|
import traceback
|
||||||
from whisperlivekit.timed_objects import ASRToken, Silence, Line
|
from whisperlivekit.timed_objects import ASRToken, Silence, Line, FrontData, State
|
||||||
from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory
|
from whisperlivekit.core import TranscriptionEngine, online_factory, online_diarization_factory, online_translation_factory
|
||||||
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
|
|
||||||
from whisperlivekit.silero_vad_iterator import FixedVADIterator
|
from whisperlivekit.silero_vad_iterator import FixedVADIterator
|
||||||
from whisperlivekit.results_formater import format_output
|
from whisperlivekit.results_formater import format_output
|
||||||
|
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
|
||||||
# Set up logging once
|
# Set up logging once
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -49,9 +49,6 @@ class AudioProcessor:
|
|||||||
self.bytes_per_sample = 2
|
self.bytes_per_sample = 2
|
||||||
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
||||||
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
||||||
self.last_ffmpeg_activity = time()
|
|
||||||
self.ffmpeg_health_check_interval = 5
|
|
||||||
self.ffmpeg_max_idle_time = 10
|
|
||||||
self.is_pcm_input = self.args.pcm_input
|
self.is_pcm_input = self.args.pcm_input
|
||||||
self.debug = False
|
self.debug = False
|
||||||
|
|
||||||
@@ -68,7 +65,7 @@ class AudioProcessor:
|
|||||||
self.lock = asyncio.Lock()
|
self.lock = asyncio.Lock()
|
||||||
self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio
|
self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio
|
||||||
self.sep = " " # Default separator
|
self.sep = " " # Default separator
|
||||||
self.last_response_content = ""
|
self.last_response_content = FrontData()
|
||||||
|
|
||||||
# Models and processing
|
# Models and processing
|
||||||
self.asr = models.asr
|
self.asr = models.asr
|
||||||
@@ -78,19 +75,21 @@ class AudioProcessor:
|
|||||||
self.vac = FixedVADIterator(models.vac_model)
|
self.vac = FixedVADIterator(models.vac_model)
|
||||||
else:
|
else:
|
||||||
self.vac = None
|
self.vac = None
|
||||||
|
|
||||||
self.ffmpeg_manager = FFmpegManager(
|
self.ffmpeg_manager = None
|
||||||
sample_rate=self.sample_rate,
|
self.ffmpeg_reader_task = None
|
||||||
channels=self.channels
|
|
||||||
)
|
|
||||||
|
|
||||||
async def handle_ffmpeg_error(error_type: str):
|
|
||||||
logger.error(f"FFmpeg error: {error_type}")
|
|
||||||
self._ffmpeg_error = error_type
|
|
||||||
|
|
||||||
self.ffmpeg_manager.on_error_callback = handle_ffmpeg_error
|
|
||||||
self._ffmpeg_error = None
|
self._ffmpeg_error = None
|
||||||
|
|
||||||
|
if not self.is_pcm_input:
|
||||||
|
self.ffmpeg_manager = FFmpegManager(
|
||||||
|
sample_rate=self.sample_rate,
|
||||||
|
channels=self.channels
|
||||||
|
)
|
||||||
|
async def handle_ffmpeg_error(error_type: str):
|
||||||
|
logger.error(f"FFmpeg error: {error_type}")
|
||||||
|
self._ffmpeg_error = error_type
|
||||||
|
self.ffmpeg_manager.on_error_callback = handle_ffmpeg_error
|
||||||
|
|
||||||
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
||||||
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
||||||
self.translation_queue = asyncio.Queue() if self.args.target_language else None
|
self.translation_queue = asyncio.Queue() if self.args.target_language else None
|
||||||
@@ -98,12 +97,12 @@ class AudioProcessor:
|
|||||||
|
|
||||||
self.transcription_task = None
|
self.transcription_task = None
|
||||||
self.diarization_task = None
|
self.diarization_task = None
|
||||||
self.ffmpeg_reader_task = None
|
|
||||||
self.watchdog_task = None
|
self.watchdog_task = None
|
||||||
self.all_tasks_for_cleanup = []
|
self.all_tasks_for_cleanup = []
|
||||||
|
|
||||||
if self.args.transcription:
|
if self.args.transcription:
|
||||||
self.online = online_factory(self.args, models.asr, models.tokenizer)
|
self.online = online_factory(self.args, models.asr, models.tokenizer)
|
||||||
|
self.sep = self.online.asr.sep
|
||||||
if self.args.diarization:
|
if self.args.diarization:
|
||||||
self.diarization = online_diarization_factory(self.args, models.diarization_model)
|
self.diarization = online_diarization_factory(self.args, models.diarization_model)
|
||||||
if self.args.target_language:
|
if self.args.target_language:
|
||||||
@@ -113,13 +112,12 @@ class AudioProcessor:
|
|||||||
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
||||||
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
|
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
async def update_transcription(self, new_tokens, buffer, end_buffer, sep):
|
async def update_transcription(self, new_tokens, buffer, end_buffer):
|
||||||
"""Thread-safe update of transcription with new data."""
|
"""Thread-safe update of transcription with new data."""
|
||||||
async with self.lock:
|
async with self.lock:
|
||||||
self.tokens.extend(new_tokens)
|
self.tokens.extend(new_tokens)
|
||||||
self.buffer_transcription = buffer
|
self.buffer_transcription = buffer
|
||||||
self.end_buffer = end_buffer
|
self.end_buffer = end_buffer
|
||||||
self.sep = sep
|
|
||||||
|
|
||||||
async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
|
async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
|
||||||
"""Thread-safe update of diarization with new data."""
|
"""Thread-safe update of diarization with new data."""
|
||||||
@@ -152,17 +150,16 @@ class AudioProcessor:
|
|||||||
latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
|
latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
|
||||||
remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1))
|
remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1))
|
||||||
|
|
||||||
return {
|
return State(
|
||||||
"tokens": self.tokens.copy(),
|
tokens=self.tokens.copy(),
|
||||||
"translated_segments": self.translated_segments.copy(),
|
translated_segments=self.translated_segments.copy(),
|
||||||
"buffer_transcription": self.buffer_transcription,
|
buffer_transcription=self.buffer_transcription,
|
||||||
"buffer_diarization": self.buffer_diarization,
|
buffer_diarization=self.buffer_diarization,
|
||||||
"end_buffer": self.end_buffer,
|
end_buffer=self.end_buffer,
|
||||||
"end_attributed_speaker": self.end_attributed_speaker,
|
end_attributed_speaker=self.end_attributed_speaker,
|
||||||
"sep": self.sep,
|
remaining_time_transcription=remaining_transcription,
|
||||||
"remaining_time_transcription": remaining_transcription,
|
remaining_time_diarization=remaining_diarization
|
||||||
"remaining_time_diarization": remaining_diarization
|
)
|
||||||
}
|
|
||||||
|
|
||||||
async def reset(self):
|
async def reset(self):
|
||||||
"""Reset all state variables to initial values."""
|
"""Reset all state variables to initial values."""
|
||||||
@@ -174,13 +171,15 @@ class AudioProcessor:
|
|||||||
self.beg_loop = time()
|
self.beg_loop = time()
|
||||||
|
|
||||||
async def ffmpeg_stdout_reader(self):
|
async def ffmpeg_stdout_reader(self):
|
||||||
"""Read audio data from FFmpeg stdout and process it."""
|
"""Read audio data from FFmpeg stdout and process it into the PCM pipeline."""
|
||||||
beg = time()
|
beg = time()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
# Check if FFmpeg is running
|
if self.is_stopping:
|
||||||
state = await self.ffmpeg_manager.get_state()
|
logger.info("Stopping ffmpeg_stdout_reader due to stopping flag.")
|
||||||
|
break
|
||||||
|
|
||||||
|
state = await self.ffmpeg_manager.get_state() if self.ffmpeg_manager else FFmpegState.STOPPED
|
||||||
if state == FFmpegState.FAILED:
|
if state == FFmpegState.FAILED:
|
||||||
logger.error("FFmpeg is in FAILED state, cannot read data")
|
logger.error("FFmpeg is in FAILED state, cannot read data")
|
||||||
break
|
break
|
||||||
@@ -188,55 +187,41 @@ class AudioProcessor:
|
|||||||
logger.info("FFmpeg is stopped")
|
logger.info("FFmpeg is stopped")
|
||||||
break
|
break
|
||||||
elif state != FFmpegState.RUNNING:
|
elif state != FFmpegState.RUNNING:
|
||||||
logger.warning(f"FFmpeg is in {state} state, waiting...")
|
await asyncio.sleep(0.1)
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
current_time = time()
|
current_time = time()
|
||||||
elapsed_time = math.floor((current_time - beg) * 10) / 10
|
elapsed_time = max(0.0, current_time - beg)
|
||||||
buffer_size = max(int(32000 * elapsed_time), 4096)
|
buffer_size = max(int(32000 * elapsed_time), 4096) # dynamic read
|
||||||
beg = current_time
|
beg = current_time
|
||||||
|
|
||||||
chunk = await self.ffmpeg_manager.read_data(buffer_size)
|
chunk = await self.ffmpeg_manager.read_data(buffer_size)
|
||||||
|
|
||||||
if not chunk:
|
if not chunk:
|
||||||
if self.is_stopping:
|
# No data currently available
|
||||||
logger.info("FFmpeg stdout closed, stopping.")
|
await asyncio.sleep(0.05)
|
||||||
break
|
continue
|
||||||
else:
|
|
||||||
# No data available, but not stopping - FFmpeg might be restarting
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.pcm_buffer.extend(chunk)
|
self.pcm_buffer.extend(chunk)
|
||||||
await self.handle_pcm_data()
|
await self.handle_pcm_data()
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("ffmpeg_stdout_reader cancelled.")
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
|
logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
|
||||||
logger.warning(f"Traceback: {traceback.format_exc()}")
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||||
# Try to recover by waiting a bit
|
await asyncio.sleep(0.2)
|
||||||
await asyncio.sleep(1)
|
|
||||||
|
logger.info("FFmpeg stdout processing finished. Signaling downstream processors if needed.")
|
||||||
# Check if we should exit
|
|
||||||
if self.is_stopping:
|
|
||||||
break
|
|
||||||
|
|
||||||
logger.info("FFmpeg stdout processing finished. Signaling downstream processors.")
|
|
||||||
if self.args.transcription and self.transcription_queue:
|
if self.args.transcription and self.transcription_queue:
|
||||||
await self.transcription_queue.put(SENTINEL)
|
await self.transcription_queue.put(SENTINEL)
|
||||||
logger.debug("Sentinel put into transcription_queue.")
|
|
||||||
if self.args.diarization and self.diarization_queue:
|
if self.args.diarization and self.diarization_queue:
|
||||||
await self.diarization_queue.put(SENTINEL)
|
await self.diarization_queue.put(SENTINEL)
|
||||||
logger.debug("Sentinel put into diarization_queue.")
|
|
||||||
if self.args.target_language and self.translation_queue:
|
if self.args.target_language and self.translation_queue:
|
||||||
await self.translation_queue.put(SENTINEL)
|
await self.translation_queue.put(SENTINEL)
|
||||||
|
|
||||||
|
|
||||||
async def transcription_processor(self):
|
async def transcription_processor(self):
|
||||||
"""Process audio chunks for transcription."""
|
"""Process audio chunks for transcription."""
|
||||||
self.sep = self.online.asr.sep
|
|
||||||
cumulative_pcm_duration_stream_time = 0.0
|
cumulative_pcm_duration_stream_time = 0.0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@@ -259,12 +244,11 @@ class AudioProcessor:
|
|||||||
asr_processing_logs += f" + Silence of = {item.duration:.2f}s"
|
asr_processing_logs += f" + Silence of = {item.duration:.2f}s"
|
||||||
if self.tokens:
|
if self.tokens:
|
||||||
asr_processing_logs += f" | last_end = {self.tokens[-1].end} |"
|
asr_processing_logs += f" | last_end = {self.tokens[-1].end} |"
|
||||||
logger.info(asr_processing_logs)
|
logger.info(asr_processing_logs)
|
||||||
|
|
||||||
if type(item) is Silence:
|
|
||||||
cumulative_pcm_duration_stream_time += item.duration
|
cumulative_pcm_duration_stream_time += item.duration
|
||||||
self.online.insert_silence(item.duration, self.tokens[-1].end if self.tokens else 0)
|
self.online.insert_silence(item.duration, self.tokens[-1].end if self.tokens else 0)
|
||||||
continue
|
continue
|
||||||
|
logger.info(asr_processing_logs)
|
||||||
|
|
||||||
if isinstance(item, np.ndarray):
|
if isinstance(item, np.ndarray):
|
||||||
pcm_array = item
|
pcm_array = item
|
||||||
@@ -276,7 +260,7 @@ class AudioProcessor:
|
|||||||
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
|
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
|
||||||
|
|
||||||
self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
|
self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
|
||||||
new_tokens, current_audio_processed_upto = self.online.process_iter()
|
new_tokens, current_audio_processed_upto = await asyncio.to_thread(self.online.process_iter)
|
||||||
|
|
||||||
# Get buffer information
|
# Get buffer information
|
||||||
_buffer_transcript_obj = self.online.get_buffer()
|
_buffer_transcript_obj = self.online.get_buffer()
|
||||||
@@ -300,10 +284,10 @@ class AudioProcessor:
|
|||||||
new_end_buffer = max(candidate_end_times)
|
new_end_buffer = max(candidate_end_times)
|
||||||
|
|
||||||
await self.update_transcription(
|
await self.update_transcription(
|
||||||
new_tokens, buffer_text, new_end_buffer, self.sep
|
new_tokens, buffer_text, new_end_buffer
|
||||||
)
|
)
|
||||||
|
|
||||||
if new_tokens and self.args.target_language and self.translation_queue:
|
if self.translation_queue:
|
||||||
for token in new_tokens:
|
for token in new_tokens:
|
||||||
await self.translation_queue.put(token)
|
await self.translation_queue.put(token)
|
||||||
|
|
||||||
@@ -314,6 +298,14 @@ class AudioProcessor:
|
|||||||
logger.warning(f"Traceback: {traceback.format_exc()}")
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
||||||
if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
|
if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
|
||||||
self.transcription_queue.task_done()
|
self.transcription_queue.task_done()
|
||||||
|
|
||||||
|
if self.is_stopping:
|
||||||
|
logger.info("Transcription processor finishing due to stopping flag.")
|
||||||
|
if self.diarization_queue:
|
||||||
|
await self.diarization_queue.put(SENTINEL)
|
||||||
|
if self.translation_queue:
|
||||||
|
await self.translation_queue.put(SENTINEL)
|
||||||
|
|
||||||
logger.info("Transcription processor task finished.")
|
logger.info("Transcription processor task finished.")
|
||||||
|
|
||||||
|
|
||||||
@@ -328,13 +320,11 @@ class AudioProcessor:
|
|||||||
logger.debug("Diarization processor received sentinel. Finishing.")
|
logger.debug("Diarization processor received sentinel. Finishing.")
|
||||||
self.diarization_queue.task_done()
|
self.diarization_queue.task_done()
|
||||||
break
|
break
|
||||||
|
elif type(item) is Silence:
|
||||||
if type(item) is Silence:
|
|
||||||
cumulative_pcm_duration_stream_time += item.duration
|
cumulative_pcm_duration_stream_time += item.duration
|
||||||
diarization_obj.insert_silence(item.duration)
|
diarization_obj.insert_silence(item.duration)
|
||||||
continue
|
continue
|
||||||
|
elif isinstance(item, np.ndarray):
|
||||||
if isinstance(item, np.ndarray):
|
|
||||||
pcm_array = item
|
pcm_array = item
|
||||||
else:
|
else:
|
||||||
raise Exception('item should be pcm_array')
|
raise Exception('item should be pcm_array')
|
||||||
@@ -367,14 +357,17 @@ class AudioProcessor:
|
|||||||
# in the future we want to have different languages for each speaker etc, so it will be more complex.
|
# in the future we want to have different languages for each speaker etc, so it will be more complex.
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
token = await self.translation_queue.get() #block until at least 1 token
|
item = await self.translation_queue.get() #block until at least 1 token
|
||||||
if token is SENTINEL:
|
if item is SENTINEL:
|
||||||
logger.debug("Translation processor received sentinel. Finishing.")
|
logger.debug("Translation processor received sentinel. Finishing.")
|
||||||
self.translation_queue.task_done()
|
self.translation_queue.task_done()
|
||||||
break
|
break
|
||||||
|
elif type(item) is Silence:
|
||||||
|
online_translation.insert_silence(item.duration)
|
||||||
|
continue
|
||||||
|
|
||||||
# get all the available tokens for translation. The more words, the more precise
|
# get all the available tokens for translation. The more words, the more precise
|
||||||
tokens_to_process = [token]
|
tokens_to_process = [item]
|
||||||
additional_tokens = await get_all_from_queue(self.translation_queue)
|
additional_tokens = await get_all_from_queue(self.translation_queue)
|
||||||
|
|
||||||
sentinel_found = False
|
sentinel_found = False
|
||||||
@@ -385,7 +378,7 @@ class AudioProcessor:
|
|||||||
tokens_to_process.append(additional_token)
|
tokens_to_process.append(additional_token)
|
||||||
if tokens_to_process:
|
if tokens_to_process:
|
||||||
online_translation.insert_tokens(tokens_to_process)
|
online_translation.insert_tokens(tokens_to_process)
|
||||||
self.translated_segments = online_translation.process()
|
self.translated_segments = await asyncio.to_thread(online_translation.process)
|
||||||
|
|
||||||
self.translation_queue.task_done()
|
self.translation_queue.task_done()
|
||||||
for _ in additional_tokens:
|
for _ in additional_tokens:
|
||||||
@@ -398,7 +391,7 @@ class AudioProcessor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Exception in translation_processor: {e}")
|
logger.warning(f"Exception in translation_processor: {e}")
|
||||||
logger.warning(f"Traceback: {traceback.format_exc()}")
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
||||||
if 'token' in locals() and token is not SENTINEL:
|
if 'token' in locals() and item is not SENTINEL:
|
||||||
self.translation_queue.task_done()
|
self.translation_queue.task_done()
|
||||||
if 'additional_tokens' in locals():
|
if 'additional_tokens' in locals():
|
||||||
for _ in additional_tokens:
|
for _ in additional_tokens:
|
||||||
@@ -407,39 +400,26 @@ class AudioProcessor:
|
|||||||
|
|
||||||
async def results_formatter(self):
|
async def results_formatter(self):
|
||||||
"""Format processing results for output."""
|
"""Format processing results for output."""
|
||||||
last_sent_trans = None
|
|
||||||
last_sent_diar = None
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
ffmpeg_state = await self.ffmpeg_manager.get_state()
|
# If FFmpeg error occurred, notify front-end
|
||||||
if ffmpeg_state == FFmpegState.FAILED and self._ffmpeg_error:
|
if self._ffmpeg_error:
|
||||||
yield {
|
yield FrontData(
|
||||||
"status": "error",
|
status="error",
|
||||||
"error": f"FFmpeg error: {self._ffmpeg_error}",
|
error=f"FFmpeg error: {self._ffmpeg_error}"
|
||||||
"lines": [],
|
)
|
||||||
"buffer_transcription": "",
|
|
||||||
"buffer_diarization": "",
|
|
||||||
"remaining_time_transcription": 0,
|
|
||||||
"remaining_time_diarization": 0
|
|
||||||
}
|
|
||||||
self._ffmpeg_error = None
|
self._ffmpeg_error = None
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get current state
|
# Get current state
|
||||||
state = await self.get_current_state()
|
state = await self.get_current_state()
|
||||||
tokens = state["tokens"]
|
|
||||||
buffer_transcription = state["buffer_transcription"]
|
|
||||||
buffer_diarization = state["buffer_diarization"]
|
|
||||||
end_attributed_speaker = state["end_attributed_speaker"]
|
|
||||||
sep = state["sep"]
|
|
||||||
|
|
||||||
# Add dummy tokens if needed
|
# Add dummy tokens if needed
|
||||||
if (not tokens or tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
|
if (not state.tokens or state.tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
|
||||||
await self.add_dummy_token()
|
await self.add_dummy_token()
|
||||||
sleep(0.5)
|
sleep(0.5)
|
||||||
state = await self.get_current_state()
|
state = await self.get_current_state()
|
||||||
tokens = state["tokens"]
|
|
||||||
|
|
||||||
# Format output
|
# Format output
|
||||||
lines, undiarized_text, buffer_transcription, buffer_diarization = format_output(
|
lines, undiarized_text, buffer_transcription, buffer_diarization = format_output(
|
||||||
@@ -447,53 +427,41 @@ class AudioProcessor:
|
|||||||
self.silence,
|
self.silence,
|
||||||
current_time = time() - self.beg_loop if self.beg_loop else None,
|
current_time = time() - self.beg_loop if self.beg_loop else None,
|
||||||
args = self.args,
|
args = self.args,
|
||||||
debug = self.debug
|
debug = self.debug,
|
||||||
|
sep=self.sep
|
||||||
)
|
)
|
||||||
# Handle undiarized text
|
# Handle undiarized text
|
||||||
if undiarized_text:
|
if undiarized_text:
|
||||||
combined = sep.join(undiarized_text)
|
combined = self.sep.join(undiarized_text)
|
||||||
if buffer_transcription:
|
if buffer_transcription:
|
||||||
combined += sep
|
combined += self.sep
|
||||||
await self.update_diarization(end_attributed_speaker, combined)
|
await self.update_diarization(state.end_attributed_speaker, combined)
|
||||||
buffer_diarization = combined
|
buffer_diarization = combined
|
||||||
|
|
||||||
response_status = "active_transcription"
|
response_status = "active_transcription"
|
||||||
if not tokens and not buffer_transcription and not buffer_diarization:
|
if not state.tokens and not buffer_transcription and not buffer_diarization:
|
||||||
response_status = "no_audio_detected"
|
response_status = "no_audio_detected"
|
||||||
lines = []
|
lines = []
|
||||||
elif response_status == "active_transcription" and not lines:
|
elif not lines:
|
||||||
lines = [Line(
|
lines = [Line(
|
||||||
speaker=1,
|
speaker=1,
|
||||||
start=state.get("end_buffer", 0),
|
start=state.end_buffer,
|
||||||
end=state.get("end_buffer", 0)
|
end=state.end_buffer
|
||||||
)]
|
)]
|
||||||
|
|
||||||
response = {
|
response = FrontData(
|
||||||
"status": response_status,
|
status=response_status,
|
||||||
"lines": [line.to_dict() for line in lines],
|
lines=lines,
|
||||||
"buffer_transcription": buffer_transcription,
|
buffer_transcription=buffer_transcription,
|
||||||
"buffer_diarization": buffer_diarization,
|
buffer_diarization=buffer_diarization,
|
||||||
"remaining_time_transcription": state["remaining_time_transcription"],
|
remaining_time_transcription=state.remaining_time_transcription,
|
||||||
"remaining_time_diarization": state["remaining_time_diarization"] if self.args.diarization else 0
|
remaining_time_diarization=state.remaining_time_diarization if self.args.diarization else 0
|
||||||
}
|
|
||||||
|
|
||||||
current_response_signature = f"{response_status} | " + \
|
|
||||||
' '.join([f"{line.speaker} {line.text}" for line in lines]) + \
|
|
||||||
f" | {buffer_transcription} | {buffer_diarization}"
|
|
||||||
|
|
||||||
trans = state["remaining_time_transcription"]
|
|
||||||
diar = state["remaining_time_diarization"]
|
|
||||||
should_push = (
|
|
||||||
current_response_signature != self.last_response_content
|
|
||||||
or last_sent_trans is None
|
|
||||||
or round(trans, 1) != round(last_sent_trans, 1)
|
|
||||||
or round(diar, 1) != round(last_sent_diar, 1)
|
|
||||||
)
|
)
|
||||||
if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
|
|
||||||
|
should_push = (response != self.last_response_content)
|
||||||
|
if should_push and (lines or buffer_transcription or buffer_diarization or response_status == "no_audio_detected"):
|
||||||
yield response
|
yield response
|
||||||
self.last_response_content = current_response_signature
|
self.last_response_content = response
|
||||||
last_sent_trans = trans
|
|
||||||
last_sent_diar = diar
|
|
||||||
|
|
||||||
# Check for termination condition
|
# Check for termination condition
|
||||||
if self.is_stopping:
|
if self.is_stopping:
|
||||||
@@ -507,32 +475,32 @@ class AudioProcessor:
|
|||||||
logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
|
logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
|
||||||
return
|
return
|
||||||
|
|
||||||
await asyncio.sleep(0.1) # Avoid overwhelming the client
|
await asyncio.sleep(0.05)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Exception in results_formatter: {e}")
|
logger.warning(f"Exception in results_formatter: {e}")
|
||||||
logger.warning(f"Traceback: {traceback.format_exc()}")
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
||||||
await asyncio.sleep(0.5) # Back off on error
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
async def create_tasks(self):
|
async def create_tasks(self):
|
||||||
"""Create and start processing tasks."""
|
"""Create and start processing tasks."""
|
||||||
self.all_tasks_for_cleanup = []
|
self.all_tasks_for_cleanup = []
|
||||||
processing_tasks_for_watchdog = []
|
processing_tasks_for_watchdog = []
|
||||||
|
|
||||||
success = await self.ffmpeg_manager.start()
|
# If using FFmpeg (non-PCM input), start it and spawn stdout reader
|
||||||
if not success:
|
if not self.is_pcm_input:
|
||||||
logger.error("Failed to start FFmpeg manager")
|
success = await self.ffmpeg_manager.start()
|
||||||
async def error_generator():
|
if not success:
|
||||||
yield {
|
logger.error("Failed to start FFmpeg manager")
|
||||||
"status": "error",
|
async def error_generator():
|
||||||
"error": "FFmpeg failed to start. Please check that FFmpeg is installed.",
|
yield FrontData(
|
||||||
"lines": [],
|
status="error",
|
||||||
"buffer_transcription": "",
|
error="FFmpeg failed to start. Please check that FFmpeg is installed."
|
||||||
"buffer_diarization": "",
|
)
|
||||||
"remaining_time_transcription": 0,
|
return error_generator()
|
||||||
"remaining_time_diarization": 0
|
self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
|
||||||
}
|
self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
|
||||||
return error_generator()
|
processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
|
||||||
|
|
||||||
if self.args.transcription and self.online:
|
if self.args.transcription and self.online:
|
||||||
self.transcription_task = asyncio.create_task(self.transcription_processor())
|
self.transcription_task = asyncio.create_task(self.transcription_processor())
|
||||||
@@ -549,10 +517,6 @@ class AudioProcessor:
|
|||||||
self.all_tasks_for_cleanup.append(self.translation_task)
|
self.all_tasks_for_cleanup.append(self.translation_task)
|
||||||
processing_tasks_for_watchdog.append(self.translation_task)
|
processing_tasks_for_watchdog.append(self.translation_task)
|
||||||
|
|
||||||
self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
|
|
||||||
self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
|
|
||||||
processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
|
|
||||||
|
|
||||||
# Monitor overall system health
|
# Monitor overall system health
|
||||||
self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
|
self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
|
||||||
self.all_tasks_for_cleanup.append(self.watchdog_task)
|
self.all_tasks_for_cleanup.append(self.watchdog_task)
|
||||||
@@ -573,15 +537,6 @@ class AudioProcessor:
|
|||||||
logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
|
logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
|
||||||
else:
|
else:
|
||||||
logger.info(f"{task_name} completed normally.")
|
logger.info(f"{task_name} completed normally.")
|
||||||
|
|
||||||
# Check FFmpeg status through the manager
|
|
||||||
ffmpeg_state = await self.ffmpeg_manager.get_state()
|
|
||||||
if ffmpeg_state == FFmpegState.FAILED:
|
|
||||||
logger.error("FFmpeg is in FAILED state, notifying results formatter")
|
|
||||||
# FFmpeg manager will handle its own recovery
|
|
||||||
elif ffmpeg_state == FFmpegState.STOPPED and not self.is_stopping:
|
|
||||||
logger.warning("FFmpeg unexpectedly stopped, attempting restart")
|
|
||||||
await self.ffmpeg_manager.restart()
|
|
||||||
|
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.info("Watchdog task cancelled.")
|
logger.info("Watchdog task cancelled.")
|
||||||
@@ -601,9 +556,14 @@ class AudioProcessor:
|
|||||||
if created_tasks:
|
if created_tasks:
|
||||||
await asyncio.gather(*created_tasks, return_exceptions=True)
|
await asyncio.gather(*created_tasks, return_exceptions=True)
|
||||||
logger.info("All processing tasks cancelled or finished.")
|
logger.info("All processing tasks cancelled or finished.")
|
||||||
await self.ffmpeg_manager.stop()
|
|
||||||
logger.info("FFmpeg manager stopped.")
|
if not self.is_pcm_input and self.ffmpeg_manager:
|
||||||
if self.args.diarization and hasattr(self, 'diarization') and hasattr(self.diarization, 'close'):
|
try:
|
||||||
|
await self.ffmpeg_manager.stop()
|
||||||
|
logger.info("FFmpeg manager stopped.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error stopping FFmpeg manager: {e}")
|
||||||
|
if self.args.diarization and hasattr(self, 'dianization') and hasattr(self.diarization, 'close'):
|
||||||
self.diarization.close()
|
self.diarization.close()
|
||||||
logger.info("AudioProcessor cleanup complete.")
|
logger.info("AudioProcessor cleanup complete.")
|
||||||
|
|
||||||
@@ -617,8 +577,13 @@ class AudioProcessor:
|
|||||||
if not message:
|
if not message:
|
||||||
logger.info("Empty audio message received, initiating stop sequence.")
|
logger.info("Empty audio message received, initiating stop sequence.")
|
||||||
self.is_stopping = True
|
self.is_stopping = True
|
||||||
# Signal FFmpeg manager to stop accepting data
|
|
||||||
await self.ffmpeg_manager.stop()
|
if self.transcription_queue:
|
||||||
|
await self.transcription_queue.put(SENTINEL)
|
||||||
|
|
||||||
|
if not self.is_pcm_input and self.ffmpeg_manager:
|
||||||
|
await self.ffmpeg_manager.stop()
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.is_stopping:
|
if self.is_stopping:
|
||||||
@@ -629,6 +594,9 @@ class AudioProcessor:
|
|||||||
self.pcm_buffer.extend(message)
|
self.pcm_buffer.extend(message)
|
||||||
await self.handle_pcm_data()
|
await self.handle_pcm_data()
|
||||||
else:
|
else:
|
||||||
|
if not self.ffmpeg_manager:
|
||||||
|
logger.error("FFmpeg manager not initialized for non-PCM input.")
|
||||||
|
return
|
||||||
success = await self.ffmpeg_manager.write_data(message)
|
success = await self.ffmpeg_manager.write_data(message)
|
||||||
if not success:
|
if not success:
|
||||||
ffmpeg_state = await self.ffmpeg_manager.get_state()
|
ffmpeg_state = await self.ffmpeg_manager.get_state()
|
||||||
@@ -671,6 +639,8 @@ class AudioProcessor:
|
|||||||
await self.transcription_queue.put(silence_buffer)
|
await self.transcription_queue.put(silence_buffer)
|
||||||
if self.args.diarization and self.diarization_queue:
|
if self.args.diarization and self.diarization_queue:
|
||||||
await self.diarization_queue.put(silence_buffer)
|
await self.diarization_queue.put(silence_buffer)
|
||||||
|
if self.translation_queue:
|
||||||
|
await self.translation_queue.put(silence_buffer)
|
||||||
|
|
||||||
if not self.silence:
|
if not self.silence:
|
||||||
if self.args.transcription and self.transcription_queue:
|
if self.args.transcription and self.transcription_queue:
|
||||||
|
|||||||
@@ -18,16 +18,7 @@ args = parse_args()
|
|||||||
transcription_engine = None
|
transcription_engine = None
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
|
|
||||||
#to remove after 0.2.8
|
|
||||||
if args.backend == "simulstreaming" and not args.disable_fast_encoder:
|
|
||||||
logger.warning(f"""
|
|
||||||
{'='*50}
|
|
||||||
WhisperLiveKit 0.2.8 has introduced a new fast encoder feature using MLX Whisper or Faster Whisper for improved speed. Use --disable-fast-encoder to disable if you encounter issues.
|
|
||||||
{'='*50}
|
|
||||||
""")
|
|
||||||
|
|
||||||
global transcription_engine
|
global transcription_engine
|
||||||
transcription_engine = TranscriptionEngine(
|
transcription_engine = TranscriptionEngine(
|
||||||
**vars(args),
|
**vars(args),
|
||||||
@@ -54,7 +45,7 @@ async def handle_websocket_results(websocket, results_generator):
|
|||||||
"""Consumes results from the audio processor and sends them via WebSocket."""
|
"""Consumes results from the audio processor and sends them via WebSocket."""
|
||||||
try:
|
try:
|
||||||
async for response in results_generator:
|
async for response in results_generator:
|
||||||
await websocket.send_json(response)
|
await websocket.send_json(response.to_dict())
|
||||||
# when the results_generator finishes it means all audio has been processed
|
# when the results_generator finishes it means all audio has been processed
|
||||||
logger.info("Results generator finished. Sending 'ready_to_stop' to client.")
|
logger.info("Results generator finished. Sending 'ready_to_stop' to client.")
|
||||||
await websocket.send_json({"type": "ready_to_stop"})
|
await websocket.send_json({"type": "ready_to_stop"})
|
||||||
@@ -72,6 +63,11 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||||||
)
|
)
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
logger.info("WebSocket connection opened.")
|
logger.info("WebSocket connection opened.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
await websocket.send_json({"type": "config", "useAudioWorklet": bool(args.pcm_input)})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to send config to client: {e}")
|
||||||
|
|
||||||
results_generator = await audio_processor.create_tasks()
|
results_generator = await audio_processor.create_tasks()
|
||||||
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from .whisper_streaming_custom.whisper_online import backend_factory
|
from .whisper_streaming_custom.whisper_online import backend_factory
|
||||||
from .whisper_streaming_custom.online_asr import OnlineASRProcessor
|
from .whisper_streaming_custom.online_asr import OnlineASRProcessor
|
||||||
from whisperlivekit.warmup import warmup_asr, warmup_online
|
from whisperlivekit.warmup import warmup_asr
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@@ -43,10 +43,12 @@ class TranscriptionEngine:
|
|||||||
"transcription": True,
|
"transcription": True,
|
||||||
"vad": True,
|
"vad": True,
|
||||||
"pcm_input": False,
|
"pcm_input": False,
|
||||||
|
|
||||||
# whisperstreaming params:
|
# whisperstreaming params:
|
||||||
"buffer_trimming": "segment",
|
"buffer_trimming": "segment",
|
||||||
"confidence_validation": False,
|
"confidence_validation": False,
|
||||||
"buffer_trimming_sec": 15,
|
"buffer_trimming_sec": 15,
|
||||||
|
|
||||||
# simulstreaming params:
|
# simulstreaming params:
|
||||||
"disable_fast_encoder": False,
|
"disable_fast_encoder": False,
|
||||||
"frame_threshold": 25,
|
"frame_threshold": 25,
|
||||||
@@ -61,10 +63,15 @@ class TranscriptionEngine:
|
|||||||
"max_context_tokens": None,
|
"max_context_tokens": None,
|
||||||
"model_path": './base.pt',
|
"model_path": './base.pt',
|
||||||
"diarization_backend": "sortformer",
|
"diarization_backend": "sortformer",
|
||||||
|
|
||||||
# diarization params:
|
# diarization params:
|
||||||
"disable_punctuation_split" : False,
|
"disable_punctuation_split" : False,
|
||||||
"segmentation_model": "pyannote/segmentation-3.0",
|
"segmentation_model": "pyannote/segmentation-3.0",
|
||||||
"embedding_model": "pyannote/embedding",
|
"embedding_model": "pyannote/embedding",
|
||||||
|
|
||||||
|
# translation params:
|
||||||
|
"nllb_backend": "ctranslate2",
|
||||||
|
"nllb_size": "600M"
|
||||||
}
|
}
|
||||||
|
|
||||||
config_dict = {**defaults, **kwargs}
|
config_dict = {**defaults, **kwargs}
|
||||||
@@ -120,7 +127,7 @@ class TranscriptionEngine:
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
self.asr, self.tokenizer = backend_factory(self.args)
|
self.asr, self.tokenizer = backend_factory(self.args)
|
||||||
warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
|
warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
|
||||||
|
|
||||||
if self.args.diarization:
|
if self.args.diarization:
|
||||||
if self.args.diarization_backend == "diart":
|
if self.args.diarization_backend == "diart":
|
||||||
@@ -142,8 +149,7 @@ class TranscriptionEngine:
|
|||||||
raise Exception('Translation cannot be set with language auto')
|
raise Exception('Translation cannot be set with language auto')
|
||||||
else:
|
else:
|
||||||
from whisperlivekit.translation.translation import load_model
|
from whisperlivekit.translation.translation import load_model
|
||||||
self.translation_model = load_model([self.args.lan]) #in the future we want to handle different languages for different speakers
|
self.translation_model = load_model([self.args.lan], backend=self.args.nllb_backend, model_size=self.args.nllb_size) #in the future we want to handle different languages for different speakers
|
||||||
|
|
||||||
TranscriptionEngine._initialized = True
|
TranscriptionEngine._initialized = True
|
||||||
|
|
||||||
|
|
||||||
@@ -155,7 +161,6 @@ def online_factory(args, asr, tokenizer, logfile=sys.stderr):
|
|||||||
asr,
|
asr,
|
||||||
logfile=logfile,
|
logfile=logfile,
|
||||||
)
|
)
|
||||||
# warmup_online(online, args.warmup_file)
|
|
||||||
else:
|
else:
|
||||||
online = OnlineASRProcessor(
|
online = OnlineASRProcessor(
|
||||||
asr,
|
asr,
|
||||||
|
|||||||
@@ -7,11 +7,12 @@ import contextlib
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
ERROR_INSTALL_INSTRUCTIONS = """
|
ERROR_INSTALL_INSTRUCTIONS = f"""
|
||||||
|
{'='*50}
|
||||||
FFmpeg is not installed or not found in your system's PATH.
|
FFmpeg is not installed or not found in your system's PATH.
|
||||||
Please install FFmpeg to enable audio processing.
|
Alternative Solution: You can still use WhisperLiveKit without FFmpeg by adding the --pcm-input parameter. Note that when using this option, audio will not be compressed between the frontend and backend, which may result in higher bandwidth usage.
|
||||||
|
|
||||||
Installation instructions:
|
If you want to install FFmpeg:
|
||||||
|
|
||||||
# Ubuntu/Debian:
|
# Ubuntu/Debian:
|
||||||
sudo apt update && sudo apt install ffmpeg
|
sudo apt update && sudo apt install ffmpeg
|
||||||
@@ -25,6 +26,7 @@ brew install ffmpeg
|
|||||||
# 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
|
# 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
|
||||||
|
|
||||||
After installation, please restart the application.
|
After installation, please restart the application.
|
||||||
|
{'='*50}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class FFmpegState(Enum):
|
class FFmpegState(Enum):
|
||||||
@@ -183,6 +185,8 @@ class FFmpegManager:
|
|||||||
async def _drain_stderr(self):
|
async def _drain_stderr(self):
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
if not self.process or not self.process.stderr:
|
||||||
|
break
|
||||||
line = await self.process.stderr.readline()
|
line = await self.process.stderr.readline()
|
||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
@@ -190,4 +194,4 @@ class FFmpegManager:
|
|||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.info("FFmpeg stderr drain task cancelled.")
|
logger.info("FFmpeg stderr drain task cancelled.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error draining FFmpeg stderr: {e}")
|
logger.error(f"Error draining FFmpeg stderr: {e}")
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def parse_args():
|
|||||||
help="""
|
help="""
|
||||||
The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
|
The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
|
||||||
If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
|
If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
|
||||||
If False, no warmup is performed.
|
If empty, no warmup is performed.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -177,7 +177,7 @@ def parse_args():
|
|||||||
"--pcm-input",
|
"--pcm-input",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
default=False,
|
default=False,
|
||||||
help="If set, raw PCM (s16le) data is expected as input and FFmpeg will be bypassed."
|
help="If set, raw PCM (s16le) data is expected as input and FFmpeg will be bypassed. Frontend will use AudioWorklet instead of MediaRecorder."
|
||||||
)
|
)
|
||||||
# SimulStreaming-specific arguments
|
# SimulStreaming-specific arguments
|
||||||
simulstreaming_group = parser.add_argument_group('SimulStreaming arguments (only used with --backend simulstreaming)')
|
simulstreaming_group = parser.add_argument_group('SimulStreaming arguments (only used with --backend simulstreaming)')
|
||||||
@@ -287,6 +287,20 @@ def parse_args():
|
|||||||
help="Optional. Number of models to preload in memory to speed up loading (set up to the expected number of concurrent instances).",
|
help="Optional. Number of models to preload in memory to speed up loading (set up to the expected number of concurrent instances).",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
simulstreaming_group.add_argument(
|
||||||
|
"--nllb-backend",
|
||||||
|
type=str,
|
||||||
|
default="ctranslate2",
|
||||||
|
help="transformers or ctranslate2",
|
||||||
|
)
|
||||||
|
|
||||||
|
simulstreaming_group.add_argument(
|
||||||
|
"--nllb-size",
|
||||||
|
type=str,
|
||||||
|
default="600M",
|
||||||
|
help="600M or 1.3B",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
args.transcription = not args.no_transcription
|
args.transcription = not args.no_transcription
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ def blank_to_silence(tokens):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if silence_token: #there was silence but no more
|
if silence_token: #there was silence but no more
|
||||||
if silence_token.end - silence_token.start >= MIN_SILENCE_DURATION:
|
if silence_token.duration() >= MIN_SILENCE_DURATION:
|
||||||
cleaned_tokens.append(
|
cleaned_tokens.append(
|
||||||
silence_token
|
silence_token
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -46,15 +46,14 @@ def append_token_to_last_line(lines, sep, token, debug_info):
|
|||||||
lines[-1].text += sep + token.text + debug_info
|
lines[-1].text += sep + token.text + debug_info
|
||||||
lines[-1].end = token.end
|
lines[-1].end = token.end
|
||||||
|
|
||||||
def format_output(state, silence, current_time, args, debug):
|
def format_output(state, silence, current_time, args, debug, sep):
|
||||||
diarization = args.diarization
|
diarization = args.diarization
|
||||||
disable_punctuation_split = args.disable_punctuation_split
|
disable_punctuation_split = args.disable_punctuation_split
|
||||||
tokens = state["tokens"]
|
tokens = state.tokens
|
||||||
translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments
|
translated_segments = state.translated_segments # Here we will attribute the speakers only based on the timestamps of the segments
|
||||||
buffer_transcription = state["buffer_transcription"]
|
buffer_transcription = state.buffer_transcription
|
||||||
buffer_diarization = state["buffer_diarization"]
|
buffer_diarization = state.buffer_diarization
|
||||||
end_attributed_speaker = state["end_attributed_speaker"]
|
end_attributed_speaker = state.end_attributed_speaker
|
||||||
sep = state["sep"]
|
|
||||||
|
|
||||||
previous_speaker = -1
|
previous_speaker = -1
|
||||||
lines = []
|
lines = []
|
||||||
@@ -124,14 +123,33 @@ def format_output(state, silence, current_time, args, debug):
|
|||||||
|
|
||||||
append_token_to_last_line(lines, sep, token, debug_info)
|
append_token_to_last_line(lines, sep, token, debug_info)
|
||||||
if lines and translated_segments:
|
if lines and translated_segments:
|
||||||
cts_idx = 0 # current_translated_segment_idx
|
unassigned_translated_segments = []
|
||||||
for line in lines:
|
for ts in translated_segments:
|
||||||
while cts_idx < len(translated_segments):
|
assigned = False
|
||||||
ts = translated_segments[cts_idx]
|
for line in lines:
|
||||||
if ts.start and ts.start >= line.start and ts.end <= line.end:
|
if ts and ts.overlaps_with(line):
|
||||||
line.translation += ts.text + ' '
|
if ts.is_within(line):
|
||||||
cts_idx += 1
|
line.translation += ts.text + ' '
|
||||||
else:
|
assigned = True
|
||||||
break
|
break
|
||||||
return lines, undiarized_text, buffer_transcription, ''
|
else:
|
||||||
|
ts0, ts1 = ts.approximate_cut_at(line.end)
|
||||||
|
if ts0 and line.overlaps_with(ts0):
|
||||||
|
line.translation += ts0.text + ' '
|
||||||
|
if ts1:
|
||||||
|
unassigned_translated_segments.append(ts1)
|
||||||
|
assigned = True
|
||||||
|
break
|
||||||
|
if not assigned:
|
||||||
|
unassigned_translated_segments.append(ts)
|
||||||
|
|
||||||
|
if unassigned_translated_segments:
|
||||||
|
for line in lines:
|
||||||
|
remaining_segments = []
|
||||||
|
for ts in unassigned_translated_segments:
|
||||||
|
if ts and ts.overlaps_with(line):
|
||||||
|
line.translation += ts.text + ' '
|
||||||
|
else:
|
||||||
|
remaining_segments.append(ts)
|
||||||
|
unassigned_translated_segments = remaining_segments #maybe do smth in the future about that
|
||||||
|
return lines, undiarized_text, buffer_transcription, ''
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional
|
from typing import Optional, Any
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
def format_time(seconds: float) -> str:
|
def format_time(seconds: float) -> str:
|
||||||
@@ -15,6 +15,21 @@ class TimedText:
|
|||||||
speaker: Optional[int] = -1
|
speaker: Optional[int] = -1
|
||||||
probability: Optional[float] = None
|
probability: Optional[float] = None
|
||||||
is_dummy: Optional[bool] = False
|
is_dummy: Optional[bool] = False
|
||||||
|
|
||||||
|
def overlaps_with(self, other: 'TimedText') -> bool:
|
||||||
|
return not (self.end <= other.start or other.end <= self.start)
|
||||||
|
|
||||||
|
def is_within(self, other: 'TimedText') -> bool:
|
||||||
|
return other.contains_timespan(self)
|
||||||
|
|
||||||
|
def duration(self) -> float:
|
||||||
|
return self.end - self.start
|
||||||
|
|
||||||
|
def contains_time(self, time: float) -> bool:
|
||||||
|
return self.start <= time <= self.end
|
||||||
|
|
||||||
|
def contains_timespan(self, other: 'TimedText') -> bool:
|
||||||
|
return self.start <= other.start and self.end >= other.end
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ASRToken(TimedText):
|
class ASRToken(TimedText):
|
||||||
@@ -41,6 +56,34 @@ class SpeakerSegment(TimedText):
|
|||||||
class Translation(TimedText):
|
class Translation(TimedText):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def approximate_cut_at(self, cut_time):
|
||||||
|
"""
|
||||||
|
Each word in text is considered to be of duration (end-start)/len(words in text)
|
||||||
|
"""
|
||||||
|
if not self.text or not self.contains_time(cut_time):
|
||||||
|
return self, None
|
||||||
|
|
||||||
|
words = self.text.split()
|
||||||
|
num_words = len(words)
|
||||||
|
if num_words == 0:
|
||||||
|
return self, None
|
||||||
|
|
||||||
|
duration_per_word = self.duration() / num_words
|
||||||
|
|
||||||
|
cut_word_index = int((cut_time - self.start) / duration_per_word)
|
||||||
|
|
||||||
|
if cut_word_index >= num_words:
|
||||||
|
cut_word_index = num_words -1
|
||||||
|
|
||||||
|
text0 = " ".join(words[:cut_word_index])
|
||||||
|
text1 = " ".join(words[cut_word_index:])
|
||||||
|
|
||||||
|
segment0 = Translation(start=self.start, end=cut_time, text=text0)
|
||||||
|
segment1 = Translation(start=cut_time, end=self.end, text=text1)
|
||||||
|
|
||||||
|
return segment0, segment1
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Silence():
|
class Silence():
|
||||||
duration: float
|
duration: float
|
||||||
@@ -57,4 +100,38 @@ class Line(TimedText):
|
|||||||
'translation': self.translation,
|
'translation': self.translation,
|
||||||
'start': format_time(self.start),
|
'start': format_time(self.start),
|
||||||
'end': format_time(self.end),
|
'end': format_time(self.end),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FrontData():
|
||||||
|
status: str = ''
|
||||||
|
error: str = ''
|
||||||
|
lines: list[Line] = field(default_factory=list)
|
||||||
|
buffer_transcription: str = ''
|
||||||
|
buffer_diarization: str = ''
|
||||||
|
remaining_time_transcription: float = 0.
|
||||||
|
remaining_time_diarization: float = 0.
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
_dict = {
|
||||||
|
'status': self.status,
|
||||||
|
'lines': [line.to_dict() for line in self.lines],
|
||||||
|
'buffer_transcription': self.buffer_transcription,
|
||||||
|
'buffer_diarization': self.buffer_diarization,
|
||||||
|
'remaining_time_transcription': self.remaining_time_transcription,
|
||||||
|
'remaining_time_diarization': self.remaining_time_diarization,
|
||||||
|
}
|
||||||
|
if self.error:
|
||||||
|
_dict['error'] = self.error
|
||||||
|
return _dict
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class State():
|
||||||
|
tokens: list
|
||||||
|
translated_segments: list
|
||||||
|
buffer_transcription: str
|
||||||
|
buffer_diarization: str
|
||||||
|
end_buffer: float
|
||||||
|
end_attributed_speaker: float
|
||||||
|
remaining_time_transcription: float
|
||||||
|
remaining_time_diarization: float
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
import ctranslate2
|
import ctranslate2
|
||||||
import torch
|
import torch
|
||||||
import transformers
|
import transformers
|
||||||
@@ -6,38 +8,42 @@ import huggingface_hub
|
|||||||
from whisperlivekit.translation.mapping_languages import get_nllb_code
|
from whisperlivekit.translation.mapping_languages import get_nllb_code
|
||||||
from whisperlivekit.timed_objects import Translation
|
from whisperlivekit.timed_objects import Translation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
#In diarization case, we may want to translate just one speaker, or at least start the sentences there
|
#In diarization case, we may want to translate just one speaker, or at least start the sentences there
|
||||||
|
|
||||||
PUNCTUATION_MARKS = {'.', '!', '?', '。', '!', '?'}
|
PUNCTUATION_MARKS = {'.', '!', '?', '。', '!', '?'}
|
||||||
|
|
||||||
|
MIN_SILENCE_DURATION_DEL_BUFFER = 3 #After a silence of x seconds, we consider the model should not use the buffer, even if the previous
|
||||||
|
# sentence is not finished.
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TranslationModel():
|
class TranslationModel():
|
||||||
translator: ctranslate2.Translator
|
translator: ctranslate2.Translator
|
||||||
tokenizer: dict
|
tokenizer: dict
|
||||||
|
device: str
|
||||||
|
backend_type: str = 'ctranslate2'
|
||||||
|
|
||||||
def load_model(src_langs):
|
def load_model(src_langs, backend='ctranslate2', model_size='600M'):
|
||||||
MODEL = 'nllb-200-distilled-600M-ctranslate2'
|
|
||||||
MODEL_GUY = 'entai2965'
|
|
||||||
huggingface_hub.snapshot_download(MODEL_GUY + '/' + MODEL,local_dir=MODEL)
|
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
translator = ctranslate2.Translator(MODEL,device=device)
|
MODEL = f'nllb-200-distilled-{model_size}-ctranslate2'
|
||||||
|
if backend=='ctranslate2':
|
||||||
|
MODEL_GUY = 'entai2965'
|
||||||
|
huggingface_hub.snapshot_download(MODEL_GUY + '/' + MODEL,local_dir=MODEL)
|
||||||
|
translator = ctranslate2.Translator(MODEL,device=device)
|
||||||
|
elif backend=='transformers':
|
||||||
|
translator = transformers.AutoModelForSeq2SeqLM.from_pretrained(f"facebook/nllb-200-distilled-{model_size}")
|
||||||
tokenizer = dict()
|
tokenizer = dict()
|
||||||
for src_lang in src_langs:
|
for src_lang in src_langs:
|
||||||
tokenizer[src_lang] = transformers.AutoTokenizer.from_pretrained(MODEL, src_lang=src_lang, clean_up_tokenization_spaces=True)
|
tokenizer[src_lang] = transformers.AutoTokenizer.from_pretrained(MODEL, src_lang=src_lang, clean_up_tokenization_spaces=True)
|
||||||
|
|
||||||
return TranslationModel(
|
return TranslationModel(
|
||||||
translator=translator,
|
translator=translator,
|
||||||
tokenizer=tokenizer
|
tokenizer=tokenizer,
|
||||||
|
backend_type=backend,
|
||||||
|
device = device
|
||||||
)
|
)
|
||||||
|
|
||||||
def translate(input, translation_model, tgt_lang):
|
|
||||||
source = translation_model.tokenizer.convert_ids_to_tokens(translation_model.tokenizer.encode(input))
|
|
||||||
target_prefix = [tgt_lang]
|
|
||||||
results = translation_model.translator.translate_batch([source], target_prefix=[target_prefix])
|
|
||||||
target = results[0].hypotheses[0][1:]
|
|
||||||
return translation_model.tokenizer.decode(translation_model.tokenizer.convert_tokens_to_ids(target))
|
|
||||||
|
|
||||||
class OnlineTranslation:
|
class OnlineTranslation:
|
||||||
def __init__(self, translation_model: TranslationModel, input_languages: list, output_languages: list):
|
def __init__(self, translation_model: TranslationModel, input_languages: list, output_languages: list):
|
||||||
self.buffer = []
|
self.buffer = []
|
||||||
@@ -68,12 +74,19 @@ class OnlineTranslation:
|
|||||||
output_lang = self.output_languages[0]
|
output_lang = self.output_languages[0]
|
||||||
nllb_output_lang = get_nllb_code(output_lang)
|
nllb_output_lang = get_nllb_code(output_lang)
|
||||||
|
|
||||||
source = self.translation_model.tokenizer[input_lang].convert_ids_to_tokens(self.translation_model.tokenizer[input_lang].encode(input))
|
tokenizer = self.translation_model.tokenizer[input_lang]
|
||||||
results = self.translation_model.translator.translate_batch([source], target_prefix=[[nllb_output_lang]]) #we can use return_attention=True to try to optimize the stuff.
|
tokenizer_output = tokenizer(input, return_tensors="pt").to(self.translation_model.device)
|
||||||
target = results[0].hypotheses[0][1:]
|
|
||||||
results = self.translation_model.tokenizer[input_lang].decode(self.translation_model.tokenizer[input_lang].convert_tokens_to_ids(target))
|
if self.translation_model.backend_type == 'ctranslate2':
|
||||||
return results
|
source = tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0])
|
||||||
|
results = self.translation_model.translator.translate_batch([source], target_prefix=[[nllb_output_lang]])
|
||||||
|
target = results[0].hypotheses[0][1:]
|
||||||
|
result = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
|
||||||
|
else:
|
||||||
|
translated_tokens = self.translation_model.translator.generate(**tokenizer_output, forced_bos_token_id=tokenizer.convert_tokens_to_ids(nllb_output_lang))
|
||||||
|
result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
||||||
|
return result
|
||||||
|
|
||||||
def translate_tokens(self, tokens):
|
def translate_tokens(self, tokens):
|
||||||
if tokens:
|
if tokens:
|
||||||
text = ' '.join([token.text for token in tokens])
|
text = ' '.join([token.text for token in tokens])
|
||||||
@@ -88,7 +101,6 @@ class OnlineTranslation:
|
|||||||
return translation
|
return translation
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def insert_tokens(self, tokens):
|
def insert_tokens(self, tokens):
|
||||||
self.buffer.extend(tokens)
|
self.buffer.extend(tokens)
|
||||||
@@ -109,7 +121,11 @@ class OnlineTranslation:
|
|||||||
self.translation_remaining = self.translate_tokens(self.buffer)
|
self.translation_remaining = self.translate_tokens(self.buffer)
|
||||||
self.len_processed_buffer = len(self.buffer)
|
self.len_processed_buffer = len(self.buffer)
|
||||||
return self.validated + [self.translation_remaining]
|
return self.validated + [self.translation_remaining]
|
||||||
|
|
||||||
|
def insert_silence(self, silence_duration: float):
|
||||||
|
if silence_duration >= MIN_SILENCE_DURATION_DEL_BUFFER:
|
||||||
|
self.buffer = []
|
||||||
|
self.validated += [self.translation_remaining]
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
output_lang = 'fr'
|
output_lang = 'fr'
|
||||||
@@ -122,16 +138,13 @@ if __name__ == '__main__':
|
|||||||
test = test_string.split(' ')
|
test = test_string.split(' ')
|
||||||
step = len(test) // 3
|
step = len(test) // 3
|
||||||
|
|
||||||
shared_model = load_model([input_lang])
|
shared_model = load_model([input_lang], backend='ctranslate2')
|
||||||
online_translation = OnlineTranslation(shared_model, input_languages=[input_lang], output_languages=[output_lang])
|
online_translation = OnlineTranslation(shared_model, input_languages=[input_lang], output_languages=[output_lang])
|
||||||
|
|
||||||
|
beg_inference = time.time()
|
||||||
for id in range(5):
|
for id in range(5):
|
||||||
val = test[id*step : (id+1)*step]
|
val = test[id*step : (id+1)*step]
|
||||||
val_str = ' '.join(val)
|
val_str = ' '.join(val)
|
||||||
result = online_translation.translate(val_str)
|
result = online_translation.translate(val_str)
|
||||||
print(result)
|
print(result)
|
||||||
|
print('inference time:', time.time() - beg_inference)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# print(result)
|
|
||||||
@@ -6,57 +6,46 @@ logger = logging.getLogger(__name__)
|
|||||||
def load_file(warmup_file=None, timeout=5):
|
def load_file(warmup_file=None, timeout=5):
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
import librosa
|
import librosa
|
||||||
|
|
||||||
|
if warmup_file == "":
|
||||||
|
logger.info(f"Skipping warmup.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Download JFK sample if not already present
|
||||||
if warmup_file is None:
|
if warmup_file is None:
|
||||||
# Download JFK sample if not already present
|
|
||||||
jfk_url = "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav"
|
jfk_url = "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav"
|
||||||
temp_dir = tempfile.gettempdir()
|
temp_dir = tempfile.gettempdir()
|
||||||
warmup_file = os.path.join(temp_dir, "whisper_warmup_jfk.wav")
|
warmup_file = os.path.join(temp_dir, "whisper_warmup_jfk.wav")
|
||||||
|
if not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
|
||||||
if not os.path.exists(warmup_file):
|
|
||||||
logger.debug(f"Downloading warmup file from {jfk_url}")
|
|
||||||
print(f"Downloading warmup file from {jfk_url}")
|
|
||||||
import time
|
|
||||||
import urllib.request
|
|
||||||
import urllib.error
|
|
||||||
import socket
|
|
||||||
|
|
||||||
original_timeout = socket.getdefaulttimeout()
|
|
||||||
socket.setdefaulttimeout(timeout)
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
try:
|
try:
|
||||||
urllib.request.urlretrieve(jfk_url, warmup_file)
|
logger.debug(f"Downloading warmup file from {jfk_url}")
|
||||||
logger.debug(f"Download successful in {time.time() - start_time:.2f}s")
|
with urllib.request.urlopen(jfk_url, timeout=timeout) as r, open(warmup_file, "wb") as f:
|
||||||
except (urllib.error.URLError, socket.timeout) as e:
|
f.write(r.read())
|
||||||
logger.warning(f"Download failed: {e}. Proceeding without warmup.")
|
except Exception as e:
|
||||||
|
logger.warning(f"Warmup file download failed: {e}.")
|
||||||
return None
|
return None
|
||||||
finally:
|
|
||||||
socket.setdefaulttimeout(original_timeout)
|
# Validate file and load
|
||||||
elif not warmup_file:
|
if not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
|
||||||
return None
|
logger.warning(f"Warmup file {warmup_file} is invalid or missing.")
|
||||||
|
|
||||||
if not warmup_file or not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
|
|
||||||
logger.warning(f"Warmup file {warmup_file} invalid or missing.")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio, sr = librosa.load(warmup_file, sr=16000)
|
audio, _ = librosa.load(warmup_file, sr=16000)
|
||||||
|
return audio
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to load audio file: {e}")
|
logger.warning(f"Failed to load warmup file: {e}")
|
||||||
return None
|
return None
|
||||||
return audio
|
|
||||||
|
|
||||||
def warmup_asr(asr, warmup_file=None, timeout=5):
|
def warmup_asr(asr, warmup_file=None, timeout=5):
|
||||||
"""
|
"""
|
||||||
Warmup the ASR model by transcribing a short audio file.
|
Warmup the ASR model by transcribing a short audio file.
|
||||||
"""
|
"""
|
||||||
audio = load_file(warmup_file=None, timeout=5)
|
audio = load_file(warmup_file=warmup_file, timeout=timeout)
|
||||||
|
if audio is None:
|
||||||
|
logger.warning("Warmup file unavailable. Skipping ASR warmup.")
|
||||||
|
return
|
||||||
asr.transcribe(audio)
|
asr.transcribe(audio)
|
||||||
logger.info("ASR model is warmed up")
|
logger.info("ASR model is warmed up.")
|
||||||
|
|
||||||
def warmup_online(online, warmup_file=None, timeout=5):
|
|
||||||
audio = load_file(warmup_file=None, timeout=5)
|
|
||||||
online.warmup(audio)
|
|
||||||
logger.warning("ASR is warmed up")
|
|
||||||
@@ -438,7 +438,6 @@ label {
|
|||||||
font-size: 13px;
|
font-size: 13px;
|
||||||
border-radius: 30px;
|
border-radius: 30px;
|
||||||
padding: 2px 10px;
|
padding: 2px 10px;
|
||||||
display: none;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.loading {
|
.loading {
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ let timerInterval = null;
|
|||||||
let audioContext = null;
|
let audioContext = null;
|
||||||
let analyser = null;
|
let analyser = null;
|
||||||
let microphone = null;
|
let microphone = null;
|
||||||
|
let workletNode = null;
|
||||||
|
let recorderWorker = null;
|
||||||
let waveCanvas = document.getElementById("waveCanvas");
|
let waveCanvas = document.getElementById("waveCanvas");
|
||||||
let waveCtx = waveCanvas.getContext("2d");
|
let waveCtx = waveCanvas.getContext("2d");
|
||||||
let animationFrame = null;
|
let animationFrame = null;
|
||||||
@@ -20,6 +22,9 @@ let lastReceivedData = null;
|
|||||||
let lastSignature = null;
|
let lastSignature = null;
|
||||||
let availableMicrophones = [];
|
let availableMicrophones = [];
|
||||||
let selectedMicrophoneId = null;
|
let selectedMicrophoneId = null;
|
||||||
|
let serverUseAudioWorklet = null;
|
||||||
|
let configReadyResolve;
|
||||||
|
const configReady = new Promise((r) => (configReadyResolve = r));
|
||||||
|
|
||||||
waveCanvas.width = 60 * (window.devicePixelRatio || 1);
|
waveCanvas.width = 60 * (window.devicePixelRatio || 1);
|
||||||
waveCanvas.height = 30 * (window.devicePixelRatio || 1);
|
waveCanvas.height = 30 * (window.devicePixelRatio || 1);
|
||||||
@@ -226,6 +231,14 @@ function setupWebSocket() {
|
|||||||
|
|
||||||
websocket.onmessage = (event) => {
|
websocket.onmessage = (event) => {
|
||||||
const data = JSON.parse(event.data);
|
const data = JSON.parse(event.data);
|
||||||
|
if (data.type === "config") {
|
||||||
|
serverUseAudioWorklet = !!data.useAudioWorklet;
|
||||||
|
statusText.textContent = serverUseAudioWorklet
|
||||||
|
? "Connected. Using AudioWorklet (PCM)."
|
||||||
|
: "Connected. Using MediaRecorder (WebM).";
|
||||||
|
if (configReadyResolve) configReadyResolve();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (data.type === "ready_to_stop") {
|
if (data.type === "ready_to_stop") {
|
||||||
console.log("Ready to stop received, finalizing display and closing WebSocket.");
|
console.log("Ready to stop received, finalizing display and closing WebSocket.");
|
||||||
@@ -332,13 +345,6 @@ function renderLinesWithBuffer(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let currentLineText = item.text || "";
|
let currentLineText = item.text || "";
|
||||||
|
|
||||||
if (item.translation) {
|
|
||||||
currentLineText += `<div class="label_translation">
|
|
||||||
<img src="/web/src/translate.svg" alt="Translation" width="12" height="12" />
|
|
||||||
<span>${item.translation}</span>
|
|
||||||
</div>`;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (idx === lines.length - 1) {
|
if (idx === lines.length - 1) {
|
||||||
if (!isFinalizing && item.speaker !== -2) {
|
if (!isFinalizing && item.speaker !== -2) {
|
||||||
@@ -372,6 +378,13 @@ function renderLinesWithBuffer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (item.translation) {
|
||||||
|
currentLineText += `<div class="label_translation">
|
||||||
|
<img src="/web/src/translate.svg" alt="Translation" width="12" height="12" />
|
||||||
|
<span>${item.translation}</span>
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
return currentLineText.trim().length > 0 || speakerLabel.length > 0
|
return currentLineText.trim().length > 0 || speakerLabel.length > 0
|
||||||
? `<p>${speakerLabel}<br/><div class='textcontent'>${currentLineText}</div></p>`
|
? `<p>${speakerLabel}<br/><div class='textcontent'>${currentLineText}</div></p>`
|
||||||
@@ -457,13 +470,54 @@ async function startRecording() {
|
|||||||
microphone = audioContext.createMediaStreamSource(stream);
|
microphone = audioContext.createMediaStreamSource(stream);
|
||||||
microphone.connect(analyser);
|
microphone.connect(analyser);
|
||||||
|
|
||||||
recorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
|
if (serverUseAudioWorklet) {
|
||||||
recorder.ondataavailable = (e) => {
|
if (!audioContext.audioWorklet) {
|
||||||
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
throw new Error("AudioWorklet is not supported in this browser");
|
||||||
websocket.send(e.data);
|
|
||||||
}
|
}
|
||||||
};
|
await audioContext.audioWorklet.addModule("/web/pcm_worklet.js");
|
||||||
recorder.start(chunkDuration);
|
workletNode = new AudioWorkletNode(audioContext, "pcm-forwarder", { numberOfInputs: 1, numberOfOutputs: 0, channelCount: 1 });
|
||||||
|
microphone.connect(workletNode);
|
||||||
|
|
||||||
|
recorderWorker = new Worker("/web/recorder_worker.js");
|
||||||
|
recorderWorker.postMessage({
|
||||||
|
command: "init",
|
||||||
|
config: {
|
||||||
|
sampleRate: audioContext.sampleRate,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
recorderWorker.onmessage = (e) => {
|
||||||
|
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||||
|
websocket.send(e.data.buffer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
workletNode.port.onmessage = (e) => {
|
||||||
|
const data = e.data;
|
||||||
|
const ab = data instanceof ArrayBuffer ? data : data.buffer;
|
||||||
|
recorderWorker.postMessage(
|
||||||
|
{
|
||||||
|
command: "record",
|
||||||
|
buffer: ab,
|
||||||
|
},
|
||||||
|
[ab]
|
||||||
|
);
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
recorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
|
||||||
|
} catch (e) {
|
||||||
|
recorder = new MediaRecorder(stream);
|
||||||
|
}
|
||||||
|
recorder.ondataavailable = (e) => {
|
||||||
|
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||||
|
if (e.data && e.data.size > 0) {
|
||||||
|
websocket.send(e.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
recorder.start(chunkDuration);
|
||||||
|
}
|
||||||
|
|
||||||
startTime = Date.now();
|
startTime = Date.now();
|
||||||
timerInterval = setInterval(updateTimer, 1000);
|
timerInterval = setInterval(updateTimer, 1000);
|
||||||
@@ -502,10 +556,28 @@ async function stopRecording() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (recorder) {
|
if (recorder) {
|
||||||
recorder.stop();
|
try {
|
||||||
|
recorder.stop();
|
||||||
|
} catch (e) {
|
||||||
|
}
|
||||||
recorder = null;
|
recorder = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (recorderWorker) {
|
||||||
|
recorderWorker.terminate();
|
||||||
|
recorderWorker = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (workletNode) {
|
||||||
|
try {
|
||||||
|
workletNode.port.onmessage = null;
|
||||||
|
} catch (e) {}
|
||||||
|
try {
|
||||||
|
workletNode.disconnect();
|
||||||
|
} catch (e) {}
|
||||||
|
workletNode = null;
|
||||||
|
}
|
||||||
|
|
||||||
if (microphone) {
|
if (microphone) {
|
||||||
microphone.disconnect();
|
microphone.disconnect();
|
||||||
microphone = null;
|
microphone = null;
|
||||||
@@ -549,9 +621,11 @@ async function toggleRecording() {
|
|||||||
console.log("Connecting to WebSocket");
|
console.log("Connecting to WebSocket");
|
||||||
try {
|
try {
|
||||||
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||||
|
await configReady;
|
||||||
await startRecording();
|
await startRecording();
|
||||||
} else {
|
} else {
|
||||||
await setupWebSocket();
|
await setupWebSocket();
|
||||||
|
await configReady;
|
||||||
await startRecording();
|
await startRecording();
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
|||||||
16
whisperlivekit/web/pcm_worklet.js
Normal file
16
whisperlivekit/web/pcm_worklet.js
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
class PCMForwarder extends AudioWorkletProcessor {
|
||||||
|
process(inputs) {
|
||||||
|
const input = inputs[0];
|
||||||
|
if (input && input[0] && input[0].length) {
|
||||||
|
// Forward mono channel (0). If multi-channel, downmixing can be added here.
|
||||||
|
const channelData = input[0];
|
||||||
|
const copy = new Float32Array(channelData.length);
|
||||||
|
copy.set(channelData);
|
||||||
|
this.port.postMessage(copy, [copy.buffer]);
|
||||||
|
}
|
||||||
|
// Keep processor alive
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
registerProcessor('pcm-forwarder', PCMForwarder);
|
||||||
58
whisperlivekit/web/recorder_worker.js
Normal file
58
whisperlivekit/web/recorder_worker.js
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
let sampleRate = 48000;
|
||||||
|
let targetSampleRate = 16000;
|
||||||
|
|
||||||
|
self.onmessage = function (e) {
|
||||||
|
switch (e.data.command) {
|
||||||
|
case 'init':
|
||||||
|
init(e.data.config);
|
||||||
|
break;
|
||||||
|
case 'record':
|
||||||
|
record(e.data.buffer);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
function init(config) {
|
||||||
|
sampleRate = config.sampleRate;
|
||||||
|
targetSampleRate = config.targetSampleRate || 16000;
|
||||||
|
}
|
||||||
|
|
||||||
|
function record(inputBuffer) {
|
||||||
|
const buffer = new Float32Array(inputBuffer);
|
||||||
|
const resampledBuffer = resample(buffer, sampleRate, targetSampleRate);
|
||||||
|
const pcmBuffer = toPCM(resampledBuffer);
|
||||||
|
self.postMessage({ buffer: pcmBuffer }, [pcmBuffer]);
|
||||||
|
}
|
||||||
|
|
||||||
|
function resample(buffer, from, to) {
|
||||||
|
if (from === to) {
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
const ratio = from / to;
|
||||||
|
const newLength = Math.round(buffer.length / ratio);
|
||||||
|
const result = new Float32Array(newLength);
|
||||||
|
let offsetResult = 0;
|
||||||
|
let offsetBuffer = 0;
|
||||||
|
while (offsetResult < result.length) {
|
||||||
|
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
|
||||||
|
let accum = 0, count = 0;
|
||||||
|
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
|
||||||
|
accum += buffer[i];
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
result[offsetResult] = accum / count;
|
||||||
|
offsetResult++;
|
||||||
|
offsetBuffer = nextOffsetBuffer;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toPCM(input) {
|
||||||
|
const buffer = new ArrayBuffer(input.length * 2);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
for (let i = 0; i < input.length; i++) {
|
||||||
|
const s = Math.max(-1, Math.min(1, input[i]));
|
||||||
|
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user