mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-08 06:44:09 +00:00
Compare commits
41 Commits
0.1.3
...
windows_au
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e704b0b0db | ||
|
|
2dd974ade0 | ||
|
|
bc7c32100f | ||
|
|
c4150894af | ||
|
|
25bf242ce1 | ||
|
|
14cc601a5c | ||
|
|
34d5d513fa | ||
|
|
2ab3dac948 | ||
|
|
b56fcffde1 | ||
|
|
2def194893 | ||
|
|
29978da301 | ||
|
|
b708890788 | ||
|
|
3ac4c514cf | ||
|
|
3c58bfcfa2 | ||
|
|
d53b7a323a | ||
|
|
02de5993e6 | ||
|
|
d94560ef37 | ||
|
|
f62baa80b7 | ||
|
|
0b43035701 | ||
|
|
704170ccf3 | ||
|
|
09279c572a | ||
|
|
23e41f993f | ||
|
|
c791b1e125 | ||
|
|
3de2990ec4 | ||
|
|
51e6a6f6f9 | ||
|
|
f6e53b2fab | ||
|
|
5d6f08ff7a | ||
|
|
583a26da88 | ||
|
|
5b3d8969e8 | ||
|
|
40cca184c1 | ||
|
|
47ed345f9e | ||
|
|
9c9c179684 | ||
|
|
b870c12f62 | ||
|
|
cfd5905fd4 | ||
|
|
2399487e45 | ||
|
|
afd88310fd | ||
|
|
080f446b0d | ||
|
|
8bd2b36488 | ||
|
|
25fd924bf9 | ||
|
|
ff8fd0ec72 | ||
|
|
e99f53e649 |
82
Dockerfile
Normal file
82
Dockerfile
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ARG EXTRAS
|
||||||
|
ARG HF_PRECACHE_DIR
|
||||||
|
ARG HF_TKN_FILE
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
#RUN apt-get update && \
|
||||||
|
# apt-get install -y ffmpeg git && \
|
||||||
|
# apt-get clean && \
|
||||||
|
# rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 2) Install system dependencies + Python + pip
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
ffmpeg \
|
||||||
|
git && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install WhisperLiveKit directly, allowing for optional dependencies
|
||||||
|
# Note: For gates modedls, need to add your HF toke. See README.md
|
||||||
|
# for more details.
|
||||||
|
RUN if [ -n "$EXTRAS" ]; then \
|
||||||
|
echo "Installing with extras: [$EXTRAS]"; \
|
||||||
|
pip install --no-cache-dir .[$EXTRAS]; \
|
||||||
|
else \
|
||||||
|
echo "Installing base package only"; \
|
||||||
|
pip install --no-cache-dir .; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Enable in-container caching for Hugging Face models by:
|
||||||
|
# Note: If running multiple containers, better to map a shared
|
||||||
|
# bucket.
|
||||||
|
#
|
||||||
|
# A) Make the cache directory persistent via an anonymous volume.
|
||||||
|
# Note: This only persists for a single, named container. This is
|
||||||
|
# only for convenience at de/test stage.
|
||||||
|
# For prod, it is better to use a named volume via host mount/k8s.
|
||||||
|
VOLUME ["/root/.cache/huggingface/hub"]
|
||||||
|
|
||||||
|
# or
|
||||||
|
# B) Conditionally copy a local pre-cache from the build context to the
|
||||||
|
# container's cache via the HF_PRECACHE_DIR build-arg.
|
||||||
|
# WARNING: This will copy ALL files in the pre-cache location.
|
||||||
|
|
||||||
|
# Conditionally copy a cache directory if provided
|
||||||
|
RUN if [ -n "$HF_PRECACHE_DIR" ]; then \
|
||||||
|
echo "Copying Hugging Face cache from $HF_PRECACHE_DIR"; \
|
||||||
|
mkdir -p /root/.cache/huggingface/hub && \
|
||||||
|
cp -r $HF_PRECACHE_DIR/* /root/.cache/huggingface/hub; \
|
||||||
|
else \
|
||||||
|
echo "No local Hugging Face cache specified, skipping copy"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Conditionally copy a Hugging Face token if provided
|
||||||
|
|
||||||
|
RUN if [ -n "$HF_TKN_FILE" ]; then \
|
||||||
|
echo "Copying Hugging Face token from $HF_TKN_FILE"; \
|
||||||
|
mkdir -p /root/.cache/huggingface && \
|
||||||
|
cp $HF_TKN_FILE /root/.cache/huggingface/token; \
|
||||||
|
else \
|
||||||
|
echo "No Hugging Face token file specified, skipping token setup"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Expose port for the transcription server
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
ENTRYPOINT ["whisperlivekit-server", "--host", "0.0.0.0"]
|
||||||
|
|
||||||
|
# Default args
|
||||||
|
CMD ["--model", "tiny.en"]
|
||||||
38
LICENSE
38
LICENSE
@@ -1,21 +1,33 @@
|
|||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2023 ÚFAL
|
Copyright (c) 2025 Quentin Fuxa.
|
||||||
|
Based on:
|
||||||
|
- The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
|
||||||
|
- The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
||||||
|
- The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
in the Software without restriction, including without limitation the rights
|
in the Software without restriction, including without limitation the rights
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
furnished to do so, subject to the following conditions:
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
The above copyright notice and this permission notice shall be included in all
|
||||||
copies or substantial portions of the Software.
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Third-party components included in this software:
|
||||||
|
|
||||||
|
- **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming
|
||||||
|
- **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad
|
||||||
|
- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart
|
||||||
|
|||||||
336
README.md
336
README.md
@@ -1,43 +1,77 @@
|
|||||||
<h1 align="center">WhisperLiveKit</h1>
|
<h1 align="center">WhisperLiveKit</h1>
|
||||||
<p align="center"><b>Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization</b></p>
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g">
|
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/refs/heads/main/demo.png" alt="WhisperLiveKit Demo" width="730">
|
||||||
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit">
|
|
||||||
<img alt="Python Versions" src="https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-dark_green">
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. Simply launch the local server and grant microphone access. Everything runs locally on your machine ✨
|
<p align="center"><b>Real-time, Fully Local Speech-to-Text with Speaker Diarization</b></p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/refs/heads/main/demo.png" alt="Demo Screenshot" width="730">
|
<a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
|
||||||
|
<a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
|
||||||
|
<a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-dark_green"></a>
|
||||||
|
<a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/QuentinFuxa/WhisperLiveKit?color=blue"></a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
### Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
|
## 🚀 Overview
|
||||||
|
|
||||||
#### ⚙️ **Core Improvements**
|
This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. WhisperLiveKit provides a complete backend solution for real-time speech transcription with a functional and simple frontend that you can customize for your own needs. Everything runs locally on your machine ✨
|
||||||
|
|
||||||
|
### 🔄 Architecture
|
||||||
|
|
||||||
|
WhisperLiveKit consists of three main components:
|
||||||
|
|
||||||
|
- **Frontend**: A basic HTML & JavaScript interface that captures microphone audio and streams it to the backend via WebSockets. You can use and adapt the provided template at [whisperlivekit/web/live_transcription.html](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/web/live_transcription.html) for your specific use case.
|
||||||
|
- **Backend (Web Server)**: A FastAPI-based WebSocket server that receives streamed audio data, processes it in real time, and returns transcriptions to the frontend. This is where the WebSocket logic and routing live.
|
||||||
|
- **Core Backend (Library Logic)**: A server-agnostic core that handles audio processing, ASR, and diarization. It exposes reusable components that take in audio bytes and return transcriptions. This makes it easy to plug into any WebSocket or audio stream pipeline.
|
||||||
|
|
||||||
|
|
||||||
|
### ✨ Key Features
|
||||||
|
|
||||||
|
- **🎙️ Real-time Transcription** - Convert speech to text instantly as you speak
|
||||||
|
- **👥 Speaker Diarization** - Identify different speakers in real-time using [Diart](https://github.com/juanmc2005/diart)
|
||||||
|
- **🔒 Fully Local** - All processing happens on your machine - no data sent to external servers
|
||||||
|
- **📱 Multi-User Support** - Handle multiple users simultaneously with a single backend/server
|
||||||
|
|
||||||
|
### ⚙️ Core differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
|
||||||
|
|
||||||
|
- **Automatic Silence Chunking** – Automatically chunks when no audio is detected to limit buffer size
|
||||||
|
- **Multi-User Support** – Handles multiple users simultaneously by decoupling backend and online ASR
|
||||||
|
- **Confidence Validation** – Immediately validate high-confidence tokens for faster inference
|
||||||
|
- **MLX Whisper Backend** – Optimized for Apple Silicon for faster local processing
|
||||||
- **Buffering Preview** – Displays unvalidated transcription segments
|
- **Buffering Preview** – Displays unvalidated transcription segments
|
||||||
- **Multi-User Support** – Handles multiple users simultaneously by decoupling backend and online asr
|
|
||||||
- **MLX Whisper Backend** – Optimized for Apple Silicon for faster local processing.
|
|
||||||
- **Confidence validation** – Immediately validate high-confidence tokens for faster inference
|
|
||||||
|
|
||||||
#### 🎙️ **Speaker Identification**
|
## 📖 Quick Start
|
||||||
- **Real-Time Diarization** – Identify different speakers in real time using [Diart](https://github.com/juanmc2005/diart)
|
|
||||||
|
|
||||||
#### 🌐 **Web & API**
|
```bash
|
||||||
- **Built-in Web UI** – Simple raw html browser interface with no frontend setup required
|
# Install the package
|
||||||
- **FastAPI WebSocket Server** – Real-time speech-to-text processing with async FFmpeg streaming.
|
pip install whisperlivekit
|
||||||
- **JavaScript Client** – Ready-to-use MediaRecorder implementation for seamless client-side integration.
|
|
||||||
|
|
||||||
## Installation
|
# Start the transcription server
|
||||||
|
whisperlivekit-server --model tiny.en
|
||||||
|
|
||||||
### Via pip (recommended)
|
# Open your browser at http://localhost:8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Quick Start with SSL
|
||||||
|
```bash
|
||||||
|
# You must provide a certificate and key
|
||||||
|
whisperlivekit-server -ssl-certfile public.crt --ssl-keyfile private.key
|
||||||
|
|
||||||
|
# Open your browser at https://localhost:8000
|
||||||
|
```
|
||||||
|
|
||||||
|
That's it! Start speaking and watch your words appear on screen.
|
||||||
|
|
||||||
|
## 🛠️ Installation Options
|
||||||
|
|
||||||
|
### Install from PyPI (Recommended)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install whisperlivekit
|
pip install whisperlivekit
|
||||||
```
|
```
|
||||||
|
|
||||||
### From source
|
### Install from Source
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/QuentinFuxa/WhisperLiveKit
|
git clone https://github.com/QuentinFuxa/WhisperLiveKit
|
||||||
@@ -47,78 +81,92 @@ pip install -e .
|
|||||||
|
|
||||||
### System Dependencies
|
### System Dependencies
|
||||||
|
|
||||||
You need to install FFmpeg on your system:
|
FFmpeg is required:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# For Ubuntu/Debian:
|
# Ubuntu/Debian
|
||||||
sudo apt install ffmpeg
|
sudo apt install ffmpeg
|
||||||
|
|
||||||
# For macOS:
|
# macOS
|
||||||
brew install ffmpeg
|
brew install ffmpeg
|
||||||
|
|
||||||
# For Windows:
|
# Windows
|
||||||
# Download from https://ffmpeg.org/download.html and add to PATH
|
# Download from https://ffmpeg.org/download.html and add to PATH
|
||||||
```
|
```
|
||||||
|
|
||||||
### Optional Dependencies
|
### Optional Dependencies
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# If you want to use VAC (Voice Activity Controller). Useful for preventing hallucinations
|
# Voice Activity Controller (prevents hallucinations)
|
||||||
pip install torch
|
pip install torch
|
||||||
|
|
||||||
# If you choose sentences as buffer trimming strategy
|
# Sentence-based buffer trimming
|
||||||
pip install mosestokenizer wtpsplit
|
pip install mosestokenizer wtpsplit
|
||||||
pip install tokenize_uk # If you work with Ukrainian text
|
pip install tokenize_uk # If you work with Ukrainian text
|
||||||
|
|
||||||
# If you want to use diarization
|
# Speaker diarization
|
||||||
pip install diart
|
pip install diart
|
||||||
|
|
||||||
# Optional backends. Default is faster-whisper
|
# Alternative Whisper backends (default is faster-whisper)
|
||||||
pip install whisperlivekit[whisper] # Original Whisper backend
|
pip install whisperlivekit[whisper] # Original Whisper
|
||||||
pip install whisperlivekit[whisper-timestamped] # Whisper with improved timestamps
|
pip install whisperlivekit[whisper-timestamped] # Improved timestamps
|
||||||
pip install whisperlivekit[mlx-whisper] # Optimized for Apple Silicon
|
pip install whisperlivekit[mlx-whisper] # Apple Silicon optimization
|
||||||
pip install whisperlivekit[openai] # OpenAI API backend
|
pip install whisperlivekit[openai] # OpenAI API
|
||||||
|
|
||||||
|
# System audio capture (Windows only)
|
||||||
|
pip install whisperlivekit[pyaudiowpatch] # Use PyAudioWPatch for system audio loopback
|
||||||
```
|
```
|
||||||
|
|
||||||
### Get access to 🎹 pyannote models
|
### 🎹 Pyannote Models Setup
|
||||||
|
|
||||||
By default, diart is based on [pyannote.audio](https://github.com/pyannote/pyannote-audio) models from the [huggingface](https://huggingface.co/) hub.
|
For diarization, you need access to pyannote.audio models:
|
||||||
In order to use them, please follow these steps:
|
|
||||||
|
|
||||||
1) [Accept user conditions](https://huggingface.co/pyannote/segmentation) for the `pyannote/segmentation` model
|
1. [Accept user conditions](https://huggingface.co/pyannote/segmentation) for the `pyannote/segmentation` model
|
||||||
2) [Accept user conditions](https://huggingface.co/pyannote/segmentation-3.0) for the newest `pyannote/segmentation-3.0` model
|
2. [Accept user conditions](https://huggingface.co/pyannote/segmentation-3.0) for the `pyannote/segmentation-3.0` model
|
||||||
3) [Accept user conditions](https://huggingface.co/pyannote/embedding) for the `pyannote/embedding` model
|
3. [Accept user conditions](https://huggingface.co/pyannote/embedding) for the `pyannote/embedding` model
|
||||||
4) Install [huggingface-cli](https://huggingface.co/docs/huggingface_hub/quick-start#install-the-hub-library) and [log in](https://huggingface.co/docs/huggingface_hub/quick-start#login) with your user access token (or provide it manually in diart CLI or API).
|
4. Login with HuggingFace:
|
||||||
|
```bash
|
||||||
|
pip install huggingface_hub
|
||||||
|
huggingface-cli login
|
||||||
|
```
|
||||||
|
|
||||||
|
## 💻 Usage Examples
|
||||||
|
|
||||||
|
### Command-line Interface
|
||||||
|
|
||||||
## Usage
|
Start the transcription server with various options:
|
||||||
|
|
||||||
### Using the command-line tool
|
|
||||||
|
|
||||||
After installation, you can start the server using the provided command-line tool:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
whisperlivekit-server --host 0.0.0.0 --port 8000 --model tiny.en
|
# Basic server with English model
|
||||||
|
whisperlivekit-server --model tiny.en
|
||||||
|
|
||||||
|
# Advanced configuration with diarization
|
||||||
|
whisperlivekit-server --host 0.0.0.0 --port 8000 --model medium --diarization --language auto
|
||||||
|
|
||||||
|
# Using PyAudioWPatch for system audio input (Windows only)
|
||||||
|
whisperlivekit-server --model tiny.en --audio-input pyaudiowpatch
|
||||||
```
|
```
|
||||||
|
|
||||||
Then open your browser at `http://localhost:8000` (or your specified host and port).
|
### Python API Integration (Backend)
|
||||||
|
|
||||||
### Using the library in your code
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from whisperlivekit import WhisperLiveKit
|
from whisperlivekit import WhisperLiveKit
|
||||||
from whisperlivekit.audio_processor import AudioProcessor
|
from whisperlivekit.audio_processor import AudioProcessor
|
||||||
from fastapi import FastAPI, WebSocket
|
from fastapi import FastAPI, WebSocket
|
||||||
|
import asyncio
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
|
||||||
|
# Initialize components
|
||||||
|
app = FastAPI()
|
||||||
kit = WhisperLiveKit(model="medium", diarization=True)
|
kit = WhisperLiveKit(model="medium", diarization=True)
|
||||||
app = FastAPI() # Create a FastAPI application
|
|
||||||
|
|
||||||
|
# Serve the web interface
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def get():
|
async def get():
|
||||||
return HTMLResponse(kit.web_interface()) # Use the built-in web interface
|
return HTMLResponse(kit.web_interface()) # Use the built-in web interface
|
||||||
|
|
||||||
async def handle_websocket_results(websocket, results_generator): # Sends results to frontend
|
# Process WebSocket connections
|
||||||
|
async def handle_websocket_results(websocket, results_generator):
|
||||||
async for response in results_generator:
|
async for response in results_generator:
|
||||||
await websocket.send_json(response)
|
await websocket.send_json(response)
|
||||||
|
|
||||||
@@ -127,57 +175,163 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||||||
audio_processor = AudioProcessor()
|
audio_processor = AudioProcessor()
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
results_generator = await audio_processor.create_tasks()
|
results_generator = await audio_processor.create_tasks()
|
||||||
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
websocket_task = asyncio.create_task(
|
||||||
|
handle_websocket_results(websocket, results_generator)
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
try:
|
||||||
message = await websocket.receive_bytes()
|
while True:
|
||||||
await audio_processor.process_audio(message)
|
message = await websocket.receive_bytes()
|
||||||
|
await audio_processor.process_audio(message)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WebSocket error: {e}")
|
||||||
|
websocket_task.cancel()
|
||||||
```
|
```
|
||||||
|
|
||||||
For a complete audio processing example, check [whisper_fastapi_online_server.py](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisper_fastapi_online_server.py)
|
### Frontend Implementation
|
||||||
|
|
||||||
|
The package includes a simple HTML/JavaScript implementation that you can adapt for your project. You can get in in [whisperlivekit/web/live_transcription.html](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/web/live_transcription.html), or using :
|
||||||
|
|
||||||
## Configuration Options
|
```python
|
||||||
|
kit.web_interface()
|
||||||
|
```
|
||||||
|
|
||||||
The following parameters are supported when initializing `WhisperLiveKit`:
|
## ⚙️ Configuration Reference
|
||||||
|
|
||||||
- `--host` and `--port` let you specify the server's IP/port.
|
WhisperLiveKit offers extensive configuration options:
|
||||||
- `--min-chunk-size` sets the minimum chunk size for audio processing. Make sure this value aligns with the chunk size selected in the frontend. If not aligned, the system will work but may unnecessarily over-process audio data.
|
|
||||||
- `--no-transcription`: Disable transcription (enabled by default)
|
|
||||||
- `--diarization`: Enable speaker diarization (disabled by default)
|
|
||||||
- `--confidence-validation`: Use confidence scores for faster validation. Transcription will be faster but punctuation might be less accurate (disabled by default)
|
|
||||||
- `--warmup-file`: The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast:
|
|
||||||
- If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
|
|
||||||
- If False, no warmup is performed.
|
|
||||||
- `--min-chunk-size` Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.
|
|
||||||
- `--model`: Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en, tiny, base.en, base, small.en, small, medium.en, medium, large-v1, large-v2, large-v3, large, large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.
|
|
||||||
- `--model_cache_dir`: Overriding the default model cache dir where models downloaded from the hub are saved
|
|
||||||
- `--model_dir`: Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.
|
|
||||||
- `--lan`, `--language`: Source language code, e.g. en,de,cs, or 'auto' for language detection.
|
|
||||||
- `--task` {_transcribe, translate_}: Transcribe or translate. If translate is set, we recommend avoiding the _large-v3-turbo_ backend, as it [performs significantly worse](https://github.com/QuentinFuxa/whisper_streaming_web/issues/40#issuecomment-2652816533) than other models for translation.
|
|
||||||
- `--backend` {_faster-whisper, whisper_timestamped, openai-api, mlx-whisper_}: Load only this backend for Whisper processing.
|
|
||||||
- `--vac`: Use VAC = voice activity controller. Requires torch. (disabled by default)
|
|
||||||
- `--vac-chunk-size`: VAC sample size in seconds.
|
|
||||||
- `--no-vad`: Disable VAD (voice activity detection), which is enabled by default.
|
|
||||||
- `--buffer_trimming` {_sentence, segment_}: Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.
|
|
||||||
- `--buffer_trimming_sec`: Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.
|
|
||||||
|
|
||||||
|
| Parameter | Description | Default |
|
||||||
|
|-----------|-------------|---------|
|
||||||
|
| `--host` | Server host address | `localhost` |
|
||||||
|
| `--port` | Server port | `8000` |
|
||||||
|
| `--model` | Whisper model size | `tiny` |
|
||||||
|
| `--language` | Source language code or `auto` | `en` |
|
||||||
|
| `--task` | `transcribe` or `translate` | `transcribe` |
|
||||||
|
| `--backend` | Processing backend | `faster-whisper` |
|
||||||
|
| `--diarization` | Enable speaker identification | `False` |
|
||||||
|
| `--confidence-validation` | Use confidence scores for faster validation | `False` |
|
||||||
|
| `--min-chunk-size` | Minimum audio chunk size (seconds) | `1.0` |
|
||||||
|
| `--vac` | Use Voice Activity Controller | `False` |
|
||||||
|
| `--no-vad` | Disable Voice Activity Detection | `False` |
|
||||||
|
| `--buffer_trimming` | Buffer trimming strategy (`sentence` or `segment`) | `segment` |
|
||||||
|
| `--warmup-file` | Audio file path for model warmup | `jfk.wav` |
|
||||||
|
| `--audio-input` | Source of audio (`websocket` or `pyaudiowpatch`) | `websocket` |
|
||||||
|
| `--ssl-certfile` | Path to the SSL certificate file (for HTTPS support) | `None` |
|
||||||
|
| `--ssl-keyfile` | Path to the SSL private key file (for HTTPS support) | `None` |
|
||||||
|
|
||||||
## How the Live Interface Works
|
## 🔧 How It Works
|
||||||
|
|
||||||
- Once you **allow microphone access**, the page records small chunks of audio using the **MediaRecorder** API in **webm/opus** format.
|
<p align="center">
|
||||||
- These chunks are sent over a **WebSocket** to the FastAPI endpoint at `/asr`.
|
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/refs/heads/main/demo.png" alt="WhisperLiveKit in Action" width="500">
|
||||||
- The Python server decodes `.webm` chunks on the fly using **FFmpeg** and streams them into the **whisper streaming** implementation for transcription.
|
</p>
|
||||||
- **Partial transcription** appears as soon as enough audio is processed. The "unvalidated" text is shown in **lighter or grey color** (i.e., an 'aperçu') to indicate it's still buffered partial output. Once Whisper finalizes that segment, it's displayed in normal text.
|
|
||||||
|
|
||||||
### Deploying to a Remote Server
|
1. **Audio Input**:
|
||||||
|
- **WebSocket (Default)**: Browser's MediaRecorder API captures audio (webm/opus), streams via WebSocket.
|
||||||
|
- **PyAudioWPatch (Windows Only)**: Captures system audio output directly using WASAPI loopback. Requires `--audio-input pyaudiowpatch`.
|
||||||
|
2. **Processing**:
|
||||||
|
- **WebSocket**: Server decodes webm/opus audio with FFmpeg.
|
||||||
|
- **PyAudioWPatch**: Server receives raw PCM audio directly.
|
||||||
|
- Audio is streamed into Whisper for transcription.
|
||||||
|
3. **Real-time Output**:
|
||||||
|
- Partial transcriptions appear immediately in light gray (the 'aperçu').
|
||||||
|
- Finalized text appears in normal color.
|
||||||
|
- (When enabled) Different speakers are identified and highlighted
|
||||||
|
|
||||||
If you want to **deploy** this setup:
|
## 🚀 Deployment Guide
|
||||||
|
|
||||||
1. **Host the FastAPI app** behind a production-grade HTTP(S) server (like **Uvicorn + Nginx** or Docker). If you use HTTPS, use "wss" instead of "ws" in WebSocket URL.
|
To deploy WhisperLiveKit in production:
|
||||||
2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host.
|
|
||||||
3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket). No additional front-end libraries or frameworks are required.
|
|
||||||
|
|
||||||
## Acknowledgments
|
1. **Server Setup** (Backend):
|
||||||
|
```bash
|
||||||
|
# Install production ASGI server
|
||||||
|
pip install uvicorn gunicorn
|
||||||
|
|
||||||
This project builds upon the foundational work of the Whisper Streaming and Diart projects. We extend our gratitude to the original authors for their contributions.
|
# Launch with multiple workers
|
||||||
|
gunicorn -k uvicorn.workers.UvicornWorker -w 4 your_app:app
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Frontend Integration**:
|
||||||
|
- Host your customized version of the example HTML/JS in your web application
|
||||||
|
- Ensure WebSocket connection points to your server's address
|
||||||
|
|
||||||
|
3. **Nginx Configuration** (recommended for production):
|
||||||
|
```nginx
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name your-domain.com;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://localhost:8000;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **HTTPS Support**: For secure deployments, use "wss://" instead of "ws://" in WebSocket URL
|
||||||
|
|
||||||
|
### 🐋 Docker
|
||||||
|
|
||||||
|
A basic Dockerfile is provided which allows re-use of Python package installation options. See below usage examples:
|
||||||
|
|
||||||
|
**NOTE:** For **larger** models, ensure that your **docker runtime** has enough **memory** available.
|
||||||
|
|
||||||
|
#### All defaults
|
||||||
|
- Create a reusable image with only the basics and then run as a named container:
|
||||||
|
```bash
|
||||||
|
docker build -t whisperlivekit-defaults .
|
||||||
|
docker create --gpus all --name whisperlivekit -p 8000:8000 whisperlivekit-defaults
|
||||||
|
docker start -i whisperlivekit
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note**: If you're running on a system without NVIDIA GPU support (such as Mac with Apple Silicon or any system without CUDA capabilities), you need to **remove the `--gpus all` flag** from the `docker create` command. Without GPU acceleration, transcription will use CPU only, which may be significantly slower. Consider using small models for better performance on CPU-only systems.
|
||||||
|
|
||||||
|
#### Customization
|
||||||
|
- Customize the container options:
|
||||||
|
```bash
|
||||||
|
docker build -t whisperlivekit-defaults .
|
||||||
|
docker create --gpus all --name whisperlivekit-base -p 8000:8000 whisperlivekit-defaults --model base
|
||||||
|
docker start -i whisperlivekit-base
|
||||||
|
```
|
||||||
|
|
||||||
|
- `--build-arg` Options:
|
||||||
|
- `EXTRAS="whisper-timestamped"` - Add extras to the image's installation (no spaces). Remember to set necessary container options!
|
||||||
|
- `HF_PRECACHE_DIR="./.cache/"` - Pre-load a model cache for faster first-time start
|
||||||
|
- `HF_TOKEN="./token"` - Add your Hugging Face Hub access token to download gated models
|
||||||
|
|
||||||
|
## 🔮 Use Cases
|
||||||
|
|
||||||
|
- **Meeting Transcription**: Capture discussions in real-time
|
||||||
|
- **Accessibility Tools**: Help hearing-impaired users follow conversations
|
||||||
|
- **Content Creation**: Transcribe podcasts or videos automatically
|
||||||
|
- **Customer Service**: Transcribe support calls with speaker identification
|
||||||
|
|
||||||
|
## 🤝 Contributing
|
||||||
|
|
||||||
|
Contributions are welcome! Here's how to get started:
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch: `git checkout -b feature/amazing-feature`
|
||||||
|
3. Commit your changes: `git commit -m 'Add amazing feature'`
|
||||||
|
4. Push to your branch: `git push origin feature/amazing-feature`
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
## 🙏 Acknowledgments
|
||||||
|
|
||||||
|
This project builds upon the foundational work of:
|
||||||
|
- [Whisper Streaming](https://github.com/ufal/whisper_streaming)
|
||||||
|
- [Diart](https://github.com/juanmc2005/diart)
|
||||||
|
- [OpenAI Whisper](https://github.com/openai/whisper)
|
||||||
|
|
||||||
|
We extend our gratitude to the original authors for their contributions.
|
||||||
|
|
||||||
|
## 📄 License
|
||||||
|
|
||||||
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||||
|
|
||||||
|
## 🔗 Links
|
||||||
|
|
||||||
|
- [GitHub Repository](https://github.com/QuentinFuxa/WhisperLiveKit)
|
||||||
|
- [PyPI Package](https://pypi.org/project/whisperlivekit/)
|
||||||
|
- [Issue Tracker](https://github.com/QuentinFuxa/WhisperLiveKit/issues)
|
||||||
|
|||||||
BIN
demo.png
BIN
demo.png
Binary file not shown.
|
Before Width: | Height: | Size: 463 KiB After Width: | Height: | Size: 438 KiB |
3
setup.py
3
setup.py
@@ -1,7 +1,7 @@
|
|||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
setup(
|
setup(
|
||||||
name="whisperlivekit",
|
name="whisperlivekit",
|
||||||
version="0.1.3",
|
version="0.1.5",
|
||||||
description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
|
description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
@@ -25,6 +25,7 @@ setup(
|
|||||||
"whisper-timestamped": ["whisper-timestamped"],
|
"whisper-timestamped": ["whisper-timestamped"],
|
||||||
"mlx-whisper": ["mlx-whisper"],
|
"mlx-whisper": ["mlx-whisper"],
|
||||||
"openai": ["openai"],
|
"openai": ["openai"],
|
||||||
|
"pyaudiowpatch": ["PyAudioWPatch"],
|
||||||
},
|
},
|
||||||
package_data={
|
package_data={
|
||||||
'whisperlivekit': ['web/*.html'],
|
'whisperlivekit': ['web/*.html'],
|
||||||
|
|||||||
@@ -1,82 +0,0 @@
|
|||||||
from contextlib import asynccontextmanager
|
|
||||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
|
||||||
from fastapi.responses import HTMLResponse
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
|
|
||||||
from whisperlivekit import WhisperLiveKit
|
|
||||||
from whisperlivekit.audio_processor import AudioProcessor
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
||||||
logging.getLogger().setLevel(logging.WARNING)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
logger.setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
kit = None
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def lifespan(app: FastAPI):
|
|
||||||
global kit
|
|
||||||
kit = WhisperLiveKit()
|
|
||||||
yield
|
|
||||||
|
|
||||||
app = FastAPI(lifespan=lifespan)
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"],
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def get():
|
|
||||||
return HTMLResponse(kit.web_interface())
|
|
||||||
|
|
||||||
|
|
||||||
async def handle_websocket_results(websocket, results_generator):
|
|
||||||
"""Consumes results from the audio processor and sends them via WebSocket."""
|
|
||||||
try:
|
|
||||||
async for response in results_generator:
|
|
||||||
await websocket.send_json(response)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Error in WebSocket results handler: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
@app.websocket("/asr")
|
|
||||||
async def websocket_endpoint(websocket: WebSocket):
|
|
||||||
audio_processor = AudioProcessor()
|
|
||||||
|
|
||||||
await websocket.accept()
|
|
||||||
logger.info("WebSocket connection opened.")
|
|
||||||
|
|
||||||
results_generator = await audio_processor.create_tasks()
|
|
||||||
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
message = await websocket.receive_bytes()
|
|
||||||
await audio_processor.process_audio(message)
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
logger.warning("WebSocket disconnected.")
|
|
||||||
finally:
|
|
||||||
websocket_task.cancel()
|
|
||||||
await audio_processor.cleanup()
|
|
||||||
logger.info("WebSocket endpoint cleaned up.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import uvicorn
|
|
||||||
|
|
||||||
temp_kit = WhisperLiveKit(transcription=False, diarization=False)
|
|
||||||
|
|
||||||
uvicorn.run(
|
|
||||||
"whisper_fastapi_online_server:app",
|
|
||||||
host=temp_kit.args.host,
|
|
||||||
port=temp_kit.args.port,
|
|
||||||
reload=True,
|
|
||||||
log_level="info"
|
|
||||||
)
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
from .core import WhisperLiveKit, parse_args
|
from .core import WhisperLiveKit, _parse_args_internal, get_parsed_args
|
||||||
from .audio_processor import AudioProcessor
|
from .audio_processor import AudioProcessor
|
||||||
|
|
||||||
__all__ = ['WhisperLiveKit', 'AudioProcessor', 'parse_args']
|
__all__ = ['WhisperLiveKit', 'AudioProcessor', '_parse_args_internal', 'get_parsed_args']
|
||||||
@@ -2,11 +2,18 @@ import asyncio
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
from time import time, sleep
|
from time import time, sleep
|
||||||
|
import platform # To check OS
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pyaudiowpatch as pyaudio
|
||||||
|
PYAUDIOWPATCH_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
pyaudio = None
|
||||||
|
PYAUDIOWPATCH_AVAILABLE = False
|
||||||
import math
|
import math
|
||||||
import logging
|
import logging
|
||||||
import traceback
|
import traceback
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import List, Dict, Any
|
|
||||||
from whisperlivekit.timed_objects import ASRToken
|
from whisperlivekit.timed_objects import ASRToken
|
||||||
from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
|
from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
|
||||||
from whisperlivekit.core import WhisperLiveKit
|
from whisperlivekit.core import WhisperLiveKit
|
||||||
@@ -14,7 +21,6 @@ from whisperlivekit.core import WhisperLiveKit
|
|||||||
# Set up logging once
|
# Set up logging once
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
def format_time(seconds: float) -> str:
|
def format_time(seconds: float) -> str:
|
||||||
"""Format seconds as HH:MM:SS."""
|
"""Format seconds as HH:MM:SS."""
|
||||||
@@ -39,7 +45,10 @@ class AudioProcessor:
|
|||||||
self.bytes_per_sample = 2
|
self.bytes_per_sample = 2
|
||||||
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
||||||
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
||||||
|
self.last_ffmpeg_activity = time()
|
||||||
|
self.ffmpeg_health_check_interval = 5
|
||||||
|
self.ffmpeg_max_idle_time = 10
|
||||||
|
|
||||||
# State management
|
# State management
|
||||||
self.tokens = []
|
self.tokens = []
|
||||||
self.buffer_transcription = ""
|
self.buffer_transcription = ""
|
||||||
@@ -56,18 +65,80 @@ class AudioProcessor:
|
|||||||
self.asr = models.asr
|
self.asr = models.asr
|
||||||
self.tokenizer = models.tokenizer
|
self.tokenizer = models.tokenizer
|
||||||
self.diarization = models.diarization
|
self.diarization = models.diarization
|
||||||
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
|
||||||
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
||||||
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
||||||
self.pcm_buffer = bytearray()
|
self.pcm_buffer = bytearray()
|
||||||
|
self.ffmpeg_process = None
|
||||||
|
self.pyaudio_instance = None
|
||||||
|
self.pyaudio_stream = None
|
||||||
|
|
||||||
|
# Initialize audio input based on args
|
||||||
|
if self.args.audio_input == "websocket":
|
||||||
|
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
||||||
|
elif self.args.audio_input == "pyaudiowpatch":
|
||||||
|
if not PYAUDIOWPATCH_AVAILABLE:
|
||||||
|
logger.error("PyAudioWPatch selected but not installed. Please install it: pip install whisperlivekit[pyaudiowpatch]")
|
||||||
|
raise ImportError("PyAudioWPatch not found.")
|
||||||
|
if platform.system() != "Windows":
|
||||||
|
logger.error("PyAudioWPatch is only supported on Windows.")
|
||||||
|
raise OSError("PyAudioWPatch requires Windows.")
|
||||||
|
self.initialize_pyaudiowpatch()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported audio input type: {self.args.audio_input}")
|
||||||
|
|
||||||
# Initialize transcription engine if enabled
|
# Initialize transcription engine if enabled
|
||||||
if self.args.transcription:
|
if self.args.transcription:
|
||||||
self.online = online_factory(self.args, models.asr, models.tokenizer)
|
self.online = online_factory(self.args, models.asr, models.tokenizer)
|
||||||
|
|
||||||
|
def initialize_pyaudiowpatch(self):
|
||||||
|
"""Initialize PyAudioWPatch for audio input."""
|
||||||
|
logger.info("Initializing PyAudioWPatch...")
|
||||||
|
try:
|
||||||
|
self.pyaudio_instance = pyaudio.PyAudio()
|
||||||
|
# Find the default WASAPI loopback device
|
||||||
|
wasapi_info = self.pyaudio_instance.get_host_api_info_by_type(pyaudio.paWASAPI)
|
||||||
|
default_speakers = self.pyaudio_instance.get_device_info_by_index(wasapi_info["defaultOutputDevice"])
|
||||||
|
|
||||||
|
if not default_speakers["isLoopbackDevice"]:
|
||||||
|
for loopback in self.pyaudio_instance.get_loopback_device_info_generator():
|
||||||
|
if default_speakers["name"] in loopback["name"]:
|
||||||
|
default_speakers = loopback
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logger.error("Default loopback output device not found.")
|
||||||
|
raise OSError("Default loopback output device not found.")
|
||||||
|
|
||||||
|
logger.info(f"Using loopback device: {default_speakers['name']}")
|
||||||
|
self.pyaudio_stream = self.pyaudio_instance.open(
|
||||||
|
format=pyaudio.paInt16,
|
||||||
|
channels=default_speakers["maxInputChannels"],
|
||||||
|
rate=int(default_speakers["defaultSampleRate"]),
|
||||||
|
input=True,
|
||||||
|
input_device_index=default_speakers["index"],
|
||||||
|
frames_per_buffer=int(self.sample_rate * self.args.min_chunk_size)
|
||||||
|
)
|
||||||
|
self.sample_rate = int(default_speakers["defaultSampleRate"])
|
||||||
|
self.channels = default_speakers["maxInputChannels"]
|
||||||
|
self.samples_per_sec = int(self.sample_rate * self.args.min_chunk_size)
|
||||||
|
self.bytes_per_sample = 2
|
||||||
|
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
||||||
|
logger.info(f"PyAudioWPatch initialized with {self.channels} channels and {self.sample_rate} Hz sample rate.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize PyAudioWPatch: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
if self.pyaudio_instance:
|
||||||
|
self.pyaudio_instance.terminate()
|
||||||
|
raise
|
||||||
|
|
||||||
def convert_pcm_to_float(self, pcm_buffer):
|
def convert_pcm_to_float(self, pcm_buffer):
|
||||||
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
||||||
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
|
if isinstance(pcm_buffer, (bytes, bytearray)):
|
||||||
|
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
else:
|
||||||
|
logger.error(f"Invalid buffer type for PCM conversion: {type(pcm_buffer)}")
|
||||||
|
return np.array([], dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
def start_ffmpeg_decoder(self):
|
def start_ffmpeg_decoder(self):
|
||||||
"""Start FFmpeg process for WebM to PCM conversion."""
|
"""Start FFmpeg process for WebM to PCM conversion."""
|
||||||
@@ -78,14 +149,89 @@ class AudioProcessor:
|
|||||||
|
|
||||||
async def restart_ffmpeg(self):
|
async def restart_ffmpeg(self):
|
||||||
"""Restart the FFmpeg process after failure."""
|
"""Restart the FFmpeg process after failure."""
|
||||||
|
logger.warning("Restarting FFmpeg process...")
|
||||||
|
|
||||||
if self.ffmpeg_process:
|
if self.ffmpeg_process:
|
||||||
try:
|
try:
|
||||||
self.ffmpeg_process.kill()
|
# we check if process is still running
|
||||||
await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait)
|
if self.ffmpeg_process.poll() is None:
|
||||||
|
logger.info("Terminating existing FFmpeg process")
|
||||||
|
self.ffmpeg_process.stdin.close()
|
||||||
|
self.ffmpeg_process.terminate()
|
||||||
|
|
||||||
|
# wait for termination with timeout
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait),
|
||||||
|
timeout=5.0
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("FFmpeg process did not terminate, killing forcefully")
|
||||||
|
self.ffmpeg_process.kill()
|
||||||
|
await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error killing FFmpeg process: {e}")
|
logger.error(f"Error during FFmpeg process termination: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
# we start new process
|
||||||
|
try:
|
||||||
|
logger.info("Starting new FFmpeg process")
|
||||||
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
||||||
self.pcm_buffer = bytearray()
|
self.pcm_buffer = bytearray()
|
||||||
|
self.last_ffmpeg_activity = time()
|
||||||
|
logger.info("FFmpeg process restarted successfully")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to restart FFmpeg process: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
# try again after 5s
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
try:
|
||||||
|
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
||||||
|
self.pcm_buffer = bytearray()
|
||||||
|
self.last_ffmpeg_activity = time()
|
||||||
|
logger.info("FFmpeg process restarted successfully on second attempt")
|
||||||
|
except Exception as e2:
|
||||||
|
logger.critical(f"Failed to restart FFmpeg process on second attempt: {e2}")
|
||||||
|
logger.critical(traceback.format_exc())
|
||||||
|
|
||||||
|
async def pyaudiowpatch_reader(self):
|
||||||
|
"""Read audio data from PyAudioWPatch stream and process it."""
|
||||||
|
logger.info("Starting PyAudioWPatch reader task.")
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
chunk = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
self.pyaudio_stream.read,
|
||||||
|
int(self.sample_rate * self.args.min_chunk_size),
|
||||||
|
False
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chunk:
|
||||||
|
logger.info("PyAudioWPatch stream closed or read empty chunk.")
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
pcm_array = self.convert_pcm_to_float(chunk)
|
||||||
|
|
||||||
|
if self.args.diarization and self.diarization_queue:
|
||||||
|
await self.diarization_queue.put(pcm_array.copy())
|
||||||
|
|
||||||
|
if self.args.transcription and self.transcription_queue:
|
||||||
|
await self.transcription_queue.put(pcm_array.copy())
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"PyAudioWPatch stream error: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Exception in pyaudiowpatch_reader: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
await asyncio.sleep(1) # Wait before retrying or breaking
|
||||||
|
break
|
||||||
|
logger.info("PyAudioWPatch reader task finished.")
|
||||||
|
|
||||||
|
|
||||||
async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
|
async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
|
||||||
"""Thread-safe update of transcription with new data."""
|
"""Thread-safe update of transcription with new data."""
|
||||||
@@ -154,23 +300,23 @@ class AudioProcessor:
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
# Calculate buffer size based on elapsed time
|
current_time = time()
|
||||||
elapsed_time = math.floor((time() - beg) * 10) / 10 # Round to 0.1 sec
|
elapsed_time = math.floor((current_time - beg) * 10) / 10
|
||||||
buffer_size = max(int(32000 * elapsed_time), 4096)
|
buffer_size = max(int(32000 * elapsed_time), 4096)
|
||||||
beg = time()
|
beg = current_time
|
||||||
|
|
||||||
# Read chunk with timeout
|
# Detect idle state much more quickly
|
||||||
try:
|
if current_time - self.last_ffmpeg_activity > self.ffmpeg_max_idle_time:
|
||||||
chunk = await asyncio.wait_for(
|
logger.warning(f"FFmpeg process idle for {current_time - self.last_ffmpeg_activity:.2f}s. Restarting...")
|
||||||
loop.run_in_executor(None, self.ffmpeg_process.stdout.read, buffer_size),
|
|
||||||
timeout=15.0
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning("FFmpeg read timeout. Restarting...")
|
|
||||||
await self.restart_ffmpeg()
|
await self.restart_ffmpeg()
|
||||||
beg = time()
|
beg = time()
|
||||||
|
self.last_ffmpeg_activity = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
chunk = await loop.run_in_executor(None, self.ffmpeg_process.stdout.read, buffer_size)
|
||||||
|
if chunk:
|
||||||
|
self.last_ffmpeg_activity = time()
|
||||||
|
|
||||||
if not chunk:
|
if not chunk:
|
||||||
logger.info("FFmpeg stdout closed.")
|
logger.info("FFmpeg stdout closed.")
|
||||||
break
|
break
|
||||||
@@ -183,7 +329,7 @@ class AudioProcessor:
|
|||||||
self.convert_pcm_to_float(self.pcm_buffer).copy()
|
self.convert_pcm_to_float(self.pcm_buffer).copy()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process when we have enough data
|
# Process when enough data
|
||||||
if len(self.pcm_buffer) >= self.bytes_per_sec:
|
if len(self.pcm_buffer) >= self.bytes_per_sec:
|
||||||
if len(self.pcm_buffer) > self.max_bytes_per_sec:
|
if len(self.pcm_buffer) > self.max_bytes_per_sec:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@@ -366,18 +512,55 @@ class AudioProcessor:
|
|||||||
logger.warning(f"Exception in results_formatter: {e}")
|
logger.warning(f"Exception in results_formatter: {e}")
|
||||||
logger.warning(f"Traceback: {traceback.format_exc()}")
|
logger.warning(f"Traceback: {traceback.format_exc()}")
|
||||||
await asyncio.sleep(0.5) # Back off on error
|
await asyncio.sleep(0.5) # Back off on error
|
||||||
|
|
||||||
async def create_tasks(self):
|
async def create_tasks(self):
|
||||||
"""Create and start processing tasks."""
|
"""Create and start processing tasks."""
|
||||||
|
|
||||||
tasks = []
|
tasks = []
|
||||||
if self.args.transcription and self.online:
|
if self.args.transcription and self.online:
|
||||||
tasks.append(asyncio.create_task(self.transcription_processor()))
|
tasks.append(asyncio.create_task(self.transcription_processor()))
|
||||||
|
|
||||||
if self.args.diarization and self.diarization:
|
if self.args.diarization and self.diarization:
|
||||||
tasks.append(asyncio.create_task(self.diarization_processor(self.diarization)))
|
tasks.append(asyncio.create_task(self.diarization_processor(self.diarization))) # Corrected indentation
|
||||||
|
|
||||||
tasks.append(asyncio.create_task(self.ffmpeg_stdout_reader()))
|
if self.args.audio_input == "websocket":
|
||||||
|
tasks.append(asyncio.create_task(self.ffmpeg_stdout_reader()))
|
||||||
|
elif self.args.audio_input == "pyaudiowpatch":
|
||||||
|
tasks.append(asyncio.create_task(self.pyaudiowpatch_reader()))
|
||||||
|
|
||||||
|
# Monitor overall system health
|
||||||
|
async def watchdog():
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(10) # Check every 10 seconds instead of 60
|
||||||
|
|
||||||
|
current_time = time()
|
||||||
|
# Check for stalled tasks
|
||||||
|
for i, task in enumerate(tasks):
|
||||||
|
if task.done():
|
||||||
|
exc = task.exception() if task.done() else None
|
||||||
|
task_name = task.get_name() if hasattr(task, 'get_name') else f"Task {i}"
|
||||||
|
logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
|
||||||
|
|
||||||
|
if self.args.audio_input == "websocket":
|
||||||
|
ffmpeg_idle_time = current_time - self.last_ffmpeg_activity
|
||||||
|
if ffmpeg_idle_time > 15: # 15 seconds instead of 180
|
||||||
|
logger.warning(f"FFmpeg idle for {ffmpeg_idle_time:.2f}s - may need attention")
|
||||||
|
|
||||||
|
# Force restart after 30 seconds of inactivity (instead of 600)
|
||||||
|
if ffmpeg_idle_time > 30:
|
||||||
|
logger.error("FFmpeg idle for too long, forcing restart")
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
|
||||||
|
elif self.args.audio_input == "pyaudiowpatch":
|
||||||
|
if self.pyaudio_stream and not self.pyaudio_stream.is_active():
|
||||||
|
logger.warning("PyAudioWPatch stream is not active. Attempting to restart or handle.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in watchdog task: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
tasks.append(asyncio.create_task(watchdog()))
|
||||||
self.tasks = tasks
|
self.tasks = tasks
|
||||||
|
|
||||||
return self.results_formatter()
|
return self.results_formatter()
|
||||||
@@ -389,21 +572,100 @@ class AudioProcessor:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
await asyncio.gather(*self.tasks, return_exceptions=True)
|
await asyncio.gather(*self.tasks, return_exceptions=True)
|
||||||
self.ffmpeg_process.stdin.close()
|
if self.args.audio_input == "websocket" and self.ffmpeg_process:
|
||||||
self.ffmpeg_process.wait()
|
if self.ffmpeg_process.stdin:
|
||||||
|
self.ffmpeg_process.stdin.close()
|
||||||
|
if self.ffmpeg_process.poll() is None:
|
||||||
|
self.ffmpeg_process.wait()
|
||||||
|
elif self.args.audio_input == "pyaudiowpatch":
|
||||||
|
if self.pyaudio_stream:
|
||||||
|
self.pyaudio_stream.stop_stream()
|
||||||
|
self.pyaudio_stream.close()
|
||||||
|
logger.info("PyAudioWPatch stream closed.")
|
||||||
|
if self.pyaudio_instance:
|
||||||
|
self.pyaudio_instance.terminate()
|
||||||
|
logger.info("PyAudioWPatch instance terminated.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error during cleanup: {e}")
|
logger.warning(f"Error during cleanup: {e}")
|
||||||
|
logger.warning(traceback.format_exc())
|
||||||
|
|
||||||
if self.args.diarization and hasattr(self, 'diarization'):
|
if self.args.diarization and hasattr(self, 'diarization'):
|
||||||
self.diarization.close()
|
self.diarization.close()
|
||||||
|
|
||||||
async def process_audio(self, message):
|
async def process_audio(self, message):
|
||||||
"""Process incoming audio data."""
|
"""Process incoming audio data."""
|
||||||
try:
|
retry_count = 0
|
||||||
self.ffmpeg_process.stdin.write(message)
|
max_retries = 3
|
||||||
self.ffmpeg_process.stdin.flush()
|
|
||||||
except (BrokenPipeError, AttributeError) as e:
|
# Log periodic heartbeats showing ongoing audio proc
|
||||||
logger.warning(f"Error writing to FFmpeg: {e}. Restarting...")
|
current_time = time()
|
||||||
await self.restart_ffmpeg()
|
if not hasattr(self, '_last_heartbeat') or current_time - self._last_heartbeat >= 10:
|
||||||
self.ffmpeg_process.stdin.write(message)
|
logger.debug(f"Processing audio chunk, last FFmpeg activity: {current_time - self.last_ffmpeg_activity:.2f}s ago")
|
||||||
self.ffmpeg_process.stdin.flush()
|
self._last_heartbeat = current_time
|
||||||
|
|
||||||
|
if self.args.audio_input != "websocket":
|
||||||
|
# logger.debug("Audio input is not WebSocket, skipping process_audio.")
|
||||||
|
return # Do nothing if input is not WebSocket
|
||||||
|
|
||||||
|
while retry_count < max_retries:
|
||||||
|
try:
|
||||||
|
|
||||||
|
if not self.ffmpeg_process or self.ffmpeg_process.poll() is not None:
|
||||||
|
logger.warning("FFmpeg process not running or unavailable, attempting restart...")
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
|
||||||
|
if not self.ffmpeg_process or self.ffmpeg_process.poll() is not None:
|
||||||
|
logger.error("FFmpeg restart failed or process terminated immediately.")
|
||||||
|
# maybe raise an error or break after retries
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
retry_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure stdin is available
|
||||||
|
if not hasattr(self.ffmpeg_process, 'stdin') or self.ffmpeg_process.stdin.closed:
|
||||||
|
logger.warning("FFmpeg stdin is not available or closed. Restarting...")
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
if not hasattr(self.ffmpeg_process, 'stdin') or self.ffmpeg_process.stdin.closed:
|
||||||
|
logger.error("FFmpeg stdin still unavailable after restart.")
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
retry_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
loop.run_in_executor(None, lambda: self.ffmpeg_process.stdin.write(message)),
|
||||||
|
timeout=2.0
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("FFmpeg write operation timed out, restarting...")
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
retry_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
loop.run_in_executor(None, self.ffmpeg_process.stdin.flush),
|
||||||
|
timeout=2.0
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("FFmpeg flush operation timed out, restarting...")
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
retry_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.last_ffmpeg_activity = time()
|
||||||
|
return
|
||||||
|
|
||||||
|
except (BrokenPipeError, AttributeError, OSError) as e:
|
||||||
|
retry_count += 1
|
||||||
|
logger.warning(f"Error writing to FFmpeg: {e}. Retry {retry_count}/{max_retries}...")
|
||||||
|
|
||||||
|
if retry_count < max_retries:
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
else:
|
||||||
|
logger.error("Maximum retries reached for FFmpeg process")
|
||||||
|
await self.restart_ffmpeg()
|
||||||
|
return
|
||||||
@@ -3,26 +3,47 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
|||||||
from fastapi.responses import HTMLResponse
|
from fastapi.responses import HTMLResponse
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
from whisperlivekit import WhisperLiveKit
|
from whisperlivekit import WhisperLiveKit, get_parsed_args
|
||||||
from whisperlivekit.audio_processor import AudioProcessor
|
from whisperlivekit.audio_processor import AudioProcessor
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os, sys
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||||
logging.getLogger().setLevel(logging.WARNING)
|
logging.getLogger().setLevel(logging.WARNING)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
kit = None
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
global kit
|
logger.info("Starting up...")
|
||||||
kit = WhisperLiveKit()
|
kit = WhisperLiveKit()
|
||||||
|
app.state.kit = kit
|
||||||
|
logger.info(f"Audio Input mode: {kit.args.audio_input}")
|
||||||
|
|
||||||
|
audio_processor = AudioProcessor()
|
||||||
|
app.state.audio_processor = audio_processor
|
||||||
|
app.state.results_generator = None # Initialize
|
||||||
|
|
||||||
|
if kit.args.audio_input == "pyaudiowpatch":
|
||||||
|
logger.info("Starting PyAudioWPatch processing tasks...")
|
||||||
|
try:
|
||||||
|
app.state.results_generator = await audio_processor.create_tasks()
|
||||||
|
except Exception as e:
|
||||||
|
logger.critical(f"Failed to start PyAudioWPatch processing: {e}", exc_info=True)
|
||||||
|
else:
|
||||||
|
logger.info("WebSocket input mode selected. Processing will start on client connection.")
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
logger.info("Shutting down...")
|
||||||
|
if hasattr(app.state, 'audio_processor') and app.state.audio_processor:
|
||||||
|
logger.info("Cleaning up AudioProcessor...")
|
||||||
|
await app.state.audio_processor.cleanup()
|
||||||
|
logger.info("Shutdown complete.")
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(lifespan=lifespan)
|
app = FastAPI(lifespan=lifespan)
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
@@ -35,10 +56,10 @@ app.add_middleware(
|
|||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def get():
|
async def get():
|
||||||
return HTMLResponse(kit.web_interface())
|
return HTMLResponse(app.state.kit.web_interface())
|
||||||
|
|
||||||
|
|
||||||
async def handle_websocket_results(websocket, results_generator):
|
async def handle_websocket_results(websocket: WebSocket, results_generator):
|
||||||
"""Consumes results from the audio processor and sends them via WebSocket."""
|
"""Consumes results from the audio processor and sends them via WebSocket."""
|
||||||
try:
|
try:
|
||||||
async for response in results_generator:
|
async for response in results_generator:
|
||||||
@@ -49,38 +70,126 @@ async def handle_websocket_results(websocket, results_generator):
|
|||||||
|
|
||||||
@app.websocket("/asr")
|
@app.websocket("/asr")
|
||||||
async def websocket_endpoint(websocket: WebSocket):
|
async def websocket_endpoint(websocket: WebSocket):
|
||||||
audio_processor = AudioProcessor()
|
|
||||||
|
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
logger.info("WebSocket connection opened.")
|
logger.info("WebSocket connection accepted.")
|
||||||
|
|
||||||
results_generator = await audio_processor.create_tasks()
|
audio_processor = app.state.audio_processor
|
||||||
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
kit_args = app.state.kit.args
|
||||||
|
results_generator = None
|
||||||
|
websocket_task = None
|
||||||
|
receive_task = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
if kit_args.audio_input == "websocket":
|
||||||
message = await websocket.receive_bytes()
|
logger.info("WebSocket mode: Starting processing tasks for this connection.")
|
||||||
await audio_processor.process_audio(message)
|
results_generator = await audio_processor.create_tasks()
|
||||||
|
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
||||||
|
|
||||||
|
async def receive_audio():
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
message = await websocket.receive_bytes()
|
||||||
|
await audio_processor.process_audio(message)
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
logger.info("WebSocket disconnected by client (receive_audio).")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error receiving audio: {e}", exc_info=True)
|
||||||
|
finally:
|
||||||
|
logger.debug("Receive audio task finished.")
|
||||||
|
|
||||||
|
|
||||||
|
receive_task = asyncio.create_task(receive_audio())
|
||||||
|
done, pending = await asyncio.wait(
|
||||||
|
{websocket_task, receive_task},
|
||||||
|
return_when=asyncio.FIRST_COMPLETED,
|
||||||
|
)
|
||||||
|
for task in pending:
|
||||||
|
task.cancel() # Cancel the other task
|
||||||
|
|
||||||
|
elif kit_args.audio_input == "pyaudiowpatch":
|
||||||
|
logger.info("PyAudioWPatch mode: Streaming existing results.")
|
||||||
|
results_generator = app.state.results_generator
|
||||||
|
if results_generator is None:
|
||||||
|
logger.error("PyAudioWPatch results generator not available. Was startup successful?")
|
||||||
|
await websocket.close(code=1011, reason="Server error: Audio processing not started.")
|
||||||
|
return
|
||||||
|
|
||||||
|
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
||||||
|
await websocket_task
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.error(f"Unsupported audio input mode configured: {kit_args.audio_input}")
|
||||||
|
await websocket.close(code=1011, reason="Server configuration error.")
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
except WebSocketDisconnect:
|
||||||
logger.warning("WebSocket disconnected.")
|
logger.info("WebSocket disconnected by client.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in WebSocket endpoint: {e}", exc_info=True)
|
||||||
|
# Attempt to close gracefully
|
||||||
|
try:
|
||||||
|
await websocket.close(code=1011, reason=f"Server error: {e}")
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore errors during close after another error
|
||||||
finally:
|
finally:
|
||||||
websocket_task.cancel()
|
logger.info("Cleaning up WebSocket connection...")
|
||||||
await audio_processor.cleanup()
|
if websocket_task and not websocket_task.done():
|
||||||
logger.info("WebSocket endpoint cleaned up.")
|
websocket_task.cancel()
|
||||||
|
if receive_task and not receive_task.done():
|
||||||
|
receive_task.cancel()
|
||||||
|
|
||||||
|
if kit_args.audio_input == "websocket":
|
||||||
|
pass
|
||||||
|
|
||||||
|
logger.info("WebSocket connection closed.")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Entry point for the CLI command."""
|
"""Entry point for the CLI command."""
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
|
# Get the globally parsed arguments
|
||||||
|
args = get_parsed_args()
|
||||||
|
|
||||||
|
# Set logger level based on args
|
||||||
|
log_level_name = args.log_level.upper()
|
||||||
|
# Ensure the level name is valid for the logging module
|
||||||
|
numeric_level = getattr(logging, log_level_name, None)
|
||||||
|
if not isinstance(numeric_level, int):
|
||||||
|
logging.warning(f"Invalid log level: {args.log_level}. Defaulting to INFO.")
|
||||||
|
numeric_level = logging.INFO
|
||||||
|
logging.getLogger().setLevel(numeric_level) # Set root logger level
|
||||||
|
# Set our specific logger level too
|
||||||
|
logger.setLevel(numeric_level)
|
||||||
|
logger.info(f"Log level set to: {log_level_name}")
|
||||||
|
|
||||||
|
# Determine uvicorn log level (map CRITICAL to critical, etc.)
|
||||||
|
uvicorn_log_level = log_level_name.lower()
|
||||||
|
if uvicorn_log_level == "debug": # Uvicorn uses 'trace' for more verbose than debug
|
||||||
|
uvicorn_log_level = "trace"
|
||||||
|
|
||||||
|
|
||||||
|
uvicorn_kwargs = {
|
||||||
|
"app": "whisperlivekit.basic_server:app",
|
||||||
|
"host":args.host,
|
||||||
|
"port":args.port,
|
||||||
|
"reload": False,
|
||||||
|
"log_level": uvicorn_log_level,
|
||||||
|
"lifespan": "on",
|
||||||
|
}
|
||||||
|
|
||||||
temp_kit = WhisperLiveKit(transcription=False, diarization=False)
|
ssl_kwargs = {}
|
||||||
|
if args.ssl_certfile or args.ssl_keyfile:
|
||||||
uvicorn.run(
|
if not (args.ssl_certfile and args.ssl_keyfile):
|
||||||
"whisperlivekit.basic_server:app",
|
raise ValueError("Both --ssl-certfile and --ssl-keyfile must be specified together.")
|
||||||
host=temp_kit.args.host,
|
ssl_kwargs = {
|
||||||
port=temp_kit.args.port,
|
"ssl_certfile": args.ssl_certfile,
|
||||||
reload=True,
|
"ssl_keyfile": args.ssl_keyfile
|
||||||
log_level="info"
|
}
|
||||||
)
|
|
||||||
|
|
||||||
|
if ssl_kwargs:
|
||||||
|
uvicorn_kwargs = {**uvicorn_kwargs, **ssl_kwargs}
|
||||||
|
|
||||||
|
uvicorn.run(**uvicorn_kwargs)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
|
import sys
|
||||||
|
from argparse import Namespace, ArgumentParser
|
||||||
try:
|
try:
|
||||||
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
|
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from .whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
|
if '.' not in sys.path:
|
||||||
from argparse import Namespace, ArgumentParser
|
sys.path.insert(0, '.')
|
||||||
|
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
|
||||||
|
|
||||||
def parse_args():
|
def _parse_args_internal():
|
||||||
parser = ArgumentParser(description="Whisper FastAPI Online Server")
|
parser = ArgumentParser(description="Whisper FastAPI Online Server")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--host",
|
"--host",
|
||||||
@@ -130,35 +133,55 @@ def parse_args():
|
|||||||
help="Set the log level",
|
help="Set the log level",
|
||||||
default="DEBUG",
|
default="DEBUG",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--audio-input",
|
||||||
|
type=str,
|
||||||
|
default="websocket",
|
||||||
|
choices=["websocket", "pyaudiowpatch"],
|
||||||
|
help="Source of the audio input. 'websocket' expects audio via WebSocket (default). 'pyaudiowpatch' uses PyAudioWPatch to capture system audio output.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--ssl-certfile", type=str, help="Path to the SSL certificate file.", default=None)
|
||||||
|
parser.add_argument("--ssl-keyfile", type=str, help="Path to the SSL private key file.", default=None)
|
||||||
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
args.transcription = not args.no_transcription
|
args.transcription = not args.no_transcription
|
||||||
args.vad = not args.no_vad
|
args.vad = not args.no_vad
|
||||||
delattr(args, 'no_transcription')
|
delattr(args, 'no_transcription')
|
||||||
delattr(args, 'no_vad')
|
delattr(args, 'no_vad')
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
_cli_args = _parse_args_internal()
|
||||||
|
|
||||||
|
def get_parsed_args() -> Namespace:
|
||||||
|
"""Returns the globally parsed command-line arguments."""
|
||||||
|
return _cli_args
|
||||||
|
|
||||||
|
# --- WhisperLiveKit Class ---
|
||||||
class WhisperLiveKit:
|
class WhisperLiveKit:
|
||||||
_instance = None
|
_instance = None
|
||||||
_initialized = False
|
_initialized = False
|
||||||
|
|
||||||
def __new__(cls, *args, **kwargs):
|
def __new__(cls, args: Namespace = None, **kwargs):
|
||||||
if cls._instance is None:
|
if cls._instance is None:
|
||||||
cls._instance = super().__new__(cls)
|
cls._instance = super().__new__(cls)
|
||||||
return cls._instance
|
return cls._instance
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, args: Namespace = None, **kwargs):
|
||||||
|
"""
|
||||||
|
Initializes WhisperLiveKit.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args (Namespace, optional): Pre-parsed arguments. If None, uses globally parsed args.
|
||||||
|
Defaults to None.
|
||||||
|
**kwargs: Additional keyword arguments (currently not used directly but captured).
|
||||||
|
"""
|
||||||
if WhisperLiveKit._initialized:
|
if WhisperLiveKit._initialized:
|
||||||
return
|
return
|
||||||
|
|
||||||
default_args = vars(parse_args())
|
self.args = args if args is not None else get_parsed_args()
|
||||||
|
|
||||||
merged_args = {**default_args, **kwargs}
|
|
||||||
|
|
||||||
self.args = Namespace(**merged_args)
|
|
||||||
|
|
||||||
self.asr = None
|
self.asr = None
|
||||||
self.tokenizer = None
|
self.tokenizer = None
|
||||||
self.diarization = None
|
self.diarization = None
|
||||||
|
|||||||
@@ -38,7 +38,6 @@
|
|||||||
transform: scale(0.95);
|
transform: scale(0.95);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Shape inside the button */
|
|
||||||
.shape-container {
|
.shape-container {
|
||||||
width: 25px;
|
width: 25px;
|
||||||
height: 25px;
|
height: 25px;
|
||||||
@@ -56,6 +55,10 @@
|
|||||||
transition: all 0.3s ease;
|
transition: all 0.3s ease;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#recordButton:disabled .shape {
|
||||||
|
background-color: #6e6d6d;
|
||||||
|
}
|
||||||
|
|
||||||
#recordButton.recording .shape {
|
#recordButton.recording .shape {
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
width: 25px;
|
width: 25px;
|
||||||
@@ -279,7 +282,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<label for="websocketInput">WebSocket URL:</label>
|
<label for="websocketInput">WebSocket URL:</label>
|
||||||
<input id="websocketInput" type="text" value="ws://localhost:8000/asr" />
|
<input id="websocketInput" type="text" />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -304,6 +307,7 @@
|
|||||||
let waveCanvas = document.getElementById("waveCanvas");
|
let waveCanvas = document.getElementById("waveCanvas");
|
||||||
let waveCtx = waveCanvas.getContext("2d");
|
let waveCtx = waveCanvas.getContext("2d");
|
||||||
let animationFrame = null;
|
let animationFrame = null;
|
||||||
|
let waitingForStop = false;
|
||||||
waveCanvas.width = 60 * (window.devicePixelRatio || 1);
|
waveCanvas.width = 60 * (window.devicePixelRatio || 1);
|
||||||
waveCanvas.height = 30 * (window.devicePixelRatio || 1);
|
waveCanvas.height = 30 * (window.devicePixelRatio || 1);
|
||||||
waveCtx.scale(window.devicePixelRatio || 1, window.devicePixelRatio || 1);
|
waveCtx.scale(window.devicePixelRatio || 1, window.devicePixelRatio || 1);
|
||||||
@@ -315,6 +319,13 @@
|
|||||||
const linesTranscriptDiv = document.getElementById("linesTranscript");
|
const linesTranscriptDiv = document.getElementById("linesTranscript");
|
||||||
const timerElement = document.querySelector(".timer");
|
const timerElement = document.querySelector(".timer");
|
||||||
|
|
||||||
|
const host = window.location.hostname || "localhost";
|
||||||
|
const port = window.location.port || "8000";
|
||||||
|
const protocol = window.location.protocol === "https:" ? "wss" : "ws";
|
||||||
|
const defaultWebSocketUrl = `${protocol}://${host}:${port}/asr`;
|
||||||
|
websocketInput.value = defaultWebSocketUrl;
|
||||||
|
websocketUrl = defaultWebSocketUrl;
|
||||||
|
|
||||||
chunkSelector.addEventListener("change", () => {
|
chunkSelector.addEventListener("change", () => {
|
||||||
chunkDuration = parseInt(chunkSelector.value);
|
chunkDuration = parseInt(chunkSelector.value);
|
||||||
});
|
});
|
||||||
@@ -346,10 +357,16 @@
|
|||||||
|
|
||||||
websocket.onclose = () => {
|
websocket.onclose = () => {
|
||||||
if (userClosing) {
|
if (userClosing) {
|
||||||
statusText.textContent = "WebSocket closed by user.";
|
if (!statusText.textContent.includes("Recording stopped. Processing final audio")) { // This is a bit of a hack. We should have a better way to handle this. eg. using a status code.
|
||||||
|
statusText.textContent = "Finished processing audio! Ready to record again.";
|
||||||
|
}
|
||||||
|
waitingForStop = false;
|
||||||
} else {
|
} else {
|
||||||
statusText.textContent =
|
statusText.textContent =
|
||||||
"Disconnected from the WebSocket server. (Check logs if model is loading.)";
|
"Disconnected from the WebSocket server. (Check logs if model is loading.)";
|
||||||
|
if (isRecording) {
|
||||||
|
stopRecording();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
userClosing = false;
|
userClosing = false;
|
||||||
};
|
};
|
||||||
@@ -363,6 +380,27 @@
|
|||||||
websocket.onmessage = (event) => {
|
websocket.onmessage = (event) => {
|
||||||
const data = JSON.parse(event.data);
|
const data = JSON.parse(event.data);
|
||||||
|
|
||||||
|
// Check for status messages
|
||||||
|
if (data.type === "ready_to_stop") {
|
||||||
|
console.log("Ready to stop, closing WebSocket");
|
||||||
|
|
||||||
|
// signal that we are not waiting for stop anymore
|
||||||
|
waitingForStop = false;
|
||||||
|
recordButton.disabled = false; // this should be elsewhere
|
||||||
|
console.log("Record button enabled");
|
||||||
|
|
||||||
|
//Now we can close the WebSocket
|
||||||
|
if (websocket) {
|
||||||
|
websocket.close();
|
||||||
|
websocket = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle normal transcription updates
|
||||||
const {
|
const {
|
||||||
lines = [],
|
lines = [],
|
||||||
buffer_transcription = "",
|
buffer_transcription = "",
|
||||||
@@ -494,8 +532,17 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function stopRecording() {
|
async function stopRecording() {
|
||||||
userClosing = true;
|
userClosing = true;
|
||||||
|
waitingForStop = true;
|
||||||
|
|
||||||
|
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||||
|
// Send empty audio buffer as stop signal
|
||||||
|
const emptyBlob = new Blob([], { type: 'audio/webm' });
|
||||||
|
websocket.send(emptyBlob);
|
||||||
|
statusText.textContent = "Recording stopped. Processing final audio...";
|
||||||
|
}
|
||||||
|
|
||||||
if (recorder) {
|
if (recorder) {
|
||||||
recorder.stop();
|
recorder.stop();
|
||||||
recorder = null;
|
recorder = null;
|
||||||
@@ -531,34 +578,67 @@
|
|||||||
timerElement.textContent = "00:00";
|
timerElement.textContent = "00:00";
|
||||||
startTime = null;
|
startTime = null;
|
||||||
|
|
||||||
isRecording = false;
|
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||||
|
try {
|
||||||
if (websocket) {
|
await websocket.send(JSON.stringify({
|
||||||
websocket.close();
|
type: "stop",
|
||||||
websocket = null;
|
message: "User stopped recording"
|
||||||
|
}));
|
||||||
|
statusText.textContent = "Recording stopped. Processing final audio...";
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Could not send stop message:", e);
|
||||||
|
statusText.textContent = "Recording stopped. Error during final audio processing.";
|
||||||
|
websocket.close();
|
||||||
|
websocket = null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isRecording = false;
|
||||||
updateUI();
|
updateUI();
|
||||||
}
|
}
|
||||||
|
|
||||||
async function toggleRecording() {
|
async function toggleRecording() {
|
||||||
if (!isRecording) {
|
if (!isRecording) {
|
||||||
linesTranscriptDiv.innerHTML = "";
|
if (waitingForStop) {
|
||||||
|
console.log("Waiting for stop, early return");
|
||||||
|
return; // Early return, UI is already updated
|
||||||
|
}
|
||||||
|
console.log("Connecting to WebSocket");
|
||||||
try {
|
try {
|
||||||
await setupWebSocket();
|
// If we have an active WebSocket that's still processing, just restart audio capture
|
||||||
await startRecording();
|
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||||
|
await startRecording();
|
||||||
|
} else {
|
||||||
|
// If no active WebSocket or it's closed, create new one
|
||||||
|
await setupWebSocket();
|
||||||
|
await startRecording();
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
|
statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
|
||||||
console.error(err);
|
console.error(err);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
console.log("Stopping recording");
|
||||||
stopRecording();
|
stopRecording();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function updateUI() {
|
function updateUI() {
|
||||||
recordButton.classList.toggle("recording", isRecording);
|
recordButton.classList.toggle("recording", isRecording);
|
||||||
statusText.textContent = isRecording ? "Recording..." : "Click to start transcription";
|
|
||||||
|
if (waitingForStop) {
|
||||||
|
statusText.textContent = "Please wait for processing to complete...";
|
||||||
|
recordButton.disabled = true; // Optionally disable the button while waiting
|
||||||
|
console.log("Record button disabled");
|
||||||
|
} else if (isRecording) {
|
||||||
|
statusText.textContent = "Recording...";
|
||||||
|
recordButton.disabled = false;
|
||||||
|
console.log("Record button enabled");
|
||||||
|
} else {
|
||||||
|
statusText.textContent = "Click to start transcription";
|
||||||
|
recordButton.disabled = false;
|
||||||
|
console.log("Record button enabled");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
recordButton.addEventListener("click", toggleRecording);
|
recordButton.addEventListener("click", toggleRecording);
|
||||||
|
|||||||
@@ -105,8 +105,9 @@ class FasterWhisperASR(ASRBase):
|
|||||||
model_size_or_path = modelsize
|
model_size_or_path = modelsize
|
||||||
else:
|
else:
|
||||||
raise ValueError("Either modelsize or model_dir must be set")
|
raise ValueError("Either modelsize or model_dir must be set")
|
||||||
device = "cuda" if torch and torch.cuda.is_available() else "cpu"
|
device = "auto" # Allow CTranslate2 to decide available device
|
||||||
compute_type = "float16" if device == "cuda" else "float32"
|
compute_type = "auto" # Allow CTranslate2 to decide faster compute type
|
||||||
|
|
||||||
|
|
||||||
model = WhisperModel(
|
model = WhisperModel(
|
||||||
model_size_or_path,
|
model_size_or_path,
|
||||||
@@ -252,8 +253,8 @@ class OpenaiApiASR(ASRBase):
|
|||||||
no_speech_segments = []
|
no_speech_segments = []
|
||||||
if self.use_vad_opt:
|
if self.use_vad_opt:
|
||||||
for segment in segments.segments:
|
for segment in segments.segments:
|
||||||
if segment["no_speech_prob"] > 0.8:
|
if segment.no_speech_prob > 0.8:
|
||||||
no_speech_segments.append((segment.get("start"), segment.get("end")))
|
no_speech_segments.append((segment.start, segment.end))
|
||||||
tokens = []
|
tokens = []
|
||||||
for word in segments.words:
|
for word in segments.words:
|
||||||
start = word.start
|
start = word.start
|
||||||
|
|||||||
@@ -216,31 +216,54 @@ class OnlineASRProcessor:
|
|||||||
"""
|
"""
|
||||||
If the committed tokens form at least two sentences, chunk the audio
|
If the committed tokens form at least two sentences, chunk the audio
|
||||||
buffer at the end time of the penultimate sentence.
|
buffer at the end time of the penultimate sentence.
|
||||||
|
Also ensures chunking happens if audio buffer exceeds a time limit.
|
||||||
"""
|
"""
|
||||||
|
buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
|
||||||
if not self.committed:
|
if not self.committed:
|
||||||
|
if buffer_duration > self.buffer_trimming_sec:
|
||||||
|
chunk_time = self.buffer_time_offset + (buffer_duration / 2)
|
||||||
|
logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
|
||||||
|
self.chunk_at(chunk_time)
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
|
logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
|
||||||
sentences = self.words_to_sentences(self.committed)
|
sentences = self.words_to_sentences(self.committed)
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
logger.debug(f"\tSentence: {sentence.text}")
|
logger.debug(f"\tSentence: {sentence.text}")
|
||||||
if len(sentences) < 2:
|
|
||||||
return
|
chunk_done = False
|
||||||
# Keep the last two sentences.
|
if len(sentences) >= 2:
|
||||||
while len(sentences) > 2:
|
while len(sentences) > 2:
|
||||||
sentences.pop(0)
|
sentences.pop(0)
|
||||||
chunk_time = sentences[-2].end
|
chunk_time = sentences[-2].end
|
||||||
logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
|
logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
|
||||||
self.chunk_at(chunk_time)
|
self.chunk_at(chunk_time)
|
||||||
|
chunk_done = True
|
||||||
|
|
||||||
|
if not chunk_done and buffer_duration > self.buffer_trimming_sec:
|
||||||
|
last_committed_time = self.committed[-1].end
|
||||||
|
logger.debug(f"--- Not enough sentences, chunking at last committed time {last_committed_time:.2f}")
|
||||||
|
self.chunk_at(last_committed_time)
|
||||||
|
|
||||||
def chunk_completed_segment(self, res):
|
def chunk_completed_segment(self, res):
|
||||||
"""
|
"""
|
||||||
Chunk the audio buffer based on segment-end timestamps reported by the ASR.
|
Chunk the audio buffer based on segment-end timestamps reported by the ASR.
|
||||||
|
Also ensures chunking happens if audio buffer exceeds a time limit.
|
||||||
"""
|
"""
|
||||||
|
buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
|
||||||
if not self.committed:
|
if not self.committed:
|
||||||
|
if buffer_duration > self.buffer_trimming_sec:
|
||||||
|
chunk_time = self.buffer_time_offset + (buffer_duration / 2)
|
||||||
|
logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
|
||||||
|
self.chunk_at(chunk_time)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
logger.debug("Processing committed tokens for segmenting")
|
||||||
ends = self.asr.segments_end_ts(res)
|
ends = self.asr.segments_end_ts(res)
|
||||||
last_committed_time = self.committed[-1].end
|
last_committed_time = self.committed[-1].end
|
||||||
|
chunk_done = False
|
||||||
if len(ends) > 1:
|
if len(ends) > 1:
|
||||||
|
logger.debug("Multiple segments available for chunking")
|
||||||
e = ends[-2] + self.buffer_time_offset
|
e = ends[-2] + self.buffer_time_offset
|
||||||
while len(ends) > 2 and e > last_committed_time:
|
while len(ends) > 2 and e > last_committed_time:
|
||||||
ends.pop(-1)
|
ends.pop(-1)
|
||||||
@@ -248,11 +271,18 @@ class OnlineASRProcessor:
|
|||||||
if e <= last_committed_time:
|
if e <= last_committed_time:
|
||||||
logger.debug(f"--- Segment chunked at {e:.2f}")
|
logger.debug(f"--- Segment chunked at {e:.2f}")
|
||||||
self.chunk_at(e)
|
self.chunk_at(e)
|
||||||
|
chunk_done = True
|
||||||
else:
|
else:
|
||||||
logger.debug("--- Last segment not within committed area")
|
logger.debug("--- Last segment not within committed area")
|
||||||
else:
|
else:
|
||||||
logger.debug("--- Not enough segments to chunk")
|
logger.debug("--- Not enough segments to chunk")
|
||||||
|
|
||||||
|
if not chunk_done and buffer_duration > self.buffer_trimming_sec:
|
||||||
|
logger.debug(f"--- Buffer too large, chunking at last committed time {last_committed_time:.2f}")
|
||||||
|
self.chunk_at(last_committed_time)
|
||||||
|
|
||||||
|
logger.debug("Segment chunking complete")
|
||||||
|
|
||||||
def chunk_at(self, time: float):
|
def chunk_at(self, time: float):
|
||||||
"""
|
"""
|
||||||
Trim both the hypothesis and audio buffer at the given time.
|
Trim both the hypothesis and audio buffer at the given time.
|
||||||
@@ -358,7 +388,7 @@ class VACOnlineASRProcessor:
|
|||||||
# Load a VAD model (e.g. Silero VAD)
|
# Load a VAD model (e.g. Silero VAD)
|
||||||
import torch
|
import torch
|
||||||
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
||||||
from silero_vad_iterator import FixedVADIterator
|
from .silero_vad_iterator import FixedVADIterator
|
||||||
|
|
||||||
self.vac = FixedVADIterator(model)
|
self.vac = FixedVADIterator(model)
|
||||||
self.logfile = self.online.logfile
|
self.logfile = self.online.logfile
|
||||||
|
|||||||
@@ -179,7 +179,7 @@ def warmup_asr(asr, warmup_file=None, timeout=5):
|
|||||||
logger.warning(f"Warmup file {warmup_file} invalid or missing.")
|
logger.warning(f"Warmup file {warmup_file} invalid or missing.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
print(f"Warmping up Whisper with {warmup_file}")
|
print(f"Warming up Whisper with {warmup_file}")
|
||||||
try:
|
try:
|
||||||
import librosa
|
import librosa
|
||||||
audio, sr = librosa.load(warmup_file, sr=16000)
|
audio, sr = librosa.load(warmup_file, sr=16000)
|
||||||
|
|||||||
Reference in New Issue
Block a user