34 Commits
0.0.1 ... 0.1.4

Author SHA1 Message Date
Quentin Fuxa
d94560ef37 Merge branch 'main' of https://github.com/QuentinFuxa/whisper_streaming_web 2025-04-09 11:38:57 +02:00
Quentin Fuxa
f62baa80b7 to 0.1.4 2025-04-09 11:35:22 +02:00
Quentin Fuxa
0b43035701 enhance chunking to handle audio buffer time limits 2025-04-09 11:34:59 +02:00
Quentin Fuxa
704170ccf3 Logs for https://github.com/QuentinFuxa/WhisperLiveKit/issues/110 https://github.com/QuentinFuxa/WhisperLiveKit/issues/106
https://github.com/QuentinFuxa/WhisperLiveKit/issues/90
https://github.com/QuentinFuxa/WhisperLiveKit/issues/87
https://github.com/QuentinFuxa/WhisperLiveKit/issues/81
https://github.com/QuentinFuxa/WhisperLiveKit/issues/2
2025-04-09 11:34:27 +02:00
Quentin Fuxa
09279c572a Merge pull request #116 from needabetterusername/solve-115
Solve #115: VAC Broken import
2025-04-09 10:16:49 +02:00
Quentin Fuxa
23e41f993f typos 2025-04-09 10:14:23 +02:00
Quentin Fuxa
c791b1e125 Merge pull request #114 from needabetterusername/implement-69-clean
(Re-) Implement #69 (Dockerfile)
2025-04-09 10:10:08 +02:00
Quentin Fuxa
3de2990ec4 Update README to clarify Docker usage for non-GPU systems 2025-04-09 10:08:48 +02:00
Chris Margach
51e6a6f6f9 update import after moving target file 2025-04-09 13:33:39 +09:00
Chris Margach
f6e53b2fab return silero_vad_iterator.py to whisper_streaming(_custom) package. 2025-04-09 11:59:21 +09:00
Chris Margach
5d6f08ff7a Update readme for Dockerfile 2025-04-08 19:06:42 +09:00
Chris Margach
583a26da88 Add Dockerfile w/ GPU support. 2025-04-08 19:06:11 +09:00
Chris Margach
5b3d8969e8 Merge branch 'main' of https://github.com/QuentinFuxa/WhisperLiveKit 2025-04-08 09:44:20 +09:00
Quentin Fuxa
40cca184c1 Merge pull request #113 from needabetterusername/implement-107
Allow CTranslate2 backend to choose device and compute types.
2025-04-07 14:42:57 +02:00
Chris Margach
47ed345f9e Merge branch 'implement-107' 2025-04-07 17:40:08 +09:00
Chris Margach
9c9c179684 Allow CTranslate2 backend to choose device and compute types. 2025-04-07 14:47:29 +09:00
Quentin Fuxa
b870c12f62 Merge pull request #109 from QuentinFuxa/needabetterusername/implement-69
Needabetterusername/implement 69
2025-04-04 11:10:08 +02:00
Quentin Fuxa
cfd5905fd4 Improve WebSocket fallback logic
Use window.location.hostname and port if available,
otherwise fallback to localhost:8000.

Co-authored-by: Chris Margach <hcagramc@gmail.com>
2025-04-04 11:08:05 +02:00
Chris Margach
2399487e45 Implement #107 2025-04-04 10:54:15 +09:00
Quentin Fuxa
afd88310fd Merge branch 'main' of https://github.com/QuentinFuxa/whisper_streaming_web 2025-04-02 11:56:25 +02:00
Quentin Fuxa
080f446b0d start implementing frontend part of https://github.com/QuentinFuxa/WhisperLiveKit/pull/80 2025-04-02 11:56:02 +02:00
Quentin Fuxa
8bd2b36488 Add files via upload 2025-04-01 11:03:22 +02:00
Quentin Fuxa
25fd924bf9 Merge pull request #103 from QuentinFuxa/readme
Update README.md
2025-03-28 14:30:35 +01:00
Quentin Fuxa
ff8fd0ec72 Update README.md 2025-03-28 14:30:14 +01:00
Quentin Fuxa
e99f53e649 Corrects 'TranscriptionSegment' object is not subscriptable 2025-03-24 21:16:08 +01:00
Quentin Fuxa
e9022894b2 solve #100 2025-03-24 20:38:47 +01:00
Quentin Fuxa
ccf99cecdf Solve #95 and #96 2025-03-24 17:55:52 +01:00
Quentin Fuxa
40e2814cd7 0.1.2 2025-03-20 11:08:40 +01:00
Quentin Fuxa
cd29eace3d Update README.md 2025-03-20 10:23:14 +01:00
Quentin Fuxa
38cb54640f Update README.md 2025-03-19 15:49:39 +01:00
Quentin Fuxa
81268a7ca3 update CLI launch 2025-03-19 15:40:54 +01:00
Quentin Fuxa
33cbd24964 Update README.md 2025-03-19 15:14:38 +01:00
Quentin Fuxa
e966e78584 Merge pull request #92 from QuentinFuxa/refacto_lib
script to lib
2025-03-19 15:13:42 +01:00
Quentin Fuxa
c13d36b5e7 Merge pull request #91 from QuentinFuxa/refacto_lib
move all audio processing out of /asr endpoint
2025-03-19 11:20:28 +01:00
14 changed files with 734 additions and 183 deletions

82
Dockerfile Normal file
View File

@@ -0,0 +1,82 @@
FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
WORKDIR /app
ARG EXTRAS
ARG HF_PRECACHE_DIR
ARG HF_TKN_FILE
# Install system dependencies
#RUN apt-get update && \
# apt-get install -y ffmpeg git && \
# apt-get clean && \
# rm -rf /var/lib/apt/lists/*
# 2) Install system dependencies + Python + pip
RUN apt-get update && \
apt-get install -y --no-install-recommends \
python3 \
python3-pip \
ffmpeg \
git && \
rm -rf /var/lib/apt/lists/*
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
COPY . .
# Install WhisperLiveKit directly, allowing for optional dependencies
# Note: For gates modedls, need to add your HF toke. See README.md
# for more details.
RUN if [ -n "$EXTRAS" ]; then \
echo "Installing with extras: [$EXTRAS]"; \
pip install --no-cache-dir .[$EXTRAS]; \
else \
echo "Installing base package only"; \
pip install --no-cache-dir .; \
fi
# Enable in-container caching for Hugging Face models by:
# Note: If running multiple containers, better to map a shared
# bucket.
#
# A) Make the cache directory persistent via an anonymous volume.
# Note: This only persists for a single, named container. This is
# only for convenience at de/test stage.
# For prod, it is better to use a named volume via host mount/k8s.
VOLUME ["/root/.cache/huggingface/hub"]
# or
# B) Conditionally copy a local pre-cache from the build context to the
# container's cache via the HF_PRECACHE_DIR build-arg.
# WARNING: This will copy ALL files in the pre-cache location.
# Conditionally copy a cache directory if provided
RUN if [ -n "$HF_PRECACHE_DIR" ]; then \
echo "Copying Hugging Face cache from $HF_PRECACHE_DIR"; \
mkdir -p /root/.cache/huggingface/hub && \
cp -r $HF_PRECACHE_DIR/* /root/.cache/huggingface/hub; \
else \
echo "No local Hugging Face cache specified, skipping copy"; \
fi
# Conditionally copy a Hugging Face token if provided
RUN if [ -n "$HF_TKN_FILE" ]; then \
echo "Copying Hugging Face token from $HF_TKN_FILE"; \
mkdir -p /root/.cache/huggingface && \
cp $HF_TKN_FILE /root/.cache/huggingface/token; \
else \
echo "No Hugging Face token file specified, skipping token setup"; \
fi
# Expose port for the transcription server
EXPOSE 8000
ENTRYPOINT ["whisperlivekit-server", "--host", "0.0.0.0"]
# Default args
CMD ["--model", "tiny.en"]

386
README.md
View File

@@ -1,158 +1,316 @@
<h1 align="center">WhisperLiveKit</h1>
<p align="center"><b>Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization</b></p>
This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. Simply launch the local server and grant microphone access. Everything runs locally on your machine ✨
<p align="center">
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/demo.png" alt="Demo Screenshot" width="730">
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/refs/heads/main/demo.png" alt="WhisperLiveKit Demo" width="730">
</p>
### Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
<p align="center"><b>Real-time, Fully Local Speech-to-Text with Speaker Diarization</b></p>
#### ⚙️ **Core Improvements**
<p align="center">
<a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
<a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
<a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-dark_green"></a>
<a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/QuentinFuxa/WhisperLiveKit?color=blue"></a>
</p>
## 🚀 Overview
This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. WhisperLiveKit provides a complete backend solution for real-time speech transcription with an example frontend that you can customize for your own needs. Everything runs locally on your machine ✨
### 🔄 Architecture
WhisperLiveKit consists of two main components:
- **Backend (Server)**: FastAPI WebSocket server that processes audio and provides real-time transcription
- **Frontend Example**: Basic HTML & JavaScript implementation that demonstrates how to capture and stream audio
> **Note**: We recommend installing this library on the server/backend. For the frontend, you can use and adapt the provided HTML template from [whisperlivekit/web/live_transcription.html](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/web/live_transcription.html) for your specific use case.
### ✨ Key Features
- **🎙️ Real-time Transcription** - Convert speech to text instantly as you speak
- **👥 Speaker Diarization** - Identify different speakers in real-time using [Diart](https://github.com/juanmc2005/diart)
- **🔒 Fully Local** - All processing happens on your machine - no data sent to external servers
- **📱 Multi-User Support** - Handle multiple users simultaneously with a single backend/server
### ⚙️ Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
- **Multi-User Support** Handles multiple users simultaneously by decoupling backend and online ASR
- **MLX Whisper Backend** Optimized for Apple Silicon for faster local processing
- **Buffering Preview** Displays unvalidated transcription segments
- **Multi-User Support** Handles multiple users simultaneously by decoupling backend and online asr
- **MLX Whisper Backend** Optimized for Apple Silicon for faster local processing.
- **Confidence validation** Immediately validate high-confidence tokens for faster inference
- **Confidence Validation** Immediately validate high-confidence tokens for faster inference
- **Apple Silicon Optimized** - MLX backend for faster local processing on Mac
#### 🎙️ **Speaker Identification**
- **Real-Time Diarization** Identify different speakers in real time using [Diart](https://github.com/juanmc2005/diart)
## 📖 Quick Start
#### 🌐 **Web & API**
- **Built-in Web UI** Simple raw html browser interface with no frontend setup required
- **FastAPI WebSocket Server** Real-time speech-to-text processing with async FFmpeg streaming.
- **JavaScript Client** Ready-to-use MediaRecorder implementation for seamless client-side integration.
```bash
# Install the package
pip install whisperlivekit
## Installation
# Start the transcription server
whisperlivekit-server --model tiny.en
### Via pip
# Open your browser at http://localhost:8000
```
That's it! Start speaking and watch your words appear on screen.
## 🛠️ Installation Options
### Install from PyPI (Recommended)
```bash
pip install whisperlivekit
```
### From source
### Install from Source
1. **Clone the Repository**:
```bash
git clone https://github.com/QuentinFuxa/WhisperLiveKit
cd WhisperLiveKit
pip install -e .
```
```bash
git clone https://github.com/QuentinFuxa/WhisperLiveKit
cd WhisperLiveKit
pip install -e .
```
### System Dependencies
You need to install FFmpeg on your system:
FFmpeg is required:
- Install system dependencies:
```bash
# Install FFmpeg on your system (required for audio processing)
# For Ubuntu/Debian:
sudo apt install ffmpeg
# For macOS:
brew install ffmpeg
# For Windows:
# Download from https://ffmpeg.org/download.html and add to PATH
```
```bash
# Ubuntu/Debian
sudo apt install ffmpeg
- Install required Python dependencies:
# macOS
brew install ffmpeg
```bash
# Whisper streaming required dependencies
pip install librosa soundfile
# Windows
# Download from https://ffmpeg.org/download.html and add to PATH
```
# Whisper streaming web required dependencies
pip install fastapi ffmpeg-python
```
- Install at least one whisper backend among:
### Optional Dependencies
```
whisper
whisper-timestamped
faster-whisper (faster backend on NVIDIA GPU)
mlx-whisper (faster backend on Apple Silicon)
```bash
# Voice Activity Controller (prevents hallucinations)
pip install torch
# Sentence-based buffer trimming
pip install mosestokenizer wtpsplit
pip install tokenize_uk # If you work with Ukrainian text
# Speaker diarization
pip install diart
# Alternative Whisper backends (default is faster-whisper)
pip install whisperlivekit[whisper] # Original Whisper
pip install whisperlivekit[whisper-timestamped] # Improved timestamps
pip install whisperlivekit[mlx-whisper] # Apple Silicon optimization
pip install whisperlivekit[openai] # OpenAI API
```
### 🎹 Pyannote Models Setup
For diarization, you need access to pyannote.audio models:
1. [Accept user conditions](https://huggingface.co/pyannote/segmentation) for the `pyannote/segmentation` model
2. [Accept user conditions](https://huggingface.co/pyannote/segmentation-3.0) for the `pyannote/segmentation-3.0` model
3. [Accept user conditions](https://huggingface.co/pyannote/embedding) for the `pyannote/embedding` model
4. Login with HuggingFace:
```bash
pip install huggingface_hub
huggingface-cli login
```
- Optionnal dependencies
```
# If you want to use VAC (Voice Activity Controller). Useful for preventing hallucinations
torch
# If you choose sentences as buffer trimming strategy
mosestokenizer
wtpsplit
tokenize_uk # If you work with Ukrainian text
## 💻 Usage Examples
# If you want to run the server using uvicorn (recommended)
uvicorn
### Command-line Interface
# If you want to use diarization
diart
```
Start the transcription server with various options:
Diart uses by default [pyannote.audio](https://github.com/pyannote/pyannote-audio) models from the _huggingface hub_. To use them, please follow the steps described [here](https://github.com/juanmc2005/diart?tab=readme-ov-file#get-access-to--pyannote-models).
```bash
# Basic server with English model
whisperlivekit-server --model tiny.en
# Advanced configuration with diarization
whisperlivekit-server --host 0.0.0.0 --port 8000 --model medium --diarization --language auto
```
3. **Run the FastAPI Server**:
### Python API Integration (Backend)
```bash
python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000
```
```python
from whisperlivekit import WhisperLiveKit
from whisperlivekit.audio_processor import AudioProcessor
from fastapi import FastAPI, WebSocket
import asyncio
from fastapi.responses import HTMLResponse
**Parameters**
The following parameters are supported:
- `--host` and `--port` let you specify the server's IP/port.
- `-min-chunk-size` sets the minimum chunk size for audio processing. Make sure this value aligns with the chunk size selected in the frontend. If not aligned, the system will work but may unnecessarily over-process audio data.
- `--transcription`: Enable/disable transcription (default: True)
- `--diarization`: Enable/disable speaker diarization (default: False)
- `--confidence-validation`: Use confidence scores for faster validation. Transcription will be faster but punctuation might be less accurate (default: True)
- `--warmup-file`: The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. :
- If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
- If False, no warmup is performed.
- `--min-chunk-size` Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.
- `--model` {_tiny.en, tiny, base.en, base, small.en, small, medium.en, medium, large-v1, large-v2, large-v3, large, large-v3-turbo_}
Name size of the Whisper model to use (default: tiny). The model is automatically downloaded from the model hub if not present in model cache dir.
- `--model_cache_dir` Overriding the default model cache dir where models downloaded from the hub are saved
- `--model_dir` Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.
- `--lan`, --language Source language code, e.g. en,de,cs, or 'auto' for language detection.
- `--task` {_transcribe, translate_} Transcribe or translate. If translate is set, we recommend avoiding the _large-v3-turbo_ backend, as it [performs significantly worse](https://github.com/QuentinFuxa/whisper_streaming_web/issues/40#issuecomment-2652816533) than other models for translation.
- `--backend` {_faster-whisper, whisper_timestamped, openai-api, mlx-whisper_} Load only this backend for Whisper processing.
- `--vac` Use VAC = voice activity controller. Requires torch.
- `--vac-chunk-size` VAC sample size in seconds.
- `--vad` Use VAD = voice activity detection, with the default parameters.
- `--buffer_trimming` {_sentence, segment_} Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.
- `--buffer_trimming_sec` Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.
# Initialize components
app = FastAPI()
kit = WhisperLiveKit(model="medium", diarization=True)
5. **Open the Provided HTML**:
# Serve the web interface
@app.get("/")
async def get():
return HTMLResponse(kit.web_interface()) # Use the built-in web interface
- By default, the server root endpoint `/` serves a simple `live_transcription.html` page.
- Open your browser at `http://localhost:8000` (or replace `localhost` and `8000` with whatever you specified).
- The page uses vanilla JavaScript and the WebSocket API to capture your microphone and stream audio to the server in real time.
# Process WebSocket connections
async def handle_websocket_results(websocket, results_generator):
async for response in results_generator:
await websocket.send_json(response)
### How the Live Interface Works
@app.websocket("/asr")
async def websocket_endpoint(websocket: WebSocket):
audio_processor = AudioProcessor()
await websocket.accept()
results_generator = await audio_processor.create_tasks()
websocket_task = asyncio.create_task(
handle_websocket_results(websocket, results_generator)
)
- Once you **allow microphone access**, the page records small chunks of audio using the **MediaRecorder** API in **webm/opus** format.
- These chunks are sent over a **WebSocket** to the FastAPI endpoint at `/asr`.
- The Python server decodes `.webm` chunks on the fly using **FFmpeg** and streams them into the **whisper streaming** implementation for transcription.
- **Partial transcription** appears as soon as enough audio is processed. The “unvalidated” text is shown in **lighter or grey color** (i.e., an aperçu) to indicate its still buffered partial output. Once Whisper finalizes that segment, its displayed in normal text.
- You can watch the transcription update in near real time, ideal for demos, prototyping, or quick debugging.
try:
while True:
message = await websocket.receive_bytes()
await audio_processor.process_audio(message)
except Exception as e:
print(f"WebSocket error: {e}")
websocket_task.cancel()
```
### Deploying to a Remote Server
### Frontend Implementation
If you want to **deploy** this setup:
The package includes a simple HTML/JavaScript implementation that you can adapt for your project. You can get in in [whisperlivekit/web/live_transcription.html](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/web/live_transcription.html), or using :
1. **Host the FastAPI app** behind a production-grade HTTP(S) server (like **Uvicorn + Nginx** or Docker). If you use HTTPS, use "wss" instead of "ws" in WebSocket URL.
2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host.
3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket).
```python
kit.web_interface()
```
No additional front-end libraries or frameworks are required. The WebSocket logic in `live_transcription.html` is minimal enough to adapt for your own custom UI or embed in other pages.
## ⚙️ Configuration Reference
## Acknowledgments
WhisperLiveKit offers extensive configuration options:
This project builds upon the foundational work of the Whisper Streaming project. We extend our gratitude to the original authors for their contributions.
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--host` | Server host address | `localhost` |
| `--port` | Server port | `8000` |
| `--model` | Whisper model size | `tiny` |
| `--language` | Source language code or `auto` | `en` |
| `--task` | `transcribe` or `translate` | `transcribe` |
| `--backend` | Processing backend | `faster-whisper` |
| `--diarization` | Enable speaker identification | `False` |
| `--confidence-validation` | Use confidence scores for faster validation | `False` |
| `--min-chunk-size` | Minimum audio chunk size (seconds) | `1.0` |
| `--vac` | Use Voice Activity Controller | `False` |
| `--no-vad` | Disable Voice Activity Detection | `False` |
| `--buffer_trimming` | Buffer trimming strategy (`sentence` or `segment`) | `segment` |
| `--warmup-file` | Audio file path for model warmup | `jfk.wav` |
## 🔧 How It Works
<p align="center">
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/refs/heads/main/demo.png" alt="WhisperLiveKit in Action" width="500">
</p>
1. **Audio Capture**: Browser's MediaRecorder API captures audio in webm/opus format
2. **Streaming**: Audio chunks are sent to the server via WebSocket
3. **Processing**: Server decodes audio with FFmpeg and streams into Whisper for transcription
4. **Real-time Output**:
- Partial transcriptions appear immediately in light gray (the 'aperçu')
- Finalized text appears in normal color
- (When enabled) Different speakers are identified and highlighted
## 🚀 Deployment Guide
To deploy WhisperLiveKit in production:
1. **Server Setup** (Backend):
```bash
# Install production ASGI server
pip install uvicorn gunicorn
# Launch with multiple workers
gunicorn -k uvicorn.workers.UvicornWorker -w 4 your_app:app
```
2. **Frontend Integration**:
- Host your customized version of the example HTML/JS in your web application
- Ensure WebSocket connection points to your server's address
3. **Nginx Configuration** (recommended for production):
```nginx
server {
listen 80;
server_name your-domain.com;
location / {
proxy_pass http://localhost:8000;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
}
}
```
4. **HTTPS Support**: For secure deployments, use "wss://" instead of "ws://" in WebSocket URL
### 🐋 Docker
A basic Dockerfile is provided which allows re-use of Python package installation options. See below usage examples:
**NOTE:** For **larger** models, ensure that your **docker runtime** has enough **memory** available.
#### All defaults
- Create a reusable image with only the basics and then run as a named container:
```bash
docker build -t whisperlivekit-defaults .
docker create --gpus all --name whisperlivekit -p 8000:8000 whisperlivekit-defaults
docker start -i whisperlivekit
```
> **Note**: If you're running on a system without NVIDIA GPU support (such as Mac with Apple Silicon or any system without CUDA capabilities), you need to **remove the `--gpus all` flag** from the `docker create` command. Without GPU acceleration, transcription will use CPU only, which may be significantly slower. Consider using small models for better performance on CPU-only systems.
#### Customization
- Customize the container options:
```bash
docker build -t whisperlivekit-defaults .
docker create --gpus all --name whisperlivekit-base -p 8000:8000 whisperlivekit-defaults --model base
docker start -i whisperlivekit-base
```
- `--build-arg` Options:
- `EXTRAS="whisper-timestamped"` - Add extras to the image's installation (no spaces). Remember to set necessary container options!
- `HF_PRECACHE_DIR="./.cache/"` - Pre-load a model cache for faster first-time start
- `HF_TOKEN="./token"` - Add your Hugging Face Hub access token to download gated models
## 🔮 Use Cases
- **Meeting Transcription**: Capture discussions in real-time
- **Accessibility Tools**: Help hearing-impaired users follow conversations
- **Content Creation**: Transcribe podcasts or videos automatically
- **Customer Service**: Transcribe support calls with speaker identification
## 🤝 Contributing
Contributions are welcome! Here's how to get started:
1. Fork the repository
2. Create a feature branch: `git checkout -b feature/amazing-feature`
3. Commit your changes: `git commit -m 'Add amazing feature'`
4. Push to your branch: `git push origin feature/amazing-feature`
5. Open a Pull Request
## 🙏 Acknowledgments
This project builds upon the foundational work of:
- [Whisper Streaming](https://github.com/ufal/whisper_streaming)
- [Diart](https://github.com/juanmc2005/diart)
- [OpenAI Whisper](https://github.com/openai/whisper)
We extend our gratitude to the original authors for their contributions.
## 📄 License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## 🔗 Links
- [GitHub Repository](https://github.com/QuentinFuxa/WhisperLiveKit)
- [PyPI Package](https://pypi.org/project/whisperlivekit/)
- [Issue Tracker](https://github.com/QuentinFuxa/WhisperLiveKit/issues)

BIN
demo.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 469 KiB

After

Width:  |  Height:  |  Size: 424 KiB

View File

@@ -1,8 +1,7 @@
from setuptools import setup, find_packages
setup(
name="whisperlivekit",
version="0.1.0",
version="0.1.4",
description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
@@ -22,13 +21,17 @@ setup(
"diarization": ["diart"],
"vac": ["torch"],
"sentence": ["mosestokenizer", "wtpsplit"],
"whisper": ["whisper"],
"whisper-timestamped": ["whisper-timestamped"],
"mlx-whisper": ["mlx-whisper"],
"openai": ["openai"],
},
package_data={
'whisperlivekit': ['web/*.html'],
},
entry_points={
'console_scripts': [
'whisperlivekit-server=whisperlivekit.server:run_server',
'whisperlivekit-server=whisperlivekit.basic_server:main',
],
},
classifiers=[

View File

@@ -6,7 +6,6 @@ import math
import logging
import traceback
from datetime import timedelta
from typing import List, Dict, Any
from whisperlivekit.timed_objects import ASRToken
from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
from whisperlivekit.core import WhisperLiveKit
@@ -39,7 +38,10 @@ class AudioProcessor:
self.bytes_per_sample = 2
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
self.last_ffmpeg_activity = time()
self.ffmpeg_health_check_interval = 5
self.ffmpeg_max_idle_time = 10
# State management
self.tokens = []
self.buffer_transcription = ""
@@ -78,14 +80,50 @@ class AudioProcessor:
async def restart_ffmpeg(self):
"""Restart the FFmpeg process after failure."""
logger.warning("Restarting FFmpeg process...")
if self.ffmpeg_process:
try:
self.ffmpeg_process.kill()
await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait)
# we check if process is still running
if self.ffmpeg_process.poll() is None:
logger.info("Terminating existing FFmpeg process")
self.ffmpeg_process.stdin.close()
self.ffmpeg_process.terminate()
# wait for termination with timeout
try:
await asyncio.wait_for(
asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait),
timeout=5.0
)
except asyncio.TimeoutError:
logger.warning("FFmpeg process did not terminate, killing forcefully")
self.ffmpeg_process.kill()
await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait)
except Exception as e:
logger.warning(f"Error killing FFmpeg process: {e}")
logger.error(f"Error during FFmpeg process termination: {e}")
logger.error(traceback.format_exc())
# we start new process
try:
logger.info("Starting new FFmpeg process")
self.ffmpeg_process = self.start_ffmpeg_decoder()
self.pcm_buffer = bytearray()
self.last_ffmpeg_activity = time()
logger.info("FFmpeg process restarted successfully")
except Exception as e:
logger.error(f"Failed to restart FFmpeg process: {e}")
logger.error(traceback.format_exc())
# try again after 5s
await asyncio.sleep(5)
try:
self.ffmpeg_process = self.start_ffmpeg_decoder()
self.pcm_buffer = bytearray()
self.last_ffmpeg_activity = time()
logger.info("FFmpeg process restarted successfully on second attempt")
except Exception as e2:
logger.critical(f"Failed to restart FFmpeg process on second attempt: {e2}")
logger.critical(traceback.format_exc())
async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
"""Thread-safe update of transcription with new data."""
@@ -154,21 +192,33 @@ class AudioProcessor:
while True:
try:
# Calculate buffer size based on elapsed time
elapsed_time = math.floor((time() - beg) * 10) / 10 # Round to 0.1 sec
current_time = time()
elapsed_time = math.floor((current_time - beg) * 10) / 10
buffer_size = max(int(32000 * elapsed_time), 4096)
beg = time()
beg = current_time
# Read chunk with timeout
# Detect idle state much more quickly
if current_time - self.last_ffmpeg_activity > self.ffmpeg_max_idle_time:
logger.warning(f"FFmpeg process idle for {current_time - self.last_ffmpeg_activity:.2f}s. Restarting...")
await self.restart_ffmpeg()
beg = time()
self.last_ffmpeg_activity = time()
continue
# Reduce timeout for reading from FFmpeg
try:
chunk = await asyncio.wait_for(
loop.run_in_executor(None, self.ffmpeg_process.stdout.read, buffer_size),
timeout=15.0
timeout=5.0 # Shorter timeout (5 seconds instead of 15)
)
if chunk:
self.last_ffmpeg_activity = time()
except asyncio.TimeoutError:
logger.warning("FFmpeg read timeout. Restarting...")
await self.restart_ffmpeg()
beg = time()
self.last_ffmpeg_activity = time()
continue
if not chunk:
@@ -366,7 +416,7 @@ class AudioProcessor:
logger.warning(f"Exception in results_formatter: {e}")
logger.warning(f"Traceback: {traceback.format_exc()}")
await asyncio.sleep(0.5) # Back off on error
async def create_tasks(self):
"""Create and start processing tasks."""
@@ -378,6 +428,35 @@ class AudioProcessor:
tasks.append(asyncio.create_task(self.diarization_processor(self.diarization)))
tasks.append(asyncio.create_task(self.ffmpeg_stdout_reader()))
# Monitor overall system health
async def watchdog():
while True:
try:
await asyncio.sleep(10) # Check every 10 seconds instead of 60
current_time = time()
# Check for stalled tasks
for i, task in enumerate(tasks):
if task.done():
exc = task.exception() if task.done() else None
task_name = task.get_name() if hasattr(task, 'get_name') else f"Task {i}"
logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
# Check for FFmpeg process health with shorter thresholds
ffmpeg_idle_time = current_time - self.last_ffmpeg_activity
if ffmpeg_idle_time > 15: # 15 seconds instead of 180
logger.warning(f"FFmpeg idle for {ffmpeg_idle_time:.2f}s - may need attention")
# Force restart after 30 seconds of inactivity (instead of 600)
if ffmpeg_idle_time > 30:
logger.error("FFmpeg idle for too long, forcing restart")
await self.restart_ffmpeg()
except Exception as e:
logger.error(f"Error in watchdog task: {e}")
tasks.append(asyncio.create_task(watchdog()))
self.tasks = tasks
return self.results_formatter()
@@ -399,11 +478,34 @@ class AudioProcessor:
async def process_audio(self, message):
"""Process incoming audio data."""
try:
self.ffmpeg_process.stdin.write(message)
self.ffmpeg_process.stdin.flush()
except (BrokenPipeError, AttributeError) as e:
logger.warning(f"Error writing to FFmpeg: {e}. Restarting...")
await self.restart_ffmpeg()
self.ffmpeg_process.stdin.write(message)
self.ffmpeg_process.stdin.flush()
retry_count = 0
max_retries = 3
# Log periodic heartbeats showing ongoing audio proc
current_time = time()
if not hasattr(self, '_last_heartbeat') or current_time - self._last_heartbeat >= 10:
logger.debug(f"Processing audio chunk, last FFmpeg activity: {current_time - self.last_ffmpeg_activity:.2f}s ago")
self._last_heartbeat = current_time
while retry_count < max_retries:
try:
if not self.ffmpeg_process or not hasattr(self.ffmpeg_process, 'stdin') or self.ffmpeg_process.poll() is not None:
logger.warning("FFmpeg process not available, restarting...")
await self.restart_ffmpeg()
self.ffmpeg_process.stdin.write(message)
self.ffmpeg_process.stdin.flush()
self.last_ffmpeg_activity = time() # Update activity timestamp
return
except (BrokenPipeError, AttributeError, OSError) as e:
retry_count += 1
logger.warning(f"Error writing to FFmpeg: {e}. Retry {retry_count}/{max_retries}...")
if retry_count < max_retries:
await self.restart_ffmpeg()
await asyncio.sleep(0.5) # Shorter pause between retries
else:
logger.error("Maximum retries reached for FFmpeg process")
await self.restart_ffmpeg()
return

View File

@@ -0,0 +1,86 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from whisperlivekit import WhisperLiveKit
from whisperlivekit.audio_processor import AudioProcessor
import asyncio
import logging
import os
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.getLogger().setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
kit = None
@asynccontextmanager
async def lifespan(app: FastAPI):
global kit
kit = WhisperLiveKit()
yield
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def get():
return HTMLResponse(kit.web_interface())
async def handle_websocket_results(websocket, results_generator):
"""Consumes results from the audio processor and sends them via WebSocket."""
try:
async for response in results_generator:
await websocket.send_json(response)
except Exception as e:
logger.warning(f"Error in WebSocket results handler: {e}")
@app.websocket("/asr")
async def websocket_endpoint(websocket: WebSocket):
audio_processor = AudioProcessor()
await websocket.accept()
logger.info("WebSocket connection opened.")
results_generator = await audio_processor.create_tasks()
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
try:
while True:
message = await websocket.receive_bytes()
await audio_processor.process_audio(message)
except WebSocketDisconnect:
logger.warning("WebSocket disconnected.")
finally:
websocket_task.cancel()
await audio_processor.cleanup()
logger.info("WebSocket endpoint cleaned up.")
def main():
"""Entry point for the CLI command."""
import uvicorn
temp_kit = WhisperLiveKit(transcription=False, diarization=False)
uvicorn.run(
"whisperlivekit.basic_server:app",
host=temp_kit.args.host,
port=temp_kit.args.port,
reload=True,
log_level="info"
)
if __name__ == "__main__":
main()

View File

@@ -1,4 +1,7 @@
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
try:
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
except ImportError:
from .whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
from argparse import Namespace, ArgumentParser
def parse_args():
@@ -26,23 +29,21 @@ def parse_args():
parser.add_argument(
"--confidence-validation",
type=bool,
default=False,
action="store_true",
help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
)
parser.add_argument(
"--diarization",
type=bool,
default=True,
help="Whether to enable speaker diarization.",
action="store_true",
default=False,
help="Enable speaker diarization.",
)
parser.add_argument(
"--transcription",
type=bool,
default=True,
help="To disable to only see live diarization results.",
"--no-transcription",
action="store_true",
help="Disable transcription to only see live diarization results.",
)
parser.add_argument(
@@ -51,15 +52,14 @@ def parse_args():
default=0.5,
help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
)
parser.add_argument(
"--model",
type=str,
default="tiny",
choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
","
),
help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.",
help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.",
)
parser.add_argument(
"--model_cache_dir",
type=str,
@@ -102,12 +102,13 @@ def parse_args():
parser.add_argument(
"--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
)
parser.add_argument(
"--vad",
"--no-vad",
action="store_true",
default=True,
help="Use VAD = voice activity detection, with the default parameters.",
help="Disable VAD (voice activity detection).",
)
parser.add_argument(
"--buffer_trimming",
type=str,
@@ -131,6 +132,12 @@ def parse_args():
)
args = parser.parse_args()
args.transcription = not args.no_transcription
args.vad = not args.no_vad
delattr(args, 'no_transcription')
delattr(args, 'no_vad')
return args
class WhisperLiveKit:

View File

View File

View File

@@ -38,7 +38,6 @@
transform: scale(0.95);
}
/* Shape inside the button */
.shape-container {
width: 25px;
height: 25px;
@@ -56,6 +55,10 @@
transition: all 0.3s ease;
}
#recordButton:disabled .shape {
background-color: #6e6d6d;
}
#recordButton.recording .shape {
border-radius: 5px;
width: 25px;
@@ -279,7 +282,7 @@
</div>
<div>
<label for="websocketInput">WebSocket URL:</label>
<input id="websocketInput" type="text" value="ws://localhost:8000/asr" />
<input id="websocketInput" type="text" />
</div>
</div>
</div>
@@ -304,6 +307,7 @@
let waveCanvas = document.getElementById("waveCanvas");
let waveCtx = waveCanvas.getContext("2d");
let animationFrame = null;
let waitingForStop = false;
waveCanvas.width = 60 * (window.devicePixelRatio || 1);
waveCanvas.height = 30 * (window.devicePixelRatio || 1);
waveCtx.scale(window.devicePixelRatio || 1, window.devicePixelRatio || 1);
@@ -315,6 +319,12 @@
const linesTranscriptDiv = document.getElementById("linesTranscript");
const timerElement = document.querySelector(".timer");
const host = window.location.hostname || "localhost";
const port = window.location.port || "8000";
const defaultWebSocketUrl = `ws://${host}:${port}/asr`;
websocketInput.value = defaultWebSocketUrl;
websocketUrl = defaultWebSocketUrl;
chunkSelector.addEventListener("change", () => {
chunkDuration = parseInt(chunkSelector.value);
});
@@ -346,10 +356,16 @@
websocket.onclose = () => {
if (userClosing) {
statusText.textContent = "WebSocket closed by user.";
if (!statusText.textContent.includes("Recording stopped. Processing final audio")) { // This is a bit of a hack. We should have a better way to handle this. eg. using a status code.
statusText.textContent = "Finished processing audio! Ready to record again.";
}
waitingForStop = false;
} else {
statusText.textContent =
"Disconnected from the WebSocket server. (Check logs if model is loading.)";
if (isRecording) {
stopRecording();
}
}
userClosing = false;
};
@@ -363,6 +379,27 @@
websocket.onmessage = (event) => {
const data = JSON.parse(event.data);
// Check for status messages
if (data.type === "ready_to_stop") {
console.log("Ready to stop, closing WebSocket");
// signal that we are not waiting for stop anymore
waitingForStop = false;
recordButton.disabled = false; // this should be elsewhere
console.log("Record button enabled");
//Now we can close the WebSocket
if (websocket) {
websocket.close();
websocket = null;
}
return;
}
// Handle normal transcription updates
const {
lines = [],
buffer_transcription = "",
@@ -494,8 +531,17 @@
}
}
function stopRecording() {
async function stopRecording() {
userClosing = true;
waitingForStop = true;
if (websocket && websocket.readyState === WebSocket.OPEN) {
// Send empty audio buffer as stop signal
const emptyBlob = new Blob([], { type: 'audio/webm' });
websocket.send(emptyBlob);
statusText.textContent = "Recording stopped. Processing final audio...";
}
if (recorder) {
recorder.stop();
recorder = null;
@@ -531,34 +577,67 @@
timerElement.textContent = "00:00";
startTime = null;
isRecording = false;
if (websocket) {
websocket.close();
websocket = null;
if (websocket && websocket.readyState === WebSocket.OPEN) {
try {
await websocket.send(JSON.stringify({
type: "stop",
message: "User stopped recording"
}));
statusText.textContent = "Recording stopped. Processing final audio...";
} catch (e) {
console.error("Could not send stop message:", e);
statusText.textContent = "Recording stopped. Error during final audio processing.";
websocket.close();
websocket = null;
}
}
isRecording = false;
updateUI();
}
async function toggleRecording() {
if (!isRecording) {
linesTranscriptDiv.innerHTML = "";
if (waitingForStop) {
console.log("Waiting for stop, early return");
return; // Early return, UI is already updated
}
console.log("Connecting to WebSocket");
try {
await setupWebSocket();
await startRecording();
// If we have an active WebSocket that's still processing, just restart audio capture
if (websocket && websocket.readyState === WebSocket.OPEN) {
await startRecording();
} else {
// If no active WebSocket or it's closed, create new one
await setupWebSocket();
await startRecording();
}
} catch (err) {
statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
console.error(err);
}
} else {
console.log("Stopping recording");
stopRecording();
}
}
function updateUI() {
recordButton.classList.toggle("recording", isRecording);
statusText.textContent = isRecording ? "Recording..." : "Click to start transcription";
if (waitingForStop) {
statusText.textContent = "Please wait for processing to complete...";
recordButton.disabled = true; // Optionally disable the button while waiting
console.log("Record button disabled");
} else if (isRecording) {
statusText.textContent = "Recording...";
recordButton.disabled = false;
console.log("Record button enabled");
} else {
statusText.textContent = "Click to start transcription";
recordButton.disabled = false;
console.log("Record button enabled");
}
}
recordButton.addEventListener("click", toggleRecording);

View File

@@ -3,7 +3,10 @@ import logging
import io
import soundfile as sf
import math
import torch
try:
import torch
except ImportError:
torch = None
from typing import List
import numpy as np
from whisperlivekit.timed_objects import ASRToken
@@ -102,8 +105,9 @@ class FasterWhisperASR(ASRBase):
model_size_or_path = modelsize
else:
raise ValueError("Either modelsize or model_dir must be set")
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"
device = "auto" # Allow CTranslate2 to decide available device
compute_type = "auto" # Allow CTranslate2 to decide faster compute type
model = WhisperModel(
model_size_or_path,
@@ -249,8 +253,8 @@ class OpenaiApiASR(ASRBase):
no_speech_segments = []
if self.use_vad_opt:
for segment in segments.segments:
if segment["no_speech_prob"] > 0.8:
no_speech_segments.append((segment.get("start"), segment.get("end")))
if segment.no_speech_prob > 0.8:
no_speech_segments.append((segment.start, segment.end))
tokens = []
for word in segments.words:
start = word.start

View File

@@ -216,31 +216,54 @@ class OnlineASRProcessor:
"""
If the committed tokens form at least two sentences, chunk the audio
buffer at the end time of the penultimate sentence.
Also ensures chunking happens if audio buffer exceeds a time limit.
"""
buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
if not self.committed:
if buffer_duration > self.buffer_trimming_sec:
chunk_time = self.buffer_time_offset + (buffer_duration / 2)
logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
self.chunk_at(chunk_time)
return
logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
sentences = self.words_to_sentences(self.committed)
for sentence in sentences:
logger.debug(f"\tSentence: {sentence.text}")
if len(sentences) < 2:
return
# Keep the last two sentences.
while len(sentences) > 2:
sentences.pop(0)
chunk_time = sentences[-2].end
logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
self.chunk_at(chunk_time)
chunk_done = False
if len(sentences) >= 2:
while len(sentences) > 2:
sentences.pop(0)
chunk_time = sentences[-2].end
logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
self.chunk_at(chunk_time)
chunk_done = True
if not chunk_done and buffer_duration > self.buffer_trimming_sec:
last_committed_time = self.committed[-1].end
logger.debug(f"--- Not enough sentences, chunking at last committed time {last_committed_time:.2f}")
self.chunk_at(last_committed_time)
def chunk_completed_segment(self, res):
"""
Chunk the audio buffer based on segment-end timestamps reported by the ASR.
Also ensures chunking happens if audio buffer exceeds a time limit.
"""
buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
if not self.committed:
if buffer_duration > self.buffer_trimming_sec:
chunk_time = self.buffer_time_offset + (buffer_duration / 2)
logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
self.chunk_at(chunk_time)
return
logger.debug("Processing committed tokens for segmenting")
ends = self.asr.segments_end_ts(res)
last_committed_time = self.committed[-1].end
last_committed_time = self.committed[-1].end
chunk_done = False
if len(ends) > 1:
logger.debug("Multiple segments available for chunking")
e = ends[-2] + self.buffer_time_offset
while len(ends) > 2 and e > last_committed_time:
ends.pop(-1)
@@ -248,11 +271,18 @@ class OnlineASRProcessor:
if e <= last_committed_time:
logger.debug(f"--- Segment chunked at {e:.2f}")
self.chunk_at(e)
chunk_done = True
else:
logger.debug("--- Last segment not within committed area")
else:
logger.debug("--- Not enough segments to chunk")
if not chunk_done and buffer_duration > self.buffer_trimming_sec:
logger.debug(f"--- Buffer too large, chunking at last committed time {last_committed_time:.2f}")
self.chunk_at(last_committed_time)
logger.debug("Segment chunking complete")
def chunk_at(self, time: float):
"""
Trim both the hypothesis and audio buffer at the given time.
@@ -358,7 +388,7 @@ class VACOnlineASRProcessor:
# Load a VAD model (e.g. Silero VAD)
import torch
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
from silero_vad_iterator import FixedVADIterator
from .silero_vad_iterator import FixedVADIterator
self.vac = FixedVADIterator(model)
self.logfile = self.online.logfile