21 Commits

Author SHA1 Message Date
Quentin Fuxa
349c7dcb9e bump version ro 0.2.5 2025-08-13 10:04:31 +02:00
Quentin Fuxa
1c42b867cf Merge branch 'main' of https://github.com/QuentinFuxa/whisper_streaming_web 2025-08-13 10:04:04 +02:00
Quentin Fuxa
d4771e563e Increase END_SILENCE_DURATION to reduce false positives 2025-08-13 10:04:00 +02:00
Quentin Fuxa
b0a5fc0693 Merge pull request #155 from davidgumberg/keepawakescrolldown
frontend: Keep screen awake and scroll down when transcribing.
2025-08-13 10:02:52 +02:00
David Gumberg
3b96fb8776 frontend: Scroll down when appending transcription 2025-08-12 17:31:32 -07:00
David Gumberg
7f93c4b978 frontend: Don't let screen sleep when transcribing. 2025-08-12 17:30:57 -07:00
Quentin Fuxa
15c3df1cba warmup base whisper when using simulstreaming 2025-08-12 18:52:52 +02:00
Quentin Fuxa
7fb8e66c01 typo 2025-08-12 18:36:32 +02:00
Quentin Fuxa
728e1f1290 simulstreaming warmup is done for each instance of online, not for the backend 2025-08-12 18:35:04 +02:00
Quentin Fuxa
87b9ed6ecd nonspeech_prob from 1 to 0.5 2025-08-12 18:34:37 +02:00
Quentin Fuxa
38b4ebe8ba Handle 3 types of silences: Indicated by whisper, between tokens, and at the end of the input. Display them in the frontend 2025-08-11 17:56:57 +02:00
Quentin Fuxa
d098af3185 each SimulStreamingOnlineProcessor now contains PaddedAlignAttWhisper instance. SimulStreamingASR only contains loaded whisper model 2025-08-11 08:24:14 +02:00
Quentin Fuxa
4e56130a40 frontend supports dark theme 2025-08-11 08:22:23 +02:00
Quentin Fuxa
2bbdc70187 lags are now updated every 0.1s 2025-08-09 23:11:05 +02:00
Quentin Fuxa
b678a55f63 remove duplicate file 2025-08-09 23:10:34 +02:00
Quentin Fuxa
5491964e81 clean SimulStreamingOnlineProcessor initialization + audio processing 2025-08-09 20:16:27 +02:00
Quentin Fuxa
b05297a96d clean simulwhisper backend and online 2025-08-09 18:02:15 +02:00
Quentin Fuxa
197293e25e refactor(simulstreaming): extract backend + online module into separate files from whisper streaming 2025-08-08 18:07:51 +02:00
Quentin Fuxa
ba41c4ab56 Remove download_simulstreaming_backend 2025-08-08 18:06:40 +02:00
Quentin Fuxa
bda72b8bc0 setup.py to pyproject.toml. Remove <2.0.0 condition on numpy dep 2025-08-03 16:32:31 +02:00
Quentin Fuxa
bb6b9f4cb1 architecture diagram : available backends for whisper streaming & diarization 2025-08-03 12:25:36 +02:00
20 changed files with 773 additions and 788 deletions

View File

@@ -13,13 +13,16 @@
<a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT/Dual Licensed-dark_green"></a>
</p>
Built on [WhisperStreaming](https://github.com/ufal/whisper_streaming) and [SimulStreaming](https://github.com/ufal/SimulStreaming), WhisperLiveKit provides real-time speech transcription in your browser, with a ready-to-use backend and a simple, customizable frontend. ✨
WhisperLiveKit brings real-time speech transcription directly to your browser, with a ready-to-use backend+server and a simple frontend. ✨
Built on [SimulStreaming](https://github.com/ufal/SimulStreaming) (SOTA 2025) and [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) for transcription, plus [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) and [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) for diarization.
### Key Features
- **Real-time Transcription** - Locally (or on-prem) convert speech to text instantly as you speak
- **Speaker Diarization** - Identify different speakers in real-time using [Diart](https://github.com/juanmc2005/diart)
- **Speaker Diarization** - Identify different speakers in real-time. (⚠️ backend Streaming Sortformer in developement)
- **Multi-User Support** - Handle multiple users simultaneously with a single backend/server
- **Automatic Silence Chunking** Automatically chunks when no audio is detected to limit buffer size
- **Confidence Validation** Immediately validate high-confidence tokens for faster inference (WhisperStreaming only)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 342 KiB

After

Width:  |  Height:  |  Size: 382 KiB

59
pyproject.toml Normal file
View File

@@ -0,0 +1,59 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "whisperlivekit"
version = "0.2.5"
description = "Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization"
readme = "README.md"
authors = [
{ name = "Quentin Fuxa" }
]
license = { file = "LICENSE" }
requires-python = ">=3.9"
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Multimedia :: Sound/Audio :: Speech"
]
dependencies = [
"fastapi",
"librosa",
"soundfile",
"faster-whisper",
"uvicorn",
"websockets"
]
[project.optional-dependencies]
diarization = ["diart"]
vac = ["torch"]
sentence = ["mosestokenizer", "wtpsplit"]
whisper = ["whisper"]
whisper-timestamped = ["whisper-timestamped"]
mlx-whisper = ["mlx-whisper"]
openai = ["openai"]
simulstreaming = [
"torch",
"tqdm",
"tiktoken",
'triton>=2.0.0,<3; platform_machine == "x86_64" and (sys_platform == "linux" or sys_platform == "linux2")'
]
[project.urls]
Homepage = "https://github.com/QuentinFuxa/WhisperLiveKit"
[project.scripts]
whisperlivekit-server = "whisperlivekit.basic_server:main"
[tool.setuptools]
packages = ["whisperlivekit", "whisperlivekit.diarization", "whisperlivekit.simul_whisper", "whisperlivekit.simul_whisper.whisper", "whisperlivekit.simul_whisper.whisper.assets", "whisperlivekit.simul_whisper.whisper.normalizers", "whisperlivekit.web", "whisperlivekit.whisper_streaming_custom"]
[tool.setuptools.package-data]
whisperlivekit = ["web/*.html"]
"whisperlivekit.simul_whisper.whisper.assets" = ["*.tiktoken", "*.npz"]

View File

@@ -1,54 +0,0 @@
from setuptools import setup, find_packages
setup(
name="whisperlivekit",
version="0.2.4.dev0",
description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
author="Quentin Fuxa",
url="https://github.com/QuentinFuxa/WhisperLiveKit",
packages=find_packages(),
install_requires=[
"fastapi",
"librosa",
"soundfile",
"faster-whisper",
"uvicorn",
"websockets",
],
extras_require={
"diarization": ["diart"],
"vac": ["torch"],
"sentence": ["mosestokenizer", "wtpsplit"],
"whisper": ["whisper"],
"whisper-timestamped": ["whisper-timestamped"],
"mlx-whisper": ["mlx-whisper"],
"openai": ["openai"],
"simulstreaming": [
"torch",
"tqdm",
"tiktoken",
"numpy<2.0.0",
"triton>=2.0.0,<3;platform_machine==\"x86_64\" and sys_platform==\"linux\" or sys_platform==\"linux2\"",
],
},
package_data={
'whisperlivekit': ['web/*.html'],
'whisperlivekit.simul_whisper.whisper.assets': ['*.tiktoken', '*.npz'],
},
entry_points={
'console_scripts': [
'whisperlivekit-server=whisperlivekit.basic_server:main',
],
},
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Multimedia :: Sound/Audio :: Speech",
],
python_requires=">=3.9",
)

View File

@@ -1,4 +1,3 @@
from .download_simulstreaming_backend import download_simulstreaming_backend
from .audio_processor import AudioProcessor
from .core import TranscriptionEngine
from .parse_args import parse_args

View File

@@ -6,10 +6,9 @@ import logging
import traceback
from datetime import timedelta
from whisperlivekit.timed_objects import ASRToken
from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
from whisperlivekit.core import TranscriptionEngine
from whisperlivekit.core import TranscriptionEngine, online_factory
from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
from .remove_silences import handle_silences
# Set up logging once
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
@@ -52,7 +51,6 @@ class AudioProcessor:
self.tokens = []
self.buffer_transcription = ""
self.buffer_diarization = ""
self.full_transcription = ""
self.end_buffer = 0
self.end_attributed_speaker = 0
self.lock = asyncio.Lock()
@@ -96,13 +94,12 @@ class AudioProcessor:
"""Convert PCM buffer in s16le format to normalized NumPy array."""
return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
async def update_transcription(self, new_tokens, buffer, end_buffer, sep):
"""Thread-safe update of transcription with new data."""
async with self.lock:
self.tokens.extend(new_tokens)
self.buffer_transcription = buffer
self.end_buffer = end_buffer
self.full_transcription = full_transcription
self.sep = sep
async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
@@ -129,12 +126,12 @@ class AudioProcessor:
# Calculate remaining times
remaining_transcription = 0
if self.end_buffer > 0:
remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 2))
remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 1))
remaining_diarization = 0
if self.tokens:
latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 2))
remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1))
return {
"tokens": self.tokens.copy(),
@@ -153,7 +150,6 @@ class AudioProcessor:
self.tokens = []
self.buffer_transcription = self.buffer_diarization = ""
self.end_buffer = self.end_attributed_speaker = 0
self.full_transcription = self.last_response_content = ""
self.beg_loop = time()
async def ffmpeg_stdout_reader(self):
@@ -238,7 +234,6 @@ class AudioProcessor:
async def transcription_processor(self):
"""Process audio chunks for transcription."""
self.full_transcription = ""
self.sep = self.online.asr.sep
cumulative_pcm_duration_stream_time = 0.0
@@ -250,7 +245,7 @@ class AudioProcessor:
self.transcription_queue.task_done()
break
if not self.online: # Should not happen if queue is used
if not self.online:
logger.warning("Transcription processor: self.online not initialized.")
self.transcription_queue.task_done()
continue
@@ -277,8 +272,6 @@ class AudioProcessor:
if new_tokens:
validated_text = self.sep.join([t.text for t in new_tokens])
self.full_transcription += validated_text
if buffer_text.startswith(validated_text):
buffer_text = buffer_text[len(validated_text):].lstrip()
@@ -295,7 +288,7 @@ class AudioProcessor:
new_end_buffer = max(candidate_end_times)
await self.update_transcription(
new_tokens, buffer_text, new_end_buffer, self.full_transcription, self.sep
new_tokens, buffer_text, new_end_buffer, self.sep
)
self.transcription_queue.task_done()
@@ -344,6 +337,8 @@ class AudioProcessor:
async def results_formatter(self):
"""Format processing results for output."""
last_sent_trans = None
last_sent_diar = None
while True:
try:
ffmpeg_state = await self.ffmpeg_manager.get_state()
@@ -381,8 +376,8 @@ class AudioProcessor:
lines = []
last_end_diarized = 0
undiarized_text = []
# Process each token
current_time = time() - self.beg_loop
tokens = handle_silences(tokens, current_time)
for token in tokens:
speaker = token.speaker
@@ -447,10 +442,19 @@ class AudioProcessor:
' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \
f" | {buffer_transcription} | {buffer_diarization}"
if current_response_signature != self.last_response_content and \
(final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected"):
trans = state["remaining_time_transcription"]
diar = state["remaining_time_diarization"]
should_push = (
current_response_signature != self.last_response_content
or last_sent_trans is None
or round(trans, 1) != round(last_sent_trans, 1)
or round(diar, 1) != round(last_sent_diar, 1)
)
if should_push and (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
yield response
self.last_response_content = current_response_signature
last_sent_trans = trans
last_sent_diar = diar
# Check for termination condition
if self.is_stopping:

View File

@@ -1,9 +1,12 @@
try:
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory
from whisperlivekit.whisper_streaming_custom.online_asr import VACOnlineASRProcessor, OnlineASRProcessor
except ImportError:
from .whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
from .whisper_streaming_custom.whisper_online import backend_factory
from .whisper_streaming_custom.online_asr import VACOnlineASRProcessor, OnlineASRProcessor
from whisperlivekit.warmup import warmup_asr, warmup_online
from argparse import Namespace
import sys
class TranscriptionEngine:
_instance = None
@@ -22,7 +25,6 @@ class TranscriptionEngine:
"host": "localhost",
"port": 8000,
"warmup_file": None,
"confidence_validation": False,
"diarization": False,
"punctuation_split": False,
"min_chunk_size": 0.5,
@@ -34,15 +36,15 @@ class TranscriptionEngine:
"backend": "faster-whisper",
"vac": False,
"vac_chunk_size": 0.04,
"buffer_trimming": "segment",
"buffer_trimming_sec": 15,
"log_level": "DEBUG",
"ssl_certfile": None,
"ssl_keyfile": None,
"transcription": True,
"vad": True,
"segmentation_model": "pyannote/segmentation-3.0",
"embedding_model": "pyannote/embedding",
# whisperstreaming params:
"buffer_trimming": "segment",
"confidence_validation": False,
"buffer_trimming_sec": 15,
# simulstreaming params:
"frame_threshold": 25,
"beams": 1,
@@ -55,6 +57,10 @@ class TranscriptionEngine:
"static_init_prompt": None,
"max_context_tokens": None,
"model_path": './base.pt',
# diart params:
"segmentation_model": "pyannote/segmentation-3.0",
"embedding_model": "pyannote/embedding",
}
config_dict = {**defaults, **kwargs}
@@ -78,8 +84,32 @@ class TranscriptionEngine:
self.diarization = None
if self.args.transcription:
self.asr, self.tokenizer = backend_factory(self.args)
warmup_asr(self.asr, self.args.warmup_file)
if self.args.backend == "simulstreaming":
from simul_whisper import SimulStreamingASR
self.tokenizer = None
simulstreaming_kwargs = {}
for attr in ['frame_threshold', 'beams', 'decoder_type', 'audio_max_len', 'audio_min_len',
'cif_ckpt_path', 'never_fire', 'init_prompt', 'static_init_prompt',
'max_context_tokens', 'model_path']:
if hasattr(self.args, attr):
simulstreaming_kwargs[attr] = getattr(self.args, attr)
# Add segment_length from min_chunk_size
simulstreaming_kwargs['segment_length'] = getattr(self.args, 'min_chunk_size', 0.5)
simulstreaming_kwargs['task'] = self.args.task
size = self.args.model
self.asr = SimulStreamingASR(
modelsize=size,
lan=self.args.lan,
cache_dir=getattr(self.args, 'model_cache_dir', None),
model_dir=getattr(self.args, 'model_dir', None),
**simulstreaming_kwargs
)
else:
self.asr, self.tokenizer = backend_factory(self.args)
warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
if self.args.diarization:
from whisperlivekit.diarization.diarization_online import DiartDiarization
@@ -90,3 +120,33 @@ class TranscriptionEngine:
)
TranscriptionEngine._initialized = True
def online_factory(args, asr, tokenizer, logfile=sys.stderr):
if args.backend == "simulstreaming":
from simul_whisper import SimulStreamingOnlineProcessor
online = SimulStreamingOnlineProcessor(
asr,
logfile=logfile,
)
# warmup_online(online, args.warmup_file)
elif args.vac:
online = VACOnlineASRProcessor(
args.min_chunk_size,
asr,
tokenizer,
logfile=logfile,
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
confidence_validation = args.confidence_validation
)
else:
online = OnlineASRProcessor(
asr,
tokenizer,
logfile=logfile,
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
confidence_validation = args.confidence_validation
)
return online

View File

@@ -1,32 +0,0 @@
import os
import requests
import inspect
def get_module_path():
return os.path.dirname(inspect.getfile(inspect.currentframe()))
GITHUB_API_URL = "https://api.github.com/repos/ufal/SimulStreaming/contents/simul_whisper/whisper"
RAW_BASE_URL = "https://raw.githubusercontent.com/ufal/SimulStreaming/main/simul_whisper/whisper"
TARGET_DIR = os.path.join(get_module_path(), "simul_whisper", "whisper")
def download_files_from_github(api_url, local_dir):
os.makedirs(local_dir, exist_ok=True)
response = requests.get(api_url)
response.raise_for_status()
items = response.json()
for item in items:
if item['type'] == 'file':
download_url = item['download_url']
file_name = item['name']
file_response = requests.get(download_url)
file_response.raise_for_status()
with open(os.path.join(local_dir, file_name), 'wb') as f:
f.write(file_response.content)
elif item['type'] == 'dir':
# Recursive call for subdirectories
download_files_from_github(item['url'], os.path.join(local_dir, item['name']))
def download_simulstreaming_backend():
print(f"Downloading files into {TARGET_DIR} ...")
download_files_from_github(GITHUB_API_URL, TARGET_DIR)
print("✅ Download of SimulStreaming backend files completed successfully.")

View File

@@ -0,0 +1,103 @@
from whisperlivekit.timed_objects import ASRToken
import re
MIN_SILENCE_DURATION = 4 #in seconds
END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important
def blank_to_silence(tokens):
full_string = ''.join([t.text for t in tokens])
patterns = [re.compile(r'(?:\s*\[BLANK_AUDIO\]\s*)+'), re.compile(r'(?:\s*\[typing\]\s*)+')]
matches = []
for pattern in patterns:
for m in pattern.finditer(full_string):
matches.append({
'start': m.start(),
'end': m.end()
})
if matches:
# cleaned = pattern.sub(' ', full_string).strip()
# print("Cleaned:", cleaned)
cumulated_len = 0
silence_token = None
cleaned_tokens = []
for token in tokens:
if matches:
start = cumulated_len
end = cumulated_len + len(token.text)
cumulated_len = end
if start >= matches[0]['start'] and end <= matches[0]['end']:
if silence_token: #previous token was already silence
silence_token.start = min(silence_token.start, token.start)
silence_token.end = max(silence_token.end, token.end)
else: #new silence
silence_token = ASRToken(
start=token.start,
end=token.end,
speaker=-2,
probability=0.95
)
else:
if silence_token: #there was silence but no more
if silence_token.end - silence_token.start >= MIN_SILENCE_DURATION:
cleaned_tokens.append(
silence_token
)
silence_token = None
matches.pop(0)
cleaned_tokens.append(token)
# print(cleaned_tokens)
return cleaned_tokens
return tokens
def no_token_to_silence(tokens):
new_tokens = []
silence_token = None
for token in tokens:
if token.speaker == -2:
if new_tokens and new_tokens[-1].speaker == -2: #if token is silence and previous one too
new_tokens[-1].end = token.end
else:
new_tokens.append(token)
last_end = new_tokens[-1].end if new_tokens else 0.0
if token.start - last_end >= MIN_SILENCE_DURATION: #if token is not silence but important gap
if new_tokens and new_tokens[-1].speaker == -2:
new_tokens[-1].end = token.start
else:
silence_token = ASRToken(
start=last_end,
end=token.start,
speaker=-2,
probability=0.95
)
new_tokens.append(silence_token)
if token.speaker != -2:
new_tokens.append(token)
return new_tokens
def ends_with_silence(tokens, current_time):
if not tokens:
return []
last_token = tokens[-1]
if tokens and current_time - last_token.end >= END_SILENCE_DURATION:
if last_token.speaker == -2:
last_token.end = current_time
else:
tokens.append(
ASRToken(
start=tokens[-1].end,
end=current_time,
speaker=-2,
probability=0.95
)
)
return tokens
def handle_silences(tokens, current_time):
tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
tokens = no_token_to_silence(tokens)
tokens = ends_with_silence(tokens, current_time)
return tokens

View File

@@ -0,0 +1,6 @@
from .backend import SimulStreamingASR, SimulStreamingOnlineProcessor
__all__ = [
"SimulStreamingASR",
"SimulStreamingOnlineProcessor",
]

View File

@@ -0,0 +1,223 @@
import sys
import numpy as np
import logging
from typing import List, Tuple, Optional
import logging
from whisperlivekit.timed_objects import ASRToken, Transcript
from whisperlivekit.simul_whisper.license_simulstreaming import SIMULSTREAMING_LICENSE
from .whisper import load_model, tokenizer
import os
logger = logging.getLogger(__name__)
try:
import torch
from whisperlivekit.simul_whisper.config import AlignAttConfig
from whisperlivekit.simul_whisper.simul_whisper import PaddedAlignAttWhisper
from whisperlivekit.simul_whisper.whisper import tokenizer
except ImportError as e:
raise ImportError(
"""SimulStreaming dependencies are not available.
Please install WhisperLiveKit using pip install "whisperlivekit[simulstreaming]".""")
class SimulStreamingOnlineProcessor:
SAMPLING_RATE = 16000
def __init__(
self,
asr,
logfile=sys.stderr,
warmup_file=None
):
self.asr = asr
self.logfile = logfile
self.is_last = False
self.beg = 0.0
self.end = 0.0
self.cumulative_audio_duration = 0.0
self.committed: List[ASRToken] = []
self.last_result_tokens: List[ASRToken] = []
self.model = PaddedAlignAttWhisper(
cfg=asr.cfg,
loaded_model=asr.whisper_model)
if asr.tokenizer:
self.model.tokenizer = asr.tokenizer
def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
"""Append an audio chunk to be processed by SimulStreaming."""
# Convert numpy array to torch tensor
audio_tensor = torch.from_numpy(audio).float()
# Update timing
chunk_duration = len(audio) / self.SAMPLING_RATE
self.cumulative_audio_duration += chunk_duration
if audio_stream_end_time is not None:
self.end = audio_stream_end_time
else:
self.end = self.cumulative_audio_duration
self.model.insert_audio(audio_tensor)
def get_buffer(self):
return Transcript(
start=None,
end=None,
text='',
probability=None
)
def timestamped_text(self, tokens, generation):
# From the simulstreaming repo. self.model to self.asr.model
pr = generation["progress"]
if "result" not in generation:
split_words, split_tokens = self.model.tokenizer.split_to_word_tokens(tokens)
else:
split_words, split_tokens = generation["result"]["split_words"], generation["result"]["split_tokens"]
frames = [p["most_attended_frames"][0] for p in pr]
tokens = tokens.copy()
ret = []
for sw,st in zip(split_words,split_tokens):
b = None
for stt in st:
t,f = tokens.pop(0), frames.pop(0)
if t != stt:
raise ValueError(f"Token mismatch: {t} != {stt} at frame {f}.")
if b is None:
b = f
e = f
out = (b*0.02, e*0.02, sw)
ret.append(out)
logger.debug(f"TS-WORD:\t{' '.join(map(str, out))}")
return ret
def process_iter(self) -> Tuple[List[ASRToken], float]:
"""
Process accumulated audio chunks using SimulStreaming.
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
"""
try:
tokens, generation_progress = self.model.infer(is_last=self.is_last)
ts_words = self.timestamped_text(tokens, generation_progress)
new_tokens = []
for ts_word in ts_words:
start, end, word = ts_word
token = ASRToken(
start=start,
end=end,
text=word,
probability=0.95 # fake prob. Maybe we can extract it from the model?
)
new_tokens.append(token)
self.committed.extend(new_tokens)
return new_tokens, self.end
except Exception as e:
logger.exception(f"SimulStreaming processing error: {e}")
return [], self.end
def warmup(self, audio, init_prompt=""):
"""Warmup the SimulStreaming model."""
try:
self.model.insert_audio(audio)
self.model.infer(True)
self.model.refresh_segment(complete=True)
logger.info("SimulStreaming model warmed up successfully")
except Exception as e:
logger.exception(f"SimulStreaming warmup failed: {e}")
class SimulStreamingASR():
"""SimulStreaming backend with AlignAtt policy."""
sep = ""
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr, **kwargs):
logger.warning(SIMULSTREAMING_LICENSE)
self.logfile = logfile
self.transcribe_kargs = {}
self.original_language = None if lan == "auto" else lan
self.model_path = kwargs.get('model_path', './large-v3.pt')
self.frame_threshold = kwargs.get('frame_threshold', 25)
self.audio_max_len = kwargs.get('audio_max_len', 30.0)
self.audio_min_len = kwargs.get('audio_min_len', 0.0)
self.segment_length = kwargs.get('segment_length', 0.5)
self.beams = kwargs.get('beams', 1)
self.decoder_type = kwargs.get('decoder_type', 'greedy' if self.beams == 1 else 'beam')
self.task = kwargs.get('task', 'transcribe')
self.cif_ckpt_path = kwargs.get('cif_ckpt_path', None)
self.never_fire = kwargs.get('never_fire', False)
self.init_prompt = kwargs.get('init_prompt', None)
self.static_init_prompt = kwargs.get('static_init_prompt', None)
self.max_context_tokens = kwargs.get('max_context_tokens', None)
if model_dir is not None:
self.model_path = model_dir
elif modelsize is not None:
model_mapping = {
'tiny': './tiny.pt',
'base': './base.pt',
'small': './small.pt',
'medium': './medium.pt',
'medium.en': './medium.en.pt',
'large-v1': './large-v1.pt',
'base.en': './base.en.pt',
'small.en': './small.en.pt',
'tiny.en': './tiny.en.pt',
'large-v2': './large-v2.pt',
'large-v3': './large-v3.pt',
'large': './large-v3.pt'
}
self.model_path = model_mapping.get(modelsize, f'./{modelsize}.pt')
self.model = self.load_model(modelsize)
# Set up tokenizer for translation if needed
if self.task == "translate":
self.tokenizer = self.set_translate_task()
else:
self.tokenizer = None
def load_model(self, modelsize):
self.cfg = AlignAttConfig(
model_path=self.model_path,
segment_length=self.segment_length,
frame_threshold=self.frame_threshold,
language=self.original_language,
audio_max_len=self.audio_max_len,
audio_min_len=self.audio_min_len,
cif_ckpt_path=self.cif_ckpt_path,
decoder_type="beam",
beam_size=self.beams,
task=self.task,
never_fire=self.never_fire,
init_prompt=self.init_prompt,
max_context_tokens=self.max_context_tokens,
static_init_prompt=self.static_init_prompt,
)
model_name = os.path.basename(self.cfg.model_path).replace(".pt", "")
model_path = os.path.dirname(os.path.abspath(self.cfg.model_path))
self.whisper_model = load_model(name=model_name, download_root=model_path)
def set_translate_task(self):
"""Set up translation task."""
return tokenizer.get_tokenizer(
multilingual=True,
language=self.model.cfg.language,
num_languages=self.model.model.num_languages,
task="translate"
)
def transcribe(self, audio):
"""
Only used for warmup. It's a direct whisper call, not a simulstreaming call
"""
self.whisper_model.transcribe(audio, language=self.original_language)

View File

@@ -8,7 +8,7 @@ class SimulWhisperConfig:
'''Options that are common for all simul policies that could be implemented in SimulWhisper.'''
model_path: str
language: str = field(default="zh")
nonspeech_prob: float = 1.0
nonspeech_prob: float = 0.5
audio_min_len: float = 1.0
decoder_type: Literal["greedy","beam"] = "greedy"
beam_size: int = 5

View File

@@ -1,18 +1,5 @@
SIMULSTREAMING_LICENSE = f"""
{"*"*80}
SimulStreaming (https://github.com/ufal/SimulStreaming) is dual-licensed:
🔹 Non-Commercial Use
You may use SimulStreaming under the PolyForm Noncommercial License 1.0.0 if you obtain the code through the GitHub repository. This license is free of charge and comes with no obligations for non-commercial users.
🔸 Commercial Use
Understanding who uses SimulStreaming commercially helps us improve and
prioritize development. Therefore, we want to require registration of those who acquire a commercial licence.
We plan to make the commercial licenceses affordable to SMEs and individuals. We are considering to provide commercial licenses either for free or for symbolic one-time fee, and maybe also provide additional support. You can share your preference via the questionnaire https://forms.cloud.microsoft/e/7tCxb4gJfB.
You can also leave your contact there: https://forms.cloud.microsoft/e/7tCxb4gJfB to be notified when the commercial licenses become
available.
✉️ Contact
Dominik Macháček (https://ufal.mff.cuni.cz/dominik-machacek/), machacek@ufal.mff.cuni.cz
{"*"*80}
SimulStreaming backend is dual-licensed:
• Non-Commercial Use: PolyForm Noncommercial License 1.0.0.
• Commercial Use: Check SimulStreaming README (github.com/ufal/SimulStreaming) for more details.
"""

View File

@@ -15,7 +15,7 @@ from .beam import BeamPyTorchInference
from .eow_detection import fire_at_boundary, load_cif
import os
from token_buffer import TokenBuffer
from .token_buffer import TokenBuffer
import numpy as np
from .generation_progress import *
@@ -33,11 +33,14 @@ import wave
# - prompt -- static vs. non-static
# - context
class PaddedAlignAttWhisper:
def __init__(self, cfg: AlignAttConfig) -> None:
def __init__(self, cfg: AlignAttConfig, loaded_model=None) -> None:
self.log_segments = 0
model_name = os.path.basename(cfg.model_path).replace(".pt", "")
model_path = os.path.dirname(os.path.abspath(cfg.model_path))
self.model = load_model(name=model_name, download_root=model_path)
if loaded_model:
self.model = loaded_model
else:
self.model = load_model(name=model_name, download_root=model_path)
logger.info(f"Model dimensions: {self.model.dims}")

View File

@@ -1,73 +0,0 @@
import torch
import sys
class TokenBuffer:
def __init__(self, text="", tokenizer=None, device=None, prefix_token_ids=[]):
self.text = text
self.prefix_token_ids = prefix_token_ids
self.tokenizer = tokenizer
self.device = device
def as_token_ids(self, tokenizer=None):
if tokenizer is None:
tokenizer = self.tokenizer
if tokenizer is None:
raise ValueError("Tokenizer is not set.")
return self.prefix_token_ids + tokenizer.encode(self.text)
def as_tensor(self, device=None):
if device is None:
device = self.device
if device is None:
raise ValueError("Device is not set.")
tok_ids = self.as_token_ids()
return torch.tensor(tok_ids,
dtype=torch.long, device=device).unsqueeze(0)
def as_tensor_beam(self, beam, device=None):
t = self.as_tensor(device=device)
return t.repeat_interleave(beam, dim=0)
def as_text(self):
return self.text
@staticmethod
def empty(*a, **kw):
return TokenBuffer(*a,**kw)
@staticmethod
def from_text(text, *a, **kw):
return TokenBuffer(*a, text=text, **kw)
def is_empty(self):
return self.text is None or self.text == ""
def trim_words(self, num=1, after=0):
'''
num: how many words to trim from the beginning
after: how many characters to skip (length of the static prompt)
'''
tokenizer = self.tokenizer
assert tokenizer is not None, "Tokenizer is not set."
ids = tokenizer.encode(self.text[after:])
words, wids = self.tokenizer.split_to_word_tokens(ids)
print(words, file=sys.stderr)
print(wids, file=sys.stderr)
if not words:
return 0
self.text = self.text[:after] + "".join(words[num:])
return sum(len(wi) for wi in wids[:num])
def append_token_ids(self, token_ids):
tokenizer = self.tokenizer
assert tokenizer is not None, "Tokenizer is not set."
self.text += self.tokenizer.decode(token_ids)
def as_split_word_tokens(self):
tokenizer = self.tokenizer
assert tokenizer is not None, "Tokenizer is not set."
ids = tokenizer.encode(self.text)
return tokenizer.split_to_word_tokens(ids)

62
whisperlivekit/warmup.py Normal file
View File

@@ -0,0 +1,62 @@
import logging
logger = logging.getLogger(__name__)
def load_file(warmup_file=None, timeout=5):
import os
import tempfile
import librosa
if warmup_file is None:
# Download JFK sample if not already present
jfk_url = "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav"
temp_dir = tempfile.gettempdir()
warmup_file = os.path.join(temp_dir, "whisper_warmup_jfk.wav")
if not os.path.exists(warmup_file):
logger.debug(f"Downloading warmup file from {jfk_url}")
print(f"Downloading warmup file from {jfk_url}")
import time
import urllib.request
import urllib.error
import socket
original_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
start_time = time.time()
try:
urllib.request.urlretrieve(jfk_url, warmup_file)
logger.debug(f"Download successful in {time.time() - start_time:.2f}s")
except (urllib.error.URLError, socket.timeout) as e:
logger.warning(f"Download failed: {e}. Proceeding without warmup.")
return False
finally:
socket.setdefaulttimeout(original_timeout)
elif not warmup_file:
return False
if not warmup_file or not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
logger.warning(f"Warmup file {warmup_file} invalid or missing.")
return False
try:
audio, sr = librosa.load(warmup_file, sr=16000)
except Exception as e:
logger.warning(f"Failed to load audio file: {e}")
return False
return audio
def warmup_asr(asr, warmup_file=None, timeout=5):
"""
Warmup the ASR model by transcribing a short audio file.
"""
audio = load_file(warmup_file=None, timeout=5)
asr.transcribe(audio)
logger.info("ASR model is warmed up")
def warmup_online(online, warmup_file=None, timeout=5):
audio = load_file(warmup_file=None, timeout=5)
online.warmup(audio)
logger.warning("ASR is warmed up")

View File

@@ -4,12 +4,87 @@
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Audio Transcription</title>
<title>WhisperLiveKit</title>
<style>
:root {
--bg: #ffffff;
--text: #111111;
--muted: #666666;
--border: #e5e5e5;
--chip-bg: rgba(0, 0, 0, 0.04);
--chip-text: #000000;
--spinner-border: #8d8d8d5c;
--spinner-top: #b0b0b0;
--silence-bg: #f3f3f3;
--loading-bg: rgba(255, 77, 77, 0.06);
--button-bg: #ffffff;
--button-border: #e9e9e9;
--wave-stroke: #000000;
--label-dia-text: #868686;
--label-trans-text: #111111;
}
@media (prefers-color-scheme: dark) {
:root:not([data-theme="light"]) {
--bg: #0b0b0b;
--text: #e6e6e6;
--muted: #9aa0a6;
--border: #333333;
--chip-bg: rgba(255, 255, 255, 0.08);
--chip-text: #e6e6e6;
--spinner-border: #555555;
--spinner-top: #dddddd;
--silence-bg: #1a1a1a;
--loading-bg: rgba(255, 77, 77, 0.12);
--button-bg: #111111;
--button-border: #333333;
--wave-stroke: #e6e6e6;
--label-dia-text: #b3b3b3;
--label-trans-text: #ffffff;
}
}
:root[data-theme="dark"] {
--bg: #0b0b0b;
--text: #e6e6e6;
--muted: #9aa0a6;
--border: #333333;
--chip-bg: rgba(255, 255, 255, 0.08);
--chip-text: #e6e6e6;
--spinner-border: #555555;
--spinner-top: #dddddd;
--silence-bg: #1a1a1a;
--loading-bg: rgba(255, 77, 77, 0.12);
--button-bg: #111111;
--button-border: #333333;
--wave-stroke: #e6e6e6;
--label-dia-text: #b3b3b3;
--label-trans-text: #ffffff;
}
:root[data-theme="light"] {
--bg: #ffffff;
--text: #111111;
--muted: #666666;
--border: #e5e5e5;
--chip-bg: rgba(0, 0, 0, 0.04);
--chip-text: #000000;
--spinner-border: #8d8d8d5c;
--spinner-top: #b0b0b0;
--silence-bg: #f3f3f3;
--loading-bg: rgba(255, 77, 77, 0.06);
--button-bg: #ffffff;
--button-border: #e9e9e9;
--wave-stroke: #000000;
--label-dia-text: #868686;
--label-trans-text: #111111;
}
body {
font-family: ui-sans-serif, system-ui, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
margin: 20px;
text-align: center;
background-color: var(--bg);
color: var(--text);
}
#recordButton {
@@ -17,10 +92,10 @@
height: 50px;
border: none;
border-radius: 50%;
background-color: white;
background-color: var(--button-bg);
cursor: pointer;
transition: all 0.3s ease;
border: 1px solid rgb(233, 233, 233);
border: 1px solid var(--button-border);
display: flex;
align-items: center;
justify-content: center;
@@ -94,14 +169,14 @@
.timer {
font-size: 14px;
font-weight: 500;
color: #333;
color: var(--text);
margin-left: 10px;
}
#status {
margin-top: 20px;
font-size: 16px;
color: #333;
color: var(--text);
}
.settings-container {
@@ -120,12 +195,14 @@
}
#chunkSelector,
#websocketInput {
#websocketInput,
#themeSelector {
font-size: 16px;
padding: 5px;
border-radius: 5px;
border: 1px solid #ddd;
background-color: #ffffff;
border: 1px solid var(--border);
background-color: var(--button-bg);
color: var(--text);
max-height: 30px;
}
@@ -134,7 +211,8 @@
}
#chunkSelector:focus,
#websocketInput:focus {
#websocketInput:focus,
#themeSelector:focus {
outline: none;
border-color: #007bff;
}
@@ -156,18 +234,18 @@
}
#linesTranscript strong {
color: #333;
color: var(--text);
}
#speaker {
border: 1px solid rgb(229, 229, 229);
border: 1px solid var(--border);
border-radius: 100px;
padding: 2px 10px;
font-size: 14px;
margin-bottom: 0px;
}
.label_diarization {
background-color: #ffffff66;
background-color: var(--chip-bg);
border-radius: 8px 8px 8px 8px;
padding: 2px 10px;
margin-left: 10px;
@@ -175,11 +253,11 @@
white-space: nowrap;
font-size: 14px;
margin-bottom: 0px;
color: rgb(134, 134, 134)
color: var(--label-dia-text)
}
.label_transcription {
background-color: #ffffff66;
background-color: var(--chip-bg);
border-radius: 8px 8px 8px 8px;
padding: 2px 10px;
display: inline-block;
@@ -187,11 +265,11 @@
margin-left: 10px;
font-size: 14px;
margin-bottom: 0px;
color: #000000
color: var(--label-trans-text)
}
#timeInfo {
color: #666;
color: var(--muted);
margin-left: 10px;
}
@@ -206,7 +284,7 @@
}
.buffer_diarization {
color: rgb(134, 134, 134);
color: var(--label-dia-text);
margin-left: 4px;
}
@@ -220,10 +298,10 @@
display: inline-block;
width: 8px;
height: 8px;
border: 2px solid #8d8d8d5c;
border-top: 2px solid #6c6c6ce5;
border: 2px solid var(--spinner-border);
border-top: 2px solid var(--spinner-top);
border-radius: 50%;
animation: spin 0.6s linear infinite;
animation: spin 0.7s linear infinite;
vertical-align: middle;
margin-bottom: 2px;
margin-right: 5px;
@@ -236,16 +314,16 @@
}
.silence {
color: #666;
background-color: #f3f3f3;
color: var(--muted);
background-color: var(--silence-bg);
font-size: 13px;
border-radius: 30px;
padding: 2px 10px;
}
.loading {
color: #666;
background-color: #ff4d4d0f;
color: var(--muted);
background-color: var(--loading-bg);
border-radius: 8px 8px 8px 0px;
padding: 2px 10px;
font-size: 14px;
@@ -284,6 +362,14 @@
<label for="websocketInput">WebSocket URL:</label>
<input id="websocketInput" type="text" />
</div>
<div>
<label for="themeSelector">Theme:</label>
<select id="themeSelector">
<option value="system" selected>System</option>
<option value="light">Light</option>
<option value="dark">Dark</option>
</select>
</div>
</div>
</div>
@@ -299,6 +385,7 @@
let chunkDuration = 1000;
let websocketUrl = "ws://localhost:8000/asr";
let userClosing = false;
let wakeLock = null;
let startTime = null;
let timerInterval = null;
let audioContext = null;
@@ -309,6 +396,7 @@
let animationFrame = null;
let waitingForStop = false;
let lastReceivedData = null;
let lastSignature = null;
waveCanvas.width = 60 * (window.devicePixelRatio || 1);
waveCanvas.height = 30 * (window.devicePixelRatio || 1);
waveCtx.scale(window.devicePixelRatio || 1, window.devicePixelRatio || 1);
@@ -319,6 +407,57 @@
const websocketInput = document.getElementById("websocketInput");
const linesTranscriptDiv = document.getElementById("linesTranscript");
const timerElement = document.querySelector(".timer");
const themeSelector = document.getElementById("themeSelector");
function getWaveStroke() {
const styles = getComputedStyle(document.documentElement);
const v = styles.getPropertyValue("--wave-stroke").trim();
return v || "#000";
}
let waveStroke = getWaveStroke();
function updateWaveStroke() {
waveStroke = getWaveStroke();
}
function applyTheme(pref) {
if (pref === "light") {
document.documentElement.setAttribute("data-theme", "light");
} else if (pref === "dark") {
document.documentElement.setAttribute("data-theme", "dark");
} else {
document.documentElement.removeAttribute("data-theme");
}
updateWaveStroke();
}
const savedThemePref = localStorage.getItem("themePreference") || "system";
applyTheme(savedThemePref);
if (themeSelector) {
themeSelector.value = savedThemePref;
themeSelector.addEventListener("change", () => {
const val = themeSelector.value;
localStorage.setItem("themePreference", val);
applyTheme(val);
});
}
const darkMq = window.matchMedia && window.matchMedia("(prefers-color-scheme: dark)");
const handleOsThemeChange = () => {
const pref = localStorage.getItem("themePreference") || "system";
if (pref === "system") updateWaveStroke();
};
if (darkMq && darkMq.addEventListener) {
darkMq.addEventListener("change", handleOsThemeChange);
} else if (darkMq && darkMq.addListener) {
darkMq.addListener(handleOsThemeChange);
}
function fmt1(x) {
const n = Number(x);
return Number.isFinite(n) ? n.toFixed(1) : x;
}
const host = window.location.hostname || "localhost";
const port = window.location.port;
@@ -446,10 +585,35 @@
function renderLinesWithBuffer(lines, buffer_diarization, buffer_transcription, remaining_time_diarization, remaining_time_transcription, isFinalizing = false, current_status = "active_transcription") {
if (current_status === "no_audio_detected") {
linesTranscriptDiv.innerHTML = "<p style='text-align: center; color: #666; margin-top: 20px;'><em>No audio detected...</em></p>";
linesTranscriptDiv.innerHTML = "<p style='text-align: center; color: var(--muted); margin-top: 20px;'><em>No audio detected...</em></p>";
return;
}
// try to keep stable DOM despite having updates every 0.1s. only update numeric lag values if structure hasn't changed
const showLoading = (!isFinalizing) && (lines || []).some(it => it.speaker == 0);
const showTransLag = !isFinalizing && remaining_time_transcription > 0;
const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
const signature = JSON.stringify({
lines: (lines || []).map(it => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })),
buffer_transcription: buffer_transcription || "",
buffer_diarization: buffer_diarization || "",
status: current_status,
showLoading,
showTransLag,
showDiaLag,
isFinalizing: !!isFinalizing
});
if (lastSignature === signature) {
const t = document.querySelector(".lag-transcription-value");
if (t) t.textContent = fmt1(remaining_time_transcription);
const d = document.querySelector(".lag-diarization-value");
if (d) d.textContent = fmt1(remaining_time_diarization);
const ld = document.querySelector(".loading-diarization-value");
if (ld) ld.textContent = fmt1(remaining_time_diarization);
return;
}
lastSignature = signature;
const linesHtml = lines.map((item, idx) => {
let timeInfo = "";
if (item.beg !== undefined && item.end !== undefined) {
@@ -460,7 +624,7 @@
if (item.speaker === -2) {
speakerLabel = `<span class="silence">Silence<span id='timeInfo'>${timeInfo}</span></span>`;
} else if (item.speaker == 0 && !isFinalizing) {
speakerLabel = `<span class='loading'><span class="spinner"></span><span id='timeInfo'>${remaining_time_diarization} second(s) of audio are undergoing diarization</span></span>`;
speakerLabel = `<span class='loading'><span class="spinner"></span><span id='timeInfo'><span class="loading-diarization-value">${fmt1(remaining_time_diarization)}</span> second(s) of audio are undergoing diarization</span></span>`;
} else if (item.speaker == -1) {
speakerLabel = `<span id="speaker">Speaker 1<span id='timeInfo'>${timeInfo}</span></span>`;
} else if (item.speaker !== -1 && item.speaker !== 0) {
@@ -471,12 +635,12 @@
let currentLineText = item.text || "";
if (idx === lines.length - 1) {
if (!isFinalizing) {
if (!isFinalizing && item.speaker !== -2) {
if (remaining_time_transcription > 0) {
speakerLabel += `<span class="label_transcription"><span class="spinner"></span>Transcription lag <span id='timeInfo'>${remaining_time_transcription}s</span></span>`;
speakerLabel += `<span class="label_transcription"><span class="spinner"></span>Transcription lag <span id='timeInfo'><span class="lag-transcription-value">${fmt1(remaining_time_transcription)}</span>s</span></span>`;
}
if (buffer_diarization && remaining_time_diarization > 0) {
speakerLabel += `<span class="label_diarization"><span class="spinner"></span>Diarization lag<span id='timeInfo'>${remaining_time_diarization}s</span></span>`;
speakerLabel += `<span class="label_diarization"><span class="spinner"></span>Diarization lag<span id='timeInfo'><span class="lag-diarization-value">${fmt1(remaining_time_diarization)}</span>s</span></span>`;
}
}
@@ -502,6 +666,7 @@
}).join("");
linesTranscriptDiv.innerHTML = linesHtml;
window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
}
function updateTimer() {
@@ -522,7 +687,7 @@
waveCtx.clearRect(0, 0, waveCanvas.width / (window.devicePixelRatio || 1), waveCanvas.height / (window.devicePixelRatio || 1));
waveCtx.lineWidth = 1;
waveCtx.strokeStyle = 'rgb(0, 0, 0)';
waveCtx.strokeStyle = waveStroke;
waveCtx.beginPath();
const sliceWidth = (waveCanvas.width / (window.devicePixelRatio || 1)) / bufferLength;
@@ -549,6 +714,16 @@
async function startRecording() {
try {
// https://developer.mozilla.org/en-US/docs/Web/API/Screen_Wake_Lock_API
// create an async function to request a wake lock
try {
wakeLock = await navigator.wakeLock.request("screen");
} catch (err) {
// The Wake Lock request has failed - usually system related, such as battery.
console.log("Error acquiring wake lock.")
}
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
audioContext = new (window.AudioContext || window.webkitAudioContext)();
@@ -578,6 +753,10 @@
}
async function stopRecording() {
wakeLock.release().then(() => {
wakeLock = null;
});
userClosing = true;
waitingForStop = true;

View File

@@ -3,32 +3,10 @@ import logging
import io
import soundfile as sf
import math
try:
import torch
except ImportError:
torch = None
from typing import List
import numpy as np
from whisperlivekit.timed_objects import ASRToken
from whisperlivekit.simul_whisper.license_simulstreaming import SIMULSTREAMING_LICENSE
logger = logging.getLogger(__name__)
SIMULSTREAMING_ERROR_AND_INSTALLATION_INSTRUCTIONS = ImportError(
"""SimulStreaming dependencies are not available.
Please install WhisperLiveKit using pip install "whisperlivekit[simulstreaming]"
""")
try:
from whisperlivekit.simul_whisper.config import AlignAttConfig
from whisperlivekit.simul_whisper.simul_whisper import PaddedAlignAttWhisper, DEC_PAD
from whisperlivekit.simul_whisper.whisper import tokenizer
SIMULSTREAMING_AVAILABLE = True
except ImportError:
SIMULSTREAMING_AVAILABLE = False
AlignAttConfig = None
PaddedAlignAttWhisper = None
DEC_PAD = None
tokenizer = None
class ASRBase:
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
# "" for faster-whisper because it emits the spaces when needed)
@@ -309,181 +287,4 @@ class OpenaiApiASR(ASRBase):
self.use_vad_opt = True
def set_translate_task(self):
self.task = "translate"
class SimulStreamingASR(ASRBase):
"""SimulStreaming backend with AlignAtt policy."""
sep = ""
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr, **kwargs):
if not SIMULSTREAMING_AVAILABLE:
raise SIMULSTREAMING_ERROR_AND_INSTALLATION_INSTRUCTIONS
logger.warning(SIMULSTREAMING_LICENSE)
self.logfile = logfile
self.transcribe_kargs = {}
self.original_language = None if lan == "auto" else lan
self.model_path = kwargs.get('model_path', './large-v3.pt')
self.frame_threshold = kwargs.get('frame_threshold', 25)
self.audio_max_len = kwargs.get('audio_max_len', 30.0)
self.audio_min_len = kwargs.get('audio_min_len', 0.0)
self.segment_length = kwargs.get('segment_length', 0.5)
self.beams = kwargs.get('beams', 1)
self.decoder_type = kwargs.get('decoder_type', 'greedy' if self.beams == 1 else 'beam')
self.task = kwargs.get('task', 'transcribe')
self.cif_ckpt_path = kwargs.get('cif_ckpt_path', None)
self.never_fire = kwargs.get('never_fire', False)
self.init_prompt = kwargs.get('init_prompt', None)
self.static_init_prompt = kwargs.get('static_init_prompt', None)
self.max_context_tokens = kwargs.get('max_context_tokens', None)
if model_dir is not None:
self.model_path = model_dir
elif modelsize is not None: #For the moment the .en.pt models do not work!
model_mapping = {
'tiny': './tiny.pt',
'base': './base.pt',
'small': './small.pt',
'medium': './medium.pt',
'medium.en': './medium.en.pt',
'large-v1': './large-v1.pt',
'base.en': './base.en.pt',
'small.en': './small.en.pt',
'tiny.en': './tiny.en.pt',
'large-v2': './large-v2.pt',
'large-v3': './large-v3.pt',
'large': './large-v3.pt'
}
self.model_path = model_mapping.get(modelsize, f'./{modelsize}.pt')
self.model = self.load_model(modelsize, cache_dir, model_dir)
# Set up tokenizer for translation if needed
if self.task == "translate":
self.set_translate_task()
def load_model(self, modelsize, cache_dir, model_dir):
try:
cfg = AlignAttConfig(
model_path=self.model_path,
segment_length=self.segment_length,
frame_threshold=self.frame_threshold,
language=self.original_language,
audio_max_len=self.audio_max_len,
audio_min_len=self.audio_min_len,
cif_ckpt_path=self.cif_ckpt_path,
decoder_type="beam",
beam_size=self.beams,
task=self.task,
never_fire=self.never_fire,
init_prompt=self.init_prompt,
max_context_tokens=self.max_context_tokens,
static_init_prompt=self.static_init_prompt,
)
logger.info(f"Loading SimulStreaming model with language: {self.original_language}")
model = PaddedAlignAttWhisper(cfg)
return model
except Exception as e:
logger.error(f"Failed to load SimulStreaming model: {e}")
raise
def transcribe(self, audio, init_prompt=""):
"""Transcribe audio using SimulStreaming."""
try:
if isinstance(audio, np.ndarray):
audio_tensor = torch.from_numpy(audio).float()
else:
audio_tensor = audio
prompt = init_prompt if init_prompt else (self.init_prompt or "")
result = self.model.infer(audio_tensor, init_prompt=prompt)
if torch.is_tensor(result):
result = result[result < DEC_PAD]
logger.debug(f"SimulStreaming transcription result: {result}")
return result
except Exception as e:
logger.error(f"SimulStreaming transcription failed: {e}")
raise
def ts_words(self, result) -> List[ASRToken]:
"""Convert SimulStreaming result to ASRToken list."""
tokens = []
try:
if torch.is_tensor(result):
text = self.model.tokenizer.decode(result.cpu().numpy())
else:
text = str(result)
if not text or len(text.strip()) == 0:
return tokens
# We dont have word-level timestamps here. 1rst approach, should be improved later.
words = text.strip().split()
if not words:
return tokens
duration_per_word = 0.1 # this will be modified based on actual audio duration
#with the SimulStreamingOnlineProcessor
for i, word in enumerate(words):
start_time = i * duration_per_word
end_time = (i + 1) * duration_per_word
token = ASRToken(
start=start_time,
end=end_time,
text=word,
probability=1.0
)
tokens.append(token)
except Exception as e:
logger.error(f"Error converting SimulStreaming result to tokens: {e}")
return tokens
def segments_end_ts(self, result) -> List[float]:
"""Get segment end timestamps."""
if torch.is_tensor(result):
num_tokens = len(result)
return [num_tokens * 0.1] # rough estimate
return [1.0]
def use_vad(self):
"""Enable VAD - SimulStreaming has different VAD handling."""
logger.info("VAD requested for SimulStreaming - handled internally by the model")
pass
def set_translate_task(self):
"""Set up translation task."""
try:
self.model.tokenizer = tokenizer.get_tokenizer(
multilingual=True,
language=self.model.cfg.language,
num_languages=self.model.model.num_languages,
task="translate"
)
logger.info("SimulStreaming configured for translation task")
except Exception as e:
logger.error(f"Failed to configure SimulStreaming for translation: {e}")
raise
def warmup(self, audio, init_prompt=""):
"""Warmup the SimulStreaming model."""
try:
if isinstance(audio, np.ndarray):
audio = torch.from_numpy(audio).float()
self.model.insert_audio(audio)
self.model.infer(True)
self.model.refresh_segment(complete=True)
logger.info("SimulStreaming model warmed up successfully")
except Exception as e:
logger.exception(f"SimulStreaming warmup failed: {e}")
self.task = "translate"

View File

@@ -6,18 +6,6 @@ from whisperlivekit.timed_objects import ASRToken, Sentence, Transcript
logger = logging.getLogger(__name__)
# simulStreaming imports - we check if the files are here
try:
import torch
from whisperlivekit.simul_whisper.config import AlignAttConfig
SIMULSTREAMING_AVAILABLE = True
except ImportError:
logger.warning("SimulStreaming dependencies not available for online processor.")
SIMULSTREAMING_AVAILABLE = False
OnlineProcessorInterface = None
torch = None
class HypothesisBuffer:
"""
Buffer to store and process ASR hypothesis tokens.
@@ -528,204 +516,3 @@ class VACOnlineASRProcessor:
"""
return self.online.concatenate_tokens(self.online.transcript_buffer.buffer)
class SimulStreamingOnlineProcessor:
SAMPLING_RATE = 16000
def __init__(
self,
asr,
tokenize_method: Optional[callable] = None,
buffer_trimming: Tuple[str, float] = ("segment", 15),
confidence_validation = False,
logfile=sys.stderr,
):
if not SIMULSTREAMING_AVAILABLE:
raise ImportError("SimulStreaming dependencies are not available.")
self.asr = asr
self.tokenize = tokenize_method
self.logfile = logfile
self.confidence_validation = confidence_validation
self.init()
# buffer does not work yet
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
def init(self, offset: Optional[float] = None):
"""Initialize or reset the processing state."""
self.audio_chunks = []
self.offset = offset if offset is not None else 0.0
self.is_last = False
self.beg = self.offset
self.end = self.offset
self.cumulative_audio_duration = 0.0
self.last_audio_stream_end_time = self.offset
self.committed: List[ASRToken] = []
self.last_result_tokens: List[ASRToken] = []
self.buffer_content = ""
self.processed_audio_duration = 0.0
def get_audio_buffer_end_time(self) -> float:
"""Returns the absolute end time of the current audio buffer."""
return self.end
def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
"""Append an audio chunk to be processed by SimulStreaming."""
if torch is None:
raise ImportError("PyTorch is required for SimulStreaming but not available")
# Convert numpy array to torch tensor
audio_tensor = torch.from_numpy(audio).float()
self.audio_chunks.append(audio_tensor)
# Update timing
chunk_duration = len(audio) / self.SAMPLING_RATE
self.cumulative_audio_duration += chunk_duration
if audio_stream_end_time is not None:
self.last_audio_stream_end_time = audio_stream_end_time
self.end = audio_stream_end_time
else:
self.end = self.offset + self.cumulative_audio_duration
def prompt(self) -> Tuple[str, str]:
"""
Returns a tuple: (prompt, context).
SimulStreaming handles prompting internally, so we return empty strings.
"""
return "", ""
def get_buffer(self):
"""
Get the unvalidated buffer content.
"""
buffer_end = self.end if hasattr(self, 'end') else None
return Transcript(
start=None,
end=buffer_end,
text=self.buffer_content,
probability=None
)
def timestamped_text(self, tokens, generation):
# From the simulstreaming repo. self.model to self.asr.model
pr = generation["progress"]
if "result" not in generation:
split_words, split_tokens = self.asr.model.tokenizer.split_to_word_tokens(tokens)
else:
split_words, split_tokens = generation["result"]["split_words"], generation["result"]["split_tokens"]
frames = [p["most_attended_frames"][0] for p in pr]
tokens = tokens.copy()
ret = []
for sw,st in zip(split_words,split_tokens):
b = None
for stt in st:
t,f = tokens.pop(0), frames.pop(0)
if t != stt:
raise ValueError(f"Token mismatch: {t} != {stt} at frame {f}.")
if b is None:
b = f
e = f
out = (b*0.02, e*0.02, sw)
ret.append(out)
logger.debug(f"TS-WORD:\t{' '.join(map(str, out))}")
return ret
def process_iter(self) -> Tuple[List[ASRToken], float]:
"""
Process accumulated audio chunks using SimulStreaming.
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
"""
if not self.audio_chunks:
return [], self.end
try:
# concatenate all audio chunks
if len(self.audio_chunks) == 1:
audio = self.audio_chunks[0]
else:
audio = torch.cat(self.audio_chunks, dim=0)
audio_duration = audio.shape[0] / self.SAMPLING_RATE if audio.shape[0] > 0 else 0
self.processed_audio_duration += audio_duration
self.audio_chunks = []
logger.debug(f"SimulStreaming processing audio shape: {audio.shape}, duration: {audio_duration:.2f}s")
logger.debug(f"Current end time: {self.end:.2f}s, last stream time: {self.last_audio_stream_end_time:.2f}s")
self.asr.model.insert_audio(audio)
tokens, generation_progress = self.asr.model.infer(is_last=self.is_last)
ts_words = self.timestamped_text(tokens, generation_progress)
text = self.asr.model.tokenizer.decode(tokens)
new_tokens = []
for ts_word in ts_words:
start, end, word = ts_word
token = ASRToken(
start=start,
end=end,
text=word,
probability=0.95 # fake prob. Maybe we can extract it from the model?
)
new_tokens.append(token)
self.committed.extend(new_tokens)
return new_tokens, self.end
except Exception as e:
logger.exception(f"SimulStreaming processing error: {e}")
return [], self.end
def finish(self) -> Tuple[List[ASRToken], float]:
logger.debug("SimulStreaming finish() called")
self.is_last = True
final_tokens, final_time = self.process_iter()
self.is_last = False
return final_tokens, final_time
def concatenate_tokens(
self,
tokens: List[ASRToken],
sep: Optional[str] = None,
offset: float = 0
) -> Transcript:
"""Concatenate tokens into a Transcript object."""
sep = sep if sep is not None else self.asr.sep
text = sep.join(token.text for token in tokens)
probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None
if tokens:
start = offset + tokens[0].start
end = offset + tokens[-1].end
else:
start = None
end = None
return Transcript(start, end, text, probability=probability)
def chunk_at(self, time: float):
"""
useless but kept for compatibility
"""
logger.debug(f"SimulStreaming chunk_at({time:.2f}) - handled internally")
pass
def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
"""
Create simple sentences.
"""
if not tokens:
return []
full_text = " ".join(token.text for token in tokens)
sentence = Sentence(
start=tokens[0].start,
end=tokens[-1].end,
text=full_text
)
return [sentence]

View File

@@ -5,8 +5,7 @@ import librosa
from functools import lru_cache
import time
import logging
from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR, SimulStreamingASR, SIMULSTREAMING_AVAILABLE, SIMULSTREAMING_ERROR_AND_INSTALLATION_INSTRUCTIONS
from .online_asr import OnlineASRProcessor, VACOnlineASRProcessor, SimulStreamingOnlineProcessor, SIMULSTREAMING_AVAILABLE as SIMULSTREAMING_ONLINE_AVAILABLE
from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
logger = logging.getLogger(__name__)
@@ -68,35 +67,7 @@ def backend_factory(args):
backend = args.backend
if backend == "openai-api":
logger.debug("Using OpenAI API.")
asr = OpenaiApiASR(lan=args.lan)
elif backend == "simulstreaming":
logger.debug("Using SimulStreaming backend.")
if not SIMULSTREAMING_AVAILABLE:
raise SIMULSTREAMING_ERROR_AND_INSTALLATION_INSTRUCTIONS
simulstreaming_kwargs = {}
for attr in ['frame_threshold', 'beams', 'decoder_type', 'audio_max_len', 'audio_min_len',
'cif_ckpt_path', 'never_fire', 'init_prompt', 'static_init_prompt',
'max_context_tokens', 'model_path']:
if hasattr(args, attr):
simulstreaming_kwargs[attr] = getattr(args, attr)
# Add segment_length from min_chunk_size
simulstreaming_kwargs['segment_length'] = getattr(args, 'min_chunk_size', 0.5)
simulstreaming_kwargs['task'] = args.task
size = args.model
t = time.time()
logger.info(f"Loading SimulStreaming {size} model for language {args.lan}...")
asr = SimulStreamingASR(
modelsize=size,
lan=args.lan,
cache_dir=getattr(args, 'model_cache_dir', None),
model_dir=getattr(args, 'model_dir', None),
**simulstreaming_kwargs
)
e = time.time()
logger.info(f"done. It took {round(e-t,2)} seconds.")
asr = OpenaiApiASR(lan=args.lan)
else:
if backend == "faster-whisper":
asr_cls = FasterWhisperASR
@@ -136,107 +107,4 @@ def backend_factory(args):
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
return asr, tokenizer
def online_factory(args, asr, tokenizer, logfile=sys.stderr):
if args.backend == "simulstreaming":
if not SIMULSTREAMING_ONLINE_AVAILABLE:
raise SIMULSTREAMING_ERROR_AND_INSTALLATION_INSTRUCTIONS
logger.debug("Creating SimulStreaming online processor")
online = SimulStreamingOnlineProcessor(
asr,
tokenizer,
logfile=logfile,
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
confidence_validation=args.confidence_validation
)
elif args.vac:
online = VACOnlineASRProcessor(
args.min_chunk_size,
asr,
tokenizer,
logfile=logfile,
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
confidence_validation = args.confidence_validation
)
else:
online = OnlineASRProcessor(
asr,
tokenizer,
logfile=logfile,
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
confidence_validation = args.confidence_validation
)
return online
def asr_factory(args, logfile=sys.stderr):
"""
Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
"""
asr, tokenizer = backend_factory(args)
online = online_factory(args, asr, tokenizer, logfile=logfile)
return asr, online
def warmup_asr(asr, warmup_file=None, timeout=5):
"""
Warmup the ASR model by transcribing a short audio file.
"""
import os
import tempfile
is_simulstreaming = hasattr(asr, 'warmup') and callable(getattr(asr, 'warmup'))
if warmup_file is None:
# Download JFK sample if not already present
jfk_url = "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav"
temp_dir = tempfile.gettempdir()
warmup_file = os.path.join(temp_dir, "whisper_warmup_jfk.wav")
if not os.path.exists(warmup_file):
logger.debug(f"Downloading warmup file from {jfk_url}")
print(f"Downloading warmup file from {jfk_url}")
import time
import urllib.request
import urllib.error
import socket
original_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
start_time = time.time()
try:
urllib.request.urlretrieve(jfk_url, warmup_file)
logger.debug(f"Download successful in {time.time() - start_time:.2f}s")
except (urllib.error.URLError, socket.timeout) as e:
logger.warning(f"Download failed: {e}. Proceeding without warmup.")
return False
finally:
socket.setdefaulttimeout(original_timeout)
elif not warmup_file:
return False
if not warmup_file or not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
logger.warning(f"Warmup file {warmup_file} invalid or missing.")
return False
print(f"Warming up {'SimulStreaming' if is_simulstreaming else 'Whisper'} with {warmup_file}")
try:
import librosa
audio, sr = librosa.load(warmup_file, sr=16000)
except Exception as e:
logger.warning(f"Failed to load audio file: {e}")
return False
try:
if is_simulstreaming:
asr.warmup(audio)
else:
asr.transcribe(audio)
logger.info(f"{'SimulStreaming' if is_simulstreaming else 'Whisper'} is warmed up")
return True
except Exception as e:
logger.warning(f"Warmup failed: {e}")
return False
return asr, tokenizer