mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 22:33:36 +00:00
move files
This commit is contained in:
@@ -1,90 +0,0 @@
|
||||
import asyncio
|
||||
import re
|
||||
import threading
|
||||
import numpy as np
|
||||
|
||||
from diart import SpeakerDiarization
|
||||
from diart.inference import StreamingInference
|
||||
from diart.sources import AudioSource
|
||||
from src.whisper_streaming.timed_objects import SpeakerSegment
|
||||
|
||||
def extract_number(s: str) -> int:
|
||||
m = re.search(r'\d+', s)
|
||||
return int(m.group()) if m else None
|
||||
|
||||
|
||||
class WebSocketAudioSource(AudioSource):
|
||||
"""
|
||||
Custom AudioSource that blocks in read() until close() is called.
|
||||
Use push_audio() to inject PCM chunks.
|
||||
"""
|
||||
def __init__(self, uri: str = "websocket", sample_rate: int = 16000):
|
||||
super().__init__(uri, sample_rate)
|
||||
self._closed = False
|
||||
self._close_event = threading.Event()
|
||||
|
||||
def read(self):
|
||||
self._close_event.wait()
|
||||
|
||||
def close(self):
|
||||
if not self._closed:
|
||||
self._closed = True
|
||||
self.stream.on_completed()
|
||||
self._close_event.set()
|
||||
|
||||
def push_audio(self, chunk: np.ndarray):
|
||||
if not self._closed:
|
||||
self.stream.on_next(np.expand_dims(chunk, axis=0))
|
||||
|
||||
|
||||
class DiartDiarization:
|
||||
def __init__(self, sample_rate: int):
|
||||
self.processed_time = 0
|
||||
self.segment_speakers = []
|
||||
self.speakers_queue = asyncio.Queue()
|
||||
self.pipeline = SpeakerDiarization()
|
||||
self.source = WebSocketAudioSource(uri="websocket_source", sample_rate=sample_rate)
|
||||
self.inference = StreamingInference(
|
||||
pipeline=self.pipeline,
|
||||
source=self.source,
|
||||
do_plot=False,
|
||||
show_progress=False,
|
||||
)
|
||||
# Attache la fonction hook et démarre l'inférence en arrière-plan.
|
||||
self.inference.attach_hooks(self._diar_hook)
|
||||
asyncio.get_event_loop().run_in_executor(None, self.inference)
|
||||
|
||||
def _diar_hook(self, result):
|
||||
annotation, audio = result
|
||||
if annotation._labels:
|
||||
for speaker, label in annotation._labels.items():
|
||||
start = label.segments_boundaries_[0]
|
||||
end = label.segments_boundaries_[-1]
|
||||
if end > self.processed_time:
|
||||
self.processed_time = end
|
||||
asyncio.create_task(self.speakers_queue.put(SpeakerSegment(
|
||||
speaker=speaker,
|
||||
start=start,
|
||||
end=end,
|
||||
)))
|
||||
else:
|
||||
dur = audio.extent.end
|
||||
if dur > self.processed_time:
|
||||
self.processed_time = dur
|
||||
|
||||
async def diarize(self, pcm_array: np.ndarray):
|
||||
self.source.push_audio(pcm_array)
|
||||
self.segment_speakers.clear()
|
||||
while not self.speakers_queue.empty():
|
||||
self.segment_speakers.append(await self.speakers_queue.get())
|
||||
|
||||
def close(self):
|
||||
self.source.close()
|
||||
|
||||
def assign_speakers_to_tokens(self, end_attributed_speaker, tokens: list) -> list:
|
||||
for token in tokens:
|
||||
for segment in self.segment_speakers:
|
||||
if not (segment.end <= token.start or segment.start >= token.end):
|
||||
token.speaker = extract_number(segment.speaker) + 1
|
||||
end_attributed_speaker = max(token.end, end_attributed_speaker)
|
||||
return end_attributed_speaker
|
||||
BIN
src/web/demo.png
BIN
src/web/demo.png
Binary file not shown.
|
Before Width: | Height: | Size: 202 KiB |
@@ -1,425 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Audio Transcription</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: 'Inter', sans-serif;
|
||||
margin: 20px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
#recordButton {
|
||||
width: 80px;
|
||||
height: 80px;
|
||||
border: none;
|
||||
border-radius: 50%;
|
||||
background-color: white;
|
||||
cursor: pointer;
|
||||
transition: background-color 0.3s ease, transform 0.2s ease;
|
||||
border: 1px solid rgb(252, 230, 229);
|
||||
border-bottom: 5px solid rgb(252, 230, 229);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
#recordButton.recording {
|
||||
background-color: rgba(255, 130, 123, 0.032);
|
||||
border: 1px solid rgb(240, 198, 195);
|
||||
border-bottom: 5px solid rgb(240, 198, 195);
|
||||
color: white;
|
||||
}
|
||||
|
||||
#recordButton:active {
|
||||
transform: scale(0.95);
|
||||
}
|
||||
|
||||
.shape-container {
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.shape {
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
background-color: rgb(209, 61, 53);
|
||||
border-radius: 50%;
|
||||
transition: border-radius 0.3s ease, background-color 0.3s ease;
|
||||
}
|
||||
|
||||
#recordButton.recording .shape {
|
||||
border-radius: 10px;
|
||||
width: 30px;
|
||||
height: 30px;
|
||||
|
||||
}
|
||||
|
||||
#status {
|
||||
margin-top: 20px;
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.settings-container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
gap: 15px;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.settings {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: flex-start;
|
||||
gap: 5px;
|
||||
}
|
||||
|
||||
#chunkSelector,
|
||||
#websocketInput {
|
||||
font-size: 16px;
|
||||
padding: 5px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #ddd;
|
||||
background-color: #ffffff;
|
||||
max-height: 30px;
|
||||
}
|
||||
|
||||
#websocketInput {
|
||||
width: 200px;
|
||||
}
|
||||
|
||||
#chunkSelector:focus,
|
||||
#websocketInput:focus {
|
||||
outline: none;
|
||||
border-color: #007bff;
|
||||
}
|
||||
|
||||
label {
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
/* Speaker-labeled transcript area */
|
||||
#linesTranscript {
|
||||
margin: 20px auto;
|
||||
max-width: 600px;
|
||||
text-align: left;
|
||||
font-size: 16px;
|
||||
}
|
||||
|
||||
#linesTranscript p {
|
||||
margin: 0px 0;
|
||||
}
|
||||
|
||||
#linesTranscript strong {
|
||||
color: #333;
|
||||
}
|
||||
|
||||
#speaker {
|
||||
background-color: rgb(252, 230, 229);
|
||||
border-radius: 8px 8px 8px 0px;
|
||||
padding: 2px 10px;
|
||||
font-size: 14px;
|
||||
margin-bottom: 0px;
|
||||
}
|
||||
.label_diarization {
|
||||
background-color: #ffffff66;
|
||||
border-radius: 8px 8px 8px 8px;
|
||||
padding: 2px 10px;
|
||||
margin-left: 10px;
|
||||
font-size: 14px;
|
||||
margin-bottom: 0px;
|
||||
border-bottom: 3px solid rgb(155, 84, 84);
|
||||
color: rgb(155, 84, 84)
|
||||
}
|
||||
|
||||
.label_transcription {
|
||||
background-color: #ffffff66;
|
||||
border-radius: 8px 8px 8px 8px;
|
||||
padding: 2px 10px;
|
||||
margin-left: 10px;
|
||||
font-size: 14px;
|
||||
margin-bottom: 0px;
|
||||
border-bottom: 3px solid #8825255c;
|
||||
color: #8825255c
|
||||
}
|
||||
|
||||
#timeInfo {
|
||||
color: #666;
|
||||
margin-left: 10px;
|
||||
}
|
||||
|
||||
.textcontent {
|
||||
font-size: 16px;
|
||||
/* margin-left: 10px; */
|
||||
padding-left: 10px;
|
||||
border-left: 3px solid rgb(252, 230, 229);
|
||||
margin-bottom: 10px;
|
||||
margin-top: 1px;
|
||||
padding-top: 5px;
|
||||
border-radius: 0px 0px 0px 10px;
|
||||
}
|
||||
|
||||
.buffer_diarization {
|
||||
color: rgb(155, 84, 84);
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.buffer_transcription {
|
||||
color: #8825255c;
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
|
||||
.spinner {
|
||||
display: inline-block;
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border: 2px solid #8825255c;
|
||||
border-top: 2px solid #882525e5;
|
||||
border-radius: 50%;
|
||||
animation: spin 0.6s linear infinite;
|
||||
vertical-align: middle;
|
||||
margin-bottom: 2px;
|
||||
margin-right: 5px;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.silence {
|
||||
color: #666;
|
||||
background-color: #f3f3f3;
|
||||
font-size: 13px;
|
||||
border-radius: 30px;
|
||||
padding: 2px 10px;
|
||||
}
|
||||
|
||||
.loading {
|
||||
color: #666;
|
||||
background-color: #ff4d4d0f;
|
||||
border-radius: 8px 8px 8px 0px;
|
||||
padding: 2px 10px;
|
||||
font-size: 14px;
|
||||
margin-bottom: 0px;
|
||||
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="settings-container">
|
||||
<button id="recordButton">
|
||||
<div class="shape-container">
|
||||
<div class="shape"></div>
|
||||
</div>
|
||||
</button>
|
||||
<div class="settings">
|
||||
<div>
|
||||
<label for="chunkSelector">Chunk size (ms):</label>
|
||||
<select id="chunkSelector">
|
||||
<option value="500">500 ms</option>
|
||||
<option value="1000" selected>1000 ms</option>
|
||||
<option value="2000">2000 ms</option>
|
||||
<option value="3000">3000 ms</option>
|
||||
<option value="4000">4000 ms</option>
|
||||
<option value="5000">5000 ms</option>
|
||||
</select>
|
||||
</div>
|
||||
<div>
|
||||
<label for="websocketInput">WebSocket URL:</label>
|
||||
<input id="websocketInput" type="text" value="ws://localhost:8000/asr" />
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<p id="status"></p>
|
||||
|
||||
<!-- Speaker-labeled transcript -->
|
||||
<div id="linesTranscript"></div>
|
||||
|
||||
<script>
|
||||
let isRecording = false;
|
||||
let websocket = null;
|
||||
let recorder = null;
|
||||
let chunkDuration = 1000;
|
||||
let websocketUrl = "ws://localhost:8000/asr";
|
||||
let userClosing = false;
|
||||
|
||||
const statusText = document.getElementById("status");
|
||||
const recordButton = document.getElementById("recordButton");
|
||||
const chunkSelector = document.getElementById("chunkSelector");
|
||||
const websocketInput = document.getElementById("websocketInput");
|
||||
const linesTranscriptDiv = document.getElementById("linesTranscript");
|
||||
|
||||
chunkSelector.addEventListener("change", () => {
|
||||
chunkDuration = parseInt(chunkSelector.value);
|
||||
});
|
||||
|
||||
websocketInput.addEventListener("change", () => {
|
||||
const urlValue = websocketInput.value.trim();
|
||||
if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) {
|
||||
statusText.textContent = "Invalid WebSocket URL (must start with ws:// or wss://)";
|
||||
return;
|
||||
}
|
||||
websocketUrl = urlValue;
|
||||
statusText.textContent = "WebSocket URL updated. Ready to connect.";
|
||||
});
|
||||
|
||||
function setupWebSocket() {
|
||||
return new Promise((resolve, reject) => {
|
||||
try {
|
||||
websocket = new WebSocket(websocketUrl);
|
||||
} catch (error) {
|
||||
statusText.textContent = "Invalid WebSocket URL. Please check and try again.";
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
|
||||
websocket.onopen = () => {
|
||||
statusText.textContent = "Connected to server.";
|
||||
resolve();
|
||||
};
|
||||
|
||||
websocket.onclose = () => {
|
||||
if (userClosing) {
|
||||
statusText.textContent = "WebSocket closed by user.";
|
||||
} else {
|
||||
statusText.textContent =
|
||||
"Disconnected from the WebSocket server. (Check logs if model is loading.)";
|
||||
}
|
||||
userClosing = false;
|
||||
};
|
||||
|
||||
websocket.onerror = () => {
|
||||
statusText.textContent = "Error connecting to WebSocket.";
|
||||
reject(new Error("Error connecting to WebSocket"));
|
||||
};
|
||||
|
||||
// Handle messages from server
|
||||
websocket.onmessage = (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
|
||||
const {
|
||||
lines = [],
|
||||
buffer_transcription = "",
|
||||
buffer_diarization = "",
|
||||
remaining_time_transcription = 0,
|
||||
remaining_time_diarization = 0
|
||||
} = data;
|
||||
|
||||
renderLinesWithBuffer(
|
||||
lines,
|
||||
buffer_diarization,
|
||||
buffer_transcription,
|
||||
remaining_time_diarization,
|
||||
remaining_time_transcription
|
||||
);
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function renderLinesWithBuffer(lines, buffer_diarization, buffer_transcription, remaining_time_diarization, remaining_time_transcription) {
|
||||
const linesHtml = lines.map((item, idx) => {
|
||||
let timeInfo = "";
|
||||
if (item.beg !== undefined && item.end !== undefined) {
|
||||
timeInfo = ` ${item.beg} - ${item.end}`;
|
||||
}
|
||||
|
||||
let speakerLabel = "";
|
||||
if (item.speaker === -2) {
|
||||
speakerLabel = `<span class="silence">Silence<span id='timeInfo'>${timeInfo}</span></span>`;
|
||||
} else if (item.speaker == 0) {
|
||||
speakerLabel = `<span class='loading'><span class="spinner"></span><span id='timeInfo'>${remaining_time_diarization} second(s) of audio are undergoing diarization</span></span>`;
|
||||
} else if (item.speaker == -1) {
|
||||
speakerLabel = `<span id="speaker"><span id='timeInfo'>${timeInfo}</span></span>`;
|
||||
} else if (item.speaker !== -1) {
|
||||
speakerLabel = `<span id="speaker">Speaker ${item.speaker}<span id='timeInfo'>${timeInfo}</span></span>`;
|
||||
}
|
||||
|
||||
let textContent = item.text;
|
||||
if (idx === lines.length - 1 && buffer_diarization) {
|
||||
speakerLabel += `<span class="label_diarization"><span class="spinner"></span>Diarization lag<span id='timeInfo'>${remaining_time_diarization}s</span></span>`
|
||||
textContent += `<span class="buffer_diarization">${buffer_diarization}</span>`;
|
||||
}
|
||||
if (idx === lines.length - 1 && buffer_transcription) {
|
||||
speakerLabel += `<span class="label_transcription"><span class="spinner"></span>Transcription lag <span id='timeInfo'>${remaining_time_transcription}s</span></span>`
|
||||
textContent += `<span class="buffer_transcription">${buffer_transcription}</span>`;
|
||||
}
|
||||
|
||||
return textContent
|
||||
? `<p>${speakerLabel}<br/><div class='textcontent'>${textContent}</div></p>`
|
||||
: `<p>${speakerLabel}<br/></p>`;
|
||||
}).join("");
|
||||
|
||||
linesTranscriptDiv.innerHTML = linesHtml;
|
||||
}
|
||||
|
||||
async function startRecording() {
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
recorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
|
||||
recorder.ondataavailable = (e) => {
|
||||
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||||
websocket.send(e.data);
|
||||
}
|
||||
};
|
||||
recorder.start(chunkDuration);
|
||||
isRecording = true;
|
||||
updateUI();
|
||||
} catch (err) {
|
||||
statusText.textContent = "Error accessing microphone. Please allow microphone access.";
|
||||
}
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
userClosing = true;
|
||||
if (recorder) {
|
||||
recorder.stop();
|
||||
recorder = null;
|
||||
}
|
||||
isRecording = false;
|
||||
|
||||
if (websocket) {
|
||||
websocket.close();
|
||||
websocket = null;
|
||||
}
|
||||
|
||||
updateUI();
|
||||
}
|
||||
|
||||
async function toggleRecording() {
|
||||
if (!isRecording) {
|
||||
linesTranscriptDiv.innerHTML = "";
|
||||
try {
|
||||
await setupWebSocket();
|
||||
await startRecording();
|
||||
} catch (err) {
|
||||
statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
|
||||
}
|
||||
} else {
|
||||
stopRecording();
|
||||
}
|
||||
}
|
||||
|
||||
function updateUI() {
|
||||
recordButton.classList.toggle("recording", isRecording);
|
||||
statusText.textContent = isRecording ? "Recording..." : "Click to start transcription";
|
||||
}
|
||||
|
||||
recordButton.addEventListener("click", toggleRecording);
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
@@ -1,292 +0,0 @@
|
||||
import sys
|
||||
import logging
|
||||
import io
|
||||
import soundfile as sf
|
||||
import math
|
||||
import torch
|
||||
from typing import List
|
||||
import numpy as np
|
||||
from src.whisper_streaming.timed_objects import ASRToken
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ASRBase:
|
||||
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
|
||||
# "" for faster-whisper because it emits the spaces when needed)
|
||||
|
||||
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
|
||||
self.logfile = logfile
|
||||
self.transcribe_kargs = {}
|
||||
if lan == "auto":
|
||||
self.original_language = None
|
||||
else:
|
||||
self.original_language = lan
|
||||
self.model = self.load_model(modelsize, cache_dir, model_dir)
|
||||
|
||||
def with_offset(self, offset: float) -> ASRToken:
|
||||
# This method is kept for compatibility (typically you will use ASRToken.with_offset)
|
||||
return ASRToken(self.start + offset, self.end + offset, self.text)
|
||||
|
||||
def __repr__(self):
|
||||
return f"ASRToken(start={self.start:.2f}, end={self.end:.2f}, text={self.text!r})"
|
||||
|
||||
def load_model(self, modelsize, cache_dir, model_dir):
|
||||
raise NotImplementedError("must be implemented in the child class")
|
||||
|
||||
def transcribe(self, audio, init_prompt=""):
|
||||
raise NotImplementedError("must be implemented in the child class")
|
||||
|
||||
def use_vad(self):
|
||||
raise NotImplementedError("must be implemented in the child class")
|
||||
|
||||
|
||||
class WhisperTimestampedASR(ASRBase):
|
||||
"""Uses whisper_timestamped as the backend."""
|
||||
sep = " "
|
||||
|
||||
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
||||
import whisper
|
||||
import whisper_timestamped
|
||||
from whisper_timestamped import transcribe_timestamped
|
||||
|
||||
self.transcribe_timestamped = transcribe_timestamped
|
||||
if model_dir is not None:
|
||||
logger.debug("ignoring model_dir, not implemented")
|
||||
return whisper.load_model(modelsize, download_root=cache_dir)
|
||||
|
||||
def transcribe(self, audio, init_prompt=""):
|
||||
result = self.transcribe_timestamped(
|
||||
self.model,
|
||||
audio,
|
||||
language=self.original_language,
|
||||
initial_prompt=init_prompt,
|
||||
verbose=None,
|
||||
condition_on_previous_text=True,
|
||||
**self.transcribe_kargs,
|
||||
)
|
||||
return result
|
||||
|
||||
def ts_words(self, r) -> List[ASRToken]:
|
||||
"""
|
||||
Converts the whisper_timestamped result to a list of ASRToken objects.
|
||||
"""
|
||||
tokens = []
|
||||
for segment in r["segments"]:
|
||||
for word in segment["words"]:
|
||||
token = ASRToken(word["start"], word["end"], word["text"])
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
def segments_end_ts(self, res) -> List[float]:
|
||||
return [segment["end"] for segment in res["segments"]]
|
||||
|
||||
def use_vad(self):
|
||||
self.transcribe_kargs["vad"] = True
|
||||
|
||||
def set_translate_task(self):
|
||||
self.transcribe_kargs["task"] = "translate"
|
||||
|
||||
|
||||
class FasterWhisperASR(ASRBase):
|
||||
"""Uses faster-whisper as the backend."""
|
||||
sep = ""
|
||||
|
||||
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
if model_dir is not None:
|
||||
logger.debug(f"Loading whisper model from model_dir {model_dir}. "
|
||||
f"modelsize and cache_dir parameters are not used.")
|
||||
model_size_or_path = model_dir
|
||||
elif modelsize is not None:
|
||||
model_size_or_path = modelsize
|
||||
else:
|
||||
raise ValueError("Either modelsize or model_dir must be set")
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
compute_type = "float16" if device == "cuda" else "float32"
|
||||
|
||||
model = WhisperModel(
|
||||
model_size_or_path,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
download_root=cache_dir,
|
||||
)
|
||||
return model
|
||||
|
||||
def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
|
||||
segments, info = self.model.transcribe(
|
||||
audio,
|
||||
language=self.original_language,
|
||||
initial_prompt=init_prompt,
|
||||
beam_size=5,
|
||||
word_timestamps=True,
|
||||
condition_on_previous_text=True,
|
||||
**self.transcribe_kargs,
|
||||
)
|
||||
return list(segments)
|
||||
|
||||
def ts_words(self, segments) -> List[ASRToken]:
|
||||
tokens = []
|
||||
for segment in segments:
|
||||
if segment.no_speech_prob > 0.9:
|
||||
continue
|
||||
for word in segment.words:
|
||||
token = ASRToken(word.start, word.end, word.word)
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
def segments_end_ts(self, segments) -> List[float]:
|
||||
return [segment.end for segment in segments]
|
||||
|
||||
def use_vad(self):
|
||||
self.transcribe_kargs["vad_filter"] = True
|
||||
|
||||
def set_translate_task(self):
|
||||
self.transcribe_kargs["task"] = "translate"
|
||||
|
||||
|
||||
class MLXWhisper(ASRBase):
|
||||
"""
|
||||
Uses MLX Whisper optimized for Apple Silicon.
|
||||
"""
|
||||
sep = ""
|
||||
|
||||
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
||||
from mlx_whisper.transcribe import ModelHolder, transcribe
|
||||
import mlx.core as mx
|
||||
|
||||
if model_dir is not None:
|
||||
logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used.")
|
||||
model_size_or_path = model_dir
|
||||
elif modelsize is not None:
|
||||
model_size_or_path = self.translate_model_name(modelsize)
|
||||
logger.debug(f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used.")
|
||||
else:
|
||||
raise ValueError("Either modelsize or model_dir must be set")
|
||||
|
||||
self.model_size_or_path = model_size_or_path
|
||||
dtype = mx.float16
|
||||
ModelHolder.get_model(model_size_or_path, dtype)
|
||||
return transcribe
|
||||
|
||||
def translate_model_name(self, model_name):
|
||||
model_mapping = {
|
||||
"tiny.en": "mlx-community/whisper-tiny.en-mlx",
|
||||
"tiny": "mlx-community/whisper-tiny-mlx",
|
||||
"base.en": "mlx-community/whisper-base.en-mlx",
|
||||
"base": "mlx-community/whisper-base-mlx",
|
||||
"small.en": "mlx-community/whisper-small.en-mlx",
|
||||
"small": "mlx-community/whisper-small-mlx",
|
||||
"medium.en": "mlx-community/whisper-medium.en-mlx",
|
||||
"medium": "mlx-community/whisper-medium-mlx",
|
||||
"large-v1": "mlx-community/whisper-large-v1-mlx",
|
||||
"large-v2": "mlx-community/whisper-large-v2-mlx",
|
||||
"large-v3": "mlx-community/whisper-large-v3-mlx",
|
||||
"large-v3-turbo": "mlx-community/whisper-large-v3-turbo",
|
||||
"large": "mlx-community/whisper-large-mlx",
|
||||
}
|
||||
mlx_model_path = model_mapping.get(model_name)
|
||||
if mlx_model_path:
|
||||
return mlx_model_path
|
||||
else:
|
||||
raise ValueError(f"Model name '{model_name}' is not recognized or not supported.")
|
||||
|
||||
def transcribe(self, audio, init_prompt=""):
|
||||
if self.transcribe_kargs:
|
||||
logger.warning("Transcribe kwargs (vad, task) are not compatible with MLX Whisper and will be ignored.")
|
||||
segments = self.model(
|
||||
audio,
|
||||
language=self.original_language,
|
||||
initial_prompt=init_prompt,
|
||||
word_timestamps=True,
|
||||
condition_on_previous_text=True,
|
||||
path_or_hf_repo=self.model_size_or_path,
|
||||
)
|
||||
return segments.get("segments", [])
|
||||
|
||||
def ts_words(self, segments) -> List[ASRToken]:
|
||||
tokens = []
|
||||
for segment in segments:
|
||||
if segment.get("no_speech_prob", 0) > 0.9:
|
||||
continue
|
||||
for word in segment.get("words", []):
|
||||
token = ASRToken(word["start"], word["end"], word["word"])
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
def segments_end_ts(self, res) -> List[float]:
|
||||
return [s["end"] for s in res]
|
||||
|
||||
def use_vad(self):
|
||||
self.transcribe_kargs["vad_filter"] = True
|
||||
|
||||
def set_translate_task(self):
|
||||
self.transcribe_kargs["task"] = "translate"
|
||||
|
||||
|
||||
class OpenaiApiASR(ASRBase):
|
||||
"""Uses OpenAI's Whisper API for transcription."""
|
||||
def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
|
||||
self.logfile = logfile
|
||||
self.modelname = "whisper-1"
|
||||
self.original_language = None if lan == "auto" else lan
|
||||
self.response_format = "verbose_json"
|
||||
self.temperature = temperature
|
||||
self.load_model()
|
||||
self.use_vad_opt = False
|
||||
self.task = "transcribe"
|
||||
|
||||
def load_model(self, *args, **kwargs):
|
||||
from openai import OpenAI
|
||||
self.client = OpenAI()
|
||||
self.transcribed_seconds = 0
|
||||
|
||||
def ts_words(self, segments) -> List[ASRToken]:
|
||||
"""
|
||||
Converts OpenAI API response words into ASRToken objects while
|
||||
optionally skipping words that fall into no-speech segments.
|
||||
"""
|
||||
no_speech_segments = []
|
||||
if self.use_vad_opt:
|
||||
for segment in segments.segments:
|
||||
if segment["no_speech_prob"] > 0.8:
|
||||
no_speech_segments.append((segment.get("start"), segment.get("end")))
|
||||
tokens = []
|
||||
for word in segments.words:
|
||||
start = word.start
|
||||
end = word.end
|
||||
if any(s[0] <= start <= s[1] for s in no_speech_segments):
|
||||
continue
|
||||
tokens.append(ASRToken(start, end, word.word))
|
||||
return tokens
|
||||
|
||||
def segments_end_ts(self, res) -> List[float]:
|
||||
return [s.end for s in res.words]
|
||||
|
||||
def transcribe(self, audio_data, prompt=None, *args, **kwargs):
|
||||
buffer = io.BytesIO()
|
||||
buffer.name = "temp.wav"
|
||||
sf.write(buffer, audio_data, samplerate=16000, format="WAV", subtype="PCM_16")
|
||||
buffer.seek(0)
|
||||
self.transcribed_seconds += math.ceil(len(audio_data) / 16000)
|
||||
params = {
|
||||
"model": self.modelname,
|
||||
"file": buffer,
|
||||
"response_format": self.response_format,
|
||||
"temperature": self.temperature,
|
||||
"timestamp_granularities": ["word", "segment"],
|
||||
}
|
||||
if self.task != "translate" and self.original_language:
|
||||
params["language"] = self.original_language
|
||||
if prompt:
|
||||
params["prompt"] = prompt
|
||||
proc = self.client.audio.translations if self.task == "translate" else self.client.audio.transcriptions
|
||||
transcript = proc.create(**params)
|
||||
logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
|
||||
return transcript
|
||||
|
||||
def use_vad(self):
|
||||
self.use_vad_opt = True
|
||||
|
||||
def set_translate_task(self):
|
||||
self.task = "translate"
|
||||
@@ -1,444 +0,0 @@
|
||||
import sys
|
||||
import numpy as np
|
||||
import logging
|
||||
from typing import List, Tuple, Optional
|
||||
from src.whisper_streaming.timed_objects import ASRToken, Sentence, Transcript
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HypothesisBuffer:
|
||||
"""
|
||||
Buffer to store and process ASR hypothesis tokens.
|
||||
|
||||
It holds:
|
||||
- committed_in_buffer: tokens that have been confirmed (committed)
|
||||
- buffer: the last hypothesis that is not yet committed
|
||||
- new: new tokens coming from the recognizer
|
||||
"""
|
||||
def __init__(self, logfile=sys.stderr):
|
||||
self.committed_in_buffer: List[ASRToken] = []
|
||||
self.buffer: List[ASRToken] = []
|
||||
self.new: List[ASRToken] = []
|
||||
self.last_committed_time = 0.0
|
||||
self.last_committed_word: Optional[str] = None
|
||||
self.logfile = logfile
|
||||
|
||||
def insert(self, new_tokens: List[ASRToken], offset: float):
|
||||
"""
|
||||
Insert new tokens (after applying a time offset) and compare them with the
|
||||
already committed tokens. Only tokens that extend the committed hypothesis
|
||||
are added.
|
||||
"""
|
||||
# Apply the offset to each token.
|
||||
new_tokens = [token.with_offset(offset) for token in new_tokens]
|
||||
# Only keep tokens that are roughly “new”
|
||||
self.new = [token for token in new_tokens if token.start > self.last_committed_time - 0.1]
|
||||
|
||||
if self.new:
|
||||
first_token = self.new[0]
|
||||
if abs(first_token.start - self.last_committed_time) < 1:
|
||||
if self.committed_in_buffer:
|
||||
committed_len = len(self.committed_in_buffer)
|
||||
new_len = len(self.new)
|
||||
# Try to match 1 to 5 consecutive tokens
|
||||
max_ngram = min(min(committed_len, new_len), 5)
|
||||
for i in range(1, max_ngram + 1):
|
||||
committed_ngram = " ".join(token.text for token in self.committed_in_buffer[-i:])
|
||||
new_ngram = " ".join(token.text for token in self.new[:i])
|
||||
if committed_ngram == new_ngram:
|
||||
removed = []
|
||||
for _ in range(i):
|
||||
removed_token = self.new.pop(0)
|
||||
removed.append(repr(removed_token))
|
||||
logger.debug(f"Removing last {i} words: {' '.join(removed)}")
|
||||
break
|
||||
|
||||
def flush(self) -> List[ASRToken]:
|
||||
"""
|
||||
Returns the committed chunk, defined as the longest common prefix
|
||||
between the previous hypothesis and the new tokens.
|
||||
"""
|
||||
committed: List[ASRToken] = []
|
||||
while self.new:
|
||||
current_new = self.new[0]
|
||||
if not self.buffer:
|
||||
break
|
||||
if current_new.text == self.buffer[0].text:
|
||||
committed.append(current_new)
|
||||
self.last_committed_word = current_new.text
|
||||
self.last_committed_time = current_new.end
|
||||
self.buffer.pop(0)
|
||||
self.new.pop(0)
|
||||
else:
|
||||
break
|
||||
self.buffer = self.new
|
||||
self.new = []
|
||||
self.committed_in_buffer.extend(committed)
|
||||
return committed
|
||||
|
||||
def pop_committed(self, time: float):
|
||||
"""
|
||||
Remove tokens (from the beginning) that have ended before `time`.
|
||||
"""
|
||||
while self.committed_in_buffer and self.committed_in_buffer[0].end <= time:
|
||||
self.committed_in_buffer.pop(0)
|
||||
|
||||
|
||||
|
||||
class OnlineASRProcessor:
|
||||
"""
|
||||
Processes incoming audio in a streaming fashion, calling the ASR system
|
||||
periodically, and uses a hypothesis buffer to commit and trim recognized text.
|
||||
|
||||
The processor supports two types of buffer trimming:
|
||||
- "sentence": trims at sentence boundaries (using a sentence tokenizer)
|
||||
- "segment": trims at fixed segment durations.
|
||||
"""
|
||||
SAMPLING_RATE = 16000
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
asr,
|
||||
tokenize_method: Optional[callable] = None,
|
||||
buffer_trimming: Tuple[str, float] = ("segment", 15),
|
||||
logfile=sys.stderr,
|
||||
):
|
||||
"""
|
||||
asr: An ASR system object (for example, a WhisperASR instance) that
|
||||
provides a `transcribe` method, a `ts_words` method (to extract tokens),
|
||||
a `segments_end_ts` method, and a separator attribute `sep`.
|
||||
tokenize_method: A function that receives text and returns a list of sentence strings.
|
||||
buffer_trimming: A tuple (option, seconds), where option is either "sentence" or "segment".
|
||||
"""
|
||||
self.asr = asr
|
||||
self.tokenize = tokenize_method
|
||||
self.logfile = logfile
|
||||
|
||||
self.init()
|
||||
|
||||
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
|
||||
|
||||
if self.buffer_trimming_way not in ["sentence", "segment"]:
|
||||
raise ValueError("buffer_trimming must be either 'sentence' or 'segment'")
|
||||
if self.buffer_trimming_sec <= 0:
|
||||
raise ValueError("buffer_trimming_sec must be positive")
|
||||
elif self.buffer_trimming_sec > 30:
|
||||
logger.warning(
|
||||
f"buffer_trimming_sec is set to {self.buffer_trimming_sec}, which is very long. It may cause OOM."
|
||||
)
|
||||
|
||||
def init(self, offset: Optional[float] = None):
|
||||
"""Initialize or reset the processing buffers."""
|
||||
self.audio_buffer = np.array([], dtype=np.float32)
|
||||
self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
|
||||
self.buffer_time_offset = offset if offset is not None else 0.0
|
||||
self.transcript_buffer.last_committed_time = self.buffer_time_offset
|
||||
self.committed: List[ASRToken] = []
|
||||
|
||||
def insert_audio_chunk(self, audio: np.ndarray):
|
||||
"""Append an audio chunk (a numpy array) to the current audio buffer."""
|
||||
self.audio_buffer = np.append(self.audio_buffer, audio)
|
||||
|
||||
def prompt(self) -> Tuple[str, str]:
|
||||
"""
|
||||
Returns a tuple: (prompt, context), where:
|
||||
- prompt is a 200-character suffix of committed text that falls
|
||||
outside the current audio buffer.
|
||||
- context is the committed text within the current audio buffer.
|
||||
"""
|
||||
k = len(self.committed)
|
||||
while k > 0 and self.committed[k - 1].end > self.buffer_time_offset:
|
||||
k -= 1
|
||||
|
||||
prompt_tokens = self.committed[:k]
|
||||
prompt_words = [token.text for token in prompt_tokens]
|
||||
prompt_list = []
|
||||
length_count = 0
|
||||
# Use the last words until reaching 200 characters.
|
||||
while prompt_words and length_count < 200:
|
||||
word = prompt_words.pop(-1)
|
||||
length_count += len(word) + 1
|
||||
prompt_list.append(word)
|
||||
non_prompt_tokens = self.committed[k:]
|
||||
context_text = self.asr.sep.join(token.text for token in non_prompt_tokens)
|
||||
return self.asr.sep.join(prompt_list[::-1]), context_text
|
||||
|
||||
def get_buffer(self):
|
||||
"""
|
||||
Get the unvalidated buffer in string format.
|
||||
"""
|
||||
return self.concatenate_tokens(self.transcript_buffer.buffer)
|
||||
|
||||
|
||||
def process_iter(self) -> Transcript:
|
||||
"""
|
||||
Processes the current audio buffer.
|
||||
|
||||
Returns a Transcript object representing the committed transcript.
|
||||
"""
|
||||
prompt_text, _ = self.prompt()
|
||||
logger.debug(
|
||||
f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
|
||||
)
|
||||
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
|
||||
tokens = self.asr.ts_words(res) # Expecting List[ASRToken]
|
||||
self.transcript_buffer.insert(tokens, self.buffer_time_offset)
|
||||
committed_tokens = self.transcript_buffer.flush()
|
||||
self.committed.extend(committed_tokens)
|
||||
completed = self.concatenate_tokens(committed_tokens)
|
||||
logger.debug(f">>>> COMPLETE NOW: {completed.text}")
|
||||
incomp = self.concatenate_tokens(self.transcript_buffer.buffer)
|
||||
logger.debug(f"INCOMPLETE: {incomp.text}")
|
||||
|
||||
if committed_tokens and self.buffer_trimming_way == "sentence":
|
||||
if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec:
|
||||
self.chunk_completed_sentence()
|
||||
|
||||
s = self.buffer_trimming_sec if self.buffer_trimming_way == "segment" else 30
|
||||
if len(self.audio_buffer) / self.SAMPLING_RATE > s:
|
||||
self.chunk_completed_segment(res)
|
||||
logger.debug("Chunking segment")
|
||||
logger.debug(
|
||||
f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
|
||||
)
|
||||
return committed_tokens
|
||||
|
||||
def chunk_completed_sentence(self):
|
||||
"""
|
||||
If the committed tokens form at least two sentences, chunk the audio
|
||||
buffer at the end time of the penultimate sentence.
|
||||
"""
|
||||
if not self.committed:
|
||||
return
|
||||
logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
|
||||
sentences = self.words_to_sentences(self.committed)
|
||||
for sentence in sentences:
|
||||
logger.debug(f"\tSentence: {sentence.text}")
|
||||
if len(sentences) < 2:
|
||||
return
|
||||
# Keep the last two sentences.
|
||||
while len(sentences) > 2:
|
||||
sentences.pop(0)
|
||||
chunk_time = sentences[-2].end
|
||||
logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
|
||||
self.chunk_at(chunk_time)
|
||||
|
||||
def chunk_completed_segment(self, res):
|
||||
"""
|
||||
Chunk the audio buffer based on segment-end timestamps reported by the ASR.
|
||||
"""
|
||||
if not self.committed:
|
||||
return
|
||||
ends = self.asr.segments_end_ts(res)
|
||||
last_committed_time = self.committed[-1].end
|
||||
if len(ends) > 1:
|
||||
e = ends[-2] + self.buffer_time_offset
|
||||
while len(ends) > 2 and e > last_committed_time:
|
||||
ends.pop(-1)
|
||||
e = ends[-2] + self.buffer_time_offset
|
||||
if e <= last_committed_time:
|
||||
logger.debug(f"--- Segment chunked at {e:.2f}")
|
||||
self.chunk_at(e)
|
||||
else:
|
||||
logger.debug("--- Last segment not within committed area")
|
||||
else:
|
||||
logger.debug("--- Not enough segments to chunk")
|
||||
|
||||
def chunk_at(self, time: float):
|
||||
"""
|
||||
Trim both the hypothesis and audio buffer at the given time.
|
||||
"""
|
||||
logger.debug(f"Chunking at {time:.2f}s")
|
||||
logger.debug(
|
||||
f"Audio buffer length before chunking: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f}s"
|
||||
)
|
||||
self.transcript_buffer.pop_committed(time)
|
||||
cut_seconds = time - self.buffer_time_offset
|
||||
self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE):]
|
||||
self.buffer_time_offset = time
|
||||
logger.debug(
|
||||
f"Audio buffer length after chunking: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f}s"
|
||||
)
|
||||
|
||||
def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
|
||||
"""
|
||||
Converts a list of tokens to a list of Sentence objects using the provided
|
||||
sentence tokenizer.
|
||||
"""
|
||||
if not tokens:
|
||||
return []
|
||||
|
||||
full_text = " ".join(token.text for token in tokens)
|
||||
|
||||
if self.tokenize:
|
||||
try:
|
||||
sentence_texts = self.tokenize(full_text)
|
||||
except Exception as e:
|
||||
# Some tokenizers (e.g., MosesSentenceSplitter) expect a list input.
|
||||
try:
|
||||
sentence_texts = self.tokenize([full_text])
|
||||
except Exception as e2:
|
||||
raise ValueError("Tokenization failed") from e2
|
||||
else:
|
||||
sentence_texts = [full_text]
|
||||
|
||||
sentences: List[Sentence] = []
|
||||
token_index = 0
|
||||
for sent_text in sentence_texts:
|
||||
sent_text = sent_text.strip()
|
||||
if not sent_text:
|
||||
continue
|
||||
sent_tokens = []
|
||||
accumulated = ""
|
||||
# Accumulate tokens until roughly matching the length of the sentence text.
|
||||
while token_index < len(tokens) and len(accumulated) < len(sent_text):
|
||||
token = tokens[token_index]
|
||||
accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
|
||||
sent_tokens.append(token)
|
||||
token_index += 1
|
||||
if sent_tokens:
|
||||
sentence = Sentence(
|
||||
start=sent_tokens[0].start,
|
||||
end=sent_tokens[-1].end,
|
||||
text=" ".join(t.text for t in sent_tokens),
|
||||
)
|
||||
sentences.append(sentence)
|
||||
return sentences
|
||||
def finish(self) -> Transcript:
|
||||
"""
|
||||
Flush the remaining transcript when processing ends.
|
||||
"""
|
||||
remaining_tokens = self.transcript_buffer.buffer
|
||||
final_transcript = self.concatenate_tokens(remaining_tokens)
|
||||
logger.debug(f"Final non-committed transcript: {final_transcript}")
|
||||
self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
|
||||
return final_transcript
|
||||
|
||||
def concatenate_tokens(
|
||||
self,
|
||||
tokens: List[ASRToken],
|
||||
sep: Optional[str] = None,
|
||||
offset: float = 0
|
||||
) -> Transcript:
|
||||
sep = sep if sep is not None else self.asr.sep
|
||||
text = sep.join(token.text for token in tokens)
|
||||
if tokens:
|
||||
start = offset + tokens[0].start
|
||||
end = offset + tokens[-1].end
|
||||
else:
|
||||
start = None
|
||||
end = None
|
||||
return Transcript(start, end, text)
|
||||
|
||||
|
||||
class VACOnlineASRProcessor:
|
||||
"""
|
||||
Wraps an OnlineASRProcessor with a Voice Activity Controller (VAC).
|
||||
|
||||
It receives small chunks of audio, applies VAD (e.g. with Silero),
|
||||
and when the system detects a pause in speech (or end of an utterance)
|
||||
it finalizes the utterance immediately.
|
||||
"""
|
||||
SAMPLING_RATE = 16000
|
||||
|
||||
def __init__(self, online_chunk_size: float, *args, **kwargs):
|
||||
self.online_chunk_size = online_chunk_size
|
||||
self.online = OnlineASRProcessor(*args, **kwargs)
|
||||
|
||||
# Load a VAD model (e.g. Silero VAD)
|
||||
import torch
|
||||
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
||||
from src.whisper_streaming.silero_vad_iterator import FixedVADIterator
|
||||
|
||||
self.vac = FixedVADIterator(model)
|
||||
self.logfile = self.online.logfile
|
||||
self.init()
|
||||
|
||||
def init(self):
|
||||
self.online.init()
|
||||
self.vac.reset_states()
|
||||
self.current_online_chunk_buffer_size = 0
|
||||
self.is_currently_final = False
|
||||
self.status: Optional[str] = None # "voice" or "nonvoice"
|
||||
self.audio_buffer = np.array([], dtype=np.float32)
|
||||
self.buffer_offset = 0 # in frames
|
||||
|
||||
def clear_buffer(self):
|
||||
self.buffer_offset += len(self.audio_buffer)
|
||||
self.audio_buffer = np.array([], dtype=np.float32)
|
||||
|
||||
def insert_audio_chunk(self, audio: np.ndarray):
|
||||
"""
|
||||
Process an incoming small audio chunk:
|
||||
- run VAD on the chunk,
|
||||
- decide whether to send the audio to the online ASR processor immediately,
|
||||
- and/or to mark the current utterance as finished.
|
||||
"""
|
||||
res = self.vac(audio)
|
||||
self.audio_buffer = np.append(self.audio_buffer, audio)
|
||||
|
||||
if res is not None:
|
||||
# VAD returned a result; adjust the frame number
|
||||
frame = list(res.values())[0] - self.buffer_offset
|
||||
if "start" in res and "end" not in res:
|
||||
self.status = "voice"
|
||||
send_audio = self.audio_buffer[frame:]
|
||||
self.online.init(offset=(frame + self.buffer_offset) / self.SAMPLING_RATE)
|
||||
self.online.insert_audio_chunk(send_audio)
|
||||
self.current_online_chunk_buffer_size += len(send_audio)
|
||||
self.clear_buffer()
|
||||
elif "end" in res and "start" not in res:
|
||||
self.status = "nonvoice"
|
||||
send_audio = self.audio_buffer[:frame]
|
||||
self.online.insert_audio_chunk(send_audio)
|
||||
self.current_online_chunk_buffer_size += len(send_audio)
|
||||
self.is_currently_final = True
|
||||
self.clear_buffer()
|
||||
else:
|
||||
beg = res["start"] - self.buffer_offset
|
||||
end = res["end"] - self.buffer_offset
|
||||
self.status = "nonvoice"
|
||||
send_audio = self.audio_buffer[beg:end]
|
||||
self.online.init(offset=(beg + self.buffer_offset) / self.SAMPLING_RATE)
|
||||
self.online.insert_audio_chunk(send_audio)
|
||||
self.current_online_chunk_buffer_size += len(send_audio)
|
||||
self.is_currently_final = True
|
||||
self.clear_buffer()
|
||||
else:
|
||||
if self.status == "voice":
|
||||
self.online.insert_audio_chunk(self.audio_buffer)
|
||||
self.current_online_chunk_buffer_size += len(self.audio_buffer)
|
||||
self.clear_buffer()
|
||||
else:
|
||||
# Keep 1 second worth of audio in case VAD later detects voice,
|
||||
# but trim to avoid unbounded memory usage.
|
||||
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
||||
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
||||
|
||||
def process_iter(self) -> Transcript:
|
||||
"""
|
||||
Depending on the VAD status and the amount of accumulated audio,
|
||||
process the current audio chunk.
|
||||
"""
|
||||
if self.is_currently_final:
|
||||
return self.finish()
|
||||
elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE * self.online_chunk_size:
|
||||
self.current_online_chunk_buffer_size = 0
|
||||
return self.online.process_iter()
|
||||
else:
|
||||
logger.debug("No online update, only VAD")
|
||||
return Transcript(None, None, "")
|
||||
|
||||
def finish(self) -> Transcript:
|
||||
"""Finish processing by flushing any remaining text."""
|
||||
result = self.online.finish()
|
||||
self.current_online_chunk_buffer_size = 0
|
||||
self.is_currently_final = False
|
||||
return result
|
||||
|
||||
def get_buffer(self):
|
||||
"""
|
||||
Get the unvalidated buffer in string format.
|
||||
"""
|
||||
return self.online.concatenate_tokens(self.online.transcript_buffer.buffer).text
|
||||
@@ -1,163 +0,0 @@
|
||||
import torch
|
||||
|
||||
# This is copied from silero-vad's vad_utils.py:
|
||||
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
|
||||
# (except changed defaults)
|
||||
|
||||
# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
||||
|
||||
|
||||
class VADIterator:
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
threshold: float = 0.5,
|
||||
sampling_rate: int = 16000,
|
||||
min_silence_duration_ms: int = 500, # makes sense on one recording that I checked
|
||||
speech_pad_ms: int = 100, # same
|
||||
):
|
||||
"""
|
||||
Class for stream imitation
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model: preloaded .jit silero VAD model
|
||||
|
||||
threshold: float (default - 0.5)
|
||||
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
||||
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
||||
|
||||
sampling_rate: int (default - 16000)
|
||||
Currently silero VAD models support 8000 and 16000 sample rates
|
||||
|
||||
min_silence_duration_ms: int (default - 100 milliseconds)
|
||||
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
||||
|
||||
speech_pad_ms: int (default - 30 milliseconds)
|
||||
Final speech chunks are padded by speech_pad_ms each side
|
||||
"""
|
||||
|
||||
self.model = model
|
||||
self.threshold = threshold
|
||||
self.sampling_rate = sampling_rate
|
||||
|
||||
if sampling_rate not in [8000, 16000]:
|
||||
raise ValueError(
|
||||
"VADIterator does not support sampling rates other than [8000, 16000]"
|
||||
)
|
||||
|
||||
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||
self.reset_states()
|
||||
|
||||
def reset_states(self):
|
||||
|
||||
self.model.reset_states()
|
||||
self.triggered = False
|
||||
self.temp_end = 0
|
||||
self.current_sample = 0
|
||||
|
||||
def __call__(self, x, return_seconds=False):
|
||||
"""
|
||||
x: torch.Tensor
|
||||
audio chunk (see examples in repo)
|
||||
|
||||
return_seconds: bool (default - False)
|
||||
whether return timestamps in seconds (default - samples)
|
||||
"""
|
||||
|
||||
if not torch.is_tensor(x):
|
||||
try:
|
||||
x = torch.Tensor(x)
|
||||
except:
|
||||
raise TypeError("Audio cannot be casted to tensor. Cast it manually")
|
||||
|
||||
window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
|
||||
self.current_sample += window_size_samples
|
||||
|
||||
speech_prob = self.model(x, self.sampling_rate).item()
|
||||
|
||||
if (speech_prob >= self.threshold) and self.temp_end:
|
||||
self.temp_end = 0
|
||||
|
||||
if (speech_prob >= self.threshold) and not self.triggered:
|
||||
self.triggered = True
|
||||
speech_start = self.current_sample - self.speech_pad_samples
|
||||
return {
|
||||
"start": (
|
||||
int(speech_start)
|
||||
if not return_seconds
|
||||
else round(speech_start / self.sampling_rate, 1)
|
||||
)
|
||||
}
|
||||
|
||||
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
||||
if not self.temp_end:
|
||||
self.temp_end = self.current_sample
|
||||
if self.current_sample - self.temp_end < self.min_silence_samples:
|
||||
return None
|
||||
else:
|
||||
speech_end = self.temp_end + self.speech_pad_samples
|
||||
self.temp_end = 0
|
||||
self.triggered = False
|
||||
return {
|
||||
"end": (
|
||||
int(speech_end)
|
||||
if not return_seconds
|
||||
else round(speech_end / self.sampling_rate, 1)
|
||||
)
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
#######################
|
||||
# because Silero now requires exactly 512-sized audio chunks
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class FixedVADIterator(VADIterator):
|
||||
"""It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
||||
If audio to be processed at once is long and multiple voiced segments detected,
|
||||
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
||||
"""
|
||||
|
||||
def reset_states(self):
|
||||
super().reset_states()
|
||||
self.buffer = np.array([], dtype=np.float32)
|
||||
|
||||
def __call__(self, x, return_seconds=False):
|
||||
self.buffer = np.append(self.buffer, x)
|
||||
ret = None
|
||||
while len(self.buffer) >= 512:
|
||||
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
||||
self.buffer = self.buffer[512:]
|
||||
if ret is None:
|
||||
ret = r
|
||||
elif r is not None:
|
||||
if "end" in r:
|
||||
ret["end"] = r["end"] # the latter end
|
||||
if "start" in r and "end" in ret: # there is an earlier start.
|
||||
# Remove end, merging this segment with the previous one.
|
||||
del ret["end"]
|
||||
return ret if ret != {} else None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test/demonstrate the need for FixedVADIterator:
|
||||
|
||||
import torch
|
||||
|
||||
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
||||
vac = FixedVADIterator(model)
|
||||
# vac = VADIterator(model) # the second case crashes with this
|
||||
|
||||
# this works: for both
|
||||
audio_buffer = np.array([0] * (512), dtype=np.float32)
|
||||
vac(audio_buffer)
|
||||
|
||||
# this crashes on the non FixedVADIterator with
|
||||
# ops.prim.RaiseException("Input audio chunk is too short", "builtins.ValueError")
|
||||
audio_buffer = np.array([0] * (512 - 1), dtype=np.float32)
|
||||
vac(audio_buffer)
|
||||
@@ -1,27 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class TimedText:
|
||||
start: Optional[float]
|
||||
end: Optional[float]
|
||||
text: Optional[str] = ''
|
||||
speaker: Optional[int] = -1
|
||||
|
||||
@dataclass
|
||||
class ASRToken(TimedText):
|
||||
def with_offset(self, offset: float) -> "ASRToken":
|
||||
"""Return a new token with the time offset added."""
|
||||
return ASRToken(self.start + offset, self.end + offset, self.text)
|
||||
|
||||
@dataclass
|
||||
class Sentence(TimedText):
|
||||
pass
|
||||
|
||||
@dataclass
|
||||
class Transcript(TimedText):
|
||||
pass
|
||||
|
||||
@dataclass
|
||||
class SpeakerSegment(TimedText):
|
||||
pass
|
||||
@@ -1,235 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import numpy as np
|
||||
import librosa
|
||||
from functools import lru_cache
|
||||
import time
|
||||
import logging
|
||||
from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
|
||||
from .online_asr import OnlineASRProcessor, VACOnlineASRProcessor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
|
||||
","
|
||||
)
|
||||
|
||||
|
||||
def create_tokenizer(lan):
|
||||
"""returns an object that has split function that works like the one of MosesTokenizer"""
|
||||
|
||||
assert (
|
||||
lan in WHISPER_LANG_CODES
|
||||
), "language must be Whisper's supported lang code: " + " ".join(WHISPER_LANG_CODES)
|
||||
|
||||
if lan == "uk":
|
||||
import tokenize_uk
|
||||
|
||||
class UkrainianTokenizer:
|
||||
def split(self, text):
|
||||
return tokenize_uk.tokenize_sents(text)
|
||||
|
||||
return UkrainianTokenizer()
|
||||
|
||||
# supported by fast-mosestokenizer
|
||||
if (
|
||||
lan
|
||||
in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split()
|
||||
):
|
||||
from mosestokenizer import MosesSentenceSplitter
|
||||
|
||||
return MosesSentenceSplitter(lan)
|
||||
|
||||
# the following languages are in Whisper, but not in wtpsplit:
|
||||
if (
|
||||
lan
|
||||
in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split()
|
||||
):
|
||||
logger.debug(
|
||||
f"{lan} code is not supported by wtpsplit. Going to use None lang_code option."
|
||||
)
|
||||
lan = None
|
||||
|
||||
from wtpsplit import WtP
|
||||
|
||||
# downloads the model from huggingface on the first use
|
||||
wtp = WtP("wtp-canine-s-12l-no-adapters")
|
||||
|
||||
class WtPtok:
|
||||
def split(self, sent):
|
||||
return wtp.split(sent, lang_code=lan)
|
||||
|
||||
return WtPtok()
|
||||
|
||||
|
||||
def add_shared_args(parser):
|
||||
"""shared args for simulation (this entry point) and server
|
||||
parser: argparse.ArgumentParser object
|
||||
"""
|
||||
parser.add_argument(
|
||||
"--min-chunk-size",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="large-v3-turbo",
|
||||
choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
|
||||
","
|
||||
),
|
||||
help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_cache_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Overriding the default model cache dir where models downloaded from the hub are saved",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lan",
|
||||
"--language",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
default="transcribe",
|
||||
choices=["transcribe", "translate"],
|
||||
help="Transcribe or translate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
default="faster-whisper",
|
||||
choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
|
||||
help="Load only this backend for Whisper processing.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vac",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use VAC = voice activity controller. Recommended. Requires torch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vad",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use VAD = voice activity detection, with the default parameters.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--buffer_trimming",
|
||||
type=str,
|
||||
default="segment",
|
||||
choices=["sentence", "segment"],
|
||||
help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--buffer_trimming_sec",
|
||||
type=float,
|
||||
default=15,
|
||||
help="Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--log-level",
|
||||
dest="log_level",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
||||
help="Set the log level",
|
||||
default="DEBUG",
|
||||
)
|
||||
|
||||
def backend_factory(args):
|
||||
backend = args.backend
|
||||
if backend == "openai-api":
|
||||
logger.debug("Using OpenAI API.")
|
||||
asr = OpenaiApiASR(lan=args.lan)
|
||||
else:
|
||||
if backend == "faster-whisper":
|
||||
asr_cls = FasterWhisperASR
|
||||
elif backend == "mlx-whisper":
|
||||
asr_cls = MLXWhisper
|
||||
else:
|
||||
asr_cls = WhisperTimestampedASR
|
||||
|
||||
# Only for FasterWhisperASR and WhisperTimestampedASR
|
||||
size = args.model
|
||||
t = time.time()
|
||||
logger.info(f"Loading Whisper {size} model for language {args.lan}...")
|
||||
asr = asr_cls(
|
||||
modelsize=size,
|
||||
lan=args.lan,
|
||||
cache_dir=args.model_cache_dir,
|
||||
model_dir=args.model_dir,
|
||||
)
|
||||
e = time.time()
|
||||
logger.info(f"done. It took {round(e-t,2)} seconds.")
|
||||
|
||||
# Apply common configurations
|
||||
if getattr(args, "vad", False): # Checks if VAD argument is present and True
|
||||
logger.info("Setting VAD filter")
|
||||
asr.use_vad()
|
||||
|
||||
language = args.lan
|
||||
if args.task == "translate":
|
||||
asr.set_translate_task()
|
||||
tgt_language = "en" # Whisper translates into English
|
||||
else:
|
||||
tgt_language = language # Whisper transcribes in this language
|
||||
|
||||
# Create the tokenizer
|
||||
if args.buffer_trimming == "sentence":
|
||||
|
||||
tokenizer = create_tokenizer(tgt_language)
|
||||
else:
|
||||
tokenizer = None
|
||||
return asr, tokenizer
|
||||
|
||||
def online_factory(args, asr, tokenizer, logfile=sys.stderr):
|
||||
if args.vac:
|
||||
online = VACOnlineASRProcessor(
|
||||
args.min_chunk_size,
|
||||
asr,
|
||||
tokenizer,
|
||||
logfile=logfile,
|
||||
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
|
||||
)
|
||||
else:
|
||||
online = OnlineASRProcessor(
|
||||
asr,
|
||||
tokenizer,
|
||||
logfile=logfile,
|
||||
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
|
||||
)
|
||||
return online
|
||||
|
||||
def asr_factory(args, logfile=sys.stderr):
|
||||
"""
|
||||
Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
|
||||
"""
|
||||
asr, tokenizer = backend_factory(args)
|
||||
online = online_factory(args, asr, tokenizer, logfile=logfile)
|
||||
return asr, online
|
||||
|
||||
def set_logging(args, logger, others=[]):
|
||||
logging.basicConfig(format="%(levelname)s\t%(message)s") # format='%(name)s
|
||||
logger.setLevel(args.log_level)
|
||||
|
||||
for other in others:
|
||||
logging.getLogger(other).setLevel(args.log_level)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user