mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 22:33:36 +00:00
96 lines
2.9 KiB
Python
96 lines
2.9 KiB
Python
from microphone_stream import MicrophoneStream
|
|
from voice_activity_controller import VoiceActivityController
|
|
from whisper_online import *
|
|
import numpy as np
|
|
import librosa
|
|
import io
|
|
import soundfile
|
|
import sys
|
|
|
|
|
|
|
|
|
|
class SimpleASRProcessor:
|
|
|
|
def __init__(self, asr, sampling_rate = 16000):
|
|
"""run this when starting or restarting processing"""
|
|
self.audio_buffer = np.array([],dtype=np.float32)
|
|
self.prompt_buffer = ""
|
|
self.asr = asr
|
|
self.sampling_rate = sampling_rate
|
|
self.init_prompt = ''
|
|
|
|
def ts_words(self, segments):
|
|
result = ""
|
|
for segment in segments:
|
|
if segment.no_speech_prob > 0.9:
|
|
continue
|
|
for word in segment.words:
|
|
w = word.word
|
|
t = (word.start, word.end, w)
|
|
result +=w
|
|
return result
|
|
|
|
def stream_process(self, vad_result):
|
|
iter_in_phrase = 0
|
|
for chunk, is_final in vad_result:
|
|
iter_in_phrase += 1
|
|
|
|
if chunk is not None:
|
|
sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
|
audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
|
|
# self.audio_buffer.append(chunk)
|
|
out = []
|
|
out.append(audio)
|
|
a = np.concatenate(out)
|
|
self.audio_buffer = np.append(self.audio_buffer, a)
|
|
|
|
if is_final and len(self.audio_buffer) > 0:
|
|
res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
|
|
# use custom ts_words
|
|
tsw = self.ts_words(res)
|
|
self.init_prompt = self.init_prompt + tsw
|
|
self.init_prompt = self.init_prompt [-100:]
|
|
self.audio_buffer.resize(0)
|
|
iter_in_phrase =0
|
|
yield True, tsw
|
|
# show progress evry 10 chunks
|
|
elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
|
|
res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
|
|
# use custom ts_words
|
|
tsw = self.ts_words(res)
|
|
yield False, tsw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAMPLING_RATE = 16000
|
|
|
|
model = "large-v2"
|
|
src_lan = "en" # source language
|
|
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
|
use_vad_result = True
|
|
min_sample_length = 1 * SAMPLING_RATE
|
|
|
|
|
|
|
|
vad = VoiceActivityController(use_vad_result = use_vad_result)
|
|
asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
|
|
|
|
tokenizer = create_tokenizer(tgt_lan)
|
|
online = SimpleASRProcessor(asr)
|
|
|
|
|
|
stream = MicrophoneStream()
|
|
stream = vad.detect_user_speech(stream, audio_in_int16 = False)
|
|
stream = online.stream_process(stream)
|
|
|
|
for isFinal, text in stream:
|
|
if isFinal:
|
|
print( text, end="\r\n")
|
|
else:
|
|
print( text, end="\r")
|