diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py deleted file mode 100644 index 3b2b61d..0000000 --- a/mic_test_whisper_simple.py +++ /dev/null @@ -1,95 +0,0 @@ -from microphone_stream import MicrophoneStream -from voice_activity_controller import VoiceActivityController -from whisper_online import * -import numpy as np -import librosa -import io -import soundfile -import sys - - - - -class SimpleASRProcessor: - - def __init__(self, asr, sampling_rate = 16000): - """run this when starting or restarting processing""" - self.audio_buffer = np.array([],dtype=np.float32) - self.prompt_buffer = "" - self.asr = asr - self.sampling_rate = sampling_rate - self.init_prompt = '' - - def ts_words(self, segments): - result = "" - for segment in segments: - if segment.no_speech_prob > 0.9: - continue - for word in segment.words: - w = word.word - t = (word.start, word.end, w) - result +=w - return result - - def stream_process(self, vad_result): - iter_in_phrase = 0 - for chunk, is_final in vad_result: - iter_in_phrase += 1 - - if chunk is not None: - sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") - audio, _ = librosa.load(sf,sr=SAMPLING_RATE) - out = [] - out.append(audio) - a = np.concatenate(out) - self.audio_buffer = np.append(self.audio_buffer, a) - - if is_final and len(self.audio_buffer) > 0: - res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) - tsw = self.ts_words(res) - - self.init_prompt = self.init_prompt + tsw - self.init_prompt = self.init_prompt [-100:] - self.audio_buffer.resize(0) - iter_in_phrase =0 - - yield True, tsw - # show progress evry 50 chunks - elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0: - res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) - # use custom ts_words - tsw = self.ts_words(res) - yield False, tsw - - - - - - - -SAMPLING_RATE = 16000 - -model = "large-v2" -src_lan = "en" # source language -tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used -use_vad = False -min_sample_length = 1 * SAMPLING_RATE - - - -vac = VoiceActivityController(use_vad_result = use_vad) -asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model - -tokenizer = create_tokenizer(tgt_lan) -online = SimpleASRProcessor(asr) - - -stream = MicrophoneStream() -stream = vac.detect_user_speech(stream, audio_in_int16 = False) -stream = online.stream_process(stream) - -for isFinal, text in stream: - if isFinal: - print( text, end="\r\n") - else: - print( text, end="\r") diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py deleted file mode 100644 index b427015..0000000 --- a/mic_test_whisper_streaming.py +++ /dev/null @@ -1,71 +0,0 @@ -from microphone_stream import MicrophoneStream -from voice_activity_controller import VoiceActivityController -from whisper_online import * -import numpy as np -import librosa -import io -import soundfile -import sys - - -SAMPLING_RATE = 16000 -model = "large-v2" -src_lan = "en" # source language -tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used -use_vad_result = True -min_sample_length = 1 * SAMPLING_RATE - - - -asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model -tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language -online = OnlineASRProcessor(asr, tokenizer) # create processing object - -microphone_stream = MicrophoneStream() -vad = VoiceActivityController(use_vad_result = use_vad_result) - -complete_text = '' -final_processing_pending = False -out = [] -out_len = 0 -for iter in vad.detect_user_speech(microphone_stream): # processing loop: - raw_bytes= iter[0] - is_final = iter[1] - - if raw_bytes: - sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") - audio, _ = librosa.load(sf,sr=SAMPLING_RATE) - out.append(audio) - out_len += len(audio) - - - if (is_final or out_len >= min_sample_length) and out_len>0: - a = np.concatenate(out) - online.insert_audio_chunk(a) - - if out_len > min_sample_length: - o = online.process_iter() - print('-----'*10) - complete_text = complete_text + o[2] - print('PARTIAL - '+ complete_text) # do something with current partial output - print('-----'*10) - out = [] - out_len = 0 - - if is_final: - o = online.finish() - # final_processing_pending = False - print('-----'*10) - complete_text = complete_text + o[2] - print('FINAL - '+ complete_text) # do something with current partial output - print('-----'*10) - online.init() - out = [] - out_len = 0 - - - - - - - diff --git a/microphone_stream.py b/microphone_stream.py deleted file mode 100644 index 63d5019..0000000 --- a/microphone_stream.py +++ /dev/null @@ -1,82 +0,0 @@ - - -### mic stream - -import queue -import re -import sys -import pyaudio - - -class MicrophoneStream: - def __init__( - self, - sample_rate: int = 16000, - ): - """ - Creates a stream of audio from the microphone. - - Args: - chunk_size: The size of each chunk of audio to read from the microphone. - channels: The number of channels to record audio from. - sample_rate: The sample rate to record audio at. - """ - try: - import pyaudio - except ImportError: - raise Exception('py audio not installed') - - self._pyaudio = pyaudio.PyAudio() - self.sample_rate = sample_rate - - self._chunk_size = int(self.sample_rate * 40 / 1000) - self._stream = self._pyaudio.open( - format=pyaudio.paInt16, - channels=1, - rate=sample_rate, - input=True, - frames_per_buffer=self._chunk_size, - ) - - self._open = True - - def __iter__(self): - """ - Returns the iterator object. - """ - - return self - - def __next__(self): - """ - Reads a chunk of audio from the microphone. - """ - if not self._open: - raise StopIteration - - try: - return self._stream.read(self._chunk_size) - except KeyboardInterrupt: - raise StopIteration - - def close(self): - """ - Closes the stream. - """ - - self._open = False - - if self._stream.is_active(): - self._stream.stop_stream() - - self._stream.close() - self._pyaudio.terminate() - - - - - - - - -