diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index fa454c2..135b420 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -66,10 +66,12 @@ class AudioProcessor: self.asr = models.asr self.tokenizer = models.tokenizer self.diarization = models.diarization - import torch - model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad") - self.vac = FixedVADIterator(model) - self.vac.reset_states() + self.vac_model = models.vac_model + if self.args.vac: + self.vac = FixedVADIterator(models.vac_model) + else: + self.vac = None + self.ffmpeg_manager = FFmpegManager( sample_rate=self.sample_rate, channels=self.channels @@ -218,11 +220,14 @@ class AudioProcessor: # Process audio chunk pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec]) self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:] - res = self.vac(pcm_array) - + + res = None end_of_audio = False silence_buffer = None + if self.args.vac: + res = self.vac(pcm_array) + if self.silence: print('NO AUDIO') diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index da7fdab..5bdd88f 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -34,7 +34,7 @@ class TranscriptionEngine: "lan": "auto", "task": "transcribe", "backend": "faster-whisper", - "vac": False, + "vac": True, "vac_chunk_size": 0.04, "log_level": "DEBUG", "ssl_certfile": None, @@ -82,6 +82,11 @@ class TranscriptionEngine: self.asr = None self.tokenizer = None self.diarization = None + self.vac_model = None + + if self.args.vac: + import torch + self.vac_model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad") if self.args.transcription: if self.args.backend == "simulstreaming": @@ -131,7 +136,7 @@ def online_factory(args, asr, tokenizer, logfile=sys.stderr): logfile=logfile, ) # warmup_online(online, args.warmup_file) - elif args.vac: + elif False: #args.vac: #vac is now handled in audio_processor online = VACOnlineASRProcessor( args.min_chunk_size, asr, diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index 0806ff5..391b91b 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -113,8 +113,8 @@ def parse_args(): ) parser.add_argument( "--vac", - action="store_true", - default=False, + # action="store_true", + default=True, help="Use VAC = voice activity controller. Recommended. Requires torch.", ) parser.add_argument(