Merge pull request #110 from oplatek/vad-streaming-oplatek

polishing code and note about installing deps for VAD
This commit is contained in:
Dominik Macháček
2024-08-16 17:53:03 +02:00
committed by GitHub
4 changed files with 14 additions and 13 deletions

View File

@@ -33,6 +33,8 @@ Please, cite us. [Bibtex citation](http://www.afnlp.org/conferences/ijcnlp2023/p
1) ``pip install librosa`` -- audio processing library
Note: for the VAD I need to `pip install torch torchaudio`.
2) Whisper backend.
Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
import sys
import numpy as np
import librosa
import librosa
from functools import lru_cache
import time
import datetime

View File

@@ -30,11 +30,12 @@ print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",
if args.backend == "faster-whisper":
from faster_whisper import WhisperModel
asr_cls = FasterWhisperASR
else:
elif args.backend == "whisper_timestamped":
import whisper
import whisper_timestamped
# from whisper_timestamped_model import WhisperTimestampedASR
from whisper_online import WhisperTimestampedASR
asr_cls = WhisperTimestampedASR
else:
raise ValueError(f"Unknown {args.backend=}")
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
@@ -44,25 +45,23 @@ if args.task == "translate":
else:
tgt_language = language
e = time.time()
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr)
if args.vad:
print("setting VAD filter",file=sys.stderr)
asr.use_vad()
min_chunk = args.min_chunk_size
if args.buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
if not args.vac:
from whisper_online import OnlineASRProcessor
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
else:
from whisper_online_vac import *
online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
from whisper_online_vac import VACOnlineASRProcessor
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
demo_audio_path = "cs-maji-2.16k.wav"
@@ -219,7 +218,7 @@ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
conn, addr = s.accept()
logging.info('INFO: Connected to client on {}'.format(addr))
connection = Connection(conn)
proc = ServerProcessor(connection, online, min_chunk)
proc = ServerProcessor(connection, online, args.min_chunk_size)
proc.process()
conn.close()
logging.info('INFO: Connection to client closed')

View File

@@ -165,9 +165,9 @@ if __name__ == "__main__":
if end >= duration:
break
beg = end
if end + min_chunk > duration:
end = duration
else: