diff --git a/README.md b/README.md index 26b6525..dd77ed0 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ Please, cite us. [Bibtex citation](http://www.afnlp.org/conferences/ijcnlp2023/p 1) ``pip install librosa`` -- audio processing library +Note: for the VAD I need to `pip install torch torchaudio`. + 2) Whisper backend. Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`. diff --git a/whisper_online.py b/whisper_online.py index 7df7682..4ecd2f7 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import sys import numpy as np -import librosa +import librosa from functools import lru_cache import time import datetime diff --git a/whisper_online_server.py b/whisper_online_server.py index ffd597f..571bfb6 100644 --- a/whisper_online_server.py +++ b/whisper_online_server.py @@ -30,11 +30,12 @@ print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ", if args.backend == "faster-whisper": from faster_whisper import WhisperModel asr_cls = FasterWhisperASR -else: +elif args.backend == "whisper_timestamped": import whisper - import whisper_timestamped -# from whisper_timestamped_model import WhisperTimestampedASR + from whisper_online import WhisperTimestampedASR asr_cls = WhisperTimestampedASR +else: + raise ValueError(f"Unknown {args.backend=}") asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) @@ -44,25 +45,23 @@ if args.task == "translate": else: tgt_language = language -e = time.time() -print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr) +print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr) if args.vad: print("setting VAD filter",file=sys.stderr) asr.use_vad() -min_chunk = args.min_chunk_size - if args.buffer_trimming == "sentence": tokenizer = create_tokenizer(tgt_language) else: tokenizer = None if not args.vac: + from whisper_online import OnlineASRProcessor online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) else: - from whisper_online_vac import * - online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) + from whisper_online_vac import VACOnlineASRProcessor + online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) demo_audio_path = "cs-maji-2.16k.wav" @@ -219,7 +218,7 @@ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: conn, addr = s.accept() logging.info('INFO: Connected to client on {}'.format(addr)) connection = Connection(conn) - proc = ServerProcessor(connection, online, min_chunk) + proc = ServerProcessor(connection, online, args.min_chunk_size) proc.process() conn.close() logging.info('INFO: Connection to client closed') diff --git a/whisper_online_vac.py b/whisper_online_vac.py index 3f7c739..f323d38 100644 --- a/whisper_online_vac.py +++ b/whisper_online_vac.py @@ -165,9 +165,9 @@ if __name__ == "__main__": if end >= duration: break - + beg = end - + if end + min_chunk > duration: end = duration else: