From 52da12120caac819d7e0195f3beef0119553cfb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20Mach=C3=A1=C4=8Dek?= <machacek@ufal.mff.cuni.cz>
Date: Mon, 19 Aug 2024 00:19:16 +0200
Subject: [PATCH] cleaner code

---
 README.md                    | 26 +++++++++++++++++---------
 silero_vad.py                |  4 +++-
 voice_activity_controller.py | 35 -----------------------------------
 whisper_online.py            |  2 +-
 4 files changed, 21 insertions(+), 46 deletions(-)
 delete mode 100644 voice_activity_controller.py

diff --git a/README.md b/README.md
index 0e217c0..43f849d 100644
--- a/README.md
+++ b/README.md
@@ -36,8 +36,6 @@ Please, cite us. [ACL Anthology](https://aclanthology.org/2023.ijcnlp-demo.3/),
 
 1) ``pip install librosa soundfile`` -- audio processing library
 
-Note: for the VAD I need to `pip install torch torchaudio`.
-
 2) Whisper backend.
 
  Several alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
@@ -51,7 +49,9 @@ For running with the openai-api backend, make sure that your [OpenAI api key](ht
 
 The backend is loaded only when chosen. The unused one does not have to be installed.
 
-3) Optional, not recommended: sentence segmenter (aka sentence tokenizer) 
+3) For voice activity controller: `pip install torch torchaudio`. Optional, but very recommended.
+
+4) Optional, not recommended: sentence segmenter (aka sentence tokenizer) 
 
 Two buffer trimming options are integrated and evaluated. They have impact on
 the quality and latency. The default "segment" option performs better according
@@ -78,8 +78,10 @@ In case of installation issues of opus-fast-mosestokenizer, especially on Window
 ### Real-time simulation from audio file
 
 ```
-usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}] [--model_cache_dir MODEL_CACHE_DIR] [--model_dir MODEL_DIR] [--lan LAN] [--task {transcribe,translate}]
-                         [--backend {faster-whisper,whisper_timestamped,openai-api}] [--vad] [--buffer_trimming {sentence,segment}] [--buffer_trimming_sec BUFFER_TRIMMING_SEC] [--start_at START_AT] [--offline] [--comp_unaware]
+whisper_online.py -h
+usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}] [--model_cache_dir MODEL_CACHE_DIR]
+                         [--model_dir MODEL_DIR] [--lan LAN] [--task {transcribe,translate}] [--backend {faster-whisper,whisper_timestamped,openai-api}] [--vac] [--vac-chunk-size VAC_CHUNK_SIZE] [--vad]
+                         [--buffer_trimming {sentence,segment}] [--buffer_trimming_sec BUFFER_TRIMMING_SEC] [-l {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--start_at START_AT] [--offline] [--comp_unaware]
                          audio_path
 
 positional arguments:
@@ -88,7 +90,8 @@ positional arguments:
 options:
   -h, --help            show this help message and exit
   --min-chunk-size MIN_CHUNK_SIZE
-                        Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.
+                        Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was
+                        received by this time.
   --model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}
                         Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.
   --model_cache_dir MODEL_CACHE_DIR
@@ -101,11 +104,17 @@ options:
                         Transcribe or translate.
   --backend {faster-whisper,whisper_timestamped,openai-api}
                         Load only this backend for Whisper processing.
+  --vac                 Use VAC = voice activity controller. Recommended. Requires torch.
+  --vac-chunk-size VAC_CHUNK_SIZE
+                        VAC sample size in seconds.
   --vad                 Use VAD = voice activity detection, with the default parameters.
   --buffer_trimming {sentence,segment}
-                        Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.
+                        Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter
+                        must be installed for "sentence" option.
   --buffer_trimming_sec BUFFER_TRIMMING_SEC
                         Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.
+  -l {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
+                        Set the log level
   --start_at START_AT   Start processing audio at this time.
   --offline             Offline mode.
   --comp_unaware        Computationally unaware simulation.
@@ -240,11 +249,10 @@ Contributions are welcome. We acknowledge especially:
 - [Ondřej Plátek](https://opla.cz/) for the paper pre-review.
 - [Peter Polák](https://ufal.mff.cuni.cz/peter-polak) for the original idea.
 - The UEDIN team of the [ELITR project](https://elitr.eu) for the original line_packet.py.
+- Silero Team for their VAD [model](https://github.com/snakers4/silero-vad) and [VADIterator](https://github.com/ufal/whisper_streaming/main/silero_vad.py).
 
 
 ## Contact
 
 Dominik Macháček, machacek@ufal.mff.cuni.cz
 
-
-
diff --git a/silero_vad.py b/silero_vad.py
index 7f85b69..7735215 100644
--- a/silero_vad.py
+++ b/silero_vad.py
@@ -1,8 +1,10 @@
 import torch
 
-# this is copypasted from silero-vad's vad_utils.py:
+# This is copied from silero-vad's vad_utils.py:
 # https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
 
+# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
+
 class VADIterator:
     def __init__(self,
                  model,
diff --git a/voice_activity_controller.py b/voice_activity_controller.py
deleted file mode 100644
index d000cbf..0000000
--- a/voice_activity_controller.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-from silero_vad import VADIterator
-import time
-
-class VoiceActivityController:
-    SAMPLING_RATE = 16000
-    def __init__(self):
-        self.model, _ = torch.hub.load(
-            repo_or_dir='snakers4/silero-vad',
-            model='silero_vad'
-        )
-        # we use the default options: 500ms silence, etc.
-        self.iterator = VADIterator(self.model)
-
-    def reset(self):
-        self.iterator.reset_states()
-
-    def __call__(self, audio):
-        '''
-        audio: audio chunk in the current np.array format
-        returns: 
-        - { 'start': time_frame } ... when voice start was detected. time_frame is number of frame (can be converted to seconds)
-        - { 'end': time_frame }   ... when voice end is detected
-        - None                    ... when no change detected by current chunk 
-        '''
-        x = audio
-#        if not torch.is_tensor(x):
-#            try:
-#                x = torch.Tensor(x)
-#            except:
-#                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
-        t = time.time()
-        a = self.iterator(x)
-        print("VAD took ",time.time()-t,"seconds")
-        return a
diff --git a/whisper_online.py b/whisper_online.py
index 86d98dc..d3b1e4c 100644
--- a/whisper_online.py
+++ b/whisper_online.py
@@ -656,7 +656,7 @@ def add_shared_args(parser):
     parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
     parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
     parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
-    parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
+    parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller. Recommended. Requires torch.')
     parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
     parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
     parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')