Merge branch 'main' into ayo-logging-fixes

This commit is contained in:
Alex Young
2024-04-17 20:47:55 +01:00
4 changed files with 36 additions and 43 deletions

View File

@@ -183,7 +183,7 @@ online.init() # refresh if you're going to re-use the object for the next audio
### Server -- real-time from mic
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection. See help message (`-h` option).
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. See the help message (`-h` option).
Client example:

View File

@@ -2,8 +2,6 @@
"""Functions for sending and receiving individual lines of text over a socket.
Used by marian-server-server.py to communicate with the Marian worker.
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
containing:
@@ -11,6 +9,7 @@ containing:
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
Originally from the UEDIN team of the ELITR project.
"""
PACKET_SIZE = 65536

View File

@@ -559,7 +559,7 @@ def add_shared_args(parser):
def asr_factory(args, logfile=sys.stderr):
"""
Creates and configures an ASR instance based on the specified backend and arguments.
Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
"""
backend = args.backend
if backend == "openai-api":
@@ -584,8 +584,23 @@ def asr_factory(args, logfile=sys.stderr):
logging.info("Setting VAD filter")
asr.use_vad()
return asr
language = args.lan
if args.task == "translate":
asr.set_translate_task()
tgt_language = "en" # Whisper translates into English
else:
tgt_language = language # Whisper transcribes in this language
# Create the tokenizer
if args.buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
# Create the OnlineASRProcessor
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
return asr, online
## main:
if __name__ == "__main__":
@@ -613,27 +628,13 @@ if __name__ == "__main__":
duration = len(load_audio(audio_path))/SAMPLING_RATE
logging.info("Audio duration is: %2.2f seconds" % duration)
asr = asr_factory(args, logfile=logfile)
language = args.lan
if args.task == "translate":
asr.set_translate_task()
tgt_language = "en" # Whisper translates into English
else:
tgt_language = language # Whisper transcribes in this language
asr, online = asr_factory(args, logfile=logfile)
min_chunk = args.min_chunk_size
if args.buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
# load the audio into the LRU cache before we start the timer
a = load_audio_chunk(audio_path,0,1)
# warm up the ASR, because the very first transcribe takes much more time than the other
# warm up the ASR because the very first transcribe takes much more time than the other
asr.transcribe(a)
beg = args.start_at

View File

@@ -12,6 +12,8 @@ parser = argparse.ArgumentParser()
# server options
parser.add_argument("--host", type=str, default='localhost')
parser.add_argument("--port", type=int, default=43007)
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
parser.add_argument("-l", "--log-level", dest="log_level",
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
@@ -33,37 +35,28 @@ SAMPLING_RATE = 16000
size = args.model
language = args.lan
asr = asr_factory(args)
if args.task == "translate":
asr.set_translate_task()
tgt_language = "en"
else:
tgt_language = language
asr, online = asr_factory(args)
min_chunk = args.min_chunk_size
if args.buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
demo_audio_path = "cs-maji-2.16k.wav"
if os.path.exists(demo_audio_path):
# load the audio into the LRU cache before we start the timer
logging.debug(f"Warming up on {demo_audio_path}")
a = load_audio_chunk(demo_audio_path,0,1)
# TODO: it should be tested whether it's meaningful
# warm up the ASR, because the very first transcribe takes much more time than the other
asr.transcribe(a)
logging.debug("Whisper is warmed up")
# warm up the ASR because the very first transcribe takes more time than the others.
# Test results in https://github.com/ufal/whisper_streaming/pull/81
msg = "Whisper is not warmed up. The first chunk processing may take longer."
if args.warmup_file:
if os.path.isfile(args.warmup_file):
a = load_audio_chunk(args.warmup_file,0,1)
asr.transcribe(a)
print("INFO: Whisper is warmed up.",file=sys.stderr)
else:
print("WARNING: The warm up file is not available. "+msg,file=sys.stderr)
else:
logging.debug("Whisper is not warmed up")
print("WARNING: " + msg, file=sys.stderr)
######### Server objects