unfork project, indicate files from whisper streaming

This commit is contained in:
Quentin Fuxa
2024-12-19 12:01:07 +01:00
parent baa01728be
commit 0553b75415
3 changed files with 5 additions and 277 deletions

View File

@@ -12,6 +12,11 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str
![Demo Screenshot](src/demo.png)
## Code Origins
This project reuses and extends code from the original Whisper Streaming repository:
- whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository.
- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
## Installation

View File

@@ -1,93 +0,0 @@
#!/usr/bin/env python3
"""Functions for sending and receiving individual lines of text over a socket.
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
containing:
- Zero or more bytes of UTF-8, excluding \n and \0, followed by
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
Originally from the UEDIN team of the ELITR project.
"""
PACKET_SIZE = 65536
def send_one_line(socket, text, pad_zeros=False):
"""Sends a line of text over the given socket.
The 'text' argument should contain a single line of text (line break
characters are optional). Line boundaries are determined by Python's
str.splitlines() function [1]. We also count '\0' as a line terminator.
If 'text' contains multiple lines then only the first will be sent.
If the send fails then an exception will be raised.
[1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines
Args:
socket: a socket object.
text: string containing a line of text for transmission.
"""
text.replace('\0', '\n')
lines = text.splitlines()
first_line = '' if len(lines) == 0 else lines[0]
# TODO Is there a better way of handling bad input than 'replace'?
data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'')
for offset in range(0, len(data), PACKET_SIZE):
bytes_remaining = len(data) - offset
if bytes_remaining < PACKET_SIZE:
padding_length = PACKET_SIZE - bytes_remaining
packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'')
else:
packet = data[offset:offset+PACKET_SIZE]
socket.sendall(packet)
def receive_one_line(socket):
"""Receives a line of text from the given socket.
This function will (attempt to) receive a single line of text. If data is
currently unavailable then it will block until data becomes available or
the sender has closed the connection (in which case it will return an
empty string).
The string should not contain any newline characters, but if it does then
only the first line will be returned.
Args:
socket: a socket object.
Returns:
A string representing a single line with a terminating newline or
None if the connection has been closed.
"""
data = b''
while True:
packet = socket.recv(PACKET_SIZE)
if not packet: # Connection has been closed.
return None
data += packet
if b'\0' in packet:
break
# TODO Is there a better way of handling bad input than 'replace'?
text = data.decode('utf-8', errors='replace').strip('\0')
lines = text.split('\n')
return lines[0] + '\n'
def receive_lines(socket):
try:
data = socket.recv(PACKET_SIZE)
except BlockingIOError:
return []
if data is None: # Connection has been closed.
return None
# TODO Is there a better way of handling bad input than 'replace'?
text = data.decode('utf-8', errors='replace').strip('\0')
lines = text.split('\n')
if len(lines)==1 and not lines[0]:
return None
return lines

View File

@@ -1,184 +0,0 @@
#!/usr/bin/env python3
from whisper_online import *
import sys
import argparse
import os
import logging
import numpy as np
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser()
# server options
parser.add_argument("--host", type=str, default='localhost')
parser.add_argument("--port", type=int, default=43007)
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
# options from whisper_online
add_shared_args(parser)
args = parser.parse_args()
set_logging(args,logger,other="")
# setting whisper object by args
SAMPLING_RATE = 16000
size = args.model
language = args.lan
asr, online = asr_factory(args)
min_chunk = args.min_chunk_size
# warm up the ASR because the very first transcribe takes more time than the others.
# Test results in https://github.com/ufal/whisper_streaming/pull/81
msg = "Whisper is not warmed up. The first chunk processing may take longer."
if args.warmup_file:
if os.path.isfile(args.warmup_file):
a = load_audio_chunk(args.warmup_file,0,1)
asr.transcribe(a)
logger.info("Whisper is warmed up.")
else:
logger.critical("The warm up file is not available. "+msg)
sys.exit(1)
else:
logger.warning(msg)
######### Server objects
import line_packet
import socket
class Connection:
'''it wraps conn object'''
PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536
def __init__(self, conn):
self.conn = conn
self.last_line = ""
self.conn.setblocking(True)
def send(self, line):
'''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
if line == self.last_line:
return
line_packet.send_one_line(self.conn, line)
self.last_line = line
def receive_lines(self):
in_line = line_packet.receive_lines(self.conn)
return in_line
def non_blocking_receive_audio(self):
try:
r = self.conn.recv(self.PACKET_SIZE)
return r
except ConnectionResetError:
return None
import io
import soundfile
# wraps socket and ASR object, and serves one client connection.
# next client should be served by a new instance of this object
class ServerProcessor:
def __init__(self, c, online_asr_proc, min_chunk):
self.connection = c
self.online_asr_proc = online_asr_proc
self.min_chunk = min_chunk
self.last_end = None
self.is_first = True
def receive_audio_chunk(self):
# receive all audio that is available by this time
# blocks operation if less than self.min_chunk seconds is available
# unblocks if connection is closed or a chunk is available
out = []
minlimit = self.min_chunk*SAMPLING_RATE
while sum(len(x) for x in out) < minlimit:
raw_bytes = self.connection.non_blocking_receive_audio()
if not raw_bytes:
break
# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
out.append(audio)
if not out:
return None
conc = np.concatenate(out)
if self.is_first and len(conc) < minlimit:
return None
self.is_first = False
return np.concatenate(out)
def format_output_transcript(self,o):
# output format in stdout is like:
# 0 1720 Takhle to je
# - the first two words are:
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
# - the next words: segment transcript
# This function differs from whisper_online.output_transcript in the following:
# succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
# Therefore, beg, is max of previous end and current beg outputed by Whisper.
# Usually it differs negligibly, by appx 20 ms.
if o[0] is not None:
beg, end = o[0]*1000,o[1]*1000
if self.last_end is not None:
beg = max(beg, self.last_end)
self.last_end = end
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
return "%1.0f %1.0f %s" % (beg,end,o[2])
else:
logger.debug("No text in this segment")
return None
def send_result(self, o):
msg = self.format_output_transcript(o)
if msg is not None:
self.connection.send(msg)
def process(self):
# handle one client connection
self.online_asr_proc.init()
while True:
a = self.receive_audio_chunk()
if a is None:
break
self.online_asr_proc.insert_audio_chunk(a)
o = online.process_iter()
try:
self.send_result(o)
except BrokenPipeError:
logger.info("broken pipe -- connection closed?")
break
# o = online.finish() # this should be working
# self.send_result(o)
# server loop
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind((args.host, args.port))
s.listen(1)
logger.info('Listening on'+str((args.host, args.port)))
while True:
conn, addr = s.accept()
logger.info('Connected to client on {}'.format(addr))
connection = Connection(conn)
proc = ServerProcessor(connection, online, args.min_chunk_size)
proc.process()
conn.close()
logger.info('Connection to client closed')
logger.info('Connection closed, terminating.')