From 0553b75415ddf041f217775828759c3cc40bb047 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Thu, 19 Dec 2024 12:01:07 +0100 Subject: [PATCH] unfork project, indicate files from whisper streaming --- README.md | 5 ++ line_packet.py | 93 -------------------- whisper_online_server.py | 184 --------------------------------------- 3 files changed, 5 insertions(+), 277 deletions(-) delete mode 100644 line_packet.py delete mode 100644 whisper_online_server.py diff --git a/README.md b/README.md index 87021a6..0cc721b 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,11 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str ![Demo Screenshot](src/demo.png) +## Code Origins + +This project reuses and extends code from the original Whisper Streaming repository: +- whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository. +- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project. ## Installation diff --git a/line_packet.py b/line_packet.py deleted file mode 100644 index 0664fea..0000000 --- a/line_packet.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 - -"""Functions for sending and receiving individual lines of text over a socket. - -A line is transmitted using one or more fixed-size packets of UTF-8 bytes -containing: - - - Zero or more bytes of UTF-8, excluding \n and \0, followed by - - - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE - -Originally from the UEDIN team of the ELITR project. -""" - -PACKET_SIZE = 65536 - - -def send_one_line(socket, text, pad_zeros=False): - """Sends a line of text over the given socket. - - The 'text' argument should contain a single line of text (line break - characters are optional). Line boundaries are determined by Python's - str.splitlines() function [1]. We also count '\0' as a line terminator. - If 'text' contains multiple lines then only the first will be sent. - - If the send fails then an exception will be raised. - - [1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines - - Args: - socket: a socket object. - text: string containing a line of text for transmission. - """ - text.replace('\0', '\n') - lines = text.splitlines() - first_line = '' if len(lines) == 0 else lines[0] - # TODO Is there a better way of handling bad input than 'replace'? - data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'') - for offset in range(0, len(data), PACKET_SIZE): - bytes_remaining = len(data) - offset - if bytes_remaining < PACKET_SIZE: - padding_length = PACKET_SIZE - bytes_remaining - packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'') - else: - packet = data[offset:offset+PACKET_SIZE] - socket.sendall(packet) - - -def receive_one_line(socket): - """Receives a line of text from the given socket. - - This function will (attempt to) receive a single line of text. If data is - currently unavailable then it will block until data becomes available or - the sender has closed the connection (in which case it will return an - empty string). - - The string should not contain any newline characters, but if it does then - only the first line will be returned. - - Args: - socket: a socket object. - - Returns: - A string representing a single line with a terminating newline or - None if the connection has been closed. - """ - data = b'' - while True: - packet = socket.recv(PACKET_SIZE) - if not packet: # Connection has been closed. - return None - data += packet - if b'\0' in packet: - break - # TODO Is there a better way of handling bad input than 'replace'? - text = data.decode('utf-8', errors='replace').strip('\0') - lines = text.split('\n') - return lines[0] + '\n' - - -def receive_lines(socket): - try: - data = socket.recv(PACKET_SIZE) - except BlockingIOError: - return [] - if data is None: # Connection has been closed. - return None - # TODO Is there a better way of handling bad input than 'replace'? - text = data.decode('utf-8', errors='replace').strip('\0') - lines = text.split('\n') - if len(lines)==1 and not lines[0]: - return None - return lines diff --git a/whisper_online_server.py b/whisper_online_server.py deleted file mode 100644 index 3892329..0000000 --- a/whisper_online_server.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -from whisper_online import * - -import sys -import argparse -import os -import logging -import numpy as np - -logger = logging.getLogger(__name__) -parser = argparse.ArgumentParser() - -# server options -parser.add_argument("--host", type=str, default='localhost') -parser.add_argument("--port", type=int, default=43007) -parser.add_argument("--warmup-file", type=str, dest="warmup_file", - help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") - -# options from whisper_online -add_shared_args(parser) -args = parser.parse_args() - -set_logging(args,logger,other="") - -# setting whisper object by args - -SAMPLING_RATE = 16000 - -size = args.model -language = args.lan -asr, online = asr_factory(args) -min_chunk = args.min_chunk_size - -# warm up the ASR because the very first transcribe takes more time than the others. -# Test results in https://github.com/ufal/whisper_streaming/pull/81 -msg = "Whisper is not warmed up. The first chunk processing may take longer." -if args.warmup_file: - if os.path.isfile(args.warmup_file): - a = load_audio_chunk(args.warmup_file,0,1) - asr.transcribe(a) - logger.info("Whisper is warmed up.") - else: - logger.critical("The warm up file is not available. "+msg) - sys.exit(1) -else: - logger.warning(msg) - - -######### Server objects - -import line_packet -import socket - -class Connection: - '''it wraps conn object''' - PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536 - - def __init__(self, conn): - self.conn = conn - self.last_line = "" - - self.conn.setblocking(True) - - def send(self, line): - '''it doesn't send the same line twice, because it was problematic in online-text-flow-events''' - if line == self.last_line: - return - line_packet.send_one_line(self.conn, line) - self.last_line = line - - def receive_lines(self): - in_line = line_packet.receive_lines(self.conn) - return in_line - - def non_blocking_receive_audio(self): - try: - r = self.conn.recv(self.PACKET_SIZE) - return r - except ConnectionResetError: - return None - - -import io -import soundfile - -# wraps socket and ASR object, and serves one client connection. -# next client should be served by a new instance of this object -class ServerProcessor: - - def __init__(self, c, online_asr_proc, min_chunk): - self.connection = c - self.online_asr_proc = online_asr_proc - self.min_chunk = min_chunk - - self.last_end = None - - self.is_first = True - - def receive_audio_chunk(self): - # receive all audio that is available by this time - # blocks operation if less than self.min_chunk seconds is available - # unblocks if connection is closed or a chunk is available - out = [] - minlimit = self.min_chunk*SAMPLING_RATE - while sum(len(x) for x in out) < minlimit: - raw_bytes = self.connection.non_blocking_receive_audio() - if not raw_bytes: - break -# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) - sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") - audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32) - out.append(audio) - if not out: - return None - conc = np.concatenate(out) - if self.is_first and len(conc) < minlimit: - return None - self.is_first = False - return np.concatenate(out) - - def format_output_transcript(self,o): - # output format in stdout is like: - # 0 1720 Takhle to je - # - the first two words are: - # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway - # - the next words: segment transcript - - # This function differs from whisper_online.output_transcript in the following: - # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it. - # Therefore, beg, is max of previous end and current beg outputed by Whisper. - # Usually it differs negligibly, by appx 20 ms. - - if o[0] is not None: - beg, end = o[0]*1000,o[1]*1000 - if self.last_end is not None: - beg = max(beg, self.last_end) - - self.last_end = end - print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr) - return "%1.0f %1.0f %s" % (beg,end,o[2]) - else: - logger.debug("No text in this segment") - return None - - def send_result(self, o): - msg = self.format_output_transcript(o) - if msg is not None: - self.connection.send(msg) - - def process(self): - # handle one client connection - self.online_asr_proc.init() - while True: - a = self.receive_audio_chunk() - if a is None: - break - self.online_asr_proc.insert_audio_chunk(a) - o = online.process_iter() - try: - self.send_result(o) - except BrokenPipeError: - logger.info("broken pipe -- connection closed?") - break - -# o = online.finish() # this should be working -# self.send_result(o) - - - -# server loop - -with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind((args.host, args.port)) - s.listen(1) - logger.info('Listening on'+str((args.host, args.port))) - while True: - conn, addr = s.accept() - logger.info('Connected to client on {}'.format(addr)) - connection = Connection(conn) - proc = ServerProcessor(connection, online, args.min_chunk_size) - proc.process() - conn.close() - logger.info('Connection to client closed') -logger.info('Connection closed, terminating.')