unfork project, indicate files from whisper streaming

2026-03-07 22:33:36 +00:00 · 2024-12-19 12:01:07 +01:00
parent baa01728be
commit 0553b75415
3 changed files with 5 additions and 277 deletions
--- a/README.md
+++ b/README.md
@@ -12,6 +12,11 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str

 ![Demo Screenshot](src/demo.png)

+##  Code Origins
+
+This project reuses and extends code from the original Whisper Streaming repository:
+- whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository.
+- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.

 ## Installation

--- a/line_packet.py
+++ b/line_packet.py
@@ -1,93 +0,0 @@
-#!/usr/bin/env python3
-
-"""Functions for sending and receiving individual lines of text over a socket.
-
-A line is transmitted using one or more fixed-size packets of UTF-8 bytes
-containing:
-
-  - Zero or more bytes of UTF-8, excluding \n and \0, followed by
-
-  - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
-
-Originally from the UEDIN team of the ELITR project. 
-"""
-
-PACKET_SIZE = 65536
-
-
-def send_one_line(socket, text, pad_zeros=False):
-    """Sends a line of text over the given socket.
-
-    The 'text' argument should contain a single line of text (line break
-    characters are optional). Line boundaries are determined by Python's
-    str.splitlines() function [1]. We also count '\0' as a line terminator.
-    If 'text' contains multiple lines then only the first will be sent.
-
-    If the send fails then an exception will be raised.
-
-    [1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines
-
-    Args:
-        socket: a socket object.
-        text: string containing a line of text for transmission.
-    """
-    text.replace('\0', '\n')
-    lines = text.splitlines()
-    first_line = '' if len(lines) == 0 else lines[0]
-    # TODO Is there a better way of handling bad input than 'replace'?
-    data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'')
-    for offset in range(0, len(data), PACKET_SIZE):
-        bytes_remaining = len(data) - offset
-        if bytes_remaining < PACKET_SIZE:
-            padding_length = PACKET_SIZE - bytes_remaining
-            packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'')
-        else:
-            packet = data[offset:offset+PACKET_SIZE]
-        socket.sendall(packet)
-
-
-def receive_one_line(socket):
-    """Receives a line of text from the given socket.
-
-    This function will (attempt to) receive a single line of text. If data is
-    currently unavailable then it will block until data becomes available or
-    the sender has closed the connection (in which case it will return an
-    empty string).
-
-    The string should not contain any newline characters, but if it does then
-    only the first line will be returned.
-
-    Args:
-        socket: a socket object.
-
-    Returns:
-        A string representing a single line with a terminating newline or
-        None if the connection has been closed.
-    """
-    data = b''
-    while True:
-        packet = socket.recv(PACKET_SIZE)
-        if not packet:  # Connection has been closed.
-            return None
-        data += packet
-        if b'\0' in packet:
-            break
-    # TODO Is there a better way of handling bad input than 'replace'?
-    text = data.decode('utf-8', errors='replace').strip('\0')
-    lines = text.split('\n')
-    return lines[0] + '\n'
-
-
-def receive_lines(socket):
-    try:
-        data = socket.recv(PACKET_SIZE)
-    except BlockingIOError:
-        return []
-    if data is None:  # Connection has been closed.
-        return None
-    # TODO Is there a better way of handling bad input than 'replace'?
-    text = data.decode('utf-8', errors='replace').strip('\0')
-    lines = text.split('\n')
-    if len(lines)==1 and not lines[0]:
-        return None
-    return lines
--- a/whisper_online_server.py
+++ b/whisper_online_server.py
@@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-from whisper_online import *
-
-import sys
-import argparse
-import os
-import logging
-import numpy as np
-
-logger = logging.getLogger(__name__)
-parser = argparse.ArgumentParser()
-
-# server options
-parser.add_argument("--host", type=str, default='localhost')
-parser.add_argument("--port", type=int, default=43007)
-parser.add_argument("--warmup-file", type=str, dest="warmup_file", 
-        help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
-
-# options from whisper_online
-add_shared_args(parser)
-args = parser.parse_args()
-
-set_logging(args,logger,other="")
-
-# setting whisper object by args 
-
-SAMPLING_RATE = 16000
-
-size = args.model
-language = args.lan
-asr, online = asr_factory(args)
-min_chunk = args.min_chunk_size
-
-# warm up the ASR because the very first transcribe takes more time than the others. 
-# Test results in https://github.com/ufal/whisper_streaming/pull/81
-msg = "Whisper is not warmed up. The first chunk processing may take longer."
-if args.warmup_file:
-    if os.path.isfile(args.warmup_file):
-        a = load_audio_chunk(args.warmup_file,0,1)
-        asr.transcribe(a)
-        logger.info("Whisper is warmed up.")
-    else:
-        logger.critical("The warm up file is not available. "+msg)
-        sys.exit(1)
-else:
-    logger.warning(msg)
-
-
-######### Server objects
-
-import line_packet
-import socket
-
-class Connection:
-    '''it wraps conn object'''
-    PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536
-
-    def __init__(self, conn):
-        self.conn = conn
-        self.last_line = ""
-
-        self.conn.setblocking(True)
-
-    def send(self, line):
-        '''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
-        if line == self.last_line:
-            return
-        line_packet.send_one_line(self.conn, line)
-        self.last_line = line
-
-    def receive_lines(self):
-        in_line = line_packet.receive_lines(self.conn)
-        return in_line
-
-    def non_blocking_receive_audio(self):
-        try:
-            r = self.conn.recv(self.PACKET_SIZE)
-            return r
-        except ConnectionResetError:
-            return None
-
-
-import io
-import soundfile
-
-# wraps socket and ASR object, and serves one client connection. 
-# next client should be served by a new instance of this object
-class ServerProcessor:
-
-    def __init__(self, c, online_asr_proc, min_chunk):
-        self.connection = c
-        self.online_asr_proc = online_asr_proc
-        self.min_chunk = min_chunk
-
-        self.last_end = None
-
-        self.is_first = True
-
-    def receive_audio_chunk(self):
-        # receive all audio that is available by this time
-        # blocks operation if less than self.min_chunk seconds is available
-        # unblocks if connection is closed or a chunk is available
-        out = []
-        minlimit = self.min_chunk*SAMPLING_RATE
-        while sum(len(x) for x in out) < minlimit:
-            raw_bytes = self.connection.non_blocking_receive_audio()
-            if not raw_bytes:
-                break
-#            print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
-            sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
-            audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
-            out.append(audio)
-        if not out:
-            return None
-        conc = np.concatenate(out)
-        if self.is_first and len(conc) < minlimit:
-            return None
-        self.is_first = False
-        return np.concatenate(out)
-
-    def format_output_transcript(self,o):
-        # output format in stdout is like:
-        # 0 1720 Takhle to je
-        # - the first two words are:
-        #    - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
-        # - the next words: segment transcript
-
-        # This function differs from whisper_online.output_transcript in the following:
-        # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
-        # Therefore, beg, is max of previous end and current beg outputed by Whisper.
-        # Usually it differs negligibly, by appx 20 ms.
-
-        if o[0] is not None:
-            beg, end = o[0]*1000,o[1]*1000
-            if self.last_end is not None:
-                beg = max(beg, self.last_end)
-
-            self.last_end = end
-            print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
-            return "%1.0f %1.0f %s" % (beg,end,o[2])
-        else:
-            logger.debug("No text in this segment")
-            return None
-
-    def send_result(self, o):
-        msg = self.format_output_transcript(o)
-        if msg is not None:
-            self.connection.send(msg)
-
-    def process(self):
-        # handle one client connection
-        self.online_asr_proc.init()
-        while True:
-            a = self.receive_audio_chunk()
-            if a is None:
-                break
-            self.online_asr_proc.insert_audio_chunk(a)
-            o = online.process_iter()
-            try:
-                self.send_result(o)
-            except BrokenPipeError:
-                logger.info("broken pipe -- connection closed?")
-                break
-
-#        o = online.finish()  # this should be working
-#        self.send_result(o)
-
-
-
-# server loop
-
-with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-    s.bind((args.host, args.port))
-    s.listen(1)
-    logger.info('Listening on'+str((args.host, args.port)))
-    while True:
-        conn, addr = s.accept()
-        logger.info('Connected to client on {}'.format(addr))
-        connection = Connection(conn)
-        proc = ServerProcessor(connection, online, args.min_chunk_size)
-        proc.process()
-        conn.close()
-        logger.info('Connection to client closed')
-logger.info('Connection closed, terminating.')