From 4f87ac3ea4e16d812ce81c55079b20a7a84b4456 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Mon, 17 Mar 2025 11:46:45 +0100 Subject: [PATCH] Refactor PCM conversion to a dedicated function; immediate chunk addition to the diarization queue --- README.md | 4 ++-- whisper_fastapi_online_server.py | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 42b7692..7c43451 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,8 @@ This project is based on [Whisper Streaming](https://github.com/ufal/whisper_str 1. **Clone the Repository**: ```bash - git clone https://github.com/QuentinFuxa/whisper_streaming_web - cd whisper_streaming_web + git clone https://github.com/QuentinFuxa/WhisperLiveKit + cd WhisperLiveKit ``` diff --git a/whisper_fastapi_online_server.py b/whisper_fastapi_online_server.py index 7684705..24597bc 100644 --- a/whisper_fastapi_online_server.py +++ b/whisper_fastapi_online_server.py @@ -190,6 +190,16 @@ app.add_middleware( with open("web/live_transcription.html", "r", encoding="utf-8") as f: html = f.read() +def convert_pcm_to_float(pcm_buffer): + """ + Converts a PCM buffer in s16le format to a normalized NumPy array. + Arg: pcm_buffer. PCM buffer containing raw audio data in s16le format + Returns: np.ndarray. NumPy array of float32 type normalized between -1.0 and 1.0 + """ + pcm_array = (np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) + / 32768.0) + return pcm_array + async def start_ffmpeg_decoder(): """ Start an FFmpeg process in async streaming mode that reads WebM from stdin @@ -444,24 +454,23 @@ async def websocket_endpoint(websocket: WebSocket): logger.info("FFmpeg stdout closed.") break pcm_buffer.extend(chunk) + + if args.diarization and diarization_queue: + await diarization_queue.put(convert_pcm_to_float(pcm_buffer).copy()) + if len(pcm_buffer) >= BYTES_PER_SEC: if len(pcm_buffer) > MAX_BYTES_PER_SEC: logger.warning( f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds. The model probably struggles to keep up. Consider using a smaller model. """) - # Convert int16 -> float32 - pcm_array = ( - np.frombuffer(pcm_buffer[:MAX_BYTES_PER_SEC], dtype=np.int16).astype(np.float32) - / 32768.0 - ) + + pcm_array = convert_pcm_to_float(pcm_buffer[:MAX_BYTES_PER_SEC]) pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:] if args.transcription and transcription_queue: await transcription_queue.put(pcm_array.copy()) - if args.diarization and diarization_queue: - await diarization_queue.put(pcm_array.copy()) if not args.transcription and not args.diarization: await asyncio.sleep(0.1)