diff --git a/README.md b/README.md index 1670ba9..b0c7916 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,51 @@ arecord -f S16_LE -c1 -r 16000 -t raw -D default | nc localhost 43001 - nc is netcat with server's host and port +## Live Transcription Web Interface + +This repository also includes a **FastAPI server** and an **HTML/JavaScript client** for quick testing of live speech transcription in the browser. The client uses native WebSockets and the `MediaRecorder` API to capture microphone audio in **WebM** format and send it to the server—**no additional front-end framework** is required. + +![Demo Screenshot](src/demo.png) + +### How to Launch the Server + +1. **Install Dependencies**: + + ```bash + pip install -r requirements.txt + ``` + +2. **Run the FastAPI Server**: + + ```bash + python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000 + ``` + + - `--host` and `--port` let you specify the server’s IP/port. + +3. **Open the Provided HTML**: + + - By default, the server root endpoint `/` serves a simple `live_transcription.html` page. + - Open your browser at `http://localhost:8000` (or replace `localhost` and `8000` with whatever you specified). + - The page uses vanilla JavaScript and the WebSocket API to capture your microphone and stream audio to the server in real time. + +### How the Live Interface Works + +- Once you **allow microphone access**, the page records small chunks of audio using the **MediaRecorder** API in **webm/opus** format. +- These chunks are sent over a **WebSocket** to the FastAPI endpoint at `/ws`. +- The Python server decodes `.webm` chunks on the fly using **FFmpeg** and streams them into **Whisper** for transcription. +- **Partial transcription** appears as soon as enough audio is processed. The “unvalidated” text is shown in **lighter or grey color** (i.e., an ‘aperçu’) to indicate it’s still buffered partial output. Once Whisper finalizes that segment, it’s displayed in normal text. +- You can watch the transcription update in near real time, ideal for demos, prototyping, or quick debugging. + +### Deploying to a Remote Server + +If you want to **deploy** this setup: + +1. **Host the FastAPI app** behind a production-grade HTTP server (like **Uvicorn + Nginx** or Docker). +2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host. +3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket). + +No additional front-end libraries or frameworks are required. The WebSocket logic in `live_transcription.html` is minimal enough to adapt for your own custom UI or embed in other pages. ## Background diff --git a/src/demo.png b/src/demo.png new file mode 100644 index 0000000..1c58a4d Binary files /dev/null and b/src/demo.png differ diff --git a/src/live_transcription.html b/src/live_transcription.html new file mode 100644 index 0000000..8d215ae --- /dev/null +++ b/src/live_transcription.html @@ -0,0 +1,111 @@ + + + + + + Audio Transcription + + + +

Click to start transcription

+ +
+ + + + \ No newline at end of file diff --git a/whisper_fastapi_online_server.py b/whisper_fastapi_online_server.py new file mode 100644 index 0000000..273432e --- /dev/null +++ b/whisper_fastapi_online_server.py @@ -0,0 +1,140 @@ +import io +import argparse +import asyncio +import numpy as np +import ffmpeg + +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +from fastapi.responses import HTMLResponse +from fastapi.middleware.cors import CORSMiddleware + +from whisper_online import asr_factory, add_shared_args + +app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# Argument parsing +parser = argparse.ArgumentParser() +parser.add_argument("--host", type=str, default='localhost') +parser.add_argument("--port", type=int, default=8000) +parser.add_argument("--warmup-file", type=str, dest="warmup_file", + help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") +add_shared_args(parser) +args = parser.parse_args() + +# Initialize Whisper +asr, online = asr_factory(args) + +# Load demo HTML for the root endpoint +with open("live_transcription.html", "r") as f: + html = f.read() + +@app.get("/") +async def get(): + return HTMLResponse(html) + +# Streaming constants +SAMPLE_RATE = 16000 +CHANNELS = 1 +SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size) +BYTES_PER_SAMPLE = 2 # s16le = 2 bytes per sample +BYTES_PER_SEC = SAMPLES_PER_SEC * BYTES_PER_SAMPLE + +async def start_ffmpeg_decoder(): + """ + Start an FFmpeg process in async streaming mode that reads WebM from stdin + and outputs raw s16le PCM on stdout. Returns the process object. + """ + process = ( + ffmpeg + .input('pipe:0', format='webm') + .output('pipe:1', format='s16le', acodec='pcm_s16le', ac=CHANNELS, ar=str(SAMPLE_RATE)) + .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True) + ) + return process + +@app.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + print("WebSocket connection opened.") + + ffmpeg_process = await start_ffmpeg_decoder() + pcm_buffer = bytearray() + + # Continuously read decoded PCM from ffmpeg stdout in a background task + async def ffmpeg_stdout_reader(): + nonlocal pcm_buffer + loop = asyncio.get_event_loop() + while True: + try: + chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096) + if not chunk: # FFmpeg might have closed + print("FFmpeg stdout closed.") + break + + pcm_buffer.extend(chunk) + + # Process in 3-second batches + while len(pcm_buffer) >= BYTES_PER_SEC: + three_sec_chunk = pcm_buffer[:BYTES_PER_SEC] + del pcm_buffer[:BYTES_PER_SEC] + + # Convert int16 -> float32 + pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0 + + # Send PCM data to Whisper + online.insert_audio_chunk(pcm_array) + transcription = online.process_iter() + buffer = online.to_flush(online.transcript_buffer.buffer) + + # Return partial transcription results to the client + await websocket.send_json({ + "transcription": transcription[2], + "buffer": buffer[2] + }) + except Exception as e: + print(f"Exception in ffmpeg_stdout_reader: {e}") + break + + print("Exiting ffmpeg_stdout_reader...") + + stdout_reader_task = asyncio.create_task(ffmpeg_stdout_reader()) + + try: + while True: + # Receive incoming WebM audio chunks from the client + message = await websocket.receive_bytes() + # Pass them to ffmpeg via stdin + ffmpeg_process.stdin.write(message) + ffmpeg_process.stdin.flush() + + except WebSocketDisconnect: + print("WebSocket connection closed.") + except Exception as e: + print(f"Error in websocket loop: {e}") + finally: + # Clean up ffmpeg and the reader task + try: + ffmpeg_process.stdin.close() + except: + pass + stdout_reader_task.cancel() + + try: + ffmpeg_process.stdout.close() + except: + pass + + ffmpeg_process.wait() + + +if __name__ == "__main__": + import uvicorn + uvicorn.run("whisper_fastapi_online_server:app", host=args.host, port=args.port, reload=True) \ No newline at end of file