to 0.1.7

Message when launching transcription and no audio is detected
lag information in real time even when no audio is detected
2026-03-07 22:33:36 +00:00 · 2025-05-28 13:29:45 +02:00 · 2025-05-28 13:25:49 +02:00 · 2025-05-28 12:25:47 +02:00 · 2025-05-28 11:51:37 +02:00 · 2025-05-28 11:50:44 +02:00
6 changed files with 124 additions and 59 deletions
--- a/13
+++ b/13
@@ -1,10 +1,6 @@
 MIT License

 Copyright (c) 2025 Quentin Fuxa.  
-Based on:
- The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE  
- The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE  
- The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE

 Permission is hereby granted, free of charge, to any person obtaining a copy  
 of this software and associated documentation files (the "Software"), to deal  
@@ -26,8 +22,7 @@ SOFTWARE.

 ---

-Third-party components included in this software:
-
- **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming  
- **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad  
- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart
+Based on:
+- **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming. The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE  
+- **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad. The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE  
+- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart. The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
--- a/README.md
+++ b/README.md
@@ -9,8 +9,8 @@
 <p align="center">
  <a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
  <a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
-  <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-dark_green"></a>
-  <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/QuentinFuxa/WhisperLiveKit?color=blue"></a>
+  <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9--3.13-dark_green"></a>
+  <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT-dark_green"></a>
 </p>

 ## 🚀 Overview
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 setup(
    name="whisperlivekit",
-    version="0.1.6",
+    version="0.1.7",
    description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -83,10 +83,33 @@ class AudioProcessor:

    def start_ffmpeg_decoder(self):
        """Start FFmpeg process for WebM to PCM conversion."""
-        return (ffmpeg.input("pipe:0", format="webm")
-                .output("pipe:1", format="s16le", acodec="pcm_s16le", 
-                        ac=self.channels, ar=str(self.sample_rate))
-                .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
+        try:
+            return (ffmpeg.input("pipe:0", format="webm")
+                    .output("pipe:1", format="s16le", acodec="pcm_s16le", 
+                            ac=self.channels, ar=str(self.sample_rate))
+                    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
+        except FileNotFoundError:
+            error = """
+            FFmpeg is not installed or not found in your system's PATH.
+            Please install FFmpeg to enable audio processing.
+
+            Installation instructions:
+
+            # Ubuntu/Debian:
+            sudo apt update && sudo apt install ffmpeg
+
+            # macOS (using Homebrew):
+            brew install ffmpeg
+
+            # Windows:
+            # 1. Download the latest static build from https://ffmpeg.org/download.html
+            # 2. Extract the archive (e.g., to C:\\FFmpeg).
+            # 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
+
+            After installation, please restart the application.
+            """
+            logger.error(error)
+            raise FileNotFoundError(error)

    async def restart_ffmpeg(self):
        """Restart the FFmpeg process after failure."""
@@ -269,6 +292,7 @@ class AudioProcessor:
        """Process audio chunks for transcription."""
        self.full_transcription = ""
        self.sep = self.online.asr.sep
+        cumulative_pcm_duration_stream_time = 0.0
        
        while True:
            try:
@@ -292,25 +316,38 @@ class AudioProcessor:
                )
                
                # Process transcription
-                self.online.insert_audio_chunk(pcm_array)
-                new_tokens = self.online.process_iter()
+                duration_this_chunk = len(pcm_array) / self.sample_rate if isinstance(pcm_array, np.ndarray) else 0
+                cumulative_pcm_duration_stream_time += duration_this_chunk
+                stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
+
+                self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
+                new_tokens, current_audio_processed_upto = self.online.process_iter()
                
                if new_tokens:
                    self.full_transcription += self.sep.join([t.text for t in new_tokens])
                    
                # Get buffer information
-                _buffer = self.online.get_buffer()
-                buffer = _buffer.text
-                end_buffer = _buffer.end if _buffer.end else (
-                    new_tokens[-1].end if new_tokens else 0
-                )
+                _buffer_transcript_obj = self.online.get_buffer()
+                buffer_text = _buffer_transcript_obj.text
+                
+                candidate_end_times = [self.end_buffer]
+
+                if new_tokens:
+                    candidate_end_times.append(new_tokens[-1].end)
+                
+                if _buffer_transcript_obj.end is not None:
+                    candidate_end_times.append(_buffer_transcript_obj.end)
+                
+                candidate_end_times.append(current_audio_processed_upto)
+                
+                new_end_buffer = max(candidate_end_times)
                
                # Avoid duplicating content
-                if buffer in self.full_transcription:
-                    buffer = ""
+                if buffer_text in self.full_transcription:
+                    buffer_text = ""
                    
                await self.update_transcription(
-                    new_tokens, buffer, end_buffer, self.full_transcription, self.sep
+                    new_tokens, buffer_text, new_end_buffer, self.full_transcription, self.sep
                )
                self.transcription_queue.task_done()
                
@@ -416,31 +453,38 @@ class AudioProcessor:
                    await self.update_diarization(end_attributed_speaker, combined)
                    buffer_diarization = combined
                
-                # Create response object
-                if not lines:
-                    lines = [{
+                response_status = "active_transcription"
+                final_lines_for_response = lines.copy()
+
+                if not tokens and not buffer_transcription and not buffer_diarization:
+                    response_status = "no_audio_detected"
+                    final_lines_for_response = []
+                elif response_status == "active_transcription" and not final_lines_for_response:
+                    final_lines_for_response = [{
                        "speaker": 1,
                        "text": "",
-                        "beg": format_time(0),
-                        "end": format_time(tokens[-1].end if tokens else 0),
+                        "beg": format_time(state.get("end_buffer", 0)),
+                        "end": format_time(state.get("end_buffer", 0)),
                        "diff": 0
                    }]
                
                response = {
-                    "lines": lines, 
+                    "status": response_status,
+                    "lines": final_lines_for_response,
                    "buffer_transcription": buffer_transcription,
                    "buffer_diarization": buffer_diarization,
                    "remaining_time_transcription": state["remaining_time_transcription"],
                    "remaining_time_diarization": state["remaining_time_diarization"]
                }
                
-                # Only yield if content has changed
-                response_content = ' '.join([f"{line['speaker']} {line['text']}" for line in lines]) + \
-                                  f" | {buffer_transcription} | {buffer_diarization}"
+                current_response_signature = f"{response_status} | " + \
+                                           ' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \
+                                           f" | {buffer_transcription} | {buffer_diarization}"
                
-                if response_content != self.last_response_content and (lines or buffer_transcription or buffer_diarization):
+                if current_response_signature != self.last_response_content and \
+                   (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected"):
                    yield response
-                    self.last_response_content = response_content
+                    self.last_response_content = current_response_signature
                
                # Check for termination condition
                if self.is_stopping:
--- a/whisperlivekit/web/live_transcription.html
+++ b/whisperlivekit/web/live_transcription.html
@@ -427,7 +427,8 @@
                        buffer_transcription = "", 
                        buffer_diarization = "",
                        remaining_time_transcription = 0,
-                        remaining_time_diarization = 0
+                        remaining_time_diarization = 0,
+                        status = "active_transcription"
                    } = data;
                    
                    renderLinesWithBuffer(
@@ -436,13 +437,19 @@
                        buffer_transcription, 
                        remaining_time_diarization,
                        remaining_time_transcription,
-                        false // isFinalizing = false for normal updates
+                        false,
+                        status
                    );
                };
            });
        }

-        function renderLinesWithBuffer(lines, buffer_diarization, buffer_transcription, remaining_time_diarization, remaining_time_transcription, isFinalizing = false) {
+        function renderLinesWithBuffer(lines, buffer_diarization, buffer_transcription, remaining_time_diarization, remaining_time_transcription, isFinalizing = false, current_status = "active_transcription") {
+            if (current_status === "no_audio_detected") {
+                linesTranscriptDiv.innerHTML = "<p style='text-align: center; color: #666; margin-top: 20px;'><em>No audio detected...</em></p>";
+                return; 
+            }
+
            const linesHtml = lines.map((item, idx) => {
                let timeInfo = "";
                if (item.beg !== undefined && item.end !== undefined) {
--- a/whisperlivekit/whisper_streaming_custom/online_asr.py
+++ b/whisperlivekit/whisper_streaming_custom/online_asr.py
@@ -144,7 +144,11 @@ class OnlineASRProcessor:
        self.transcript_buffer.last_committed_time = self.buffer_time_offset
        self.committed: List[ASRToken] = []

-    def insert_audio_chunk(self, audio: np.ndarray):
+    def get_audio_buffer_end_time(self) -> float:
+        """Returns the absolute end time of the current audio_buffer."""
+        return self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
+
+    def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
        """Append an audio chunk (a numpy array) to the current audio buffer."""
        self.audio_buffer = np.append(self.audio_buffer, audio)

@@ -179,18 +183,19 @@ class OnlineASRProcessor:
        return self.concatenate_tokens(self.transcript_buffer.buffer)
        

-    def process_iter(self) -> Transcript:
+    def process_iter(self) -> Tuple[List[ASRToken], float]:
        """
        Processes the current audio buffer.

-        Returns a Transcript object representing the committed transcript.
+        Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
        """
+        current_audio_processed_upto = self.get_audio_buffer_end_time()
        prompt_text, _ = self.prompt()
        logger.debug(
            f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
        )
        res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
-        tokens = self.asr.ts_words(res)  # Expecting List[ASRToken]
+        tokens = self.asr.ts_words(res)
        self.transcript_buffer.insert(tokens, self.buffer_time_offset)
        committed_tokens = self.transcript_buffer.flush()
        self.committed.extend(committed_tokens)
@@ -210,7 +215,7 @@ class OnlineASRProcessor:
        logger.debug(
            f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
        )
-        return committed_tokens
+        return committed_tokens, current_audio_processed_upto

    def chunk_completed_sentence(self):
        """
@@ -343,15 +348,17 @@ class OnlineASRProcessor:
                )
                sentences.append(sentence)
        return sentences
-    def finish(self) -> Transcript:
+    
+    def finish(self) -> Tuple[List[ASRToken], float]:
        """
        Flush the remaining transcript when processing ends.
+        Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
        """
        remaining_tokens = self.transcript_buffer.buffer
-        final_transcript = self.concatenate_tokens(remaining_tokens)
-        logger.debug(f"Final non-committed transcript: {final_transcript}")
-        self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
-        return final_transcript
+        logger.debug(f"Final non-committed tokens: {remaining_tokens}")
+        final_processed_upto = self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
+        self.buffer_time_offset = final_processed_upto
+        return remaining_tokens, final_processed_upto

    def concatenate_tokens(
        self,
@@ -384,7 +391,8 @@ class VACOnlineASRProcessor:
    def __init__(self, online_chunk_size: float, *args, **kwargs):
        self.online_chunk_size = online_chunk_size
        self.online = OnlineASRProcessor(*args, **kwargs)
-
+        self.asr = self.online.asr
+        
        # Load a VAD model (e.g. Silero VAD)
        import torch
        model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
@@ -392,28 +400,35 @@ class VACOnlineASRProcessor:

        self.vac = FixedVADIterator(model)
        self.logfile = self.online.logfile
+        self.last_input_audio_stream_end_time: float = 0.0
        self.init()

    def init(self):
        self.online.init()
        self.vac.reset_states()
        self.current_online_chunk_buffer_size = 0
+        self.last_input_audio_stream_end_time = self.online.buffer_time_offset
        self.is_currently_final = False
        self.status: Optional[str] = None  # "voice" or "nonvoice"
        self.audio_buffer = np.array([], dtype=np.float32)
        self.buffer_offset = 0  # in frames

+    def get_audio_buffer_end_time(self) -> float:
+        """Returns the absolute end time of the audio processed by the underlying OnlineASRProcessor."""
+        return self.online.get_audio_buffer_end_time()
+
    def clear_buffer(self):
        self.buffer_offset += len(self.audio_buffer)
        self.audio_buffer = np.array([], dtype=np.float32)

-    def insert_audio_chunk(self, audio: np.ndarray):
+    def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: float):
        """
        Process an incoming small audio chunk:
          - run VAD on the chunk,
          - decide whether to send the audio to the online ASR processor immediately,
          - and/or to mark the current utterance as finished.
        """
+        self.last_input_audio_stream_end_time = audio_stream_end_time
        res = self.vac(audio)
        self.audio_buffer = np.append(self.audio_buffer, audio)

@@ -455,10 +470,11 @@ class VACOnlineASRProcessor:
                self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
                self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]

-    def process_iter(self) -> Transcript:
+    def process_iter(self) -> Tuple[List[ASRToken], float]:
        """
        Depending on the VAD status and the amount of accumulated audio,
        process the current audio chunk.
+        Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
        """
        if self.is_currently_final:
            return self.finish()
@@ -467,17 +483,20 @@ class VACOnlineASRProcessor:
            return self.online.process_iter()
        else:
            logger.debug("No online update, only VAD")
-            return Transcript(None, None, "")
+            return [], self.last_input_audio_stream_end_time

-    def finish(self) -> Transcript:
-        """Finish processing by flushing any remaining text."""
-        result = self.online.finish()
+    def finish(self) -> Tuple[List[ASRToken], float]:
+        """
+        Finish processing by flushing any remaining text.
+        Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
+        """
+        result_tokens, processed_upto = self.online.finish()
        self.current_online_chunk_buffer_size = 0
        self.is_currently_final = False
-        return result
+        return result_tokens, processed_upto
    
    def get_buffer(self):
        """
        Get the unvalidated buffer in string format.
        """
-        return self.online.concatenate_tokens(self.online.transcript_buffer.buffer).text
+        return self.online.concatenate_tokens(self.online.transcript_buffer.buffer)
Author	SHA1	Message	Date
Quentin Fuxa	eabd1b199a	to 0.1.7	2025-05-28 13:29:45 +02:00
Quentin Fuxa	f7644268c1	Message when launching transcription and no audio is detected	2025-05-28 13:25:49 +02:00
Quentin Fuxa	34e8fe260e	lag information in real time even when no audio is detected	2025-05-28 12:25:47 +02:00
Quentin Fuxa	debfefaf3e	Merge pull request #128 from QuentinFuxa/vac-update Vac update	2025-05-28 11:51:37 +02:00
Quentin Fuxa	101ca9ef90	Update README.md	2025-05-28 11:50:44 +02:00
Quentin Fuxa	94bb05d53e	Update README.md	2025-05-28 11:48:46 +02:00
Quentin Fuxa	6797b88176	Error handling for missing FFmpeg in start_ffmpeg_decoder	2025-05-28 11:43:30 +02:00
Quentin Fuxa	46770efd6c	correct error when using VAC	2025-05-28 11:43:18 +02:00
Quentin Fuxa	b23ef3ec3e	refactor license for correct shields.io detection	2025-05-28 11:42:26 +02:00