0.2.15

2026-03-07 22:33:36 +00:00 · 2025-11-21 23:52:00 +01:00
parent b5067249c0
commit 6206fff118
9 changed files with 75 additions and 20 deletions
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ async def websocket_endpoint(websocket: WebSocket):
 |-----------|-------------|---------|
 | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` |
 | `--model-path` | Local .pt file/directory **or** Hugging Face repo ID containing the Whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` |
-| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
+| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
 | `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` |
 | `--diarization` | Enable speaker identification | `False` |
 | `--backend-policy` | Streaming strategy: `1`/`simulstreaming` uses AlignAtt SimulStreaming, `2`/`localagreement` uses the LocalAgreement policy | `simulstreaming` |
--- a/docs/alignement_principles.md
+++ b/docs/alignement_principles.md
@@ -4,7 +4,7 @@
 - Example 2: The punctuation from STT comes from prediction `t`, but the speaker change from Diariation come in the prediction `t-1`
 - Example 3: The punctuation from STT comes from prediction `t-1`, but the speaker change from Diariation come in the prediction `t`

-> `#` Is the split between the `t-1` prediction and t prediction.  
+> `#` Is the split between the `t-1` prediction and `t` prediction.  


 ## Example 1:
--- a/docs/technical_integration.md
+++ b/docs/technical_integration.md
@@ -0,0 +1,43 @@
+# Technical Integration Guide
+
+This document introduce how to reuse the core components when you do **not** want to ship the bundled frontend, FastAPI server, or even the provided CLI.
+
+---
+
+## 1. Runtime Components
+
+| Layer | File(s) | Purpose |
+|-------|---------|---------|
+| Transport | `whisperlivekit/basic_server.py`, any ASGI/WebSocket server | Accepts audio over WebSocket (MediaRecorder WebM or raw PCM chunks) and streams JSON updates back |
+| Audio processing | `whisperlivekit/audio_processor.py` | Buffers audio, orchestrates transcription, diarization, translation, handles FFmpeg/PCM input |
+| Engines | `whisperlivekit/core.py`, `whisperlivekit/simul_whisper/*`, `whisperlivekit/local_agreement/*` | Load models once (SimulStreaming or LocalAgreement), expose `TranscriptionEngine` and helpers |
+| Frontends | `whisperlivekit/web/*`, `chrome-extension/*` | Optional UI layers feeding the WebSocket endpoint |
+
+**Key idea:** The server boundary is just `AudioProcessor.process_audio()` for incoming bytes and the async generator returned by `AudioProcessor.create_tasks()` for outgoing updates (`FrontData`). Everything else is optional.
+
+---
+
+## 2. Running Without the Bundled Frontend
+
+1. Start the server/engine however you like:
+   ```bash
+   wlk --model small --language en --host 0.0.0.0 --port 9000
+   # or launch your own app that instantiates TranscriptionEngine(...)
+   ```
+2. Build your own client (browser, mobile, desktop) that:
+   - Opens `ws(s)://<host>:<port>/asr`
+   - Sends either MediaRecorder/Opus WebM blobs **or** raw PCM (`--pcm-input` on the server tells the client to use the AudioWorklet).
+   - Consumes the JSON payload defined in `docs/API.md`.
+
+---
+
+## 3. Running Without FastAPI
+
+`whisperlivekit/basic_server.py` is just an example. Any async framework works, as long as you:
+
+1. Create a global `TranscriptionEngine` (expensive to initialize; reuse it).
+2. Instantiate `AudioProcessor(transcription_engine=engine)` for each connection.
+3. Call `create_tasks()` to get the async generator, `process_audio()` with incoming bytes, and ensure `cleanup()` runs when the client disconnects.
+
+
+If you prefer to send compressed audio, instantiate `AudioProcessor(pcm_input=False)` and pipe encoded chunks through `FFmpegManager` transparently—just ensure `ffmpeg` is available or be ready to handle the `"ffmpeg_not_found"` error in the streamed `FrontData`.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "whisperlivekit"
-version = "0.2.14.post4"
+version = "0.2.15"
 description = "Real-time speech-to-text with speaker diarization using Whisper"
 readme = "README.md"
 authors = [
--- a/whisperlivekit/local_agreement/backends.py
+++ b/whisperlivekit/local_agreement/backends.py
@@ -224,7 +224,8 @@ class MLXWhisper(ASRBase):
            if segment.get("no_speech_prob", 0) > 0.9:
                continue
            for word in segment.get("words", []):
-                token = ASRToken(word["start"], word["end"], word["word"], probability=word["probability"])
+                probability=word["probability"]
+                token = ASRToken(word["start"], word["end"], word["word"])
                tokens.append(token)
        return tokens

--- a/whisperlivekit/local_agreement/online_asr.py
+++ b/whisperlivekit/local_agreement/online_asr.py
@@ -411,11 +411,11 @@ class OnlineASRProcessor:
    ) -> Transcript:
        sep = sep if sep is not None else self.asr.sep
        text = sep.join(token.text for token in tokens)
-        probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None
+        # probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None
        if tokens:
            start = offset + tokens[0].start
            end = offset + tokens[-1].end
        else:
            start = None
            end = None
-        return Transcript(start, end, text, probability=probability)
+        return Transcript(start, end, text)
--- a/whisperlivekit/simul_whisper/simul_whisper.py
+++ b/whisperlivekit/simul_whisper/simul_whisper.py
@@ -266,7 +266,7 @@ class AlignAtt:
        logger.debug("Refreshing segment:")
        self.init_tokens()
        self.last_attend_frame = -self.cfg.rewind_threshold       
-        self.detected_language = None
+        # self.detected_language = None
        self.cumulative_time_offset = 0.0
        self.init_context()
        logger.debug(f"Context: {self.context}")
--- a/whisperlivekit/timed_objects.py
+++ b/whisperlivekit/timed_objects.py
@@ -19,8 +19,8 @@ class TimedText(Timed):
    speaker: Optional[int] = -1
    detected_language: Optional[str] = None
    
-    def is_punctuation(self) -> bool:
-        return self.text.strip() in PUNCTUATION_MARKS
+    def has_punctuation(self) -> bool:
+        return any(char in PUNCTUATION_MARKS for char in self.text.strip())
    
    def is_within(self, other: 'TimedText') -> bool:
        return other.contains_timespan(self)
@@ -65,6 +65,7 @@ class Transcript(TimedText):
        sep: Optional[str] = None,
        offset: float = 0
    ) -> "Transcript":
+        """Collapse multiple ASR tokens into a single transcript span."""
        sep = sep if sep is not None else ' '
        text = sep.join(token.text for token in tokens)
        if tokens:
@@ -107,18 +108,19 @@ class Silence():


@dataclass
-class Segment():
+class Segment(TimedText):
+    """Generic contiguous span built from tokens or silence markers."""
    start: Optional[float]
    end: Optional[float]
    text: Optional[str]
    speaker: Optional[str]
-
    @classmethod
    def from_tokens(
        cls,
        tokens: List[Union[ASRToken, Silence]],
        is_silence: bool = False
    ) -> Optional["Segment"]:
+        """Return a normalized segment representing the provided tokens."""
        if not tokens:
            return None
        
@@ -129,16 +131,18 @@ class Segment():
                start=start_token.start,
                end=end_token.end,
                text=None,
-                speaker = -2    
+                speaker=-2
            )
        else:
            return cls(
                start=start_token.start,
                end=end_token.end,
                text=''.join(token.text for token in tokens),
-                speaker = -1
+                speaker=-1,
+                detected_language=start_token.detected_language
            )
    def is_silence(self) -> bool:
+        """True when this segment represents a silence gap."""
        return self.speaker == -2


@@ -147,6 +151,7 @@ class Line(TimedText):
    translation: str = ''
    
    def to_dict(self) -> Dict[str, Any]:
+        """Serialize the line for frontend consumption."""
        _dict: Dict[str, Any] = {
            'speaker': int(self.speaker) if self.speaker != -1 else 1,
            'text': self.text,
@@ -160,17 +165,21 @@ class Line(TimedText):
        return _dict
    
    def build_from_tokens(self, tokens: List[ASRToken]) -> "Line":
+        """Populate line attributes from a contiguous token list."""
        self.text = ''.join([token.text for token in tokens])
        self.start = tokens[0].start
        self.end = tokens[-1].end
        self.speaker = 1
+        self.detected_language = tokens[0].detected_language
        return self

    def build_from_segment(self, segment: Segment) -> "Line":
+        """Populate the line fields from a pre-built segment."""
        self.text = segment.text
        self.start = segment.start
        self.end = segment.end
        self.speaker = segment.speaker
+        self.detected_language = segment.detected_language
        return self

    def is_silent(self) -> bool:
@@ -195,6 +204,7 @@ class FrontData():
    remaining_time_diarization: float = 0.
    
    def to_dict(self) -> Dict[str, Any]:
+        """Serialize the front-end data payload."""
        _dict: Dict[str, Any] = {
            'status': self.status,
            'lines': [line.to_dict() for line in self.lines if (line.text or line.speaker == -2)],
--- a/whisperlivekit/tokens_alignment.py
+++ b/whisperlivekit/tokens_alignment.py
@@ -26,6 +26,7 @@ class TokensAlignment:
        self.beg_loop: Optional[float] = None

    def update(self) -> None:
+        """Drain state buffers into the running alignment context."""
        self.new_tokens, self.state.new_tokens = self.state.new_tokens, []
        self.new_diarization, self.state.new_diarization = self.state.new_diarization, []
        self.new_translation, self.state.new_translation = self.state.new_translation, []
@@ -37,6 +38,7 @@ class TokensAlignment:
        self.new_translation_buffer = self.state.new_translation_buffer

    def add_translation(self, line: Line) -> None:
+        """Append translated text segments that overlap with a line."""
        for ts in self.all_translation_segments:
            if ts.is_within(line):
                line.translation += ts.text + (self.sep if ts.text else '')
@@ -45,6 +47,7 @@ class TokensAlignment:


    def compute_punctuations_segments(self, tokens: Optional[List[ASRToken]] = None) -> List[Segment]:
+        """Group tokens into segments split by punctuation and explicit silence."""
        segments = []
        segment_start_idx = 0
        for i, token in enumerate(self.all_tokens):
@@ -61,7 +64,7 @@ class TokensAlignment:
                segments.append(segment)
                segment_start_idx = i+1
            else:
-                if token.is_punctuation():
+                if token.has_punctuation():
                    segment = Segment.from_tokens(
                        tokens=self.all_tokens[segment_start_idx: i+1],
                    )
@@ -77,6 +80,7 @@ class TokensAlignment:


    def concatenate_diar_segments(self) -> List[SpeakerSegment]:
+        """Merge consecutive diarization slices that share the same speaker."""
        if not self.all_diarization_segments:
            return []
        merged = [self.all_diarization_segments[0]]
@@ -90,15 +94,14 @@ class TokensAlignment:

    @staticmethod
    def intersection_duration(seg1: TimedText, seg2: TimedText) -> float:
+        """Return the overlap duration between two timed segments."""
        start = max(seg1.start, seg2.start)
        end = min(seg1.end, seg2.end)

        return max(0, end - start)

    def get_lines_diarization(self) -> Tuple[List[Line], str]:
-        """
-        use compute_punctuations_segments, concatenate_diar_segments, intersection_duration
-        """
+        """Build lines when diarization is enabled and track overflow buffer."""
        diarization_buffer = ''
        punctuation_segments = self.compute_punctuations_segments()
        diarization_segments = self.concatenate_diar_segments()
@@ -136,9 +139,7 @@ class TokensAlignment:
            translation: bool = False,
            current_silence: Optional[Silence] = None
        ) -> Tuple[List[Line], str, Union[str, TimedText]]:
-        """
-        In the case without diarization
-        """
+        """Return the formatted lines plus buffers, optionally with diarization/translation."""
        if diarization:
            lines, diarization_buffer = self.get_lines_diarization()
        else: