From 6206fff118b9808ded4b036a1ba8a89ebf420c92 Mon Sep 17 00:00:00 2001
From: Quentin Fuxa <quentin.fuxa@gmail.com>
Date: Fri, 21 Nov 2025 23:52:00 +0100
Subject: [PATCH] 0.2.15

---
 README.md                                     |  2 +-
 docs/alignement_principles.md                 |  2 +-
 docs/technical_integration.md                 | 43 +++++++++++++++++++
 pyproject.toml                                |  2 +-
 whisperlivekit/local_agreement/backends.py    |  3 +-
 whisperlivekit/local_agreement/online_asr.py  |  4 +-
 whisperlivekit/simul_whisper/simul_whisper.py |  2 +-
 whisperlivekit/timed_objects.py               | 22 +++++++---
 whisperlivekit/tokens_alignment.py            | 15 ++++---
 9 files changed, 75 insertions(+), 20 deletions(-)
 create mode 100644 docs/technical_integration.md

diff --git a/README.md b/README.md
index f5af1d4..5c62a64 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ async def websocket_endpoint(websocket: WebSocket):
 |-----------|-------------|---------|
 | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` |
 | `--model-path` | Local .pt file/directory **or** Hugging Face repo ID containing the Whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` |
-| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
+| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
 | `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` |
 | `--diarization` | Enable speaker identification | `False` |
 | `--backend-policy` | Streaming strategy: `1`/`simulstreaming` uses AlignAtt SimulStreaming, `2`/`localagreement` uses the LocalAgreement policy | `simulstreaming` |
diff --git a/docs/alignement_principles.md b/docs/alignement_principles.md
index 4ce8fae..f187005 100644
--- a/docs/alignement_principles.md
+++ b/docs/alignement_principles.md
@@ -4,7 +4,7 @@
 - Example 2: The punctuation from STT comes from prediction `t`, but the speaker change from Diariation come in the prediction `t-1`
 - Example 3: The punctuation from STT comes from prediction `t-1`, but the speaker change from Diariation come in the prediction `t`
 
-> `#` Is the split between the `t-1` prediction and t prediction.  
+> `#` Is the split between the `t-1` prediction and `t` prediction.  
 
 
 ## Example 1:
diff --git a/docs/technical_integration.md b/docs/technical_integration.md
new file mode 100644
index 0000000..c8083d2
--- /dev/null
+++ b/docs/technical_integration.md
@@ -0,0 +1,43 @@
+# Technical Integration Guide
+
+This document introduce how to reuse the core components when you do **not** want to ship the bundled frontend, FastAPI server, or even the provided CLI.
+
+---
+
+## 1. Runtime Components
+
+| Layer | File(s) | Purpose |
+|-------|---------|---------|
+| Transport | `whisperlivekit/basic_server.py`, any ASGI/WebSocket server | Accepts audio over WebSocket (MediaRecorder WebM or raw PCM chunks) and streams JSON updates back |
+| Audio processing | `whisperlivekit/audio_processor.py` | Buffers audio, orchestrates transcription, diarization, translation, handles FFmpeg/PCM input |
+| Engines | `whisperlivekit/core.py`, `whisperlivekit/simul_whisper/*`, `whisperlivekit/local_agreement/*` | Load models once (SimulStreaming or LocalAgreement), expose `TranscriptionEngine` and helpers |
+| Frontends | `whisperlivekit/web/*`, `chrome-extension/*` | Optional UI layers feeding the WebSocket endpoint |
+
+**Key idea:** The server boundary is just `AudioProcessor.process_audio()` for incoming bytes and the async generator returned by `AudioProcessor.create_tasks()` for outgoing updates (`FrontData`). Everything else is optional.
+
+---
+
+## 2. Running Without the Bundled Frontend
+
+1. Start the server/engine however you like:
+   ```bash
+   wlk --model small --language en --host 0.0.0.0 --port 9000
+   # or launch your own app that instantiates TranscriptionEngine(...)
+   ```
+2. Build your own client (browser, mobile, desktop) that:
+   - Opens `ws(s)://<host>:<port>/asr`
+   - Sends either MediaRecorder/Opus WebM blobs **or** raw PCM (`--pcm-input` on the server tells the client to use the AudioWorklet).
+   - Consumes the JSON payload defined in `docs/API.md`.
+
+---
+
+## 3. Running Without FastAPI
+
+`whisperlivekit/basic_server.py` is just an example. Any async framework works, as long as you:
+
+1. Create a global `TranscriptionEngine` (expensive to initialize; reuse it).
+2. Instantiate `AudioProcessor(transcription_engine=engine)` for each connection.
+3. Call `create_tasks()` to get the async generator, `process_audio()` with incoming bytes, and ensure `cleanup()` runs when the client disconnects.
+
+
+If you prefer to send compressed audio, instantiate `AudioProcessor(pcm_input=False)` and pipe encoded chunks through `FFmpegManager` transparently—just ensure `ffmpeg` is available or be ready to handle the `"ffmpeg_not_found"` error in the streamed `FrontData`.
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index e672609..d4cb040 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "whisperlivekit"
-version = "0.2.14.post4"
+version = "0.2.15"
 description = "Real-time speech-to-text with speaker diarization using Whisper"
 readme = "README.md"
 authors = [
diff --git a/whisperlivekit/local_agreement/backends.py b/whisperlivekit/local_agreement/backends.py
index 360df0e..a4c67f9 100644
--- a/whisperlivekit/local_agreement/backends.py
+++ b/whisperlivekit/local_agreement/backends.py
@@ -224,7 +224,8 @@ class MLXWhisper(ASRBase):
             if segment.get("no_speech_prob", 0) > 0.9:
                 continue
             for word in segment.get("words", []):
-                token = ASRToken(word["start"], word["end"], word["word"], probability=word["probability"])
+                probability=word["probability"]
+                token = ASRToken(word["start"], word["end"], word["word"])
                 tokens.append(token)
         return tokens
 
diff --git a/whisperlivekit/local_agreement/online_asr.py b/whisperlivekit/local_agreement/online_asr.py
index 40a4551..26403cd 100644
--- a/whisperlivekit/local_agreement/online_asr.py
+++ b/whisperlivekit/local_agreement/online_asr.py
@@ -411,11 +411,11 @@ class OnlineASRProcessor:
     ) -> Transcript:
         sep = sep if sep is not None else self.asr.sep
         text = sep.join(token.text for token in tokens)
-        probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None
+        # probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None
         if tokens:
             start = offset + tokens[0].start
             end = offset + tokens[-1].end
         else:
             start = None
             end = None
-        return Transcript(start, end, text, probability=probability)
+        return Transcript(start, end, text)
diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py
index 0a7d5e3..61d93f3 100644
--- a/whisperlivekit/simul_whisper/simul_whisper.py
+++ b/whisperlivekit/simul_whisper/simul_whisper.py
@@ -266,7 +266,7 @@ class AlignAtt:
         logger.debug("Refreshing segment:")
         self.init_tokens()
         self.last_attend_frame = -self.cfg.rewind_threshold       
-        self.detected_language = None
+        # self.detected_language = None
         self.cumulative_time_offset = 0.0
         self.init_context()
         logger.debug(f"Context: {self.context}")
diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py
index d75bf50..dc2a729 100644
--- a/whisperlivekit/timed_objects.py
+++ b/whisperlivekit/timed_objects.py
@@ -19,8 +19,8 @@ class TimedText(Timed):
     speaker: Optional[int] = -1
     detected_language: Optional[str] = None
     
-    def is_punctuation(self) -> bool:
-        return self.text.strip() in PUNCTUATION_MARKS
+    def has_punctuation(self) -> bool:
+        return any(char in PUNCTUATION_MARKS for char in self.text.strip())
     
     def is_within(self, other: 'TimedText') -> bool:
         return other.contains_timespan(self)
@@ -65,6 +65,7 @@ class Transcript(TimedText):
         sep: Optional[str] = None,
         offset: float = 0
     ) -> "Transcript":
+        """Collapse multiple ASR tokens into a single transcript span."""
         sep = sep if sep is not None else ' '
         text = sep.join(token.text for token in tokens)
         if tokens:
@@ -107,18 +108,19 @@ class Silence():
 
 
 @dataclass
-class Segment():
+class Segment(TimedText):
+    """Generic contiguous span built from tokens or silence markers."""
     start: Optional[float]
     end: Optional[float]
     text: Optional[str]
     speaker: Optional[str]
-
     @classmethod
     def from_tokens(
         cls,
         tokens: List[Union[ASRToken, Silence]],
         is_silence: bool = False
     ) -> Optional["Segment"]:
+        """Return a normalized segment representing the provided tokens."""
         if not tokens:
             return None
         
@@ -129,16 +131,18 @@ class Segment():
                 start=start_token.start,
                 end=end_token.end,
                 text=None,
-                speaker = -2    
+                speaker=-2
             )
         else:
             return cls(
                 start=start_token.start,
                 end=end_token.end,
                 text=''.join(token.text for token in tokens),
-                speaker = -1
+                speaker=-1,
+                detected_language=start_token.detected_language
             )
     def is_silence(self) -> bool:
+        """True when this segment represents a silence gap."""
         return self.speaker == -2
 
 
@@ -147,6 +151,7 @@ class Line(TimedText):
     translation: str = ''
     
     def to_dict(self) -> Dict[str, Any]:
+        """Serialize the line for frontend consumption."""
         _dict: Dict[str, Any] = {
             'speaker': int(self.speaker) if self.speaker != -1 else 1,
             'text': self.text,
@@ -160,17 +165,21 @@ class Line(TimedText):
         return _dict
     
     def build_from_tokens(self, tokens: List[ASRToken]) -> "Line":
+        """Populate line attributes from a contiguous token list."""
         self.text = ''.join([token.text for token in tokens])
         self.start = tokens[0].start
         self.end = tokens[-1].end
         self.speaker = 1
+        self.detected_language = tokens[0].detected_language
         return self
 
     def build_from_segment(self, segment: Segment) -> "Line":
+        """Populate the line fields from a pre-built segment."""
         self.text = segment.text
         self.start = segment.start
         self.end = segment.end
         self.speaker = segment.speaker
+        self.detected_language = segment.detected_language
         return self
 
     def is_silent(self) -> bool:
@@ -195,6 +204,7 @@ class FrontData():
     remaining_time_diarization: float = 0.
     
     def to_dict(self) -> Dict[str, Any]:
+        """Serialize the front-end data payload."""
         _dict: Dict[str, Any] = {
             'status': self.status,
             'lines': [line.to_dict() for line in self.lines if (line.text or line.speaker == -2)],
diff --git a/whisperlivekit/tokens_alignment.py b/whisperlivekit/tokens_alignment.py
index 3b694a5..dd72913 100644
--- a/whisperlivekit/tokens_alignment.py
+++ b/whisperlivekit/tokens_alignment.py
@@ -26,6 +26,7 @@ class TokensAlignment:
         self.beg_loop: Optional[float] = None
 
     def update(self) -> None:
+        """Drain state buffers into the running alignment context."""
         self.new_tokens, self.state.new_tokens = self.state.new_tokens, []
         self.new_diarization, self.state.new_diarization = self.state.new_diarization, []
         self.new_translation, self.state.new_translation = self.state.new_translation, []
@@ -37,6 +38,7 @@ class TokensAlignment:
         self.new_translation_buffer = self.state.new_translation_buffer
 
     def add_translation(self, line: Line) -> None:
+        """Append translated text segments that overlap with a line."""
         for ts in self.all_translation_segments:
             if ts.is_within(line):
                 line.translation += ts.text + (self.sep if ts.text else '')
@@ -45,6 +47,7 @@ class TokensAlignment:
 
 
     def compute_punctuations_segments(self, tokens: Optional[List[ASRToken]] = None) -> List[Segment]:
+        """Group tokens into segments split by punctuation and explicit silence."""
         segments = []
         segment_start_idx = 0
         for i, token in enumerate(self.all_tokens):
@@ -61,7 +64,7 @@ class TokensAlignment:
                 segments.append(segment)
                 segment_start_idx = i+1
             else:
-                if token.is_punctuation():
+                if token.has_punctuation():
                     segment = Segment.from_tokens(
                         tokens=self.all_tokens[segment_start_idx: i+1],
                     )
@@ -77,6 +80,7 @@ class TokensAlignment:
 
 
     def concatenate_diar_segments(self) -> List[SpeakerSegment]:
+        """Merge consecutive diarization slices that share the same speaker."""
         if not self.all_diarization_segments:
             return []
         merged = [self.all_diarization_segments[0]]
@@ -90,15 +94,14 @@ class TokensAlignment:
 
     @staticmethod
     def intersection_duration(seg1: TimedText, seg2: TimedText) -> float:
+        """Return the overlap duration between two timed segments."""
         start = max(seg1.start, seg2.start)
         end = min(seg1.end, seg2.end)
 
         return max(0, end - start)
 
     def get_lines_diarization(self) -> Tuple[List[Line], str]:
-        """
-        use compute_punctuations_segments, concatenate_diar_segments, intersection_duration
-        """
+        """Build lines when diarization is enabled and track overflow buffer."""
         diarization_buffer = ''
         punctuation_segments = self.compute_punctuations_segments()
         diarization_segments = self.concatenate_diar_segments()
@@ -136,9 +139,7 @@ class TokensAlignment:
             translation: bool = False,
             current_silence: Optional[Silence] = None
         ) -> Tuple[List[Line], str, Union[str, TimedText]]:
-        """
-        In the case without diarization
-        """
+        """Return the formatted lines plus buffers, optionally with diarization/translation."""
         if diarization:
             lines, diarization_buffer = self.get_lines_diarization()
         else: