From 6206fff118b9808ded4b036a1ba8a89ebf420c92 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Fri, 21 Nov 2025 23:52:00 +0100 Subject: [PATCH] 0.2.15 --- README.md | 2 +- docs/alignement_principles.md | 2 +- docs/technical_integration.md | 43 +++++++++++++++++++ pyproject.toml | 2 +- whisperlivekit/local_agreement/backends.py | 3 +- whisperlivekit/local_agreement/online_asr.py | 4 +- whisperlivekit/simul_whisper/simul_whisper.py | 2 +- whisperlivekit/timed_objects.py | 22 +++++++--- whisperlivekit/tokens_alignment.py | 15 ++++--- 9 files changed, 75 insertions(+), 20 deletions(-) create mode 100644 docs/technical_integration.md diff --git a/README.md b/README.md index f5af1d4..5c62a64 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ async def websocket_endpoint(websocket: WebSocket): |-----------|-------------|---------| | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` | | `--model-path` | Local .pt file/directory **or** Hugging Face repo ID containing the Whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` | -| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` | +| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` | | `--target-language` | If sets, translates using [NLLW](https://github.com/QuentinFuxa/NoLanguageLeftWaiting). [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you can also use `--direct-english-translation`. The STT model will try to directly output the translation. | `None` | | `--diarization` | Enable speaker identification | `False` | | `--backend-policy` | Streaming strategy: `1`/`simulstreaming` uses AlignAtt SimulStreaming, `2`/`localagreement` uses the LocalAgreement policy | `simulstreaming` | diff --git a/docs/alignement_principles.md b/docs/alignement_principles.md index 4ce8fae..f187005 100644 --- a/docs/alignement_principles.md +++ b/docs/alignement_principles.md @@ -4,7 +4,7 @@ - Example 2: The punctuation from STT comes from prediction `t`, but the speaker change from Diariation come in the prediction `t-1` - Example 3: The punctuation from STT comes from prediction `t-1`, but the speaker change from Diariation come in the prediction `t` -> `#` Is the split between the `t-1` prediction and t prediction. +> `#` Is the split between the `t-1` prediction and `t` prediction. ## Example 1: diff --git a/docs/technical_integration.md b/docs/technical_integration.md new file mode 100644 index 0000000..c8083d2 --- /dev/null +++ b/docs/technical_integration.md @@ -0,0 +1,43 @@ +# Technical Integration Guide + +This document introduce how to reuse the core components when you do **not** want to ship the bundled frontend, FastAPI server, or even the provided CLI. + +--- + +## 1. Runtime Components + +| Layer | File(s) | Purpose | +|-------|---------|---------| +| Transport | `whisperlivekit/basic_server.py`, any ASGI/WebSocket server | Accepts audio over WebSocket (MediaRecorder WebM or raw PCM chunks) and streams JSON updates back | +| Audio processing | `whisperlivekit/audio_processor.py` | Buffers audio, orchestrates transcription, diarization, translation, handles FFmpeg/PCM input | +| Engines | `whisperlivekit/core.py`, `whisperlivekit/simul_whisper/*`, `whisperlivekit/local_agreement/*` | Load models once (SimulStreaming or LocalAgreement), expose `TranscriptionEngine` and helpers | +| Frontends | `whisperlivekit/web/*`, `chrome-extension/*` | Optional UI layers feeding the WebSocket endpoint | + +**Key idea:** The server boundary is just `AudioProcessor.process_audio()` for incoming bytes and the async generator returned by `AudioProcessor.create_tasks()` for outgoing updates (`FrontData`). Everything else is optional. + +--- + +## 2. Running Without the Bundled Frontend + +1. Start the server/engine however you like: + ```bash + wlk --model small --language en --host 0.0.0.0 --port 9000 + # or launch your own app that instantiates TranscriptionEngine(...) + ``` +2. Build your own client (browser, mobile, desktop) that: + - Opens `ws(s)://:/asr` + - Sends either MediaRecorder/Opus WebM blobs **or** raw PCM (`--pcm-input` on the server tells the client to use the AudioWorklet). + - Consumes the JSON payload defined in `docs/API.md`. + +--- + +## 3. Running Without FastAPI + +`whisperlivekit/basic_server.py` is just an example. Any async framework works, as long as you: + +1. Create a global `TranscriptionEngine` (expensive to initialize; reuse it). +2. Instantiate `AudioProcessor(transcription_engine=engine)` for each connection. +3. Call `create_tasks()` to get the async generator, `process_audio()` with incoming bytes, and ensure `cleanup()` runs when the client disconnects. + + +If you prefer to send compressed audio, instantiate `AudioProcessor(pcm_input=False)` and pipe encoded chunks through `FFmpegManager` transparently—just ensure `ffmpeg` is available or be ready to handle the `"ffmpeg_not_found"` error in the streamed `FrontData`. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e672609..d4cb040 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "whisperlivekit" -version = "0.2.14.post4" +version = "0.2.15" description = "Real-time speech-to-text with speaker diarization using Whisper" readme = "README.md" authors = [ diff --git a/whisperlivekit/local_agreement/backends.py b/whisperlivekit/local_agreement/backends.py index 360df0e..a4c67f9 100644 --- a/whisperlivekit/local_agreement/backends.py +++ b/whisperlivekit/local_agreement/backends.py @@ -224,7 +224,8 @@ class MLXWhisper(ASRBase): if segment.get("no_speech_prob", 0) > 0.9: continue for word in segment.get("words", []): - token = ASRToken(word["start"], word["end"], word["word"], probability=word["probability"]) + probability=word["probability"] + token = ASRToken(word["start"], word["end"], word["word"]) tokens.append(token) return tokens diff --git a/whisperlivekit/local_agreement/online_asr.py b/whisperlivekit/local_agreement/online_asr.py index 40a4551..26403cd 100644 --- a/whisperlivekit/local_agreement/online_asr.py +++ b/whisperlivekit/local_agreement/online_asr.py @@ -411,11 +411,11 @@ class OnlineASRProcessor: ) -> Transcript: sep = sep if sep is not None else self.asr.sep text = sep.join(token.text for token in tokens) - probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None + # probability = sum(token.probability for token in tokens if token.probability) / len(tokens) if tokens else None if tokens: start = offset + tokens[0].start end = offset + tokens[-1].end else: start = None end = None - return Transcript(start, end, text, probability=probability) + return Transcript(start, end, text) diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index 0a7d5e3..61d93f3 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -266,7 +266,7 @@ class AlignAtt: logger.debug("Refreshing segment:") self.init_tokens() self.last_attend_frame = -self.cfg.rewind_threshold - self.detected_language = None + # self.detected_language = None self.cumulative_time_offset = 0.0 self.init_context() logger.debug(f"Context: {self.context}") diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index d75bf50..dc2a729 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -19,8 +19,8 @@ class TimedText(Timed): speaker: Optional[int] = -1 detected_language: Optional[str] = None - def is_punctuation(self) -> bool: - return self.text.strip() in PUNCTUATION_MARKS + def has_punctuation(self) -> bool: + return any(char in PUNCTUATION_MARKS for char in self.text.strip()) def is_within(self, other: 'TimedText') -> bool: return other.contains_timespan(self) @@ -65,6 +65,7 @@ class Transcript(TimedText): sep: Optional[str] = None, offset: float = 0 ) -> "Transcript": + """Collapse multiple ASR tokens into a single transcript span.""" sep = sep if sep is not None else ' ' text = sep.join(token.text for token in tokens) if tokens: @@ -107,18 +108,19 @@ class Silence(): @dataclass -class Segment(): +class Segment(TimedText): + """Generic contiguous span built from tokens or silence markers.""" start: Optional[float] end: Optional[float] text: Optional[str] speaker: Optional[str] - @classmethod def from_tokens( cls, tokens: List[Union[ASRToken, Silence]], is_silence: bool = False ) -> Optional["Segment"]: + """Return a normalized segment representing the provided tokens.""" if not tokens: return None @@ -129,16 +131,18 @@ class Segment(): start=start_token.start, end=end_token.end, text=None, - speaker = -2 + speaker=-2 ) else: return cls( start=start_token.start, end=end_token.end, text=''.join(token.text for token in tokens), - speaker = -1 + speaker=-1, + detected_language=start_token.detected_language ) def is_silence(self) -> bool: + """True when this segment represents a silence gap.""" return self.speaker == -2 @@ -147,6 +151,7 @@ class Line(TimedText): translation: str = '' def to_dict(self) -> Dict[str, Any]: + """Serialize the line for frontend consumption.""" _dict: Dict[str, Any] = { 'speaker': int(self.speaker) if self.speaker != -1 else 1, 'text': self.text, @@ -160,17 +165,21 @@ class Line(TimedText): return _dict def build_from_tokens(self, tokens: List[ASRToken]) -> "Line": + """Populate line attributes from a contiguous token list.""" self.text = ''.join([token.text for token in tokens]) self.start = tokens[0].start self.end = tokens[-1].end self.speaker = 1 + self.detected_language = tokens[0].detected_language return self def build_from_segment(self, segment: Segment) -> "Line": + """Populate the line fields from a pre-built segment.""" self.text = segment.text self.start = segment.start self.end = segment.end self.speaker = segment.speaker + self.detected_language = segment.detected_language return self def is_silent(self) -> bool: @@ -195,6 +204,7 @@ class FrontData(): remaining_time_diarization: float = 0. def to_dict(self) -> Dict[str, Any]: + """Serialize the front-end data payload.""" _dict: Dict[str, Any] = { 'status': self.status, 'lines': [line.to_dict() for line in self.lines if (line.text or line.speaker == -2)], diff --git a/whisperlivekit/tokens_alignment.py b/whisperlivekit/tokens_alignment.py index 3b694a5..dd72913 100644 --- a/whisperlivekit/tokens_alignment.py +++ b/whisperlivekit/tokens_alignment.py @@ -26,6 +26,7 @@ class TokensAlignment: self.beg_loop: Optional[float] = None def update(self) -> None: + """Drain state buffers into the running alignment context.""" self.new_tokens, self.state.new_tokens = self.state.new_tokens, [] self.new_diarization, self.state.new_diarization = self.state.new_diarization, [] self.new_translation, self.state.new_translation = self.state.new_translation, [] @@ -37,6 +38,7 @@ class TokensAlignment: self.new_translation_buffer = self.state.new_translation_buffer def add_translation(self, line: Line) -> None: + """Append translated text segments that overlap with a line.""" for ts in self.all_translation_segments: if ts.is_within(line): line.translation += ts.text + (self.sep if ts.text else '') @@ -45,6 +47,7 @@ class TokensAlignment: def compute_punctuations_segments(self, tokens: Optional[List[ASRToken]] = None) -> List[Segment]: + """Group tokens into segments split by punctuation and explicit silence.""" segments = [] segment_start_idx = 0 for i, token in enumerate(self.all_tokens): @@ -61,7 +64,7 @@ class TokensAlignment: segments.append(segment) segment_start_idx = i+1 else: - if token.is_punctuation(): + if token.has_punctuation(): segment = Segment.from_tokens( tokens=self.all_tokens[segment_start_idx: i+1], ) @@ -77,6 +80,7 @@ class TokensAlignment: def concatenate_diar_segments(self) -> List[SpeakerSegment]: + """Merge consecutive diarization slices that share the same speaker.""" if not self.all_diarization_segments: return [] merged = [self.all_diarization_segments[0]] @@ -90,15 +94,14 @@ class TokensAlignment: @staticmethod def intersection_duration(seg1: TimedText, seg2: TimedText) -> float: + """Return the overlap duration between two timed segments.""" start = max(seg1.start, seg2.start) end = min(seg1.end, seg2.end) return max(0, end - start) def get_lines_diarization(self) -> Tuple[List[Line], str]: - """ - use compute_punctuations_segments, concatenate_diar_segments, intersection_duration - """ + """Build lines when diarization is enabled and track overflow buffer.""" diarization_buffer = '' punctuation_segments = self.compute_punctuations_segments() diarization_segments = self.concatenate_diar_segments() @@ -136,9 +139,7 @@ class TokensAlignment: translation: bool = False, current_silence: Optional[Silence] = None ) -> Tuple[List[Line], str, Union[str, TimedText]]: - """ - In the case without diarization - """ + """Return the formatted lines plus buffers, optionally with diarization/translation.""" if diarization: lines, diarization_buffer = self.get_lines_diarization() else: