From d9a4c8dcb2401ad55561df42e4d5566ff3489ed0 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 30 Nov 2025 16:39:27 +0100 Subject: [PATCH] Refactor transcription and diarization handling with token-by-token validation. Introduce segment buffers for ephemeral content and update API to return structured segment data. Enhance silence handling and improve web interface for text transcripts. --- docs/API.md | 299 ++++++++--------- docs/alignement_principles.md | 124 +++---- whisperlivekit/__init__.py | 3 +- whisperlivekit/audio_processor.py | 44 ++- whisperlivekit/basic_server.py | 8 +- whisperlivekit/timed_objects.py | 59 ++-- whisperlivekit/tokens_alignment.py | 264 +++++++++++---- whisperlivekit/web/live_transcription.css | 5 +- whisperlivekit/web/live_transcription.js | 141 ++++---- whisperlivekit/web/text_transcript.html | 377 ++++++++++++++++++++++ whisperlivekit/web/web_interface.py | 31 ++ 11 files changed, 978 insertions(+), 377 deletions(-) create mode 100644 whisperlivekit/web/text_transcript.html diff --git a/docs/API.md b/docs/API.md index 8f312ca..837ccf2 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1,53 +1,22 @@ # WhisperLiveKit WebSocket API Documentation -> !! **Note**: The new API structure described in this document is currently under deployment. -This documentation is intended for devs who want to build custom frontends. - -WLK provides real-time speech transcription, speaker diarization, and translation through a WebSocket API. The server sends incremental updates as audio is processed, allowing clients to display live transcription results with minimal latency. +WLK provides real-time speech transcription, speaker diarization, and translation through a WebSocket API. The server sends updates as audio is processed, allowing clients to display live transcription results with minimal latency. --- -## Legacy API (Current) +## Endpoints -### Message Structure - -The current API sends complete state snapshots on each update (several time per second) - -```typescript -{ - "type": str, - "status": str, - "lines": [ - { - "speaker": int, - "text": str, - "start": float, - "end": float, - "translation": str | null, - "detected_language": str - } - ], - "buffer_transcription": str, - "buffer_diarization": str, - "remaining_time_transcription": float, - "remaining_time_diarization": float -} -``` +| Endpoint | Description | +|----------|-------------| +| `/` | Main web interface with visual styling | +| `/text` | Simple text-based interface for easy copy/paste (debug/development) | +| `/asr` | WebSocket endpoint for audio streaming | --- -## New API (Under Development) - -### Philosophy - -Principles: - -- **Incremental Updates**: Only updates and new segments are sent -- **Ephemeral Buffers**: Temporary, unvalidated data displayed in real-time but overwritten on next update, at speaker level - - ## Message Format +### Transcript Update (Server → Client) ```typescript { @@ -58,22 +27,11 @@ Principles: "id": number, "speaker": number, "text": string, - "start_speaker": float, - "start": float, - "end": float, + "start_speaker": string, // HH:MM:SS format + "start": string, // HH:MM:SS format + "end": string, // HH:MM:SS format "language": string | null, "translation": string, - "words": [ - { - "text": string, - "start": float, - "end": float, - "validated": { - "text": boolean, - "speaker": boolean, - } - } - ], "buffer": { "transcription": string, "diarization": string, @@ -94,9 +52,10 @@ Principles: ```json { "type": "config", - "useAudioWorklet": true / false + "useAudioWorklet": true } ``` +- `useAudioWorklet`: If `true`, client should use AudioWorklet for PCM streaming. If `false`, use MediaRecorder for WebM. #### Ready to Stop Message (sent after processing complete) ```json @@ -104,6 +63,7 @@ Principles: "type": "ready_to_stop" } ``` +Indicates all audio has been processed and the client can safely close the connection. --- @@ -113,152 +73,179 @@ Principles: | Field | Type | Description | |-------|------|-------------| -| `id` | `number` | Unique identifier for this segment. Used by clients to update specific segments efficiently. | +| `id` | `number` | Unique identifier for this segment. | | `speaker` | `number` | Speaker ID (1, 2, 3...). Special value `-2` indicates silence. | -| `text` | `string` | Validated transcription text for this update. Should be **appended** to the segment's text on the client side. | -| `start_speaker` | `float` | Timestamp (seconds) when this speaker segment began. | -| `start` | `float` | Timestamp (seconds) of the first word in this update. | -| `end` | `float` | Timestamp (seconds) of the last word in this update. | -| `language` | `string \| null` | ISO language code (e.g., "en", "fr"). `null` until language is detected. | -| `translation` | `string` | Validated translation text for this update. Should be **appended** to the segment's translation on the client side. | -| `words` | `Array` | Array of word-level objects with timing and validation information. | -| `buffer` | `Object` | Per-segment temporary buffers, see below | - -### Word Object - -| Field | Type | Description | -|-------|------|-------------| -| `text` | `string` | The word text. | -| `start` | `number` | Start timestamp (seconds) of this word. | -| `end` | `number` | End timestamp (seconds) of this word. | -| `validated.text` | `boolean` | Whether the transcription text has been validated. if false, word is also in buffer: transcription | -| `validated.speaker` | `boolean` | Whether the speaker assignment has been validated. if false, word is also in buffer: diarization | -| `validated.language` | `boolean` | Whether the language detection has been validated. if false, word is also in buffer: translation | +| `text` | `string` | Validated transcription text. | +| `start_speaker` | `string` | Timestamp (HH:MM:SS) when this speaker segment began. | +| `start` | `string` | Timestamp (HH:MM:SS) of the first word. | +| `end` | `string` | Timestamp (HH:MM:SS) of the last word. | +| `language` | `string \| null` | ISO language code (e.g., "en", "fr"). `null` until detected. | +| `translation` | `string` | Validated translation text. | +| `buffer` | `Object` | Per-segment temporary buffers (see below). | ### Buffer Object (Per-Segment) -Buffers are **ephemeral**. They should be displayed to the user but not stored permanently in the frontend. Each update may contain a completely different buffer value, and previous buffer is likely to be in the next validated text. +Buffers are **ephemeral**. They should be displayed to the user but are overwritten on each update. Only the **last non-silent segment** contains buffer content. | Field | Type | Description | |-------|------|-------------| -| `transcription` | `string` | Pending transcription text. Displayed immediately but **overwritten** on next update. | -| `diarization` | `string` | Pending diarization text (text waiting for speaker assignment). Displayed immediately but **overwritten** on next update. | -| `translation` | `string` | Pending translation text. Displayed immediately but **overwritten** on next update. | - +| `transcription` | `string` | Text pending validation (waiting for more context). | +| `diarization` | `string` | Text pending speaker assignment (diarization hasn't caught up). | +| `translation` | `string` | Translation pending validation. | ### Metadata Fields | Field | Type | Description | |-------|------|-------------| -| `remaining_time_transcription` | `float` | Seconds of audio waiting for transcription processing. | -| `remaining_time_diarization` | `float` | Seconds of audio waiting for speaker diarization. | +| `remaining_time_transcription` | `float` | Seconds of audio waiting for transcription. | +| `remaining_time_diarization` | `float` | Seconds of audio waiting for diarization. | ### Status Values | Status | Description | |--------|-------------| | `active_transcription` | Normal operation, transcription is active. | -| `no_audio_detected` | No audio has been detected yet. | +| `no_audio_detected` | No audio/speech has been detected yet. | --- -## Update Behavior +## Behavior Notes -### Incremental Updates +### Silence Handling -The API sends **only changed or new segments**. Clients should: +- **Short silences (< 2 seconds)** are filtered out and not displayed. +- Only significant pauses appear as silence segments with `speaker: -2`. +- Consecutive same-speaker segments are merged even across short silences. -1. Maintain a local map of segments by ID -2. When receiving an update, merge/update segments by ID -3. Render only the changed segments +### Update Frequency -### Language Detection +- **Active transcription**: ~20 updates/second (every 50ms) +- **During silence**: ~2 updates/second (every 500ms) to reduce bandwidth -When language is detected for a segment: +### Token-by-Token Validation (Diarization Mode) -```jsonc -// Update 1: No language yet -{ - "segments": [ - {"id": 1, "speaker": 1, "text": "May see", "language": null} - ] -} - -// Update 2: Same segment ID, language now detected -{ - "segments": [ - {"id": 1, "speaker": 1, "text": "Merci", "language": "fr"} - ] -} -``` - -**Client behavior**: **Replace** the existing segment with the same ID. - -### Buffer Behavior - -Buffers are **per-segment** to handle multi-speaker scenarios correctly. - -#### Example: Translation with diarization and translation - -```jsonc -// Update 1 +When diarization is enabled, text is validated **token-by-token** as soon as diarization covers each token, rather than waiting for punctuation. This provides: +- Faster text validation +- More responsive speaker attribution +- Buffer only contains tokens that diarization hasn't processed yet + +--- + +## Example Messages + +### Normal Transcription + +```json { + "type": "transcript_update", + "status": "active_transcription", "segments": [ { "id": 1, "speaker": 1, - "text": "Hello world, how are", + "text": "Hello, how are you today?", + "start_speaker": "0:00:02", + "start": "0:00:02", + "end": "0:00:05", + "language": "en", + "translation": "", + "buffer": { + "transcription": " I'm doing", + "diarization": "", + "translation": "" + } + } + ], + "metadata": { + "remaining_time_transcription": 0.5, + "remaining_time_diarization": 0 + } +} +``` + +### With Diarization Buffer + +```json +{ + "type": "transcript_update", + "status": "active_transcription", + "segments": [ + { + "id": 1, + "speaker": 1, + "text": "The meeting starts at nine.", + "start_speaker": "0:00:03", + "start": "0:00:03", + "end": "0:00:06", + "language": "en", "translation": "", "buffer": { "transcription": "", - "diarization": " you on", - "translation": "Bonjour le monde" + "diarization": " Let me check my calendar", + "translation": "" } } - ] + ], + "metadata": { + "remaining_time_transcription": 0.3, + "remaining_time_diarization": 2.1 + } } - - -// ==== Frontend ==== -// 1 -// Hello world, how are you on -// Bonjour le monde - - -// Update 2 -{ - "segments": [ - { - "id": 1, - "speaker": 1, - "text": " you on this", - "translation": "Bonjour tout le monde", - "buffer": { - "transcription": "", - "diarization": " beautiful day", - "translation": ",comment" - } - }, - ] -} - - -// ==== Frontend ==== -// 1 -// Hello world, how are you on this beautiful day -// Bonjour tout le monde, comment ``` -### Silence Segments +### Silence Segment -Silence is represented with the speaker id = `-2`: - -```jsonc +```json { "id": 5, "speaker": -2, "text": "", - "start": 10.5, - "end": 12.3 + "start_speaker": "0:00:10", + "start": "0:00:10", + "end": "0:00:15", + "language": null, + "translation": "", + "buffer": { + "transcription": "", + "diarization": "", + "translation": "" + } } ``` + +--- + +## Text Transcript Endpoint (`/text`) + +The `/text` endpoint provides a simple, monospace text interface designed for: +- Easy copy/paste of transcripts +- Debugging and development +- Integration testing + +Output uses text markers instead of HTML styling: + +``` +[METADATA transcription_lag=0.5s diarization_lag=1.2s] + +[SPEAKER 1] 0:00:03 - 0:00:11 [LANG: en] +Hello world, how are you doing today?[DIAR_BUFFER] I'm doing fine[/DIAR_BUFFER] + +[SILENCE 0:00:15 - 0:00:18] + +[SPEAKER 2] 0:00:18 - 0:00:22 [LANG: en] +That's great to hear! +[TRANSLATION]C'est super à entendre![/TRANSLATION] +``` + +### Markers + +| Marker | Description | +|--------|-------------| +| `[SPEAKER N]` | Speaker label with ID | +| `[SILENCE start - end]` | Silence segment | +| `[LANG: xx]` | Detected language code | +| `[DIAR_BUFFER]...[/DIAR_BUFFER]` | Text pending speaker assignment | +| `[TRANS_BUFFER]...[/TRANS_BUFFER]` | Text pending validation | +| `[TRANSLATION]...[/TRANSLATION]` | Translation content | +| `[METADATA ...]` | Lag/timing information | + diff --git a/docs/alignement_principles.md b/docs/alignement_principles.md index f187005..77b4106 100644 --- a/docs/alignement_principles.md +++ b/docs/alignement_principles.md @@ -1,13 +1,73 @@ -### Alignment between STT Tokens and Diarization Segments +# Alignment Principles -- Example 1: The punctuation from STT and the speaker change from Diariation come in the prediction `t` -- Example 2: The punctuation from STT comes from prediction `t`, but the speaker change from Diariation come in the prediction `t-1` -- Example 3: The punctuation from STT comes from prediction `t-1`, but the speaker change from Diariation come in the prediction `t` +This document explains how transcription tokens are aligned with diarization (speaker identification) segments. -> `#` Is the split between the `t-1` prediction and `t` prediction. +--- +## Token-by-Token Validation + +When diarization is enabled, text is validated **token-by-token** rather than waiting for sentence boundaries. As soon as diarization covers a token's time range, that token is validated and assigned to the appropriate speaker. + +### How It Works + +1. **Transcription produces tokens** with timestamps (start, end) +2. **Diarization produces speaker segments** with timestamps +3. **For each token**: Check if diarization has caught up to that token's time + - If yes → Find speaker with maximum overlap, validate token + - If no → Keep token in "pending" (becomes diarization buffer) + +``` +Timeline: 0s -------- 5s -------- 10s -------- 15s + | | | | +Transcription: [Hello, how are you doing today?] + |_______|___|____|_____|_____|_____| + tok1 tok2 tok3 tok4 tok5 tok6 + +Diarization: [SPEAKER 1 ][SPEAKER 2 ] + |__________________|__________________| + 0s 8s 15s + +At time t when diarization covers up to 8s: +- Tokens 1-4 (0s-7s) → Validated as SPEAKER 1 +- Tokens 5-6 (7s-10s) → In buffer (diarization hasn't caught up) +``` + +--- + +## Silence Handling + +- **Short silences (< 2 seconds)**: Filtered out, not displayed +- **Significant silences (≥ 2 seconds)**: Displayed as silence segments with `speaker: -2` +- **Same speaker across gaps**: Segments are merged even if separated by short silences + +``` +Before filtering: +[SPK1 0:00-0:03] [SILENCE 0:03-0:04] [SPK1 0:04-0:08] + +After filtering (silence < 2s): +[SPK1 0:00-0:08] ← Merged into single segment +``` + +--- + +## Buffer Types + +| Buffer | Contains | Displayed When | +|--------|----------|----------------| +| `transcription` | Text awaiting validation (more context needed) | Always on last segment | +| `diarization` | Text awaiting speaker assignment | When diarization lags behind transcription | +| `translation` | Translation awaiting validation | When translation is enabled | + +--- + +## Legacy: Punctuation-Based Splitting + +The previous approach split segments at punctuation marks and aligned with diarization at those boundaries. This is now replaced by token-by-token validation for faster, more responsive results. + +### Historical Examples (for reference) + +Example of punctuation-based alignment: -## Example 1: ```text punctuations_segments : __#_______.__________________!____ diarization_segments: @@ -16,56 +76,6 @@ SPK2 # ___________________ --> ALIGNED SPK1 __#_______. ALIGNED SPK2 # __________________!____ - -t-1 output: -SPK1: __# -SPK2: NO -DIARIZATION BUFFER: NO - -t output: -SPK1: __#__. -SPK2: __________________!____ -DIARIZATION BUFFER: No ``` -## Example 2: -```text -punctuations_segments : _____#__.___________ -diarization_segments: -SPK1 ___ # -SPK2 __#______________ ---> -ALIGNED SPK1 _____#__. -ALIGNED SPK2 # ___________ - -t-1 output: -SPK1: ___ # -SPK2: -DIARIZATION BUFFER: __# - -t output: -SPK1: __#__. -SPK2: ___________ -DIARIZATION BUFFER: No -``` - -## Example 3: -```text -punctuations_segments : ___.__#__________ -diarization_segments: -SPK1 ______#__ -SPK2 # ________ ---> -ALIGNED SPK1 ___. # -ALIGNED SPK2 __#__________ - -t-1 output: -SPK1: ___. # -SPK2: -DIARIZATION BUFFER: __# - -t output: -SPK1: # -SPK2: __#___________ -DIARIZATION BUFFER: NO -``` +With token-by-token validation, the alignment happens continuously rather than at punctuation boundaries. diff --git a/whisperlivekit/__init__.py b/whisperlivekit/__init__.py index 5e7e884..9373895 100644 --- a/whisperlivekit/__init__.py +++ b/whisperlivekit/__init__.py @@ -1,7 +1,7 @@ from .audio_processor import AudioProcessor from .core import TranscriptionEngine from .parse_args import parse_args -from .web.web_interface import get_inline_ui_html, get_web_interface_html +from .web.web_interface import get_inline_ui_html, get_text_transcript_html, get_web_interface_html __all__ = [ "TranscriptionEngine", @@ -9,5 +9,6 @@ __all__ = [ "parse_args", "get_web_interface_html", "get_inline_ui_html", + "get_text_transcript_html", "download_simulstreaming_backend", ] diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 1962bab..b1c141c 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -393,6 +393,10 @@ class AudioProcessor: async def results_formatter(self) -> AsyncGenerator[FrontData, None]: """Format processing results for output.""" + # Update intervals + ACTIVE_INTERVAL = 0.05 # 20 updates/sec during active transcription + SILENCE_INTERVAL = 0.5 # 2 updates/sec during silence + while True: try: if self._ffmpeg_error: @@ -402,25 +406,35 @@ class AudioProcessor: continue self.tokens_alignment.update() - lines, buffer_diarization_text, buffer_translation_text = self.tokens_alignment.get_lines( + state = await self.get_current_state() + + # Get transcription buffer text to pass to get_lines + buffer_transcription_text = state.buffer_transcription.text if state.buffer_transcription else '' + + # get_lines now returns segments with per-segment buffers + segments = self.tokens_alignment.get_lines( diarization=self.args.diarization, translation=bool(self.translation), - current_silence=self.current_silence + current_silence=self.current_silence, + buffer_transcription=buffer_transcription_text ) - state = await self.get_current_state() - - buffer_transcription_text = state.buffer_transcription.text if state.buffer_transcription else '' response_status = "active_transcription" - if not lines and not buffer_transcription_text and not buffer_diarization_text: + # Check if there's any content (segments with text or buffers) + has_active_content = any( + seg.buffer and (seg.buffer.transcription or seg.buffer.diarization) + for seg in segments if not seg.is_silence() + ) + has_any_content = any( + seg.text or (seg.buffer and (seg.buffer.transcription or seg.buffer.diarization)) + for seg in segments if not seg.is_silence() + ) + if not segments or not has_any_content: response_status = "no_audio_detected" response = FrontData( status=response_status, - lines=lines, - buffer_transcription=buffer_transcription_text, - buffer_diarization=buffer_diarization_text, - buffer_translation=buffer_translation_text, + segments=segments, remaining_time_transcription=state.remaining_time_transcription, remaining_time_diarization=state.remaining_time_diarization if self.args.diarization else 0 ) @@ -434,7 +448,15 @@ class AudioProcessor: logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.") return - await asyncio.sleep(0.05) + # Throttle updates during silence: use slower interval when in silence mode + # with no pending buffers (nothing actively being processed) + is_in_silence = self.current_silence is not None + has_pending_work = has_active_content or state.remaining_time_transcription > 0.5 + + if is_in_silence and not has_pending_work: + await asyncio.sleep(SILENCE_INTERVAL) + else: + await asyncio.sleep(ACTIVE_INTERVAL) except Exception as e: logger.warning(f"Exception in results_formatter. Traceback: {traceback.format_exc()}") diff --git a/whisperlivekit/basic_server.py b/whisperlivekit/basic_server.py index 1694e0c..5763d37 100644 --- a/whisperlivekit/basic_server.py +++ b/whisperlivekit/basic_server.py @@ -7,7 +7,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from whisperlivekit import (AudioProcessor, TranscriptionEngine, - get_inline_ui_html, parse_args) + get_inline_ui_html, get_text_transcript_html, parse_args) logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logging.getLogger().setLevel(logging.WARNING) @@ -39,6 +39,12 @@ async def get(): return HTMLResponse(get_inline_ui_html()) +@app.get("/text") +async def get_text(): + """Simple text-based transcript view for easy copy/paste.""" + return HTMLResponse(get_text_transcript_html()) + + async def handle_websocket_results(websocket, results_generator): """Consumes results from the audio processor and sends them via WebSocket.""" try: diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index 0004a21..44bd46f 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -107,6 +107,21 @@ class Silence(): return True +@dataclass +class SegmentBuffer: + """Per-segment buffer for ephemeral/unvalidated content.""" + transcription: str = '' + diarization: str = '' + translation: str = '' + + def to_dict(self) -> Dict[str, str]: + return { + 'transcription': self.transcription, + 'diarization': self.diarization, + 'translation': self.translation + } + + @dataclass class Segment(TimedText): """Generic contiguous span built from tokens or silence markers.""" @@ -114,14 +129,18 @@ class Segment(TimedText): end: Optional[float] text: Optional[str] speaker: Optional[str] + id: Optional[int] = None + start_speaker: Optional[float] = None tokens: Optional[ASRToken] = None translation: Optional[Translation] = None + buffer: Optional[SegmentBuffer] = None @classmethod def from_tokens( cls, tokens: List[Union[ASRToken, Silence]], - is_silence: bool = False + is_silence: bool = False, + segment_id: Optional[int] = None ) -> Optional["Segment"]: """Return a normalized segment representing the provided tokens.""" if not tokens: @@ -134,7 +153,9 @@ class Segment(TimedText): start=start_token.start, end=end_token.end, text=None, - speaker=-2 + speaker=-2, + id=segment_id, + start_speaker=start_token.start ) else: return cls( @@ -142,6 +163,8 @@ class Segment(TimedText): end=end_token.end, text=''.join(token.text for token in tokens), speaker=-1, + id=segment_id, + start_speaker=start_token.start, detected_language=start_token.detected_language ) @@ -150,17 +173,18 @@ class Segment(TimedText): return self.speaker == -2 def to_dict(self) -> Dict[str, Any]: - """Serialize the segment for frontend consumption.""" + """Serialize the segment for frontend consumption (new API format).""" _dict: Dict[str, Any] = { + 'id': self.id if self.id is not None else 0, 'speaker': int(self.speaker) if self.speaker != -1 else 1, - 'text': self.text, + 'text': self.text or '', + 'start_speaker': format_time(self.start_speaker) if self.start_speaker is not None else format_time(self.start), 'start': format_time(self.start), 'end': format_time(self.end), + 'language': self.detected_language, + 'translation': self.translation or '', + 'buffer': self.buffer.to_dict() if self.buffer else SegmentBuffer().to_dict() } - if self.translation: - _dict['translation'] = self.translation - if self.detected_language: - _dict['detected_language'] = self.detected_language return _dict @@ -179,23 +203,20 @@ class SilentSegment(Segment): class FrontData(): status: str = '' error: str = '' - lines: list[Segment] = field(default_factory=list) - buffer_transcription: str = '' - buffer_diarization: str = '' - buffer_translation: str = '' + segments: list[Segment] = field(default_factory=list) remaining_time_transcription: float = 0. remaining_time_diarization: float = 0. def to_dict(self) -> Dict[str, Any]: - """Serialize the front-end data payload.""" + """Serialize the front-end data payload (new API format).""" _dict: Dict[str, Any] = { + 'type': 'transcript_update', 'status': self.status, - 'lines': [line.to_dict() for line in self.lines if (line.text or line.speaker == -2)], - 'buffer_transcription': self.buffer_transcription, - 'buffer_diarization': self.buffer_diarization, - 'buffer_translation': self.buffer_translation, - 'remaining_time_transcription': self.remaining_time_transcription, - 'remaining_time_diarization': self.remaining_time_diarization, + 'segments': [seg.to_dict() for seg in self.segments if (seg.text or seg.speaker == -2)], + 'metadata': { + 'remaining_time_transcription': self.remaining_time_transcription, + 'remaining_time_diarization': self.remaining_time_diarization, + } } if self.error: _dict['error'] = self.error diff --git a/whisperlivekit/tokens_alignment.py b/whisperlivekit/tokens_alignment.py index e43b5aa..0b262d5 100644 --- a/whisperlivekit/tokens_alignment.py +++ b/whisperlivekit/tokens_alignment.py @@ -1,12 +1,14 @@ from time import time from typing import Any, List, Optional, Tuple, Union -from whisperlivekit.timed_objects import (ASRToken, Segment, PuncSegment, Silence, +from whisperlivekit.timed_objects import (ASRToken, Segment, SegmentBuffer, PuncSegment, Silence, SilentSegment, SpeakerSegment, TimedText) class TokensAlignment: + # Minimum duration (seconds) for a silence to be displayed + MIN_SILENCE_DISPLAY_DURATION = 2.0 def __init__(self, state: Any, args: Any, sep: Optional[str]) -> None: self.state = state @@ -34,7 +36,14 @@ class TokensAlignment: self.last_punctuation = None self.last_uncompleted_punc_segment: PuncSegment = None self.tokens_after_last_punctuation: PuncSegment = [] - self.all_validated_segments = [] + self.all_validated_segments: List[Segment] = [] + + # For token-by-token validation with diarization + self.pending_tokens: List[ASRToken] = [] + self.last_validated_token_end: float = 0.0 + + # Segment ID counter for the new API + self._next_segment_id: int = 1 def update(self) -> None: """Drain state buffers into the running alignment context.""" @@ -139,68 +148,189 @@ class TokensAlignment: return max(0, end - start) + def _get_speaker_for_token(self, token: ASRToken, diarization_segments: List[SpeakerSegment]) -> Optional[int]: + """Get speaker ID for a token based on diarization overlap. Returns None if not covered.""" + if not diarization_segments: + return None + + # Check if token is beyond diarization coverage + if token.start >= diarization_segments[-1].end: + return None + + # Find speaker with max overlap + max_overlap = 0.0 + best_speaker = None + for diar_seg in diarization_segments: + overlap = self.intersection_duration(token, diar_seg) + if overlap > max_overlap: + max_overlap = overlap + best_speaker = diar_seg.speaker + 1 # 1-indexed + + return best_speaker if max_overlap > 0 else None + def get_lines_diarization(self) -> Tuple[List[Segment], str]: - """Build segments when diarization is enabled and track overflow buffer.""" - # diarization_buffer = '' - unvalidated_segments = [] - # punctuation_segments = self.compute_punctuations_segments() - new_punc_segments = self.compute_new_punctuations_segments() + """Build segments with token-by-token validation when diarization covers them.""" diarization_segments = self.concatenate_diar_segments() - - for new_punctuation_segment in new_punc_segments: - if not new_punctuation_segment.is_silence(): - if diarization_segments and new_punctuation_segment.start >= diarization_segments[-1].end: - unvalidated_segments.append(new_punctuation_segment) + + # Add new tokens to pending + self.pending_tokens.extend(self.new_tokens) + + # Process pending tokens - validate those covered by diarization + still_pending = [] + for token in self.pending_tokens: + if token.is_silence(): + # Handle silence tokens + silence_duration = (token.end or 0) - (token.start or 0) + if silence_duration >= self.MIN_SILENCE_DISPLAY_DURATION: + # Significant silence - add as separate segment + if self.all_validated_segments and not self.all_validated_segments[-1].is_silence(): + self.all_validated_segments.append(SilentSegment( + start=token.start, + end=token.end + )) + elif self.all_validated_segments and self.all_validated_segments[-1].is_silence(): + # Extend existing silence + self.all_validated_segments[-1].end = token.end + else: + self.all_validated_segments.append(SilentSegment( + start=token.start, + end=token.end + )) + # Short silences are ignored (don't go to pending either) + continue + + speaker = self._get_speaker_for_token(token, diarization_segments) + + if speaker is not None: + # Token is covered by diarization - validate it + if self.all_validated_segments: + last_seg = self.all_validated_segments[-1] + if not last_seg.is_silence() and last_seg.speaker == speaker: + # Same speaker - append to existing segment + last_seg.text += token.text + last_seg.end = token.end + else: + # Different speaker or after silence - new segment + new_seg = Segment( + start=token.start, + end=token.end, + text=token.text, + speaker=speaker, + start_speaker=token.start, + detected_language=token.detected_language + ) + self.all_validated_segments.append(new_seg) else: - max_overlap = 0.0 - max_overlap_speaker = 1 - for diarization_segment in diarization_segments: - intersec = self.intersection_duration(new_punctuation_segment, diarization_segment) - if intersec > max_overlap: - max_overlap = intersec - max_overlap_speaker = diarization_segment.speaker + 1 - new_punctuation_segment.speaker = max_overlap_speaker - - for new_punctuation_segment in new_punc_segments: - if self.all_validated_segments and new_punctuation_segment.speaker == self.all_validated_segments[-1].speaker: - if not new_punctuation_segment.is_silence(): - self.all_validated_segments[-1].text += new_punctuation_segment.text - self.all_validated_segments[-1].end = new_punctuation_segment.end + # First segment + new_seg = Segment( + start=token.start, + end=token.end, + text=token.text, + speaker=speaker, + start_speaker=token.start, + detected_language=token.detected_language + ) + self.all_validated_segments.append(new_seg) + + self.last_validated_token_end = token.end else: - self.all_validated_segments.append(new_punctuation_segment) + # Token not yet covered by diarization - keep pending + still_pending.append(token) + + self.pending_tokens = still_pending + + # Build diarization buffer from pending tokens + diarization_buffer = ''.join(t.text for t in self.pending_tokens if not t.is_silence()) + + return self.all_validated_segments, diarization_buffer - last_partial_segment = PuncSegment.from_tokens(self.tokens_after_last_punctuation) - if last_partial_segment: - unvalidated_segments.append(last_partial_segment) - return self.all_validated_segments, ''.join([seg.text for seg in unvalidated_segments]) + def _assign_segment_ids(self, segments: List[Segment]) -> None: + """Assign unique IDs to segments that don't have one yet.""" + for segment in segments: + if segment.id is None: + segment.id = self._next_segment_id + self._next_segment_id += 1 + def _assign_buffers_to_last_segment( + self, + segments: List[Segment], + buffer_transcription: str, + buffer_diarization: str, + buffer_translation: str + ) -> None: + """Assign buffer content to the last non-silent segment.""" + # First, clear ALL buffers (they're ephemeral and shouldn't persist) + for segment in segments: + segment.buffer = SegmentBuffer() + + # Find the last non-silent segment and assign buffers to it + for segment in reversed(segments): + if not segment.is_silence(): + segment.buffer = SegmentBuffer( + transcription=buffer_transcription, + diarization=buffer_diarization, + translation=buffer_translation + ) + break + + def _filter_and_merge_segments(self, segments: List[Segment]) -> List[Segment]: + """Filter parasitic silences and merge consecutive same-speaker segments.""" + if not segments: + return segments + + result = [] + for seg in segments: + if seg.is_silence(): + # Filter short silences + duration = (seg.end or 0) - (seg.start or 0) + if duration < self.MIN_SILENCE_DISPLAY_DURATION: + continue + # Merge consecutive silences + if result and result[-1].is_silence(): + result[-1].end = seg.end + continue + else: + # Merge same speaker segments (across filtered silences) + if result and not result[-1].is_silence() and result[-1].speaker == seg.speaker: + result[-1].text += seg.text + result[-1].end = seg.end + continue + + result.append(seg) + + return result def get_lines( self, diarization: bool = False, translation: bool = False, - current_silence: Optional[Silence] = None - ) -> Tuple[List[Segment], str, Union[str, TimedText]]: - """Return the formatted segments plus buffers, optionally with diarization/translation.""" + current_silence: Optional[Silence] = None, + buffer_transcription: str = '' + ) -> List[Segment]: + """Return the formatted segments with per-segment buffers, optionally with diarization/translation.""" + diarization_buffer = '' + if diarization: segments, diarization_buffer = self.get_lines_diarization() else: - diarization_buffer = '' for token in self.new_tokens: if token.is_silence(): - if self.current_line_tokens: - self.validated_segments.append(Segment().from_tokens(self.current_line_tokens)) - self.current_line_tokens = [] - - end_silence = token.end if token.has_ended else time() - self.beg_loop - if self.validated_segments and self.validated_segments[-1].is_silence(): - self.validated_segments[-1].end = end_silence - else: - self.validated_segments.append(SilentSegment( - start=token.start, - end=end_silence - )) + # Check silence duration before adding + silence_duration = (token.end or 0) - (token.start or 0) + if silence_duration >= self.MIN_SILENCE_DISPLAY_DURATION: + if self.current_line_tokens: + self.validated_segments.append(Segment().from_tokens(self.current_line_tokens)) + self.current_line_tokens = [] + + end_silence = token.end if token.has_ended else time() - self.beg_loop + if self.validated_segments and self.validated_segments[-1].is_silence(): + self.validated_segments[-1].end = end_silence + else: + self.validated_segments.append(SilentSegment( + start=token.start, + end=end_silence + )) else: self.current_line_tokens.append(token) @@ -208,15 +338,37 @@ class TokensAlignment: if self.current_line_tokens: segments.append(Segment().from_tokens(self.current_line_tokens)) + # Handle current ongoing silence if current_silence: - end_silence = current_silence.end if current_silence.has_ended else time() - self.beg_loop - if segments and segments[-1].is_silence(): - segments[-1] = SilentSegment(start=segments[-1].start, end=end_silence) - else: - segments.append(SilentSegment( - start=current_silence.start, - end=end_silence - )) + silence_duration = (current_silence.end or time() - self.beg_loop) - (current_silence.start or 0) + if silence_duration >= self.MIN_SILENCE_DISPLAY_DURATION: + end_silence = current_silence.end if current_silence.has_ended else time() - self.beg_loop + if segments and segments[-1].is_silence(): + segments[-1] = SilentSegment(start=segments[-1].start, end=end_silence) + else: + segments.append(SilentSegment( + start=current_silence.start, + end=end_silence + )) + if translation: [self.add_translation(segment) for segment in segments if not segment.is_silence()] - return segments, diarization_buffer, self.new_translation_buffer.text + + # Get translation buffer text + translation_buffer = self.new_translation_buffer.text if self.new_translation_buffer else '' + + # Filter parasitic silences and merge same-speaker segments + segments = self._filter_and_merge_segments(segments) + + # Assign unique IDs to all segments + self._assign_segment_ids(segments) + + # Assign buffers to the last active segment + self._assign_buffers_to_last_segment( + segments, + buffer_transcription=buffer_transcription, + buffer_diarization=diarization_buffer, + buffer_translation=translation_buffer + ) + + return segments diff --git a/whisperlivekit/web/live_transcription.css b/whisperlivekit/web/live_transcription.css index 1e4867c..b12d475 100644 --- a/whisperlivekit/web/live_transcription.css +++ b/whisperlivekit/web/live_transcription.css @@ -454,8 +454,9 @@ label { gap: 4px; } -.lag-diarization-value { - margin-left: 10px; +.lag-diarization-value, +.lag-transcription-value { + font-weight: 600; } .label_translation img { diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js index c4db1cd..c069547 100644 --- a/whisperlivekit/web/live_transcription.js +++ b/whisperlivekit/web/live_transcription.js @@ -232,11 +232,8 @@ function setupWebSocket() { if (waitingForStop) { statusText.textContent = "Processing finalized or connection closed."; if (lastReceivedData) { - renderLinesWithBuffer( - lastReceivedData.lines || [], - lastReceivedData.buffer_diarization || "", - lastReceivedData.buffer_transcription || "", - lastReceivedData.buffer_translation || "", + renderSegments( + lastReceivedData.segments || [], 0, 0, true @@ -278,11 +275,8 @@ function setupWebSocket() { waitingForStop = false; if (lastReceivedData) { - renderLinesWithBuffer( - lastReceivedData.lines || [], - lastReceivedData.buffer_diarization || "", - lastReceivedData.buffer_transcription || "", - lastReceivedData.buffer_translation || "", + renderSegments( + lastReceivedData.segments || [], 0, 0, true @@ -299,21 +293,20 @@ function setupWebSocket() { lastReceivedData = data; + // New API format: segments with per-segment buffers, metadata wrapper const { - lines = [], - buffer_transcription = "", - buffer_diarization = "", - buffer_translation = "", - remaining_time_transcription = 0, - remaining_time_diarization = 0, + segments = [], + metadata = {}, status = "active_transcription", } = data; + + const { + remaining_time_transcription = 0, + remaining_time_diarization = 0, + } = metadata; - renderLinesWithBuffer( - lines, - buffer_diarization, - buffer_transcription, - buffer_translation, + renderSegments( + segments, remaining_time_diarization, remaining_time_transcription, false, @@ -323,11 +316,8 @@ function setupWebSocket() { }); } -function renderLinesWithBuffer( - lines, - buffer_diarization, - buffer_transcription, - buffer_translation, +function renderSegments( + segments, remaining_time_diarization, remaining_time_transcription, isFinalizing = false, @@ -339,33 +329,38 @@ function renderLinesWithBuffer( return; } - const showLoading = !isFinalizing && (lines || []).some((it) => it.speaker == 0); - const showTransLag = !isFinalizing && remaining_time_transcription > 0; - const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0; + // Build signature for change detection const signature = JSON.stringify({ - lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end, detected_language: it.detected_language })), - buffer_transcription: buffer_transcription || "", - buffer_diarization: buffer_diarization || "", - buffer_translation: buffer_translation, + segments: (segments || []).map((it) => ({ + id: it.id, + speaker: it.speaker, + text: it.text, + start: it.start, + end: it.end, + language: it.language, + buffer: it.buffer || {} + })), status: current_status, - showLoading, - showTransLag, - showDiaLag, isFinalizing: !!isFinalizing, }); + + // Only update lag values if signature unchanged if (lastSignature === signature) { const t = document.querySelector(".lag-transcription-value"); if (t) t.textContent = fmt1(remaining_time_transcription); const d = document.querySelector(".lag-diarization-value"); if (d) d.textContent = fmt1(remaining_time_diarization); - const ld = document.querySelector(".loading-diarization-value"); - if (ld) ld.textContent = fmt1(remaining_time_diarization); return; } lastSignature = signature; - const linesHtml = (lines || []) + const segmentsHtml = (segments || []) .map((item, idx) => { + const buffer = item.buffer || {}; + const buffer_transcription = buffer.transcription || ""; + const buffer_diarization = buffer.diarization || ""; + const buffer_translation = buffer.translation || ""; + let timeInfo = ""; if (item.start !== undefined && item.end !== undefined) { timeInfo = ` ${item.start} - ${item.end}`; @@ -373,80 +368,78 @@ function renderLinesWithBuffer( let speakerLabel = ""; if (item.speaker === -2) { + // Silence segment speakerLabel = `${silenceIcon}${timeInfo}`; - } else if (item.speaker == 0 && !isFinalizing) { - speakerLabel = `${fmt1( - remaining_time_diarization - )} second(s) of audio are undergoing diarization`; } else if (item.speaker !== 0) { + // Normal speaker segment const speakerNum = `${item.speaker}`; speakerLabel = `${speakerIcon}${speakerNum}${timeInfo}`; - if (item.detected_language) { - speakerLabel += `${languageIcon}${item.detected_language}`; + if (item.language) { + speakerLabel += `${languageIcon}${item.language}`; } } let currentLineText = item.text || ""; - - if (idx === lines.length - 1) { - if (!isFinalizing && item.speaker !== -2) { - speakerLabel += `Transcription lag ${fmt1( - remaining_time_transcription - )}s`; - - if (buffer_diarization && remaining_time_diarization) { - speakerLabel += `Diarization lag${fmt1( - remaining_time_diarization - )}s`; - } + const isLastSegment = idx === segments.length - 1; + const hasBufferContent = buffer_diarization || buffer_transcription; + + // Show lag indicators on last non-silent segment (without spinners) + if (isLastSegment && item.speaker !== -2 && !isFinalizing) { + if (remaining_time_transcription > 0) { + speakerLabel += `Transcription lag: ${fmt1(remaining_time_transcription)}s`; } + if (buffer_diarization && remaining_time_diarization > 0) { + speakerLabel += `Diarization lag: ${fmt1(remaining_time_diarization)}s`; + } + } + // Render buffers + if (hasBufferContent && item.speaker !== -2) { if (buffer_diarization) { if (isFinalizing) { - currentLineText += - (currentLineText.length > 0 && buffer_diarization.trim().length > 0 ? " " : "") + buffer_diarization.trim(); + currentLineText += (currentLineText.length > 0 ? " " : "") + buffer_diarization.trim(); } else { currentLineText += `${buffer_diarization}`; } } if (buffer_transcription) { if (isFinalizing) { - currentLineText += - (currentLineText.length > 0 && buffer_transcription.trim().length > 0 ? " " : "") + - buffer_transcription.trim(); + currentLineText += (currentLineText.length > 0 ? " " : "") + buffer_transcription.trim(); } else { currentLineText += `${buffer_transcription}`; } } } + + // Translation let translationContent = ""; if (item.translation) { translationContent += item.translation.trim(); } - if (idx === lines.length - 1 && buffer_translation) { + if (buffer_translation) { const bufferPiece = isFinalizing ? buffer_translation : `${buffer_translation}`; - translationContent += translationContent ? `${bufferPiece}` : bufferPiece; + translationContent += translationContent ? bufferPiece : bufferPiece; } if (translationContent.trim().length > 0) { currentLineText += ` -
-
- ${translationIcon} - ${translationContent} -
-
`; +
+ ${translationIcon} + ${translationContent} +
`; } - return currentLineText.trim().length > 0 || speakerLabel.length > 0 - ? `

${speakerLabel}

${currentLineText}

` - : `

${speakerLabel}

`; + if (currentLineText.trim().length > 0 || speakerLabel.length > 0) { + return `

${speakerLabel}

${currentLineText}

`; + } + return speakerLabel ? `

${speakerLabel}

` : ""; }) + .filter(html => html.length > 0) .join(""); - linesTranscriptDiv.innerHTML = linesHtml; + linesTranscriptDiv.innerHTML = segmentsHtml; const transcriptContainer = document.querySelector('.transcript-container'); if (transcriptContainer) { transcriptContainer.scrollTo({ top: transcriptContainer.scrollHeight, behavior: "smooth" }); diff --git a/whisperlivekit/web/text_transcript.html b/whisperlivekit/web/text_transcript.html new file mode 100644 index 0000000..0704d17 --- /dev/null +++ b/whisperlivekit/web/text_transcript.html @@ -0,0 +1,377 @@ + + + + + + WhisperLiveKit Transcript + + + + + +
+ + + + diff --git a/whisperlivekit/web/web_interface.py b/whisperlivekit/web/web_interface.py index d8e13bc..74677e4 100644 --- a/whisperlivekit/web/web_interface.py +++ b/whisperlivekit/web/web_interface.py @@ -13,6 +13,37 @@ def get_web_interface_html(): logger.error(f"Error loading web interface HTML: {e}") return "

Error loading interface

" + +def get_text_transcript_html(): + """Loads the simple text-based transcript HTML for easy copy/paste.""" + try: + with resources.files('whisperlivekit.web').joinpath('text_transcript.html').open('r', encoding='utf-8') as f: + html_content = f.read() + + # Inline the worker scripts + with resources.files('whisperlivekit.web').joinpath('pcm_worklet.js').open('r', encoding='utf-8') as f: + worklet_code = f.read() + with resources.files('whisperlivekit.web').joinpath('recorder_worker.js').open('r', encoding='utf-8') as f: + worker_code = f.read() + + html_content = html_content.replace( + "await audioContext.audioWorklet.addModule('/web/pcm_worklet.js');", + 'const workletBlob = new Blob([`' + worklet_code + '`], { type: "application/javascript" });\n' + + 'const workletUrl = URL.createObjectURL(workletBlob);\n' + + 'await audioContext.audioWorklet.addModule(workletUrl);' + ) + html_content = html_content.replace( + "recorderWorker = new Worker('/web/recorder_worker.js');", + 'const workerBlob = new Blob([`' + worker_code + '`], { type: "application/javascript" });\n' + + 'const workerUrl = URL.createObjectURL(workerBlob);\n' + + 'recorderWorker = new Worker(workerUrl);' + ) + + return html_content + except Exception as e: + logger.error(f"Error loading text transcript HTML: {e}") + return "

Error loading text interface

" + def get_inline_ui_html(): """Returns the complete web interface HTML with all assets embedded in a single call.""" try: