diff --git a/demo.png b/demo.png index e903a3a..14989d5 100644 Binary files a/demo.png and b/demo.png differ diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 9dcd98b..ed87dab 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -66,6 +66,8 @@ class AudioProcessor: self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio self.sep = " " # Default separator self.last_response_content = FrontData() + self.last_detected_speaker = None + self.speaker_languages = {} # Models and processing self.asr = models.asr @@ -333,7 +335,7 @@ class AudioProcessor: await diarization_obj.diarize(pcm_array) async with self.lock: - self.tokens = diarization_obj.assign_speakers_to_tokens( + self.tokens, last_segment = diarization_obj.assign_speakers_to_tokens( self.tokens, use_punctuation_split=self.args.punctuation_split ) @@ -341,7 +343,12 @@ class AudioProcessor: self.end_attributed_speaker = max(self.tokens[-1].end, self.end_attributed_speaker) if buffer_diarization: self.buffer_diarization = buffer_diarization - + + # if last_segment is not None and last_segment.speaker != self.last_detected_speaker: + # if not self.speaker_languages.get(last_segment.speaker, None): + # self.last_detected_speaker = last_segment.speaker + # self.online.on_new_speaker(last_segment) + self.diarization_queue.task_done() except Exception as e: @@ -552,20 +559,20 @@ class AudioProcessor: if task and not task.done(): task.cancel() - created_tasks = [t for t in self.all_tasks_for_cleanup if t] - if created_tasks: - await asyncio.gather(*created_tasks, return_exceptions=True) - logger.info("All processing tasks cancelled or finished.") + created_tasks = [t for t in self.all_tasks_for_cleanup if t] + if created_tasks: + await asyncio.gather(*created_tasks, return_exceptions=True) + logger.info("All processing tasks cancelled or finished.") - if not self.is_pcm_input and self.ffmpeg_manager: - try: - await self.ffmpeg_manager.stop() - logger.info("FFmpeg manager stopped.") - except Exception as e: - logger.warning(f"Error stopping FFmpeg manager: {e}") - if self.args.diarization and hasattr(self, 'dianization') and hasattr(self.diarization, 'close'): - self.diarization.close() - logger.info("AudioProcessor cleanup complete.") + if not self.is_pcm_input and self.ffmpeg_manager: + try: + await self.ffmpeg_manager.stop() + logger.info("FFmpeg manager stopped.") + except Exception as e: + logger.warning(f"Error stopping FFmpeg manager: {e}") + if self.args.diarization and hasattr(self, 'dianization') and hasattr(self.diarization, 'close'): + self.diarization.close() + logger.info("AudioProcessor cleanup complete.") async def process_audio(self, message): diff --git a/whisperlivekit/diarization/diart_backend.py b/whisperlivekit/diarization/diart_backend.py index 6c578cb..de8b794 100644 --- a/whisperlivekit/diarization/diart_backend.py +++ b/whisperlivekit/diarization/diart_backend.py @@ -242,7 +242,7 @@ class DiartDiarization: token.speaker = extract_number(segment.speaker) + 1 else: tokens = add_speaker_to_tokens(segments, tokens) - return tokens + return tokens, segments[-1] def concatenate_speakers(segments): segments_concatenated = [{"speaker": 1, "begin": 0.0, "end": 0.0}] diff --git a/whisperlivekit/diarization/sortformer_backend.py b/whisperlivekit/diarization/sortformer_backend.py index 84652a3..9b7d1a5 100644 --- a/whisperlivekit/diarization/sortformer_backend.py +++ b/whisperlivekit/diarization/sortformer_backend.py @@ -289,13 +289,14 @@ class SortformerDiarizationOnline: Returns: List of tokens with speaker assignments + Last speaker_segment """ with self.segment_lock: segments = self.speaker_segments.copy() if not segments or not tokens: logger.debug("No segments or tokens available for speaker assignment") - return tokens + return tokens, None logger.debug(f"Assigning speakers to {len(tokens)} tokens using {len(segments)} segments") use_punctuation_split = False @@ -312,7 +313,7 @@ class SortformerDiarizationOnline: # Use punctuation-aware assignment (similar to diart_backend) tokens = self._add_speaker_to_tokens_with_punctuation(segments, tokens) - return tokens + return tokens, segments[-1] def _add_speaker_to_tokens_with_punctuation(self, segments: List[SpeakerSegment], tokens: list) -> list: """ diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index a9df490..c3954ea 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -92,16 +92,22 @@ class Silence(): @dataclass class Line(TimedText): translation: str = '' + detected_language: str = None def to_dict(self): - return { + _dict = { 'speaker': int(self.speaker), 'text': self.text, - 'translation': self.translation, 'start': format_time(self.start), 'end': format_time(self.end), } - + if self.translation: + _dict['translation'] = self.translation + if self.detected_language: + _dict['detected_language'] = self.detected_language + return _dict + + @dataclass class FrontData(): status: str = '' diff --git a/whisperlivekit/web/live_transcription.css b/whisperlivekit/web/live_transcription.css index 3cf5007..40d32f0 100644 --- a/whisperlivekit/web/live_transcription.css +++ b/whisperlivekit/web/live_transcription.css @@ -346,7 +346,7 @@ label { .label_diarization { background-color: var(--chip-bg); - border-radius: 8px 8px 8px 8px; + border-radius: 100px; padding: 2px 10px; margin-left: 10px; display: inline-block; @@ -358,7 +358,7 @@ label { .label_transcription { background-color: var(--chip-bg); - border-radius: 8px 8px 8px 8px; + border-radius: 100px; padding: 2px 10px; display: inline-block; white-space: nowrap; @@ -370,16 +370,20 @@ label { .label_translation { background-color: var(--chip-bg); + display: inline-flex; border-radius: 10px; padding: 4px 8px; margin-top: 4px; font-size: 14px; color: var(--text); - display: flex; align-items: flex-start; gap: 4px; } +.lag-diarization-value { + margin-left: 10px; +} + .label_translation img { margin-top: 2px; } @@ -391,7 +395,7 @@ label { #timeInfo { color: var(--muted); - margin-left: 10px; + margin-left: 0px; } .textcontent { @@ -514,3 +518,49 @@ label { padding: 10px; } } + +.label_language { + background-color: var(--chip-bg); + margin-bottom: 0px; + margin-top: 5px; + height: 18.5px; + border-radius: 100px; + padding: 2px 8px; + margin-left: 10px; + display: inline-flex; + align-items: center; + gap: 4px; + font-size: 14px; + color: var(--muted); +} + +.label_language img { + width: 12px; + height: 12px; +} + +.silence-icon { + width: 14px; + height: 14px; + vertical-align: text-bottom; +} + +.speaker-icon { + width: 16px; + height: 16px; + vertical-align: text-bottom; +} + +.speaker-badge { + display: inline-flex; + align-items: center; + justify-content: center; + width: 16px; + height: 16px; + margin-left: -5px; + border-radius: 50%; + font-size: 11px; + line-height: 1; + font-weight: 800; + color: var(--muted); +} diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js index a527d85..83fce97 100644 --- a/whisperlivekit/web/live_transcription.js +++ b/whisperlivekit/web/live_transcription.js @@ -306,7 +306,7 @@ function renderLinesWithBuffer( const showTransLag = !isFinalizing && remaining_time_transcription > 0; const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0; const signature = JSON.stringify({ - lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })), + lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end, detected_language: it.detected_language })), buffer_transcription: buffer_transcription || "", buffer_diarization: buffer_diarization || "", status: current_status, @@ -335,13 +335,20 @@ function renderLinesWithBuffer( let speakerLabel = ""; if (item.speaker === -2) { - speakerLabel = `Silence${timeInfo}`; + const silenceIcon = `Silence`; + speakerLabel = `${silenceIcon}${timeInfo}`; } else if (item.speaker == 0 && !isFinalizing) { speakerLabel = `${fmt1( remaining_time_diarization )} second(s) of audio are undergoing diarization`; } else if (item.speaker !== 0) { - speakerLabel = `Speaker ${item.speaker}${timeInfo}`; + const speakerIcon = `Speaker ${item.speaker}`; + const speakerNum = `${item.speaker}`; + speakerLabel = `${speakerIcon}${speakerNum}${timeInfo}`; + + if (item.detected_language) { + speakerLabel += `Detected language${item.detected_language}`; + } } let currentLineText = item.text || ""; diff --git a/whisperlivekit/web/src/language.svg b/whisperlivekit/web/src/language.svg new file mode 100644 index 0000000..1725332 --- /dev/null +++ b/whisperlivekit/web/src/language.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/whisperlivekit/web/src/silence.svg b/whisperlivekit/web/src/silence.svg new file mode 100644 index 0000000..9be58ed --- /dev/null +++ b/whisperlivekit/web/src/silence.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/whisperlivekit/web/src/speaker.svg b/whisperlivekit/web/src/speaker.svg new file mode 100644 index 0000000..241f610 --- /dev/null +++ b/whisperlivekit/web/src/speaker.svg @@ -0,0 +1 @@ + \ No newline at end of file