diff --git a/demo.png b/demo.png
index e903a3a..14989d5 100644
Binary files a/demo.png and b/demo.png differ
diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py
index 9dcd98b..ed87dab 100644
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -66,6 +66,8 @@ class AudioProcessor:
self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio
self.sep = " " # Default separator
self.last_response_content = FrontData()
+ self.last_detected_speaker = None
+ self.speaker_languages = {}
# Models and processing
self.asr = models.asr
@@ -333,7 +335,7 @@ class AudioProcessor:
await diarization_obj.diarize(pcm_array)
async with self.lock:
- self.tokens = diarization_obj.assign_speakers_to_tokens(
+ self.tokens, last_segment = diarization_obj.assign_speakers_to_tokens(
self.tokens,
use_punctuation_split=self.args.punctuation_split
)
@@ -341,7 +343,12 @@ class AudioProcessor:
self.end_attributed_speaker = max(self.tokens[-1].end, self.end_attributed_speaker)
if buffer_diarization:
self.buffer_diarization = buffer_diarization
-
+
+ # if last_segment is not None and last_segment.speaker != self.last_detected_speaker:
+ # if not self.speaker_languages.get(last_segment.speaker, None):
+ # self.last_detected_speaker = last_segment.speaker
+ # self.online.on_new_speaker(last_segment)
+
self.diarization_queue.task_done()
except Exception as e:
@@ -552,20 +559,20 @@ class AudioProcessor:
if task and not task.done():
task.cancel()
- created_tasks = [t for t in self.all_tasks_for_cleanup if t]
- if created_tasks:
- await asyncio.gather(*created_tasks, return_exceptions=True)
- logger.info("All processing tasks cancelled or finished.")
+ created_tasks = [t for t in self.all_tasks_for_cleanup if t]
+ if created_tasks:
+ await asyncio.gather(*created_tasks, return_exceptions=True)
+ logger.info("All processing tasks cancelled or finished.")
- if not self.is_pcm_input and self.ffmpeg_manager:
- try:
- await self.ffmpeg_manager.stop()
- logger.info("FFmpeg manager stopped.")
- except Exception as e:
- logger.warning(f"Error stopping FFmpeg manager: {e}")
- if self.args.diarization and hasattr(self, 'dianization') and hasattr(self.diarization, 'close'):
- self.diarization.close()
- logger.info("AudioProcessor cleanup complete.")
+ if not self.is_pcm_input and self.ffmpeg_manager:
+ try:
+ await self.ffmpeg_manager.stop()
+ logger.info("FFmpeg manager stopped.")
+ except Exception as e:
+ logger.warning(f"Error stopping FFmpeg manager: {e}")
+ if self.args.diarization and hasattr(self, 'dianization') and hasattr(self.diarization, 'close'):
+ self.diarization.close()
+ logger.info("AudioProcessor cleanup complete.")
async def process_audio(self, message):
diff --git a/whisperlivekit/diarization/diart_backend.py b/whisperlivekit/diarization/diart_backend.py
index 6c578cb..de8b794 100644
--- a/whisperlivekit/diarization/diart_backend.py
+++ b/whisperlivekit/diarization/diart_backend.py
@@ -242,7 +242,7 @@ class DiartDiarization:
token.speaker = extract_number(segment.speaker) + 1
else:
tokens = add_speaker_to_tokens(segments, tokens)
- return tokens
+ return tokens, segments[-1]
def concatenate_speakers(segments):
segments_concatenated = [{"speaker": 1, "begin": 0.0, "end": 0.0}]
diff --git a/whisperlivekit/diarization/sortformer_backend.py b/whisperlivekit/diarization/sortformer_backend.py
index 84652a3..9b7d1a5 100644
--- a/whisperlivekit/diarization/sortformer_backend.py
+++ b/whisperlivekit/diarization/sortformer_backend.py
@@ -289,13 +289,14 @@ class SortformerDiarizationOnline:
Returns:
List of tokens with speaker assignments
+ Last speaker_segment
"""
with self.segment_lock:
segments = self.speaker_segments.copy()
if not segments or not tokens:
logger.debug("No segments or tokens available for speaker assignment")
- return tokens
+ return tokens, None
logger.debug(f"Assigning speakers to {len(tokens)} tokens using {len(segments)} segments")
use_punctuation_split = False
@@ -312,7 +313,7 @@ class SortformerDiarizationOnline:
# Use punctuation-aware assignment (similar to diart_backend)
tokens = self._add_speaker_to_tokens_with_punctuation(segments, tokens)
- return tokens
+ return tokens, segments[-1]
def _add_speaker_to_tokens_with_punctuation(self, segments: List[SpeakerSegment], tokens: list) -> list:
"""
diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py
index a9df490..c3954ea 100644
--- a/whisperlivekit/timed_objects.py
+++ b/whisperlivekit/timed_objects.py
@@ -92,16 +92,22 @@ class Silence():
@dataclass
class Line(TimedText):
translation: str = ''
+ detected_language: str = None
def to_dict(self):
- return {
+ _dict = {
'speaker': int(self.speaker),
'text': self.text,
- 'translation': self.translation,
'start': format_time(self.start),
'end': format_time(self.end),
}
-
+ if self.translation:
+ _dict['translation'] = self.translation
+ if self.detected_language:
+ _dict['detected_language'] = self.detected_language
+ return _dict
+
+
@dataclass
class FrontData():
status: str = ''
diff --git a/whisperlivekit/web/live_transcription.css b/whisperlivekit/web/live_transcription.css
index 3cf5007..40d32f0 100644
--- a/whisperlivekit/web/live_transcription.css
+++ b/whisperlivekit/web/live_transcription.css
@@ -346,7 +346,7 @@ label {
.label_diarization {
background-color: var(--chip-bg);
- border-radius: 8px 8px 8px 8px;
+ border-radius: 100px;
padding: 2px 10px;
margin-left: 10px;
display: inline-block;
@@ -358,7 +358,7 @@ label {
.label_transcription {
background-color: var(--chip-bg);
- border-radius: 8px 8px 8px 8px;
+ border-radius: 100px;
padding: 2px 10px;
display: inline-block;
white-space: nowrap;
@@ -370,16 +370,20 @@ label {
.label_translation {
background-color: var(--chip-bg);
+ display: inline-flex;
border-radius: 10px;
padding: 4px 8px;
margin-top: 4px;
font-size: 14px;
color: var(--text);
- display: flex;
align-items: flex-start;
gap: 4px;
}
+.lag-diarization-value {
+ margin-left: 10px;
+}
+
.label_translation img {
margin-top: 2px;
}
@@ -391,7 +395,7 @@ label {
#timeInfo {
color: var(--muted);
- margin-left: 10px;
+ margin-left: 0px;
}
.textcontent {
@@ -514,3 +518,49 @@ label {
padding: 10px;
}
}
+
+.label_language {
+ background-color: var(--chip-bg);
+ margin-bottom: 0px;
+ margin-top: 5px;
+ height: 18.5px;
+ border-radius: 100px;
+ padding: 2px 8px;
+ margin-left: 10px;
+ display: inline-flex;
+ align-items: center;
+ gap: 4px;
+ font-size: 14px;
+ color: var(--muted);
+}
+
+.label_language img {
+ width: 12px;
+ height: 12px;
+}
+
+.silence-icon {
+ width: 14px;
+ height: 14px;
+ vertical-align: text-bottom;
+}
+
+.speaker-icon {
+ width: 16px;
+ height: 16px;
+ vertical-align: text-bottom;
+}
+
+.speaker-badge {
+ display: inline-flex;
+ align-items: center;
+ justify-content: center;
+ width: 16px;
+ height: 16px;
+ margin-left: -5px;
+ border-radius: 50%;
+ font-size: 11px;
+ line-height: 1;
+ font-weight: 800;
+ color: var(--muted);
+}
diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js
index a527d85..83fce97 100644
--- a/whisperlivekit/web/live_transcription.js
+++ b/whisperlivekit/web/live_transcription.js
@@ -306,7 +306,7 @@ function renderLinesWithBuffer(
const showTransLag = !isFinalizing && remaining_time_transcription > 0;
const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
const signature = JSON.stringify({
- lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end })),
+ lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, start: it.start, end: it.end, detected_language: it.detected_language })),
buffer_transcription: buffer_transcription || "",
buffer_diarization: buffer_diarization || "",
status: current_status,
@@ -335,13 +335,20 @@ function renderLinesWithBuffer(
let speakerLabel = "";
if (item.speaker === -2) {
- speakerLabel = `Silence${timeInfo}`;
+ const silenceIcon = `
`;
+ speakerLabel = `${silenceIcon}${timeInfo}`;
} else if (item.speaker == 0 && !isFinalizing) {
speakerLabel = `${fmt1(
remaining_time_diarization
)} second(s) of audio are undergoing diarization`;
} else if (item.speaker !== 0) {
- speakerLabel = `Speaker ${item.speaker}${timeInfo}`;
+ const speakerIcon = `
`;
+ const speakerNum = `${item.speaker}`;
+ speakerLabel = `${speakerIcon}${speakerNum}${timeInfo}`;
+
+ if (item.detected_language) {
+ speakerLabel += `
${item.detected_language}`;
+ }
}
let currentLineText = item.text || "";
diff --git a/whisperlivekit/web/src/language.svg b/whisperlivekit/web/src/language.svg
new file mode 100644
index 0000000..1725332
--- /dev/null
+++ b/whisperlivekit/web/src/language.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/whisperlivekit/web/src/silence.svg b/whisperlivekit/web/src/silence.svg
new file mode 100644
index 0000000..9be58ed
--- /dev/null
+++ b/whisperlivekit/web/src/silence.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/whisperlivekit/web/src/speaker.svg b/whisperlivekit/web/src/speaker.svg
new file mode 100644
index 0000000..241f610
--- /dev/null
+++ b/whisperlivekit/web/src/speaker.svg
@@ -0,0 +1 @@
+
\ No newline at end of file