diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py
index 8e23ef2..9a39b3b 100644
--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -301,7 +301,9 @@ class AudioProcessor:
transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer)
asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |"
if type(item) is Silence:
- asr_processing_logs += f" + Silence of = {item.duration:.2f}s | last_end = {self.tokens[-1].end} |"
+ asr_processing_logs += f" + Silence of = {item.duration:.2f}s"
+ if self.tokens:
+ asr_processing_logs += " | last_end = {self.tokens[-1].end} |"
logger.info(asr_processing_logs)
if type(item) is Silence:
@@ -445,12 +447,15 @@ class AudioProcessor:
last_end_diarized = 0
undiarized_text = []
current_time = time() - self.beg_loop if self.beg_loop else None
- tokens, buffer_transcription = handle_silences(tokens, buffer_transcription, current_time, self.silence)
+ tokens, buffer_transcription, buffer_diarization = handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, self.silence)
for token in tokens:
speaker = token.speaker
+ if speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1'
+ speaker = 1
+
# Handle diarization
- if self.args.diarization:
+ if self.args.diarization and not tokens[-1].speaker == -2:
if (speaker in [-1, 0]) and token.end >= end_attributed_speaker:
undiarized_text.append(token.text)
continue
diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py
index 1ea7f6d..5091f00 100644
--- a/whisperlivekit/remove_silences.py
+++ b/whisperlivekit/remove_silences.py
@@ -77,9 +77,9 @@ def no_token_to_silence(tokens):
new_tokens.append(token)
return new_tokens
-def ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence):
+def ends_with_silence(tokens, buffer_transcription, buffer_diarization, current_time, vac_detected_silence):
if not tokens:
- return []
+ return [], buffer_transcription, buffer_diarization
last_token = tokens[-1]
if tokens and (
current_time - last_token.end >= END_SILENCE_DURATION
@@ -97,13 +97,14 @@ def ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_s
probability=0.95
)
)
- #We validate the buffer has because of the silence
- return tokens
+ buffer_transcription = "" # for whisperstreaming backend, we should probably validate the buffer has because of the silence
+ buffer_diarization = ""
+ return tokens, buffer_transcription, buffer_diarization
-def handle_silences(tokens, buffer_transcription, current_time, vac_detected_silence):
+def handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, vac_detected_silence):
tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
tokens = no_token_to_silence(tokens)
- tokens = ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence)
- return tokens, buffer_transcription
+ tokens, buffer_transcription, buffer_diarization = ends_with_silence(tokens, buffer_transcription, buffer_diarization, current_time, vac_detected_silence)
+ return tokens, buffer_transcription, buffer_diarization
\ No newline at end of file
diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js
index 8661219..f2efb18 100644
--- a/whisperlivekit/web/live_transcription.js
+++ b/whisperlivekit/web/live_transcription.js
@@ -3,7 +3,7 @@
let isRecording = false;
let websocket = null;
let recorder = null;
-let chunkDuration = 1000;
+let chunkDuration = 100;
let websocketUrl = "ws://localhost:8000/asr";
let userClosing = false;
let wakeLock = null;
@@ -269,9 +269,7 @@ function renderLinesWithBuffer(
speakerLabel = `${fmt1(
remaining_time_diarization
)} second(s) of audio are undergoing diarization`;
- } else if (item.speaker == -1) {
- speakerLabel = `Speaker 1${timeInfo}`;
- } else if (item.speaker !== -1 && item.speaker !== 0) {
+ } else if (item.speaker !== 0) {
speakerLabel = `Speaker ${item.speaker}${timeInfo}`;
}