diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 8e23ef2..9a39b3b 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -301,7 +301,9 @@ class AudioProcessor: transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer) asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |" if type(item) is Silence: - asr_processing_logs += f" + Silence of = {item.duration:.2f}s | last_end = {self.tokens[-1].end} |" + asr_processing_logs += f" + Silence of = {item.duration:.2f}s" + if self.tokens: + asr_processing_logs += " | last_end = {self.tokens[-1].end} |" logger.info(asr_processing_logs) if type(item) is Silence: @@ -445,12 +447,15 @@ class AudioProcessor: last_end_diarized = 0 undiarized_text = [] current_time = time() - self.beg_loop if self.beg_loop else None - tokens, buffer_transcription = handle_silences(tokens, buffer_transcription, current_time, self.silence) + tokens, buffer_transcription, buffer_diarization = handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, self.silence) for token in tokens: speaker = token.speaker + if speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1' + speaker = 1 + # Handle diarization - if self.args.diarization: + if self.args.diarization and not tokens[-1].speaker == -2: if (speaker in [-1, 0]) and token.end >= end_attributed_speaker: undiarized_text.append(token.text) continue diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py index 1ea7f6d..5091f00 100644 --- a/whisperlivekit/remove_silences.py +++ b/whisperlivekit/remove_silences.py @@ -77,9 +77,9 @@ def no_token_to_silence(tokens): new_tokens.append(token) return new_tokens -def ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence): +def ends_with_silence(tokens, buffer_transcription, buffer_diarization, current_time, vac_detected_silence): if not tokens: - return [] + return [], buffer_transcription, buffer_diarization last_token = tokens[-1] if tokens and ( current_time - last_token.end >= END_SILENCE_DURATION @@ -97,13 +97,14 @@ def ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_s probability=0.95 ) ) - #We validate the buffer has because of the silence - return tokens + buffer_transcription = "" # for whisperstreaming backend, we should probably validate the buffer has because of the silence + buffer_diarization = "" + return tokens, buffer_transcription, buffer_diarization -def handle_silences(tokens, buffer_transcription, current_time, vac_detected_silence): +def handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, vac_detected_silence): tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text tokens = no_token_to_silence(tokens) - tokens = ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence) - return tokens, buffer_transcription + tokens, buffer_transcription, buffer_diarization = ends_with_silence(tokens, buffer_transcription, buffer_diarization, current_time, vac_detected_silence) + return tokens, buffer_transcription, buffer_diarization \ No newline at end of file diff --git a/whisperlivekit/web/live_transcription.js b/whisperlivekit/web/live_transcription.js index 8661219..f2efb18 100644 --- a/whisperlivekit/web/live_transcription.js +++ b/whisperlivekit/web/live_transcription.js @@ -3,7 +3,7 @@ let isRecording = false; let websocket = null; let recorder = null; -let chunkDuration = 1000; +let chunkDuration = 100; let websocketUrl = "ws://localhost:8000/asr"; let userClosing = false; let wakeLock = null; @@ -269,9 +269,7 @@ function renderLinesWithBuffer( speakerLabel = `${fmt1( remaining_time_diarization )} second(s) of audio are undergoing diarization`; - } else if (item.speaker == -1) { - speakerLabel = `Speaker 1${timeInfo}`; - } else if (item.speaker !== -1 && item.speaker !== 0) { + } else if (item.speaker !== 0) { speakerLabel = `Speaker ${item.speaker}${timeInfo}`; }