diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 2826f1f..c9ff1f7 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -8,7 +8,7 @@ from datetime import timedelta from whisperlivekit.timed_objects import ASRToken from whisperlivekit.core import TranscriptionEngine, online_factory from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState - +from .remove_silences import handle_silences # Set up logging once logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -51,7 +51,6 @@ class AudioProcessor: self.tokens = [] self.buffer_transcription = "" self.buffer_diarization = "" - self.full_transcription = "" self.end_buffer = 0 self.end_attributed_speaker = 0 self.lock = asyncio.Lock() @@ -95,13 +94,12 @@ class AudioProcessor: """Convert PCM buffer in s16le format to normalized NumPy array.""" return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0 - async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep): + async def update_transcription(self, new_tokens, buffer, end_buffer, sep): """Thread-safe update of transcription with new data.""" async with self.lock: self.tokens.extend(new_tokens) self.buffer_transcription = buffer self.end_buffer = end_buffer - self.full_transcription = full_transcription self.sep = sep async def update_diarization(self, end_attributed_speaker, buffer_diarization=""): @@ -152,7 +150,6 @@ class AudioProcessor: self.tokens = [] self.buffer_transcription = self.buffer_diarization = "" self.end_buffer = self.end_attributed_speaker = 0 - self.full_transcription = self.last_response_content = "" self.beg_loop = time() async def ffmpeg_stdout_reader(self): @@ -237,7 +234,6 @@ class AudioProcessor: async def transcription_processor(self): """Process audio chunks for transcription.""" - self.full_transcription = "" self.sep = self.online.asr.sep cumulative_pcm_duration_stream_time = 0.0 @@ -249,7 +245,7 @@ class AudioProcessor: self.transcription_queue.task_done() break - if not self.online: # Should not happen if queue is used + if not self.online: logger.warning("Transcription processor: self.online not initialized.") self.transcription_queue.task_done() continue @@ -276,8 +272,6 @@ class AudioProcessor: if new_tokens: validated_text = self.sep.join([t.text for t in new_tokens]) - self.full_transcription += validated_text - if buffer_text.startswith(validated_text): buffer_text = buffer_text[len(validated_text):].lstrip() @@ -294,7 +288,7 @@ class AudioProcessor: new_end_buffer = max(candidate_end_times) await self.update_transcription( - new_tokens, buffer_text, new_end_buffer, self.full_transcription, self.sep + new_tokens, buffer_text, new_end_buffer, self.sep ) self.transcription_queue.task_done() @@ -382,8 +376,8 @@ class AudioProcessor: lines = [] last_end_diarized = 0 undiarized_text = [] - - # Process each token + current_time = time() - self.beg_loop + tokens = handle_silences(tokens, current_time) for token in tokens: speaker = token.speaker diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py new file mode 100644 index 0000000..e32f2c7 --- /dev/null +++ b/whisperlivekit/remove_silences.py @@ -0,0 +1,102 @@ +from whisperlivekit.timed_objects import ASRToken +import re + +MIN_SILENCE_DURATION = 4 #in seconds + +def blank_to_silence(tokens): + full_string = ''.join([t.text for t in tokens]) + patterns = [re.compile(r'(?:\s*\[BLANK_AUDIO\]\s*)+'), re.compile(r'(?:\s*\[typing\]\s*)+')] + matches = [] + for pattern in patterns: + for m in pattern.finditer(full_string): + matches.append({ + 'start': m.start(), + 'end': m.end() + }) + if matches: + # cleaned = pattern.sub(' ', full_string).strip() + # print("Cleaned:", cleaned) + cumulated_len = 0 + silence_token = None + cleaned_tokens = [] + for token in tokens: + if matches: + start = cumulated_len + end = cumulated_len + len(token.text) + cumulated_len = end + if start >= matches[0]['start'] and end <= matches[0]['end']: + if silence_token: #previous token was already silence + silence_token.start = min(silence_token.start, token.start) + silence_token.end = max(silence_token.end, token.end) + else: #new silence + silence_token = ASRToken( + start=token.start, + end=token.end, + speaker=-2, + probability=0.95 + ) + else: + if silence_token: #there was silence but no more + if silence_token.end - silence_token.start >= MIN_SILENCE_DURATION: + cleaned_tokens.append( + silence_token + ) + silence_token = None + matches.pop(0) + cleaned_tokens.append(token) + # print(cleaned_tokens) + return cleaned_tokens + return tokens + +def no_token_to_silence(tokens): + new_tokens = [] + silence_token = None + for token in tokens: + if token.speaker == -2: + if new_tokens and new_tokens[-1].speaker == -2: #if token is silence and previous one too + new_tokens[-1].end = token.end + else: + new_tokens.append(token) + + last_end = new_tokens[-1].end if new_tokens else 0.0 + if token.start - last_end >= MIN_SILENCE_DURATION: #if token is not silence but important gap + if new_tokens and new_tokens[-1].speaker == -2: + new_tokens[-1].end = token.start + else: + silence_token = ASRToken( + start=last_end, + end=token.start, + speaker=-2, + probability=0.95 + ) + new_tokens.append(silence_token) + + if token.speaker != -2: + new_tokens.append(token) + return new_tokens + +def ends_with_silence(tokens, current_time): + if not tokens: + return [] + last_token = tokens[-1] + if tokens and current_time - last_token.end >= MIN_SILENCE_DURATION: + if last_token.speaker == -2: + last_token.end = current_time + else: + tokens.append( + ASRToken( + start=tokens[-1].end, + end=current_time, + speaker=-2, + probability=0.95 + ) + ) + return tokens + + +def handle_silences(tokens, current_time): + tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text + tokens = no_token_to_silence(tokens) + tokens = ends_with_silence(tokens, current_time) + return tokens + \ No newline at end of file