punctuation is now checked in timed_object

This commit is contained in:
Quentin Fuxa
2025-09-22 22:40:39 +02:00
parent 426d70a790
commit e61afdefa3
3 changed files with 7 additions and 5 deletions

View File

@@ -6,11 +6,10 @@ from whisperlivekit.timed_objects import Line, format_time
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
PUNCTUATION_MARKS = {'.', '!', '?', '', '', ''}
CHECK_AROUND = 4
def is_punctuation(token):
if token.text.strip() in PUNCTUATION_MARKS:
if token.is_punctuation():
return True
return False

View File

@@ -2,6 +2,8 @@ from dataclasses import dataclass, field
from typing import Optional, Any
from datetime import timedelta
PUNCTUATION_MARKS = {'.', '!', '?', '', '', ''}
def format_time(seconds: float) -> str:
"""Format seconds as HH:MM:SS."""
return str(timedelta(seconds=int(seconds)))
@@ -16,6 +18,9 @@ class TimedText:
probability: Optional[float] = None
is_dummy: Optional[bool] = False
def is_punctuation(self):
return self.text.strip() in PUNCTUATION_MARKS
def overlaps_with(self, other: 'TimedText') -> bool:
return not (self.end <= other.start or other.end <= self.start)

View File

@@ -12,8 +12,6 @@ logger = logging.getLogger(__name__)
#In diarization case, we may want to translate just one speaker, or at least start the sentences there
PUNCTUATION_MARKS = {'.', '!', '?', '', '', ''}
MIN_SILENCE_DURATION_DEL_BUFFER = 3 #After a silence of x seconds, we consider the model should not use the buffer, even if the previous
# sentence is not finished.
@@ -111,7 +109,7 @@ class OnlineTranslation:
if len(self.buffer) < self.len_processed_buffer + 3: #nothing new to process
return self.validated + [self.translation_remaining]
while i < len(self.buffer):
if self.buffer[i].text in PUNCTUATION_MARKS:
if self.buffer[i].is_punctuation():
translation_sentence = self.translate_tokens(self.buffer[:i+1])
self.validated.append(translation_sentence)
self.buffer = self.buffer[i+1:]