add --disable-punctuation-split option

This commit is contained in:
Quentin Fuxa
2025-09-11 21:03:00 +02:00
parent 967cdfebc8
commit b06866877a
5 changed files with 19 additions and 5 deletions

View File

@@ -19,6 +19,7 @@ Real-time speech transcription directly to your browser, with a ready-to-use bac
#### Powered by Leading Research:
- [SimulStreaming](https://github.com/ufal/SimulStreaming) (SOTA 2025) - Ultra-low latency transcription with AlignAtt policy
- [NLLB](https://arxiv.org/abs/2207.04672), ([distilled](https://huggingface.co/entai2965/nllb-200-distilled-600M-ctranslate2)) (2024) - Translation to more than 100 languages.
- [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) - Low latency transcription with LocalAgreement policy
- [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) - Advanced real-time speaker diarization
- [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) - Real-time speaker diarization
@@ -143,8 +144,9 @@ An important list of parameters can be changed. But what *should* you change?
- `--task translate`, to translate in english
- `--host`, `--port`, `--ssl-certfile`, `--ssl-keyfile`, if you set up a server
- `--diarization`, if you want to use it.
- [BETA] `--target-language`, to translate using NLLB. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly.
The rest I don't recommend. But below are your options.
### Full list of parameters :
| Parameter | Description | Default |
|-----------|-------------|---------|
@@ -189,6 +191,7 @@ The rest I don't recommend. But below are your options.
|-----------|-------------|---------|
| `--diarization` | Enable speaker identification | `False` |
| `--diarization-backend` | `diart` or `sortformer` | `sortformer` |
| `--disable-punctuation-split` | Disable punctuation based splits. See #214 | `False` |
| `--segmentation-model` | Hugging Face model ID for Diart segmentation model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `pyannote/segmentation-3.0` |
| `--embedding-model` | Hugging Face model ID for Diart embedding model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `speechbrain/spkrec-ecapa-voxceleb` |

View File

@@ -492,7 +492,7 @@ class AudioProcessor:
state,
self.silence,
current_time = time() - self.beg_loop if self.beg_loop else None,
diarization = self.args.diarization,
args = self.args,
debug = self.debug
)
# Handle undiarized text

View File

@@ -60,7 +60,8 @@ class TranscriptionEngine:
"max_context_tokens": None,
"model_path": './base.pt',
"diarization_backend": "sortformer",
# diart params:
# diarization params:
"disable_punctuation_split" : False,
"segmentation_model": "pyannote/segmentation-3.0",
"embedding_model": "pyannote/embedding",
}

View File

@@ -72,6 +72,12 @@ def parse_args():
help="Disable transcription to only see live diarization results.",
)
parser.add_argument(
"--disable-punctuation-split",
action="store_true",
help="Disable the split parameter.",
)
parser.add_argument(
"--min-chunk-size",
type=float,

View File

@@ -46,7 +46,9 @@ def append_token_to_last_line(lines, sep, token, debug_info):
lines[-1].text += sep + token.text + debug_info
lines[-1].end = token.end
def format_output(state, silence, current_time, diarization, debug):
def format_output(state, silence, current_time, args, debug):
diarization = args.diarization
disable_punctuation_split = args.disable_punctuation_split
tokens = state["tokens"]
translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments
buffer_transcription = state["buffer_transcription"]
@@ -115,7 +117,9 @@ def format_output(state, silence, current_time, diarization, debug):
append_token_to_last_line(lines, sep, token, debug_info)
continue
else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line
# lines.append(new_line(token, speaker, debug_info = ""))
if disable_punctuation_split:
lines.append(new_line(token, speaker, debug_info = ""))
continue
pass
append_token_to_last_line(lines, sep, token, debug_info)