diff --git a/README.md b/README.md index 8dffe79..4800f1b 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Real-time speech transcription directly to your browser, with a ready-to-use bac #### Powered by Leading Research: - [SimulStreaming](https://github.com/ufal/SimulStreaming) (SOTA 2025) - Ultra-low latency transcription with AlignAtt policy +- [NLLB](https://arxiv.org/abs/2207.04672), ([distilled](https://huggingface.co/entai2965/nllb-200-distilled-600M-ctranslate2)) (2024) - Translation to more than 100 languages. - [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) - Low latency transcription with LocalAgreement policy - [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) - Advanced real-time speaker diarization - [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) - Real-time speaker diarization @@ -143,8 +144,9 @@ An important list of parameters can be changed. But what *should* you change? - `--task translate`, to translate in english - `--host`, `--port`, `--ssl-certfile`, `--ssl-keyfile`, if you set up a server - `--diarization`, if you want to use it. +- [BETA] `--target-language`, to translate using NLLB. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. -The rest I don't recommend. But below are your options. +### Full list of parameters : | Parameter | Description | Default | |-----------|-------------|---------| @@ -189,6 +191,7 @@ The rest I don't recommend. But below are your options. |-----------|-------------|---------| | `--diarization` | Enable speaker identification | `False` | | `--diarization-backend` | `diart` or `sortformer` | `sortformer` | +| `--disable-punctuation-split` | Disable punctuation based splits. See #214 | `False` | | `--segmentation-model` | Hugging Face model ID for Diart segmentation model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `pyannote/segmentation-3.0` | | `--embedding-model` | Hugging Face model ID for Diart embedding model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `speechbrain/spkrec-ecapa-voxceleb` | diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 4a6a194..4e74743 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -492,7 +492,7 @@ class AudioProcessor: state, self.silence, current_time = time() - self.beg_loop if self.beg_loop else None, - diarization = self.args.diarization, + args = self.args, debug = self.debug ) # Handle undiarized text diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index f59e629..8fd2d61 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -60,7 +60,8 @@ class TranscriptionEngine: "max_context_tokens": None, "model_path": './base.pt', "diarization_backend": "sortformer", - # diart params: + # diarization params: + "disable_punctuation_split" : False, "segmentation_model": "pyannote/segmentation-3.0", "embedding_model": "pyannote/embedding", } diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index 14a46b8..fe8a2f8 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -72,6 +72,12 @@ def parse_args(): help="Disable transcription to only see live diarization results.", ) + parser.add_argument( + "--disable-punctuation-split", + action="store_true", + help="Disable the split parameter.", + ) + parser.add_argument( "--min-chunk-size", type=float, diff --git a/whisperlivekit/results_formater.py b/whisperlivekit/results_formater.py index fb516be..dee4402 100644 --- a/whisperlivekit/results_formater.py +++ b/whisperlivekit/results_formater.py @@ -46,7 +46,9 @@ def append_token_to_last_line(lines, sep, token, debug_info): lines[-1].text += sep + token.text + debug_info lines[-1].end = token.end -def format_output(state, silence, current_time, diarization, debug): +def format_output(state, silence, current_time, args, debug): + diarization = args.diarization + disable_punctuation_split = args.disable_punctuation_split tokens = state["tokens"] translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments buffer_transcription = state["buffer_transcription"] @@ -115,7 +117,9 @@ def format_output(state, silence, current_time, diarization, debug): append_token_to_last_line(lines, sep, token, debug_info) continue else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line - # lines.append(new_line(token, speaker, debug_info = "")) + if disable_punctuation_split: + lines.append(new_line(token, speaker, debug_info = "")) + continue pass append_token_to_last_line(lines, sep, token, debug_info)