add --disable-punctuation-split option

2026-03-07 14:23:18 +00:00 · 2025-09-11 21:03:00 +02:00
parent 967cdfebc8
commit b06866877a
5 changed files with 19 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Real-time speech transcription directly to your browser, with a ready-to-use bac
 #### Powered by Leading Research:

 - [SimulStreaming](https://github.com/ufal/SimulStreaming) (SOTA 2025) - Ultra-low latency transcription with AlignAtt policy
+- [NLLB](https://arxiv.org/abs/2207.04672), ([distilled](https://huggingface.co/entai2965/nllb-200-distilled-600M-ctranslate2)) (2024) - Translation to more than 100 languages.
 - [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) - Low latency transcription with LocalAgreement policy
 - [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) - Advanced real-time speaker diarization
 - [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) - Real-time speaker diarization
@@ -143,8 +144,9 @@ An important list of parameters can be changed. But what *should* you change?
 - `--task translate`, to translate in english
 - `--host`, `--port`, `--ssl-certfile`, `--ssl-keyfile`, if you set up a server
 - `--diarization`, if you want to use it.
+- [BETA] `--target-language`, to translate using NLLB. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly.

-The rest I don't recommend. But below are your options.
+### Full list of parameters :

 | Parameter | Description | Default |
 |-----------|-------------|---------|
@@ -189,6 +191,7 @@ The rest I don't recommend. But below are your options.
 |-----------|-------------|---------|
 | `--diarization` | Enable speaker identification | `False` |
 | `--diarization-backend` |  `diart` or `sortformer` | `sortformer` |
+| `--disable-punctuation-split` |  Disable punctuation based splits. See #214 | `False` |
 | `--segmentation-model` | Hugging Face model ID for Diart segmentation model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `pyannote/segmentation-3.0` |
 | `--embedding-model` | Hugging Face model ID for Diart embedding model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `speechbrain/spkrec-ecapa-voxceleb` |

--- a/whisperlivekit/audio_processor.py
+++ b/whisperlivekit/audio_processor.py
@@ -492,7 +492,7 @@ class AudioProcessor:
                    state,
                    self.silence,
                    current_time = time() - self.beg_loop if self.beg_loop else None,
-                    diarization = self.args.diarization,
+                    args = self.args,
                    debug = self.debug
                )
                # Handle undiarized text
--- a/whisperlivekit/core.py
+++ b/whisperlivekit/core.py
@@ -60,7 +60,8 @@ class TranscriptionEngine:
            "max_context_tokens": None,
            "model_path": './base.pt',
            "diarization_backend": "sortformer",
-            # diart params:
+            # diarization params:
+            "disable_punctuation_split" : False,
            "segmentation_model": "pyannote/segmentation-3.0",
            "embedding_model": "pyannote/embedding",         
        }
--- a/whisperlivekit/parse_args.py
+++ b/whisperlivekit/parse_args.py
@@ -72,6 +72,12 @@ def parse_args():
        help="Disable transcription to only see live diarization results.",
    )
    
+    parser.add_argument(
+        "--disable-punctuation-split",
+        action="store_true",
+        help="Disable the split parameter.",
+    )
+    
    parser.add_argument(
        "--min-chunk-size",
        type=float,
--- a/whisperlivekit/results_formater.py
+++ b/whisperlivekit/results_formater.py
@@ -46,7 +46,9 @@ def append_token_to_last_line(lines, sep, token, debug_info):
        lines[-1].text += sep + token.text + debug_info
        lines[-1].end = token.end

-def format_output(state, silence, current_time, diarization, debug):
+def format_output(state, silence, current_time, args, debug):
+    diarization = args.diarization
+    disable_punctuation_split = args.disable_punctuation_split
    tokens = state["tokens"]
    translated_segments = state["translated_segments"] # Here we will attribute the speakers only based on the timestamps of the segments
    buffer_transcription = state["buffer_transcription"]
@@ -115,7 +117,9 @@ def format_output(state, silence, current_time, diarization, debug):
                append_token_to_last_line(lines, sep, token, debug_info)
                continue
            else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line
-                # lines.append(new_line(token, speaker, debug_info = ""))
+                if disable_punctuation_split:
+                    lines.append(new_line(token, speaker, debug_info = ""))
+                    continue
                pass
            
        append_token_to_last_line(lines, sep, token, debug_info)