adapt online for mlx detection

2026-03-07 14:23:18 +00:00 · 2025-11-25 23:52:00 +01:00
parent 59db08e961
commit f1b47178d8
2 changed files with 88 additions and 45 deletions
--- a/whisperlivekit/core.py
+++ b/whisperlivekit/core.py
@@ -176,12 +176,10 @@ class TranscriptionEngine:


 def online_factory(args, asr):
-    if args.backend_policy == "simulstreaming":    
+    if args.backend_policy == "simulstreaming":
        from whisperlivekit.simul_whisper import SimulStreamingOnlineProcessor
-        online = SimulStreamingOnlineProcessor(asr)
-    else:
-        online = OnlineASRProcessor(asr)
-    return online
+        return SimulStreamingOnlineProcessor(asr)
+    return OnlineASRProcessor(asr)
  
  
 def online_diarization_factory(args, diarization_backend):
--- a/whisperlivekit/simul_whisper/backend.py
+++ b/whisperlivekit/simul_whisper/backend.py
@@ -24,9 +24,11 @@ logger = logging.getLogger(__name__)

 HAS_MLX_WHISPER = mlx_backend_available(warn_on_missing=True)
 if HAS_MLX_WHISPER:
-    from .mlx_encoder import load_mlx_encoder, mlx_model_mapping
+    from .mlx_encoder import load_mlx_encoder, load_mlx_model, mlx_model_mapping
+    from .mlx import MLXAlignAtt
 else:
    mlx_model_mapping = {}
+    MLXAlignAtt = None
 HAS_FASTER_WHISPER = faster_backend_available(warn_on_missing=not HAS_MLX_WHISPER)
 if HAS_FASTER_WHISPER:
    from faster_whisper import WhisperModel
@@ -36,50 +38,49 @@ else:
 MIN_DURATION_REAL_SILENCE = 5

 class SimulStreamingOnlineProcessor:
+    """Online processor for SimulStreaming ASR."""
    SAMPLING_RATE = 16000

-    def __init__(
-        self,
-        asr,
-        logfile=sys.stderr,
-    ):        
+    def __init__(self, asr, logfile=sys.stderr):
        self.asr = asr
        self.logfile = logfile
        self.end = 0.0
        self.buffer = []
        self.committed: List[ASRToken] = []
-        self.last_result_tokens: List[ASRToken] = []
-        self.load_new_alignatt_instance()
+        self.last_result_tokens: List[ASRToken] = []        
+        self.model = self._create_alignatt()
        
        if asr.tokenizer:
            self.model.tokenizer = asr.tokenizer
+            self.model.state.tokenizer = asr.tokenizer

-    def load_new_alignatt_instance(self):
-        """Initialize AlignAtt decoder using the shared model."""
-        self.model = AlignAtt(
-            cfg=self.asr.cfg,
-            loaded_model=self.asr.shared_model,
-            mlx_encoder=self.asr.mlx_encoder,
-            fw_encoder=self.asr.fw_encoder,
-        )
+    def _create_alignatt(self):
+        """Create the AlignAtt decoder instance based on ASR mode."""
+        if self.asr.use_full_mlx and HAS_MLX_WHISPER:
+            return MLXAlignAtt(cfg=self.asr.cfg, mlx_model=self.asr.mlx_model)
+        else:
+            return AlignAtt(
+                cfg=self.asr.cfg,
+                loaded_model=self.asr.shared_model,
+                mlx_encoder=self.asr.mlx_encoder,
+                fw_encoder=self.asr.fw_encoder,
+            )

    def start_silence(self):
        tokens, processed_upto = self.process_iter(is_last=True)
        return tokens, processed_upto

    def end_silence(self, silence_duration, offset):
-        """
-        Handle silence period.
-        
-        If silence > MIN_DURATION_REAL_SILENCE, do a complete context clear.
-        Otherwise, insert a small silence and shift the last_attend_frame.
-        """
+        """Handle silence period."""
        self.end += silence_duration
        long_silence = silence_duration >= MIN_DURATION_REAL_SILENCE
        if not long_silence:
            gap_len = int(16000 * silence_duration)
            if gap_len > 0:
-                gap_silence = torch.zeros(gap_len)
+                if self.asr.use_full_mlx:
+                    gap_silence = np.zeros(gap_len, dtype=np.float32)
+                else:
+                    gap_silence = torch.zeros(gap_len)
                self.model.insert_audio(gap_silence)
        if long_silence:
            self.model.refresh_segment(complete=True)
@@ -87,11 +88,12 @@ class SimulStreamingOnlineProcessor:

    def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time):
        """Append an audio chunk to be processed by SimulStreaming."""
-            
-        # Convert numpy array to torch tensor
-        audio_tensor = torch.from_numpy(audio).float()
-        self.end = audio_stream_end_time  # Aligned with whisperstreaming backend behavior
-        self.model.insert_audio(audio_tensor)
+        self.end = audio_stream_end_time
+        if self.asr.use_full_mlx:
+            self.model.insert_audio(audio)
+        else:
+            audio_tensor = torch.from_numpy(audio).float()
+            self.model.insert_audio(audio_tensor)

    def new_speaker(self, change_speaker: ChangeSpeaker):
        """Handle speaker change event."""
@@ -130,6 +132,10 @@ class SimulStreamingOnlineProcessor:
    def warmup(self, audio, init_prompt=""):
        """Warmup the SimulStreaming model."""
        try:
+            if self.asr.use_full_mlx:
+                # MLX mode: ensure numpy array
+                if hasattr(audio, 'numpy'):
+                    audio = audio.numpy()
            self.model.insert_audio(audio)
            self.model.infer(True)
            self.model.refresh_segment(complete=True)
@@ -139,9 +145,14 @@ class SimulStreamingOnlineProcessor:

    def __del__(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        if not getattr(self.asr, 'use_full_mlx', True) and torch is not None:
+            try:
+                torch.cuda.empty_cache()
+            except Exception:
+                pass

-class SimulStreamingASR():
+
+class SimulStreamingASR:
    """SimulStreaming backend with AlignAtt policy."""
    sep = ""

@@ -158,6 +169,7 @@ class SimulStreamingASR():
        self.fast_encoder = False
        self._resolved_model_path = None
        self.encoder_backend = "whisper"
+        self.use_full_mlx = getattr(self, "use_full_mlx", False)
        preferred_backend = getattr(self, "backend", "auto")
        compatible_whisper_mlx, compatible_faster_whisper = True, True
        
@@ -170,7 +182,7 @@ class SimulStreamingASR():
            compatible_whisper_mlx = model_info.compatible_whisper_mlx
            compatible_faster_whisper = model_info.compatible_faster_whisper
            
-            if not model_info.has_pytorch:
+            if not self.use_full_mlx and not model_info.has_pytorch:
                raise FileNotFoundError(
                    f"No PyTorch checkpoint (.pt/.bin/.safetensors) found under {self.model_path}"
                )            
@@ -190,6 +202,10 @@ class SimulStreamingASR():
        self.fast_encoder = self.encoder_backend in ("mlx-whisper", "faster-whisper")
        if self.encoder_backend == "whisper":
            self.disable_fast_encoder = True
+        
+        if self.encoder_backend == "mlx-whisper" and platform.system() == "Darwin":
+            if not hasattr(self, '_full_mlx_disabled'):
+                self.use_full_mlx = True
                    
        self.cfg = AlignAttConfig(
                tokenizer_is_multilingual= is_multilingual,
@@ -214,20 +230,36 @@ class SimulStreamingASR():
        else:
            self.tokenizer = None

-        self.mlx_encoder, self.fw_encoder = None, None
-        if self.encoder_backend == "mlx-whisper":
-            print('Simulstreaming will use MLX whisper to increase encoding speed.')
+        self.mlx_encoder, self.fw_encoder, self.mlx_model = None, None, None
+        self.shared_model = None
+        
+        if self.use_full_mlx and HAS_MLX_WHISPER:
+            logger.info('MLX Whisper backend used.')
            if self._resolved_model_path is not None:
-                mlx_model = str(self._resolved_model_path)
+                mlx_model_path = str(self._resolved_model_path)
            else:
-                mlx_model = mlx_model_mapping.get(self.model_name)
-            if not mlx_model:
+                mlx_model_path = mlx_model_mapping.get(self.model_name)
+            if not mlx_model_path:
                raise FileNotFoundError(
                    f"MLX Whisper backend requested but no compatible weights found for model '{self.model_name}'."
                )
-            self.mlx_encoder = load_mlx_encoder(path_or_hf_repo=mlx_model)
+            self.mlx_model = load_mlx_model(path_or_hf_repo=mlx_model_path)
+            self._warmup_mlx_model()
+        elif self.encoder_backend == "mlx-whisper":
+            # hybrid mode: mlx encoder + pytorch decoder
+            logger.info('SimulStreaming will use MLX Whisper encoder with PyTorch decoder.')
+            if self._resolved_model_path is not None:
+                mlx_model_path = str(self._resolved_model_path)
+            else:
+                mlx_model_path = mlx_model_mapping.get(self.model_name)
+            if not mlx_model_path:
+                raise FileNotFoundError(
+                    f"MLX Whisper backend requested but no compatible weights found for model '{self.model_name}'."
+                )
+            self.mlx_encoder = load_mlx_encoder(path_or_hf_repo=mlx_model_path)
+            self.shared_model = self.load_model()
        elif self.encoder_backend == "faster-whisper":
-            print('Simulstreaming will use Faster Whisper for the encoder.')
+            print('SimulStreaming will use Faster Whisper for the encoder.')
            if self._resolved_model_path is not None:
                fw_model = str(self._resolved_model_path)
            else:
@@ -237,7 +269,20 @@ class SimulStreamingASR():
                device='auto',
                compute_type='auto',
            )
-        self.shared_model = self.load_model()
+            self.shared_model = self.load_model()
+        else:
+            self.shared_model = self.load_model()
+    
+    def _warmup_mlx_model(self):
+        """Warmup the full MLX model."""
+        warmup_audio = load_file(self.warmup_file)
+        if warmup_audio is not None:
+            temp_model = MLXAlignAtt(
+                cfg=self.cfg,
+                mlx_model=self.mlx_model,
+            )
+            temp_model.warmup(warmup_audio)
+            logger.info("Full MLX model warmed up successfully")


    def _resolve_encoder_backend(self, preferred_backend, compatible_whisper_mlx, compatible_faster_whisper):