From 953697cd861c3231379ab3ae01b8b6866442b08b Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Wed, 3 Sep 2025 20:11:00 +0200 Subject: [PATCH] torch.Tensor to torch.as_tensor --- whisperlivekit/simul_whisper/simul_whisper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index c1f8c2e..0b8649e 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -399,17 +399,17 @@ class PaddedAlignAttWhisper: mlx_mel_padded = mlx_log_mel_spectrogram(audio=input_segments.detach(), n_mels=self.model.dims.n_mels, padding=N_SAMPLES) mlx_mel = mlx_pad_or_trim(mlx_mel_padded, N_FRAMES, axis=-2) mlx_encoder_feature = self.mlx_encoder.encoder(mlx_mel[None]) - encoder_feature = torch.tensor(np.array(mlx_encoder_feature)) + encoder_feature = torch.as_tensor(mlx_encoder_feature) content_mel_len = int((mlx_mel_padded.shape[0] - mlx_mel.shape[0])/2) - device = 'cpu' + device = encoder_feature.device #'cpu' is apple silicon elif self.fw_encoder: audio_length_seconds = len(input_segments) / 16000 content_mel_len = int(audio_length_seconds * 100)//2 mel_padded_2 = self.fw_feature_extractor(waveform=input_segments.numpy(), padding=N_SAMPLES)[None, :] mel = fw_pad_or_trim(mel_padded_2, N_FRAMES, axis=-1) encoder_feature_ctranslate = self.fw_encoder.encode(mel) - encoder_feature = torch.Tensor(np.array(encoder_feature_ctranslate)) - device = 'cpu' + encoder_feature = torch.as_tensor(encoder_feature_ctranslate) + device = encoder_feature.device else: # mel + padding to 30s mel_padded = log_mel_spectrogram(input_segments, n_mels=self.model.dims.n_mels, padding=N_SAMPLES,