diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index 3b0e7c5..b537510 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -408,8 +408,13 @@ class PaddedAlignAttWhisper: content_mel_len = int(audio_length_seconds * 100)//2 mel_padded_2 = self.fw_feature_extractor(waveform=input_segments.numpy(), padding=N_SAMPLES)[None, :] mel = fw_pad_or_trim(mel_padded_2, N_FRAMES, axis=-1) - encoder_feature_ctranslate = np.array(self.fw_encoder.encode(mel)) - encoder_feature = torch.as_tensor(encoder_feature_ctranslate, device=self.device) + encoder_feature_ctranslate = self.fw_encoder.encode(mel) + if self.device == 'cpu': #it seems that on gpu, passing StorageView to torch.as_tensor fails and wrapping in the array works + encoder_feature_ctranslate = np.array(encoder_feature_ctranslate) + try: + encoder_feature = torch.as_tensor(encoder_feature_ctranslate, device=self.device) + except TypeError: # Normally the cpu condition should prevent having exceptions, but just in case: + encoder_feature = torch.as_tensor(np.array(encoder_feature_ctranslate), device=self.device) else: # mel + padding to 30s mel_padded = log_mel_spectrogram(input_segments, n_mels=self.model.dims.n_mels, padding=N_SAMPLES,