diff --git a/chrome-extension/README.md b/chrome-extension/README.md new file mode 100644 index 0000000..ab353c3 --- /dev/null +++ b/chrome-extension/README.md @@ -0,0 +1,13 @@ +## WhisperLiveKit Chrome Extension v0.1.0 +Capture the audio of your current tab, transcribe or translate it using WhisperliveKit. **Still unstable** + +WhisperLiveKit Demo + +## Running this extension +1. Clone this repository. +2. Load this directory in Chrome as an unpacked extension. + + +## Devs: +- Impossible to capture audio from tabs if extension is a pannel, unfortunately: https://issues.chromium.org/issues/40926394 +- To capture microphone in an extension, there are tricks: https://github.com/justinmann/sidepanel-audio-issue , https://medium.com/@lynchee.owo/how-to-enable-microphone-access-in-chrome-extensions-by-code-924295170080 (comments) diff --git a/chrome-extension/demo-extension.png b/chrome-extension/demo-extension.png new file mode 100644 index 0000000..ef6e7e2 Binary files /dev/null and b/chrome-extension/demo-extension.png differ diff --git a/chrome-extension/example_tab_capture.js b/chrome-extension/example_tab_capture.js new file mode 100644 index 0000000..8095371 --- /dev/null +++ b/chrome-extension/example_tab_capture.js @@ -0,0 +1,315 @@ +const extend = function() { //helper function to merge objects + let target = arguments[0], + sources = [].slice.call(arguments, 1); + for (let i = 0; i < sources.length; ++i) { + let src = sources[i]; + for (key in src) { + let val = src[key]; + target[key] = typeof val === "object" + ? extend(typeof target[key] === "object" ? target[key] : {}, val) + : val; + } + } + return target; +}; + +const WORKER_FILE = { + wav: "WavWorker.js", + mp3: "Mp3Worker.js" +}; + +// default configs +const CONFIGS = { + workerDir: "/workers/", // worker scripts dir (end with /) + numChannels: 2, // number of channels + encoding: "wav", // encoding (can be changed at runtime) + + // runtime options + options: { + timeLimit: 1200, // recording time limit (sec) + encodeAfterRecord: true, // process encoding after recording + progressInterval: 1000, // encoding progress report interval (millisec) + bufferSize: undefined, // buffer size (use browser default) + + // encoding-specific options + wav: { + mimeType: "audio/wav" + }, + mp3: { + mimeType: "audio/mpeg", + bitRate: 192 // (CBR only): bit rate = [64 .. 320] + } + } +}; + +class Recorder { + + constructor(source, configs) { //creates audio context from the source and connects it to the worker + extend(this, CONFIGS, configs || {}); + this.context = source.context; + if (this.context.createScriptProcessor == null) + this.context.createScriptProcessor = this.context.createJavaScriptNode; + this.input = this.context.createGain(); + source.connect(this.input); + this.buffer = []; + this.initWorker(); + } + + isRecording() { + return this.processor != null; + } + + setEncoding(encoding) { + if(!this.isRecording() && this.encoding !== encoding) { + this.encoding = encoding; + this.initWorker(); + } + } + + setOptions(options) { + if (!this.isRecording()) { + extend(this.options, options); + this.worker.postMessage({ command: "options", options: this.options}); + } + } + + startRecording() { + if(!this.isRecording()) { + let numChannels = this.numChannels; + let buffer = this.buffer; + let worker = this.worker; + this.processor = this.context.createScriptProcessor( + this.options.bufferSize, + this.numChannels, this.numChannels); + this.input.connect(this.processor); + this.processor.connect(this.context.destination); + this.processor.onaudioprocess = function(event) { + for (var ch = 0; ch < numChannels; ++ch) + buffer[ch] = event.inputBuffer.getChannelData(ch); + worker.postMessage({ command: "record", buffer: buffer }); + }; + this.worker.postMessage({ + command: "start", + bufferSize: this.processor.bufferSize + }); + this.startTime = Date.now(); + } + } + + cancelRecording() { + if(this.isRecording()) { + this.input.disconnect(); + this.processor.disconnect(); + delete this.processor; + this.worker.postMessage({ command: "cancel" }); + } + } + + finishRecording() { + if (this.isRecording()) { + this.input.disconnect(); + this.processor.disconnect(); + delete this.processor; + this.worker.postMessage({ command: "finish" }); + } + } + + cancelEncoding() { + if (this.options.encodeAfterRecord) + if (!this.isRecording()) { + this.onEncodingCanceled(this); + this.initWorker(); + } + } + + initWorker() { + if (this.worker != null) + this.worker.terminate(); + this.onEncoderLoading(this, this.encoding); + this.worker = new Worker(this.workerDir + WORKER_FILE[this.encoding]); + let _this = this; + this.worker.onmessage = function(event) { + let data = event.data; + switch (data.command) { + case "loaded": + _this.onEncoderLoaded(_this, _this.encoding); + break; + case "timeout": + _this.onTimeout(_this); + break; + case "progress": + _this.onEncodingProgress(_this, data.progress); + break; + case "complete": + _this.onComplete(_this, data.blob); + } + } + this.worker.postMessage({ + command: "init", + config: { + sampleRate: this.context.sampleRate, + numChannels: this.numChannels + }, + options: this.options + }); + } + + onEncoderLoading(recorder, encoding) {} + onEncoderLoaded(recorder, encoding) {} + onTimeout(recorder) {} + onEncodingProgress(recorder, progress) {} + onEncodingCanceled(recorder) {} + onComplete(recorder, blob) {} + +} + +const audioCapture = (timeLimit, muteTab, format, quality, limitRemoved) => { + chrome.tabCapture.capture({audio: true}, (stream) => { // sets up stream for capture + let startTabId; //tab when the capture is started + let timeout; + let completeTabID; //tab when the capture is stopped + let audioURL = null; //resulting object when encoding is completed + chrome.tabs.query({active:true, currentWindow: true}, (tabs) => startTabId = tabs[0].id) //saves start tab + const liveStream = stream; + const audioCtx = new AudioContext(); + const source = audioCtx.createMediaStreamSource(stream); + let mediaRecorder = new Recorder(source); //initiates the recorder based on the current stream + mediaRecorder.setEncoding(format); //sets encoding based on options + if(limitRemoved) { //removes time limit + mediaRecorder.setOptions({timeLimit: 10800}); + } else { + mediaRecorder.setOptions({timeLimit: timeLimit/1000}); + } + if(format === "mp3") { + mediaRecorder.setOptions({mp3: {bitRate: quality}}); + } + mediaRecorder.startRecording(); + + function onStopCommand(command) { //keypress + if (command === "stop") { + stopCapture(); + } + } + function onStopClick(request) { //click on popup + if(request === "stopCapture") { + stopCapture(); + } else if (request === "cancelCapture") { + cancelCapture(); + } else if (request.cancelEncodeID) { + if(request.cancelEncodeID === startTabId && mediaRecorder) { + mediaRecorder.cancelEncoding(); + } + } + } + chrome.commands.onCommand.addListener(onStopCommand); + chrome.runtime.onMessage.addListener(onStopClick); + mediaRecorder.onComplete = (recorder, blob) => { + audioURL = window.URL.createObjectURL(blob); + if(completeTabID) { + chrome.tabs.sendMessage(completeTabID, {type: "encodingComplete", audioURL}); + } + mediaRecorder = null; + } + mediaRecorder.onEncodingProgress = (recorder, progress) => { + if(completeTabID) { + chrome.tabs.sendMessage(completeTabID, {type: "encodingProgress", progress: progress}); + } + } + + const stopCapture = function() { + let endTabId; + //check to make sure the current tab is the tab being captured + chrome.tabs.query({active: true, currentWindow: true}, (tabs) => { + endTabId = tabs[0].id; + if(mediaRecorder && startTabId === endTabId){ + mediaRecorder.finishRecording(); + chrome.tabs.create({url: "complete.html"}, (tab) => { + completeTabID = tab.id; + let completeCallback = () => { + chrome.tabs.sendMessage(tab.id, {type: "createTab", format: format, audioURL, startID: startTabId}); + } + setTimeout(completeCallback, 500); + }); + closeStream(endTabId); + } + }) + } + + const cancelCapture = function() { + let endTabId; + chrome.tabs.query({active: true, currentWindow: true}, (tabs) => { + endTabId = tabs[0].id; + if(mediaRecorder && startTabId === endTabId){ + mediaRecorder.cancelRecording(); + closeStream(endTabId); + } + }) + } + +//removes the audio context and closes recorder to save memory + const closeStream = function(endTabId) { + chrome.commands.onCommand.removeListener(onStopCommand); + chrome.runtime.onMessage.removeListener(onStopClick); + mediaRecorder.onTimeout = () => {}; + audioCtx.close(); + liveStream.getAudioTracks()[0].stop(); + sessionStorage.removeItem(endTabId); + chrome.runtime.sendMessage({captureStopped: endTabId}); + } + + mediaRecorder.onTimeout = stopCapture; + + if(!muteTab) { + let audio = new Audio(); + audio.srcObject = liveStream; + audio.play(); + } + }); +} + + + +//sends reponses to and from the popup menu +chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { + if (request.currentTab && sessionStorage.getItem(request.currentTab)) { + sendResponse(sessionStorage.getItem(request.currentTab)); + } else if (request.currentTab){ + sendResponse(false); + } else if (request === "startCapture") { + startCapture(); + } +}); + +const startCapture = function() { + chrome.tabs.query({active: true, currentWindow: true}, (tabs) => { + // CODE TO BLOCK CAPTURE ON YOUTUBE, DO NOT REMOVE + // if(tabs[0].url.toLowerCase().includes("youtube")) { + // chrome.tabs.create({url: "error.html"}); + // } else { + if(!sessionStorage.getItem(tabs[0].id)) { + sessionStorage.setItem(tabs[0].id, Date.now()); + chrome.storage.sync.get({ + maxTime: 1200000, + muteTab: false, + format: "mp3", + quality: 192, + limitRemoved: false + }, (options) => { + let time = options.maxTime; + if(time > 1200000) { + time = 1200000 + } + audioCapture(time, options.muteTab, options.format, options.quality, options.limitRemoved); + }); + chrome.runtime.sendMessage({captureStarted: tabs[0].id, startTime: Date.now()}); + } + // } + }); +}; + + +chrome.commands.onCommand.addListener((command) => { + if (command === "start") { + startCapture(); + } +}); \ No newline at end of file diff --git a/chrome-extension/manifest.json b/chrome-extension/manifest.json new file mode 100644 index 0000000..a925ee5 --- /dev/null +++ b/chrome-extension/manifest.json @@ -0,0 +1,17 @@ +{ + "manifest_version": 3, + "name": "WhisperLiveKit Tab Capture", + "version": "1.0", + "description": "Capture and transcribe audio from browser tabs using WhisperLiveKit.", + "action": { + "default_title": "WhisperLiveKit Tab Capture", + "default_popup": "popup.html" + }, + "permissions": ["scripting", "tabCapture", "offscreen", "activeTab", "storage"], + "web_accessible_resources": [ + { + "resources": ["requestPermissions.html", "requestPermissions.js"], + "matches": [""] + } + ] +} diff --git a/chrome-extension/popup.html b/chrome-extension/popup.html new file mode 100644 index 0000000..1677c5d --- /dev/null +++ b/chrome-extension/popup.html @@ -0,0 +1,73 @@ + + + + + + + WhisperLiveKit + + + + +
+ + +
+
+ + +
+ + + +
+
+ + + + + + + + +
+
+ +
+
+ + + + +

+ +
+ + + + + \ No newline at end of file diff --git a/chrome-extension/requestPermissions.html b/chrome-extension/requestPermissions.html new file mode 100644 index 0000000..86d36dc --- /dev/null +++ b/chrome-extension/requestPermissions.html @@ -0,0 +1,12 @@ + + + + Request Permissions + + + + This page exists to workaround an issue with Chrome that blocks permission + requests from chrome extensions + + + diff --git a/chrome-extension/requestPermissions.js b/chrome-extension/requestPermissions.js new file mode 100644 index 0000000..0f1b750 --- /dev/null +++ b/chrome-extension/requestPermissions.js @@ -0,0 +1,17 @@ +/** + * Requests user permission for microphone access. + * @returns {Promise} A Promise that resolves when permission is granted or rejects with an error. + */ +async function getUserPermission() { + console.log("Getting user permission for microphone access..."); + await navigator.mediaDevices.getUserMedia({ audio: true }); + const micPermission = await navigator.permissions.query({ + name: "microphone", + }); + if (micPermission.state == "granted") { + window.close(); + } +} + +// Call the function to request microphone permission +getUserPermission(); diff --git a/chrome-extension/service-worker.js b/chrome-extension/service-worker.js new file mode 100644 index 0000000..8d69af7 --- /dev/null +++ b/chrome-extension/service-worker.js @@ -0,0 +1,249 @@ +console.log("Service worker loaded"); + +let isRecording = false; +let currentStreamId = null; + +chrome.runtime.onInstalled.addListener((details) => { + console.log("Extension installed/updated"); +}); + +chrome.action.onClicked.addListener((tab) => { + // Get the current tab ID + const tabId = tab.id; + + // Inject the content script into the current tab + chrome.scripting.executeScript({ + target: { tabId: tabId }, + files: ['style_popup.js'] + }); + + console.log(`Content script injected into tab ${tabId}`); +}); + + +// Handle messages from popup +chrome.runtime.onMessage.addListener(async (message, sender, sendResponse) => { + console.log("Service worker received message:", message); + + try { + switch (message.type) { + case 'start-capture': + const startResult = await startTabCapture(message.tabId, message.websocketUrl); + sendResponse(startResult); + break; + + case 'stop-capture': + const stopResult = await stopTabCapture(); + sendResponse(stopResult); + break; + + case 'get-recording-state': + sendResponse({ isRecording: isRecording }); + break; + + default: + sendResponse({ success: false, error: 'Unknown message type' }); + } + } catch (error) { + console.error('Error handling message:', error); + sendResponse({ success: false, error: error.message }); + } + + return true; // Keep message channel open for async response +}); + +async function startTabCapture(tabId, websocketUrl) { + console.log('Service worker: Starting tab capture process...'); + console.log('Service worker: tabId:', tabId, 'websocketUrl:', websocketUrl); + + try { + if (isRecording) { + console.log('Service worker: Already recording, aborting'); + return { success: false, error: 'Already recording' }; + } + + // Check if offscreen document exists + console.log('Service worker: Checking for existing offscreen document...'); + const existingContexts = await chrome.runtime.getContexts({}); + console.log('Service worker: Found contexts:', existingContexts.length); + + const offscreenDocument = existingContexts.find( + (c) => c.contextType === 'OFFSCREEN_DOCUMENT' + ); + + console.log('Service worker: Offscreen document exists:', !!offscreenDocument); + + // Create offscreen document if it doesn't exist + if (!offscreenDocument) { + console.log('Service worker: Creating offscreen document...'); + try { + await chrome.offscreen.createDocument({ + url: 'offscreen.html', + reasons: ['USER_MEDIA'], + justification: 'Capturing and processing tab audio for transcription' + }); + console.log('Service worker: Offscreen document created successfully'); + + // Wait for offscreen document to initialize + console.log('Service worker: Waiting for offscreen document to initialize...'); + await new Promise(resolve => setTimeout(resolve, 500)); + console.log('Service worker: Offscreen document initialization delay complete'); + + } catch (offscreenError) { + console.error('Service worker: Failed to create offscreen document:', offscreenError); + return { success: false, error: 'Failed to create offscreen document: ' + offscreenError.message }; + } + } + + // Get media stream ID for the tab + console.log('Service worker: Getting media stream ID for tab:', tabId); + try { + currentStreamId = await chrome.tabCapture.getMediaStreamId({ + targetTabId: tabId + }); + console.log('Service worker: Media stream ID:', currentStreamId); + } catch (tabCaptureError) { + console.error('Service worker: Failed to get media stream ID:', tabCaptureError); + return { success: false, error: 'Failed to get media stream ID: ' + tabCaptureError.message }; + } + + if (!currentStreamId) { + console.log('Service worker: No media stream ID returned'); + return { success: false, error: 'Failed to get media stream ID - no stream returned' }; + } + + // Send message to offscreen document to start capture with retry logic + console.log('Service worker: Sending start message to offscreen document...'); + + let response; + let retryCount = 0; + const maxRetries = 5; + + while (!response && retryCount < maxRetries) { + try { + console.log(`Service worker: Attempt ${retryCount + 1}/${maxRetries} to communicate with offscreen document`); + + // Send message to offscreen document without target property + response = await chrome.runtime.sendMessage({ + type: 'start-recording', + target: 'offscreen', + data: { + streamId: currentStreamId, + websocketUrl: websocketUrl + } + }); + + if (!response) { + console.warn(`Service worker: No response from offscreen document, waiting before retry...`); + await new Promise(resolve => setTimeout(resolve, 200)); + retryCount++; + } else { + console.log(`Service worker: Successfully communicated with offscreen document on attempt ${retryCount + 1}`); + } + } catch (sendError) { + console.error(`Service worker: Error sending message to offscreen document (attempt ${retryCount + 1}):`, sendError); + response = { success: false, error: 'Failed to communicate with offscreen document: ' + sendError.message }; + break; + } + } + + console.log('Service worker: Final offscreen document response:', response); + + if (response && response.success) { + isRecording = true; + console.log('Service worker: Recording started successfully'); + + // Notify popup of state change + try { + chrome.runtime.sendMessage({ + type: 'recording-state', + isRecording: true + }); + } catch (e) { + console.warn('Service worker: Could not notify popup of state change:', e); + } + + return { success: true }; + } else { + console.log('Service worker: Offscreen document returned failure'); + return { success: false, error: response?.error || 'Failed to start recording in offscreen document' }; + } + + } catch (error) { + console.error('Service worker: Exception in startTabCapture:', error); + return { success: false, error: 'Exception: ' + error.message }; + } +} + +async function stopTabCapture() { + try { + if (!isRecording) { + return { success: false, error: 'Not currently recording' }; + } + + // Send message to offscreen document to stop capture + const response = await chrome.runtime.sendMessage({ + type: 'stop-recording', + target: 'offscreen' + }); + + isRecording = false; + currentStreamId = null; + + // Notify popup of state change + try { + chrome.runtime.sendMessage({ + type: 'recording-state', + isRecording: false + }); + } catch (e) { + // Popup might be closed, ignore error + } + + return { success: true }; + + } catch (error) { + console.error('Error stopping tab capture:', error); + isRecording = false; + currentStreamId = null; + return { success: false, error: error.message }; + } +} + +// Handle messages from offscreen document +chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { + if (message.target === 'service-worker') { + switch (message.type) { + case 'recording-stopped': + isRecording = false; + currentStreamId = null; + + // Notify popup + try { + chrome.runtime.sendMessage({ + type: 'recording-state', + isRecording: false + }); + } catch (e) { + // Popup might be closed, ignore error + } + break; + + case 'recording-error': + isRecording = false; + currentStreamId = null; + + // Notify popup + try { + chrome.runtime.sendMessage({ + type: 'status-update', + status: 'error', + message: message.error || 'Recording error occurred' + }); + } catch (e) { + // Popup might be closed, ignore error + } + break; + } + } +}); diff --git a/chrome-extension/sidepanel.js b/chrome-extension/sidepanel.js new file mode 100644 index 0000000..5bf3311 --- /dev/null +++ b/chrome-extension/sidepanel.js @@ -0,0 +1,29 @@ +console.log("sidepanel.js"); + +async function run() { + const micPermission = await navigator.permissions.query({ + name: "microphone", + }); + + document.getElementById( + "audioPermission" + ).innerText = `MICROPHONE: ${micPermission.state}`; + + if (micPermission.state !== "granted") { + chrome.tabs.create({ url: "requestPermissions.html" }); + } + + const intervalId = setInterval(async () => { + const micPermission = await navigator.permissions.query({ + name: "microphone", + }); + if (micPermission.state === "granted") { + document.getElementById( + "audioPermission" + ).innerText = `MICROPHONE: ${micPermission.state}`; + clearInterval(intervalId); + } + }, 100); +} + +void run(); diff --git a/chrome-extension/web/live_transcription.css b/chrome-extension/web/live_transcription.css new file mode 100644 index 0000000..01392d6 --- /dev/null +++ b/chrome-extension/web/live_transcription.css @@ -0,0 +1,469 @@ +:root { + --bg: #ffffff; + --text: #111111; + --muted: #666666; + --border: #e5e5e5; + --chip-bg: rgba(0, 0, 0, 0.04); + --chip-text: #000000; + --spinner-border: #8d8d8d5c; + --spinner-top: #b0b0b0; + --silence-bg: #f3f3f3; + --loading-bg: rgba(255, 77, 77, 0.06); + --button-bg: #ffffff; + --button-border: #e9e9e9; + --wave-stroke: #000000; + --label-dia-text: #868686; + --label-trans-text: #111111; +} + +@media (prefers-color-scheme: dark) { + :root:not([data-theme="light"]) { + --bg: #0b0b0b; + --text: #e6e6e6; + --muted: #9aa0a6; + --border: #333333; + --chip-bg: rgba(255, 255, 255, 0.08); + --chip-text: #e6e6e6; + --spinner-border: #555555; + --spinner-top: #dddddd; + --silence-bg: #1a1a1a; + --loading-bg: rgba(255, 77, 77, 0.12); + --button-bg: #111111; + --button-border: #333333; + --wave-stroke: #e6e6e6; + --label-dia-text: #b3b3b3; + --label-trans-text: #ffffff; + } +} + +:root[data-theme="dark"] { + --bg: #0b0b0b; + --text: #e6e6e6; + --muted: #9aa0a6; + --border: #333333; + --chip-bg: rgba(255, 255, 255, 0.08); + --chip-text: #e6e6e6; + --spinner-border: #555555; + --spinner-top: #dddddd; + --silence-bg: #1a1a1a; + --loading-bg: rgba(255, 77, 77, 0.12); + --button-bg: #111111; + --button-border: #333333; + --wave-stroke: #e6e6e6; + --label-dia-text: #b3b3b3; + --label-trans-text: #ffffff; +} + +:root[data-theme="light"] { + --bg: #ffffff; + --text: #111111; + --muted: #666666; + --border: #e5e5e5; + --chip-bg: rgba(0, 0, 0, 0.04); + --chip-text: #000000; + --spinner-border: #8d8d8d5c; + --spinner-top: #b0b0b0; + --silence-bg: #f3f3f3; + --loading-bg: rgba(255, 77, 77, 0.06); + --button-bg: #ffffff; + --button-border: #e9e9e9; + --wave-stroke: #000000; + --label-dia-text: #868686; + --label-trans-text: #111111; +} + +body { + font-family: ui-sans-serif, system-ui, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; + margin: 20px; + text-align: center; + background-color: var(--bg); + color: var(--text); +} + +/* Record button */ +#recordButton { + width: 50px; + height: 50px; + border: none; + border-radius: 50%; + background-color: var(--button-bg); + cursor: pointer; + transition: all 0.3s ease; + border: 1px solid var(--button-border); + display: flex; + align-items: center; + justify-content: center; + position: relative; +} + +#recordButton.recording { + width: 180px; + border-radius: 40px; + justify-content: flex-start; + padding-left: 20px; +} + +#recordButton:active { + transform: scale(0.95); +} + +.shape-container { + width: 25px; + height: 25px; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; +} + +.shape { + width: 25px; + height: 25px; + background-color: rgb(209, 61, 53); + border-radius: 50%; + transition: all 0.3s ease; +} + +#recordButton:disabled .shape { + background-color: #6e6d6d; +} + +#recordButton.recording .shape { + border-radius: 5px; + width: 25px; + height: 25px; +} + +/* Recording elements */ +.recording-info { + display: none; + align-items: center; + margin-left: 15px; + flex-grow: 1; +} + +#recordButton.recording .recording-info { + display: flex; +} + +.wave-container { + width: 60px; + height: 30px; + position: relative; + display: flex; + align-items: center; + justify-content: center; +} + +#waveCanvas { + width: 100%; + height: 100%; +} + +.timer { + font-size: 14px; + font-weight: 500; + color: var(--text); + margin-left: 10px; +} + +#status { + margin-top: 20px; + font-size: 16px; + color: var(--text); +} + +/* Settings */ +.settings-container { + display: flex; + justify-content: center; + align-items: center; + gap: 15px; + margin-top: 20px; +} + +.settings { + display: flex; + flex-wrap: wrap; + align-items: flex-start; + gap: 12px; +} + +.field { + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 3px; +} + +#chunkSelector, +#websocketInput, +#themeSelector, +#microphoneSelect { + font-size: 16px; + padding: 5px 8px; + border-radius: 8px; + border: 1px solid var(--border); + background-color: var(--button-bg); + color: var(--text); + max-height: 30px; +} + +#microphoneSelect { + width: 100%; + max-width: 190px; + min-width: 120px; +} + +#chunkSelector:focus, +#websocketInput:focus, +#themeSelector:focus, +#microphoneSelect:focus { + outline: none; + border-color: #007bff; + box-shadow: 0 0 0 3px rgba(0, 123, 255, 0.15); +} + +label { + font-size: 13px; + color: var(--muted); +} + +.ws-default { + font-size: 12px; + color: var(--muted); +} + +/* Segmented pill control for Theme */ +.segmented { + display: inline-flex; + align-items: stretch; + border: 1px solid var(--button-border); + background-color: var(--button-bg); + border-radius: 999px; + overflow: hidden; +} + +.segmented input[type="radio"] { + position: absolute; + opacity: 0; + pointer-events: none; +} + +.theme-selector-container { + display: flex; + align-items: center; + margin-top: 17px; +} + +.segmented label { + display: inline-flex; + align-items: center; + gap: 6px; + padding: 6px 12px; + font-size: 14px; + color: var(--muted); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease, color 0.2s ease; +} + +.segmented label span { + display: none; +} + +.segmented label:hover span { + display: inline; +} + +.segmented label:hover { + background-color: var(--chip-bg); +} + +.segmented img { + width: 16px; + height: 16px; +} + +.segmented input[type="radio"]:checked + label { + background-color: var(--chip-bg); + color: var(--text); +} + +.segmented input[type="radio"]:focus-visible + label, +.segmented input[type="radio"]:focus + label { + outline: 2px solid #007bff; + outline-offset: 2px; + border-radius: 999px; +} + +/* Transcript area */ +#linesTranscript { + margin: 20px auto; + max-width: 700px; + text-align: left; + font-size: 16px; +} + +#linesTranscript p { + margin: 0px 0; +} + +#linesTranscript strong { + color: var(--text); +} + +#speaker { + border: 1px solid var(--border); + border-radius: 100px; + padding: 2px 10px; + font-size: 14px; + margin-bottom: 0px; +} + +.label_diarization { + background-color: var(--chip-bg); + border-radius: 8px 8px 8px 8px; + padding: 2px 10px; + margin-left: 10px; + display: inline-block; + white-space: nowrap; + font-size: 14px; + margin-bottom: 0px; + color: var(--label-dia-text); +} + +.label_transcription { + background-color: var(--chip-bg); + border-radius: 8px 8px 8px 8px; + padding: 2px 10px; + display: inline-block; + white-space: nowrap; + margin-left: 10px; + font-size: 14px; + margin-bottom: 0px; + color: var(--label-trans-text); +} + +#timeInfo { + color: var(--muted); + margin-left: 10px; +} + +.textcontent { + font-size: 16px; + padding-left: 10px; + margin-bottom: 10px; + margin-top: 1px; + padding-top: 5px; + border-radius: 0px 0px 0px 10px; +} + +.buffer_diarization { + color: var(--label-dia-text); + margin-left: 4px; +} + +.buffer_transcription { + color: #7474748c; + margin-left: 4px; +} + +.spinner { + display: inline-block; + width: 8px; + height: 8px; + border: 2px solid var(--spinner-border); + border-top: 2px solid var(--spinner-top); + border-radius: 50%; + animation: spin 0.7s linear infinite; + vertical-align: middle; + margin-bottom: 2px; + margin-right: 5px; +} + +@keyframes spin { + to { + transform: rotate(360deg); + } +} + +.silence { + color: var(--muted); + background-color: var(--silence-bg); + font-size: 13px; + border-radius: 30px; + padding: 2px 10px; +} + +.loading { + color: var(--muted); + background-color: var(--loading-bg); + border-radius: 8px 8px 8px 0px; + padding: 2px 10px; + font-size: 14px; + margin-bottom: 0px; +} + +/* for smaller screens */ +@media (max-width: 768px) { + .settings-container { + flex-direction: column; + gap: 10px; + } + + .settings { + justify-content: center; + gap: 8px; + } + + .field { + align-items: center; + } + + #websocketInput, + #microphoneSelect { + min-width: 200px; + max-width: 400px; + } + + .theme-selector-container { + margin-top: 10px; + } +} + +@media (max-width: 480px) { + body { + margin: 10px; + } + + .settings { + flex-direction: column; + align-items: center; + gap: 6px; + } + + #websocketInput, + #microphoneSelect { + max-width: 400px; + } + + .segmented label { + padding: 4px 8px; + font-size: 12px; + } + + .segmented img { + width: 14px; + height: 14px; + } +} + + +html +{ + width: 400px; /* max: 800px */ + height: 600px; /* max: 600px */ + border-radius: 10px; + +} \ No newline at end of file diff --git a/chrome-extension/web/live_transcription.js b/chrome-extension/web/live_transcription.js new file mode 100644 index 0000000..5d1703c --- /dev/null +++ b/chrome-extension/web/live_transcription.js @@ -0,0 +1,619 @@ +/* Theme, WebSocket, recording, rendering logic extracted from inline script and adapted for segmented theme control and WS caption */ +let isRecording = false; +let websocket = null; +let recorder = null; +let chunkDuration = 100; +let websocketUrl = "ws://localhost:8000/asr"; +let userClosing = false; +let wakeLock = null; +let startTime = null; +let timerInterval = null; +let audioContext = null; +let analyser = null; +let microphone = null; +let waveCanvas = document.getElementById("waveCanvas"); +let waveCtx = waveCanvas.getContext("2d"); +let animationFrame = null; +let waitingForStop = false; +let lastReceivedData = null; +let lastSignature = null; +let availableMicrophones = []; +let selectedMicrophoneId = null; + +waveCanvas.width = 60 * (window.devicePixelRatio || 1); +waveCanvas.height = 30 * (window.devicePixelRatio || 1); +waveCtx.scale(window.devicePixelRatio || 1, window.devicePixelRatio || 1); + +const statusText = document.getElementById("status"); +const recordButton = document.getElementById("recordButton"); +const chunkSelector = document.getElementById("chunkSelector"); +const websocketInput = document.getElementById("websocketInput"); +const websocketDefaultSpan = document.getElementById("wsDefaultUrl"); +const linesTranscriptDiv = document.getElementById("linesTranscript"); +const timerElement = document.querySelector(".timer"); +const themeRadios = document.querySelectorAll('input[name="theme"]'); +const microphoneSelect = document.getElementById("microphoneSelect"); + +function getWaveStroke() { + const styles = getComputedStyle(document.documentElement); + const v = styles.getPropertyValue("--wave-stroke").trim(); + return v || "#000"; +} + +let waveStroke = getWaveStroke(); +function updateWaveStroke() { + waveStroke = getWaveStroke(); +} + +function applyTheme(pref) { + if (pref === "light") { + document.documentElement.setAttribute("data-theme", "light"); + } else if (pref === "dark") { + document.documentElement.setAttribute("data-theme", "dark"); + } else { + document.documentElement.removeAttribute("data-theme"); + } + updateWaveStroke(); +} + +// Persisted theme preference +const savedThemePref = localStorage.getItem("themePreference") || "system"; +applyTheme(savedThemePref); +if (themeRadios.length) { + themeRadios.forEach((r) => { + r.checked = r.value === savedThemePref; + r.addEventListener("change", () => { + if (r.checked) { + localStorage.setItem("themePreference", r.value); + applyTheme(r.value); + } + }); + }); +} + +// React to OS theme changes when in "system" mode +const darkMq = window.matchMedia && window.matchMedia("(prefers-color-scheme: dark)"); +const handleOsThemeChange = () => { + const pref = localStorage.getItem("themePreference") || "system"; + if (pref === "system") updateWaveStroke(); +}; +if (darkMq && darkMq.addEventListener) { + darkMq.addEventListener("change", handleOsThemeChange); +} else if (darkMq && darkMq.addListener) { + // deprecated, but included for Safari compatibility + darkMq.addListener(handleOsThemeChange); +} + +async function enumerateMicrophones() { + try { + const micPermission = await navigator.permissions.query({ + name: "microphone", + }); + + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + stream.getTracks().forEach(track => track.stop()); + + const devices = await navigator.mediaDevices.enumerateDevices(); + availableMicrophones = devices.filter(device => device.kind === 'audioinput'); + + populateMicrophoneSelect(); + console.log(`Found ${availableMicrophones.length} microphone(s)`); + } catch (error) { + console.error('Error enumerating microphones:', error); + statusText.textContent = "Error accessing microphones. Please grant permission."; + } +} + +function populateMicrophoneSelect() { + if (!microphoneSelect) return; + + microphoneSelect.innerHTML = ''; + + availableMicrophones.forEach((device, index) => { + const option = document.createElement('option'); + option.value = device.deviceId; + option.textContent = device.label || `Microphone ${index + 1}`; + microphoneSelect.appendChild(option); + }); + + const savedMicId = localStorage.getItem('selectedMicrophone'); + if (savedMicId && availableMicrophones.some(mic => mic.deviceId === savedMicId)) { + microphoneSelect.value = savedMicId; + selectedMicrophoneId = savedMicId; + } +} + +function handleMicrophoneChange() { + selectedMicrophoneId = microphoneSelect.value || null; + localStorage.setItem('selectedMicrophone', selectedMicrophoneId || ''); + + const selectedDevice = availableMicrophones.find(mic => mic.deviceId === selectedMicrophoneId); + const deviceName = selectedDevice ? selectedDevice.label : 'Default Microphone'; + + console.log(`Selected microphone: ${deviceName}`); + statusText.textContent = `Microphone changed to: ${deviceName}`; + + if (isRecording) { + statusText.textContent = "Switching microphone... Please wait."; + stopRecording().then(() => { + setTimeout(() => { + toggleRecording(); + }, 1000); + }); + } +} + +// Helpers +function fmt1(x) { + const n = Number(x); + return Number.isFinite(n) ? n.toFixed(1) : x; +} + +// Default WebSocket URL computation +const host = window.location.hostname || "localhost"; +const port = window.location.port; +const protocol = window.location.protocol === "https:" ? "wss" : "ws"; +const defaultWebSocketUrl = websocketUrl; + +// Populate default caption and input +if (websocketDefaultSpan) websocketDefaultSpan.textContent = defaultWebSocketUrl; +websocketInput.value = defaultWebSocketUrl; +websocketUrl = defaultWebSocketUrl; + +// Optional chunk selector (guard for presence) +if (chunkSelector) { + chunkSelector.addEventListener("change", () => { + chunkDuration = parseInt(chunkSelector.value); + }); +} + +// WebSocket input change handling +websocketInput.addEventListener("change", () => { + const urlValue = websocketInput.value.trim(); + if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) { + statusText.textContent = "Invalid WebSocket URL (must start with ws:// or wss://)"; + return; + } + websocketUrl = urlValue; + statusText.textContent = "WebSocket URL updated. Ready to connect."; +}); + +function setupWebSocket() { + return new Promise((resolve, reject) => { + try { + websocket = new WebSocket(websocketUrl); + } catch (error) { + statusText.textContent = "Invalid WebSocket URL. Please check and try again."; + reject(error); + return; + } + + websocket.onopen = () => { + statusText.textContent = "Connected to server."; + resolve(); + }; + + websocket.onclose = () => { + if (userClosing) { + if (waitingForStop) { + statusText.textContent = "Processing finalized or connection closed."; + if (lastReceivedData) { + renderLinesWithBuffer( + lastReceivedData.lines || [], + lastReceivedData.buffer_diarization || "", + lastReceivedData.buffer_transcription || "", + 0, + 0, + true + ); + } + } + } else { + statusText.textContent = "Disconnected from the WebSocket server. (Check logs if model is loading.)"; + if (isRecording) { + stopRecording(); + } + } + isRecording = false; + waitingForStop = false; + userClosing = false; + lastReceivedData = null; + websocket = null; + updateUI(); + }; + + websocket.onerror = () => { + statusText.textContent = "Error connecting to WebSocket."; + reject(new Error("Error connecting to WebSocket")); + }; + + websocket.onmessage = (event) => { + const data = JSON.parse(event.data); + + if (data.type === "ready_to_stop") { + console.log("Ready to stop received, finalizing display and closing WebSocket."); + waitingForStop = false; + + if (lastReceivedData) { + renderLinesWithBuffer( + lastReceivedData.lines || [], + lastReceivedData.buffer_diarization || "", + lastReceivedData.buffer_transcription || "", + 0, + 0, + true + ); + } + statusText.textContent = "Finished processing audio! Ready to record again."; + recordButton.disabled = false; + + if (websocket) { + websocket.close(); + } + return; + } + + lastReceivedData = data; + + const { + lines = [], + buffer_transcription = "", + buffer_diarization = "", + remaining_time_transcription = 0, + remaining_time_diarization = 0, + status = "active_transcription", + } = data; + + renderLinesWithBuffer( + lines, + buffer_diarization, + buffer_transcription, + remaining_time_diarization, + remaining_time_transcription, + false, + status + ); + }; + }); +} + +function renderLinesWithBuffer( + lines, + buffer_diarization, + buffer_transcription, + remaining_time_diarization, + remaining_time_transcription, + isFinalizing = false, + current_status = "active_transcription" +) { + if (current_status === "no_audio_detected") { + linesTranscriptDiv.innerHTML = + "

No audio detected...

"; + return; + } + + const showLoading = !isFinalizing && (lines || []).some((it) => it.speaker == 0); + const showTransLag = !isFinalizing && remaining_time_transcription > 0; + const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0; + const signature = JSON.stringify({ + lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })), + buffer_transcription: buffer_transcription || "", + buffer_diarization: buffer_diarization || "", + status: current_status, + showLoading, + showTransLag, + showDiaLag, + isFinalizing: !!isFinalizing, + }); + if (lastSignature === signature) { + const t = document.querySelector(".lag-transcription-value"); + if (t) t.textContent = fmt1(remaining_time_transcription); + const d = document.querySelector(".lag-diarization-value"); + if (d) d.textContent = fmt1(remaining_time_diarization); + const ld = document.querySelector(".loading-diarization-value"); + if (ld) ld.textContent = fmt1(remaining_time_diarization); + return; + } + lastSignature = signature; + + const linesHtml = (lines || []) + .map((item, idx) => { + let timeInfo = ""; + if (item.beg !== undefined && item.end !== undefined) { + timeInfo = ` ${item.beg} - ${item.end}`; + } + + let speakerLabel = ""; + if (item.speaker === -2) { + speakerLabel = `Silence${timeInfo}`; + } else if (item.speaker == 0 && !isFinalizing) { + speakerLabel = `${fmt1( + remaining_time_diarization + )} second(s) of audio are undergoing diarization`; + } else if (item.speaker !== 0) { + speakerLabel = `Speaker ${item.speaker}${timeInfo}`; + } + + let currentLineText = item.text || ""; + + if (idx === lines.length - 1) { + if (!isFinalizing && item.speaker !== -2) { + if (remaining_time_transcription > 0) { + speakerLabel += `Lag ${fmt1( + remaining_time_transcription + )}s`; + } + if (buffer_diarization && remaining_time_diarization > 0) { + speakerLabel += `Lag${fmt1( + remaining_time_diarization + )}s`; + } + } + + if (buffer_diarization) { + if (isFinalizing) { + currentLineText += + (currentLineText.length > 0 && buffer_diarization.trim().length > 0 ? " " : "") + buffer_diarization.trim(); + } else { + currentLineText += `${buffer_diarization}`; + } + } + if (buffer_transcription) { + if (isFinalizing) { + currentLineText += + (currentLineText.length > 0 && buffer_transcription.trim().length > 0 ? " " : "") + + buffer_transcription.trim(); + } else { + currentLineText += `${buffer_transcription}`; + } + } + } + + return currentLineText.trim().length > 0 || speakerLabel.length > 0 + ? `

${speakerLabel}

${currentLineText}

` + : `

${speakerLabel}

`; + }) + .join(""); + + linesTranscriptDiv.innerHTML = linesHtml; + window.scrollTo({ top: document.body.scrollHeight, behavior: "smooth" }); +} + +function updateTimer() { + if (!startTime) return; + + const elapsed = Math.floor((Date.now() - startTime) / 1000); + const minutes = Math.floor(elapsed / 60).toString().padStart(2, "0"); + const seconds = (elapsed % 60).toString().padStart(2, "0"); + timerElement.textContent = `${minutes}:${seconds}`; +} + +function drawWaveform() { + if (!analyser) return; + + const bufferLength = analyser.frequencyBinCount; + const dataArray = new Uint8Array(bufferLength); + analyser.getByteTimeDomainData(dataArray); + + waveCtx.clearRect( + 0, + 0, + waveCanvas.width / (window.devicePixelRatio || 1), + waveCanvas.height / (window.devicePixelRatio || 1) + ); + waveCtx.lineWidth = 1; + waveCtx.strokeStyle = waveStroke; + waveCtx.beginPath(); + + const sliceWidth = (waveCanvas.width / (window.devicePixelRatio || 1)) / bufferLength; + let x = 0; + + for (let i = 0; i < bufferLength; i++) { + const v = dataArray[i] / 128.0; + const y = (v * (waveCanvas.height / (window.devicePixelRatio || 1))) / 2; + + if (i === 0) { + waveCtx.moveTo(x, y); + } else { + waveCtx.lineTo(x, y); + } + + x += sliceWidth; + } + + waveCtx.lineTo( + waveCanvas.width / (window.devicePixelRatio || 1), + (waveCanvas.height / (window.devicePixelRatio || 1)) / 2 + ); + waveCtx.stroke(); + + animationFrame = requestAnimationFrame(drawWaveform); +} + +async function startRecording() { + try { + try { + wakeLock = await navigator.wakeLock.request("screen"); + } catch (err) { + console.log("Error acquiring wake lock."); + } + + let stream; + try { + // Try tab capture first + stream = await new Promise((resolve, reject) => { + chrome.tabCapture.capture({audio: true}, (s) => { + if (s) { + resolve(s); + } else { + reject(new Error('Tab capture failed or not available')); + } + }); + }); + statusText.textContent = "Using tab audio capture."; + } catch (tabError) { + console.log('Tab capture not available, falling back to microphone', tabError); + // Fallback to microphone + const audioConstraints = selectedMicrophoneId + ? { audio: { deviceId: { exact: selectedMicrophoneId } } } + : { audio: true }; + stream = await navigator.mediaDevices.getUserMedia(audioConstraints); + statusText.textContent = "Using microphone audio."; + } + + audioContext = new (window.AudioContext || window.webkitAudioContext)(); + analyser = audioContext.createAnalyser(); + analyser.fftSize = 256; + microphone = audioContext.createMediaStreamSource(stream); + microphone.connect(analyser); + + recorder = new MediaRecorder(stream, { mimeType: "audio/webm" }); + recorder.ondataavailable = (e) => { + if (websocket && websocket.readyState === WebSocket.OPEN) { + websocket.send(e.data); + } + }; + recorder.start(chunkDuration); + + startTime = Date.now(); + timerInterval = setInterval(updateTimer, 1000); + drawWaveform(); + + isRecording = true; + updateUI(); + } catch (err) { + if (window.location.hostname === "0.0.0.0") { + statusText.textContent = + "Error accessing audio input. Browsers may block audio access on 0.0.0.0. Try using localhost:8000 instead."; + } else { + statusText.textContent = "Error accessing audio input. Please check permissions."; + } + console.error(err); + } +} + +async function stopRecording() { + if (wakeLock) { + try { + await wakeLock.release(); + } catch (e) { + // ignore + } + wakeLock = null; + } + + userClosing = true; + waitingForStop = true; + + if (websocket && websocket.readyState === WebSocket.OPEN) { + const emptyBlob = new Blob([], { type: "audio/webm" }); + websocket.send(emptyBlob); + statusText.textContent = "Recording stopped. Processing final audio..."; + } + + if (recorder) { + recorder.stop(); + recorder = null; + } + + if (microphone) { + microphone.disconnect(); + microphone = null; + } + + if (analyser) { + analyser = null; + } + + if (audioContext && audioContext.state !== "closed") { + try { + await audioContext.close(); + } catch (e) { + console.warn("Could not close audio context:", e); + } + audioContext = null; + } + + if (animationFrame) { + cancelAnimationFrame(animationFrame); + animationFrame = null; + } + + if (timerInterval) { + clearInterval(timerInterval); + timerInterval = null; + } + timerElement.textContent = "00:00"; + startTime = null; + + isRecording = false; + updateUI(); +} + +async function toggleRecording() { + if (!isRecording) { + if (waitingForStop) { + console.log("Waiting for stop, early return"); + return; + } + console.log("Connecting to WebSocket"); + try { + if (websocket && websocket.readyState === WebSocket.OPEN) { + await startRecording(); + } else { + await setupWebSocket(); + await startRecording(); + } + } catch (err) { + statusText.textContent = "Could not connect to WebSocket or access mic. Aborted."; + console.error(err); + } + } else { + console.log("Stopping recording"); + stopRecording(); + } +} + +function updateUI() { + recordButton.classList.toggle("recording", isRecording); + recordButton.disabled = waitingForStop; + + if (waitingForStop) { + if (statusText.textContent !== "Recording stopped. Processing final audio...") { + statusText.textContent = "Please wait for processing to complete..."; + } + } else if (isRecording) { + statusText.textContent = "Recording..."; + } else { + if ( + statusText.textContent !== "Finished processing audio! Ready to record again." && + statusText.textContent !== "Processing finalized or connection closed." + ) { + statusText.textContent = "Click to start transcription"; + } + } + if (!waitingForStop) { + recordButton.disabled = false; + } +} + +recordButton.addEventListener("click", toggleRecording); + +if (microphoneSelect) { + microphoneSelect.addEventListener("change", handleMicrophoneChange); +} +// document.addEventListener('DOMContentLoaded', async () => { +// try { +// await enumerateMicrophones(); +// } catch (error) { +// console.log("Could not enumerate microphones on load:", error); +// } +// }); +// navigator.mediaDevices.addEventListener('devicechange', async () => { +// console.log('Device change detected, re-enumerating microphones'); +// try { +// await enumerateMicrophones(); +// } catch (error) { +// console.log("Error re-enumerating microphones:", error); +// } +// }); diff --git a/chrome-extension/web/src/dark_mode.svg b/chrome-extension/web/src/dark_mode.svg new file mode 100644 index 0000000..a083e1a --- /dev/null +++ b/chrome-extension/web/src/dark_mode.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/chrome-extension/web/src/light_mode.svg b/chrome-extension/web/src/light_mode.svg new file mode 100644 index 0000000..66b6e74 --- /dev/null +++ b/chrome-extension/web/src/light_mode.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/chrome-extension/web/src/system_mode.svg b/chrome-extension/web/src/system_mode.svg new file mode 100644 index 0000000..7a8a0d2 --- /dev/null +++ b/chrome-extension/web/src/system_mode.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/whisperlivekit/simul_whisper/simul_whisper.py b/whisperlivekit/simul_whisper/simul_whisper.py index 0b8649e..c1f8c2e 100644 --- a/whisperlivekit/simul_whisper/simul_whisper.py +++ b/whisperlivekit/simul_whisper/simul_whisper.py @@ -399,17 +399,17 @@ class PaddedAlignAttWhisper: mlx_mel_padded = mlx_log_mel_spectrogram(audio=input_segments.detach(), n_mels=self.model.dims.n_mels, padding=N_SAMPLES) mlx_mel = mlx_pad_or_trim(mlx_mel_padded, N_FRAMES, axis=-2) mlx_encoder_feature = self.mlx_encoder.encoder(mlx_mel[None]) - encoder_feature = torch.as_tensor(mlx_encoder_feature) + encoder_feature = torch.tensor(np.array(mlx_encoder_feature)) content_mel_len = int((mlx_mel_padded.shape[0] - mlx_mel.shape[0])/2) - device = encoder_feature.device #'cpu' is apple silicon + device = 'cpu' elif self.fw_encoder: audio_length_seconds = len(input_segments) / 16000 content_mel_len = int(audio_length_seconds * 100)//2 mel_padded_2 = self.fw_feature_extractor(waveform=input_segments.numpy(), padding=N_SAMPLES)[None, :] mel = fw_pad_or_trim(mel_padded_2, N_FRAMES, axis=-1) encoder_feature_ctranslate = self.fw_encoder.encode(mel) - encoder_feature = torch.as_tensor(encoder_feature_ctranslate) - device = encoder_feature.device + encoder_feature = torch.Tensor(np.array(encoder_feature_ctranslate)) + device = 'cpu' else: # mel + padding to 30s mel_padded = log_mel_spectrogram(input_segments, n_mels=self.model.dims.n_mels, padding=N_SAMPLES,