diff --git a/.gitignore b/.gitignore index b5d3257e7e6..e24f5cb79b1 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,4 @@ dist/protocol.schema.json # Synthing **/.stfolder/ +apps/android/.kotlin/ diff --git a/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt b/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt index 9cb7d626ce7..6af97c87543 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/MainViewModel.kt @@ -130,6 +130,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { runtime.setCanvasDebugStatusEnabled(value) } + fun setVoiceScreenActive(active: Boolean) { + runtime.setVoiceScreenActive(active) + } + fun setMicEnabled(enabled: Boolean) { runtime.setMicEnabled(enabled) } diff --git a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt index f462413669b..d85673bf75d 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/NodeRuntime.kt @@ -387,11 +387,22 @@ class NodeRuntime(context: Context) { val micIsSending: StateFlow get() = micCapture.isSending + private val talkMode: TalkModeManager by lazy { + TalkModeManager( + context = appContext, + scope = scope, + session = operatorSession, + supportsChatSubscribe = true, + isConnected = { operatorConnected }, + ) + } + private fun applyMainSessionKey(candidate: String?) { val trimmed = normalizeMainKey(candidate) ?: return if (isCanonicalMainSessionKey(_mainSessionKey.value)) return if (_mainSessionKey.value == trimmed) return _mainSessionKey.value = trimmed + talkMode.setMainSessionKey(trimmed) chat.applyMainSessionKey(trimmed) } @@ -529,7 +540,14 @@ class NodeRuntime(context: Context) { scope.launch { prefs.talkEnabled.collect { enabled -> + // MicCaptureManager handles STT + send to gateway. + // TalkModeManager plays TTS on assistant responses. micCapture.setMicEnabled(enabled) + if (enabled) { + // Mic on = user is on voice screen and wants TTS responses. + talkMode.ttsOnAllResponses = true + scope.launch { talkMode.ensureChatSubscribed() } + } externalAudioCaptureActive.value = enabled } } @@ -637,8 +655,25 @@ class NodeRuntime(context: Context) { prefs.setCanvasDebugStatusEnabled(value) } + fun setVoiceScreenActive(active: Boolean) { + if (!active) { + // User left voice screen — stop mic and TTS + talkMode.ttsOnAllResponses = false + talkMode.stopTts() + micCapture.setMicEnabled(false) + prefs.setTalkEnabled(false) + } + // Don't re-enable on active=true; mic toggle drives that + } + fun setMicEnabled(value: Boolean) { prefs.setTalkEnabled(value) + if (value) { + // Tapping mic on interrupts any active TTS (barge-in) + talkMode.stopTts() + talkMode.ttsOnAllResponses = true + scope.launch { talkMode.ensureChatSubscribed() } + } micCapture.setMicEnabled(value) externalAudioCaptureActive.value = value } @@ -834,6 +869,7 @@ class NodeRuntime(context: Context) { private fun handleGatewayEvent(event: String, payloadJson: String?) { micCapture.handleGatewayEvent(event, payloadJson) + talkMode.handleGatewayEvent(event, payloadJson) chat.handleGatewayEvent(event, payloadJson) } diff --git a/apps/android/app/src/main/java/ai/openclaw/android/ui/PostOnboardingTabs.kt b/apps/android/app/src/main/java/ai/openclaw/android/ui/PostOnboardingTabs.kt index 1345d8e3cb9..e7adf00b18f 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/ui/PostOnboardingTabs.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/ui/PostOnboardingTabs.kt @@ -31,6 +31,7 @@ import androidx.compose.material3.Surface import androidx.compose.material3.Text import androidx.compose.runtime.Composable import androidx.compose.runtime.collectAsState +import androidx.compose.runtime.LaunchedEffect import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableStateOf import androidx.compose.runtime.remember @@ -68,6 +69,11 @@ private enum class StatusVisual { fun PostOnboardingTabs(viewModel: MainViewModel, modifier: Modifier = Modifier) { var activeTab by rememberSaveable { mutableStateOf(HomeTab.Connect) } + // Stop TTS when user navigates away from voice tab + LaunchedEffect(activeTab) { + viewModel.setVoiceScreenActive(activeTab == HomeTab.Voice) + } + val statusText by viewModel.statusText.collectAsState() val isConnected by viewModel.isConnected.collectAsState() diff --git a/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt b/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt index 7233135be83..fd0e0a8a4b9 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/ui/VoiceTabScreen.kt @@ -101,7 +101,11 @@ fun VoiceTabScreen(viewModel: MainViewModel) { } } lifecycleOwner.lifecycle.addObserver(observer) - onDispose { lifecycleOwner.lifecycle.removeObserver(observer) } + onDispose { + lifecycleOwner.lifecycle.removeObserver(observer) + // Stop TTS when leaving the voice screen + viewModel.setVoiceScreenActive(false) + } } val requestMicPermission = diff --git a/apps/android/app/src/main/java/ai/openclaw/android/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/android/voice/TalkModeManager.kt index 8c5dc9adbcb..8bafd603b85 100644 --- a/apps/android/app/src/main/java/ai/openclaw/android/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/android/voice/TalkModeManager.kt @@ -5,6 +5,7 @@ import android.content.Context import android.content.Intent import android.content.pm.PackageManager import android.media.AudioAttributes +import android.media.AudioFocusRequest import android.media.AudioFormat import android.media.AudioManager import android.media.AudioTrack @@ -23,16 +24,19 @@ import androidx.core.content.ContextCompat import ai.openclaw.android.gateway.GatewaySession import ai.openclaw.android.isCanonicalMainSessionKey import ai.openclaw.android.normalizeMainKey +import android.os.Build +import java.io.File import java.net.HttpURLConnection import java.net.URL import java.util.UUID import java.util.concurrent.atomic.AtomicLong -import kotlinx.coroutines.CompletableDeferred import kotlinx.coroutines.CancellationException +import kotlinx.coroutines.CompletableDeferred import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Job import kotlinx.coroutines.delay +import kotlinx.coroutines.ensureActive import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.flow.StateFlow import kotlinx.coroutines.launch @@ -56,7 +60,12 @@ class TalkModeManager( private const val tag = "TalkMode" private const val defaultModelIdFallback = "eleven_v3" private const val defaultOutputFormatFallback = "pcm_24000" - private const val defaultTalkProvider = "elevenlabs" +private const val defaultTalkProvider = "elevenlabs" + private const val silenceWindowMs = 500L + private const val listenWatchdogMs = 12_000L + private const val chatFinalWaitWithSubscribeMs = 45_000L + private const val chatFinalWaitWithoutSubscribeMs = 6_000L + private const val maxCachedRunCompletions = 128 internal data class TalkProviderConfigSelection( val provider: String, @@ -140,26 +149,55 @@ class TalkModeManager( private var defaultOutputFormat: String? = null private var apiKey: String? = null private var voiceAliases: Map = emptyMap() - private var interruptOnSpeech: Boolean = true + // Interrupt-on-speech is disabled by default: starting a SpeechRecognizer during + // TTS creates an audio session conflict on OxygenOS/OnePlus that causes AudioTrack + // write to return 0 and MediaPlayer to error. Can be enabled via gateway talk config. + private var activeProviderIsElevenLabs: Boolean = true + private var interruptOnSpeech: Boolean = false private var voiceOverrideActive = false private var modelOverrideActive = false private var mainSessionKey: String = "main" - private var pendingRunId: String? = null + @Volatile private var pendingRunId: String? = null private var pendingFinal: CompletableDeferred? = null + private val completedRunsLock = Any() + private val completedRunStates = LinkedHashMap() + private val completedRunTexts = LinkedHashMap() private var chatSubscribedSessionKey: String? = null private var configLoaded = false @Volatile private var playbackEnabled = true private val playbackGeneration = AtomicLong(0L) + private var ttsJob: Job? = null private var player: MediaPlayer? = null private var streamingSource: StreamingMediaDataSource? = null private var pcmTrack: AudioTrack? = null @Volatile private var pcmStopRequested = false + @Volatile private var finalizeInFlight = false + private var listenWatchdogJob: Job? = null private var systemTts: TextToSpeech? = null private var systemTtsPending: CompletableDeferred? = null private var systemTtsPendingId: String? = null + private var audioFocusRequest: AudioFocusRequest? = null + private val audioFocusListener = AudioManager.OnAudioFocusChangeListener { focusChange -> + when (focusChange) { + AudioManager.AUDIOFOCUS_LOSS, + AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> { + if (_isSpeaking.value) { + Log.d(tag, "audio focus lost; stopping TTS") + stopSpeaking(resetInterrupt = true) + } + } + else -> { /* regained or duck — ignore */ } + } + } + + suspend fun ensureChatSubscribed() { + reloadConfig() + subscribeChatIfNeeded(session = session, sessionKey = mainSessionKey.ifBlank { "main" }) + } + fun setMainSessionKey(sessionKey: String?) { val trimmed = sessionKey?.trim().orEmpty() if (trimmed.isEmpty()) return @@ -179,10 +217,174 @@ class TalkModeManager( } } + /** + * Speak a wake-word command through TalkMode's full pipeline: + * chat.send → wait for final → read assistant text → TTS. + * Calls [onComplete] when done so the caller can disable TalkMode and re-arm VoiceWake. + */ + fun speakWakeCommand(command: String, onComplete: () -> Unit) { + scope.launch { + try { + reloadConfig() + subscribeChatIfNeeded(session = session, sessionKey = mainSessionKey.ifBlank { "main" }) + val startedAt = System.currentTimeMillis().toDouble() / 1000.0 + val prompt = buildPrompt(command) + val runId = sendChat(prompt, session) + val ok = waitForChatFinal(runId) + val assistant = consumeRunText(runId) + ?: waitForAssistantText(session, startedAt, if (ok) 12_000 else 25_000) + if (!assistant.isNullOrBlank()) { + val playbackToken = playbackGeneration.incrementAndGet() + _statusText.value = "Speaking…" + playAssistant(assistant, playbackToken) + } else { + _statusText.value = "No reply" + } + } catch (err: Throwable) { + Log.w(tag, "speakWakeCommand failed: ${err.message}") + } + onComplete() + } + } + + /** When true, play TTS for all final chat responses (even ones we didn't initiate). */ + @Volatile var ttsOnAllResponses = false + + // Streaming TTS: active session keyed by runId + private var streamingTts: ElevenLabsStreamingTts? = null + private var streamingFullText: String = "" + @Volatile private var lastHandledStreamingRunId: String? = null + private var drainingTts: ElevenLabsStreamingTts? = null + + private fun stopActiveStreamingTts() { + streamingTts?.stop() + streamingTts = null + drainingTts?.stop() + drainingTts = null + streamingFullText = "" + } + + /** Handle agent stream events — only speak assistant text, not tool calls or thinking. */ + private fun handleAgentStreamEvent(payloadJson: String?) { + if (payloadJson.isNullOrBlank()) return + val payload = try { + json.parseToJsonElement(payloadJson).asObjectOrNull() + } catch (_: Throwable) { null } ?: return + + // Only speak events for the active session — prevents TTS leaking from + // concurrent sessions/channels (privacy + correctness). + val eventSession = payload["sessionKey"]?.asStringOrNull() + val activeSession = mainSessionKey.ifBlank { "main" } + if (eventSession != null && eventSession != activeSession) return + + val stream = payload["stream"]?.asStringOrNull() ?: return + if (stream != "assistant") return // Only speak assistant text + val data = payload["data"]?.asObjectOrNull() ?: return + if (data["type"]?.asStringOrNull() == "thinking") return // Skip thinking tokens + val text = data["text"]?.asStringOrNull()?.trim() ?: return + if (text.isEmpty()) return + if (!playbackEnabled) { + stopActiveStreamingTts() + return + } + + // Start streaming session if not already active + if (streamingTts == null) { + if (!activeProviderIsElevenLabs) return // Non-ElevenLabs provider — skip streaming TTS + val voiceId = currentVoiceId ?: defaultVoiceId + val apiKey = this.apiKey + if (voiceId == null || apiKey == null) { + Log.w(tag, "streaming TTS: missing voiceId or apiKey") + return + } + val modelId = currentModelId ?: defaultModelId ?: "" + val streamModel = if (ElevenLabsStreamingTts.supportsStreaming(modelId)) { + modelId + } else { + "eleven_flash_v2_5" + } + val tts = ElevenLabsStreamingTts( + scope = scope, + voiceId = voiceId, + apiKey = apiKey, + modelId = streamModel, + outputFormat = "pcm_24000", + sampleRate = 24000, + ) + streamingTts = tts + streamingFullText = "" + _isSpeaking.value = true + _statusText.value = "Speaking…" + tts.start() + Log.d(tag, "streaming TTS started for agent assistant text") + lastHandledStreamingRunId = null // will be set on final + } + + val accepted = streamingTts?.sendText(text) ?: false + if (!accepted && streamingTts != null) { + Log.d(tag, "text diverged, restarting streaming TTS") + streamingTts?.stop() + streamingTts = null + // Restart with the new text + val voiceId2 = currentVoiceId ?: defaultVoiceId + val apiKey2 = this.apiKey + if (voiceId2 != null && apiKey2 != null) { + val modelId2 = currentModelId ?: defaultModelId ?: "" + val streamModel2 = if (ElevenLabsStreamingTts.supportsStreaming(modelId2)) modelId2 else "eleven_flash_v2_5" + val newTts = ElevenLabsStreamingTts( + scope = scope, voiceId = voiceId2, apiKey = apiKey2, + modelId = streamModel2, outputFormat = "pcm_24000", sampleRate = 24000, + ) + streamingTts = newTts + streamingFullText = text + newTts.start() + newTts.sendText(streamingFullText) + Log.d(tag, "streaming TTS restarted with new text") + } + } + } + + /** Called when chat final/error/aborted arrives — finish any active streaming TTS. */ + private fun finishStreamingTts() { + streamingFullText = "" + val tts = streamingTts ?: return + // Null out immediately so the next response creates a fresh TTS instance. + // The drain coroutine below holds a reference to this instance for cleanup. + streamingTts = null + drainingTts = tts + tts.finish() + scope.launch { + delay(500) + while (tts.isPlaying.value) { delay(200) } + if (drainingTts === tts) drainingTts = null + _isSpeaking.value = false + _statusText.value = "Ready" + } + } + + fun playTtsForText(text: String) { + val playbackToken = playbackGeneration.incrementAndGet() + ttsJob?.cancel() + ttsJob = scope.launch { + reloadConfig() + ensurePlaybackActive(playbackToken) + _isSpeaking.value = true + _statusText.value = "Speaking…" + playAssistant(text, playbackToken) + ttsJob = null + } + } + fun handleGatewayEvent(event: String, payloadJson: String?) { + if (ttsOnAllResponses) { + Log.d(tag, "gateway event: $event") + } + if (event == "agent" && ttsOnAllResponses) { + handleAgentStreamEvent(payloadJson) + return + } if (event != "chat") return if (payloadJson.isNullOrBlank()) return - val pending = pendingRunId ?: return val obj = try { json.parseToJsonElement(payloadJson).asObjectOrNull() @@ -190,13 +392,68 @@ class TalkModeManager( null } ?: return val runId = obj["runId"].asStringOrNull() ?: return - if (runId != pending) return val state = obj["state"].asStringOrNull() ?: return - if (state == "final") { - pendingFinal?.complete(true) - pendingFinal = null - pendingRunId = null + + // Only speak events for the active session — prevents TTS from other + // sessions/channels leaking into voice mode (privacy + correctness). + val eventSession = obj["sessionKey"]?.asStringOrNull() + val activeSession = mainSessionKey.ifBlank { "main" } + if (eventSession != null && eventSession != activeSession) return + + // If this is a response we initiated, handle normally below. + // Otherwise, if ttsOnAllResponses, finish streaming TTS on terminal events. + val pending = pendingRunId + if (pending == null || runId != pending) { + if (ttsOnAllResponses && state in listOf("final", "error", "aborted")) { + // Skip if we already handled TTS for this run (multiple final events + // can arrive on different threads for the same run). + if (lastHandledStreamingRunId == runId) { + if (pending == null || runId != pending) return + } + lastHandledStreamingRunId = runId + val stts = streamingTts + if (stts != null) { + // Finish streaming and let the drain coroutine handle playback completion. + // Don’t check hasReceivedAudio synchronously — audio may still be in flight + // from the WebSocket (EOS was just sent). The drain coroutine in finishStreamingTts + // waits for playback to complete; if ElevenLabs truly fails, the user just won’t + // hear anything (silent failure is better than double-speaking with system TTS). + finishStreamingTts() + } else if (state == "final") { + // No streaming was active — fall back to non-streaming + val text = extractTextFromChatEventMessage(obj["message"]) + if (!text.isNullOrBlank()) { + playTtsForText(text) + } + } + } + if (pending == null || runId != pending) return } + Log.d(tag, "chat event arrived runId=$runId state=$state pendingRunId=$pendingRunId") + val terminal = + when (state) { + "final" -> true + "aborted", "error" -> false + else -> null + } ?: return + // Cache text from final event so we never need to poll chat.history + if (terminal) { + val text = extractTextFromChatEventMessage(obj["message"]) + if (!text.isNullOrBlank()) { + synchronized(completedRunsLock) { + completedRunTexts[runId] = text + while (completedRunTexts.size > maxCachedRunCompletions) { + completedRunTexts.entries.firstOrNull()?.let { completedRunTexts.remove(it.key) } + } + } + } + } + cacheRunCompletion(runId, terminal) + + if (runId != pendingRunId) return + pendingFinal?.complete(terminal) + pendingFinal = null + pendingRunId = null } fun setPlaybackEnabled(enabled: Boolean) { @@ -204,6 +461,7 @@ class TalkModeManager( playbackEnabled = enabled if (!enabled) { playbackGeneration.incrementAndGet() + stopActiveStreamingTts() stopSpeaking() } } @@ -258,6 +516,7 @@ class TalkModeManager( private fun stop() { stopRequested = true + finalizeInFlight = false listeningMode = false restartJob?.cancel() restartJob = null @@ -270,6 +529,13 @@ class TalkModeManager( stopSpeaking() _usingFallbackTts.value = false chatSubscribedSessionKey = null + pendingRunId = null + pendingFinal?.cancel() + pendingFinal = null + synchronized(completedRunsLock) { + completedRunStates.clear() + completedRunTexts.clear() + } mainHandler.post { recognizer?.cancel() @@ -290,6 +556,10 @@ class TalkModeManager( putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true) putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3) putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName) + // Use cloud recognition — it handles natural speech and pauses better + // than on-device which cuts off aggressively after short silences. + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, 2500L) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, 1800L) } if (markListening) { @@ -309,8 +579,8 @@ class TalkModeManager( if (stopRequested) return@post try { recognizer?.cancel() - val shouldListen = listeningMode - val shouldInterrupt = _isSpeaking.value && interruptOnSpeech + val shouldListen = listeningMode && !finalizeInFlight + val shouldInterrupt = _isSpeaking.value && interruptOnSpeech && shouldAllowSpeechInterrupt() if (!shouldListen && !shouldInterrupt) return@post startListeningInternal(markListening = shouldListen) } catch (_: Throwable) { @@ -338,6 +608,9 @@ class TalkModeManager( if (isFinal) { lastTranscript = trimmed + // Don't finalize immediately — let the silence monitor trigger after + // silenceWindowMs. This allows the recognizer to fire onResults and + // still give the user a natural pause before we send. } } @@ -359,7 +632,15 @@ class TalkModeManager( val lastHeard = lastHeardAtMs ?: return val elapsed = SystemClock.elapsedRealtime() - lastHeard if (elapsed < silenceWindowMs) return - scope.launch { finalizeTranscript(transcript) } + if (finalizeInFlight) return + finalizeInFlight = true + scope.launch { + try { + finalizeTranscript(transcript) + } finally { + finalizeInFlight = false + } + } } private suspend fun finalizeTranscript(transcript: String) { @@ -368,6 +649,16 @@ class TalkModeManager( _statusText.value = "Thinking…" lastTranscript = "" lastHeardAtMs = null + // Release SpeechRecognizer before making the API call and playing TTS. + // Must use withContext(Main) — not post() — so we WAIT for destruction before + // proceeding. A fire-and-forget post() races with TTS startup: the recognizer + // stays alive, picks up TTS audio as speech (onBeginningOfSpeech), and the + // OS kills the AudioTrack write (returns 0) on OxygenOS/OnePlus devices. + withContext(Dispatchers.Main) { + recognizer?.cancel() + recognizer?.destroy() + recognizer = null + } ensureConfigLoaded() val prompt = buildPrompt(transcript) @@ -388,7 +679,9 @@ class TalkModeManager( if (!ok) { Log.w(tag, "chat final timeout runId=$runId; attempting history fallback") } - val assistant = waitForAssistantText(session, startedAt, if (ok) 12_000 else 25_000) + // Use text cached from the final event first — avoids chat.history polling + val assistant = consumeRunText(runId) + ?: waitForAssistantText(session, startedAt, if (ok) 12_000 else 25_000) if (assistant.isNullOrBlank()) { _statusText.value = "No reply" Log.w(tag, "assistant text timeout runId=$runId") @@ -482,6 +775,36 @@ class TalkModeManager( return result } + private fun cacheRunCompletion(runId: String, isFinal: Boolean) { + synchronized(completedRunsLock) { + completedRunStates[runId] = isFinal + while (completedRunStates.size > maxCachedRunCompletions) { + val first = completedRunStates.entries.firstOrNull() ?: break + completedRunStates.remove(first.key) + } + } + } + + private fun consumeRunCompletion(runId: String): Boolean? { + synchronized(completedRunsLock) { + return completedRunStates.remove(runId) + } + } + + private fun consumeRunText(runId: String): String? { + synchronized(completedRunsLock) { + return completedRunTexts.remove(runId) + } + } + + private fun extractTextFromChatEventMessage(messageEl: JsonElement?): String? { + val msg = messageEl?.asObjectOrNull() ?: return null + val content = msg["content"] as? JsonArray ?: return null + return content.mapNotNull { entry -> + entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim() + }.filter { it.isNotEmpty() }.joinToString("\n").takeIf { it.isNotBlank() } + } + private suspend fun waitForAssistantText( session: GatewaySession, sinceSeconds: Double, @@ -566,6 +889,7 @@ class TalkModeManager( _isSpeaking.value = true lastSpokenText = cleaned ensureInterruptListener() + requestAudioFocusForTts() try { val canUseElevenLabs = !voiceId.isNullOrBlank() && !apiKey.isNullOrEmpty() @@ -623,6 +947,7 @@ class TalkModeManager( Log.w(tag, "system voice failed: ${fallbackErr.message ?: fallbackErr::class.simpleName}") } } finally { + _isSpeaking.value = false } } @@ -655,8 +980,14 @@ class TalkModeManager( } } - ensurePlaybackActive(playbackToken) - streamAndPlayMp3(voiceId = voiceId, apiKey = apiKey, request = request, playbackToken = playbackToken) + // When falling back from PCM, rewrite format to MP3 and download to file. + // File-based playback avoids custom DataSource races and is reliable across OEMs. + val mp3Request = if (request.outputFormat?.startsWith("pcm_") == true) { + request.copy(outputFormat = "mp3_44100_128") + } else { + request + } + streamAndPlayMp3(voiceId = voiceId, apiKey = apiKey, request = mp3Request, playbackToken = playbackToken) } private suspend fun streamAndPlayMp3( @@ -677,7 +1008,7 @@ class TalkModeManager( player.setAudioAttributes( AudioAttributes.Builder() .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .setUsage(AudioAttributes.USAGE_ASSISTANT) + .setUsage(AudioAttributes.USAGE_MEDIA) .build(), ) player.setOnPreparedListener { @@ -724,6 +1055,74 @@ class TalkModeManager( Log.d(tag, "play done") } + /** + * Download ElevenLabs audio to a temp file, then play from disk via MediaPlayer. + * Simpler and more reliable than streaming: avoids custom DataSource races and + * AudioTrack underrun issues on OxygenOS/OnePlus. + */ + private suspend fun streamAndPlayViaFile(voiceId: String, apiKey: String, request: ElevenLabsRequest) { + val tempFile = withContext(Dispatchers.IO) { + val file = File.createTempFile("tts_", ".mp3", context.cacheDir) + val conn = openTtsConnection(voiceId = voiceId, apiKey = apiKey, request = request) + try { + val payload = buildRequestPayload(request) + conn.outputStream.use { it.write(payload.toByteArray()) } + val code = conn.responseCode + if (code >= 400) { + val body = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: "" + file.delete() + throw IllegalStateException("ElevenLabs failed: $code $body") + } + Log.d(tag, "elevenlabs http code=$code voiceId=$voiceId format=${request.outputFormat}") + // Manual loop so cancellation is honoured on every chunk. + // input.copyTo() is a single blocking call with no yield points; if the + // coroutine is cancelled mid-download the entire response would finish + // before cancellation was observed. + conn.inputStream.use { input -> + file.outputStream().use { out -> + val buf = ByteArray(8192) + var n: Int + while (input.read(buf).also { n = it } != -1) { + ensureActive() + out.write(buf, 0, n) + } + } + } + } catch (err: Throwable) { + file.delete() + throw err + } finally { + conn.disconnect() + } + file + } + try { + val player = MediaPlayer() + this.player = player + val finished = CompletableDeferred() + player.setAudioAttributes( + AudioAttributes.Builder() + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .setUsage(AudioAttributes.USAGE_MEDIA) + .build(), + ) + player.setOnCompletionListener { finished.complete(Unit) } + player.setOnErrorListener { _, what, extra -> + finished.completeExceptionally(IllegalStateException("MediaPlayer error what=$what extra=$extra")) + true + } + player.setDataSource(tempFile.absolutePath) + withContext(Dispatchers.IO) { player.prepare() } + Log.d(tag, "file play start bytes=${tempFile.length()}") + player.start() + finished.await() + Log.d(tag, "file play done") + } finally { + try { cleanupPlayer() } catch (_: Throwable) {} + tempFile.delete() + } + } + private suspend fun streamAndPlayPcm( voiceId: String, apiKey: String, @@ -747,7 +1146,7 @@ class TalkModeManager( AudioTrack( AudioAttributes.Builder() .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .setUsage(AudioAttributes.USAGE_ASSISTANT) + .setUsage(AudioAttributes.USAGE_MEDIA) .build(), AudioFormat.Builder() .setSampleRate(sampleRate) @@ -763,7 +1162,10 @@ class TalkModeManager( throw IllegalStateException("AudioTrack init failed") } pcmTrack = track - track.play() + // Don't call track.play() yet — start the track only when the first audio + // chunk arrives from ElevenLabs (see streamPcm). OxygenOS/OnePlus kills an + // AudioTrack that underruns (no data written) for ~1+ seconds, causing + // write() to return 0. Deferring play() until first data avoids the underrun. Log.d(tag, "pcm play start sampleRate=$sampleRate bufferSize=$bufferSize") try { @@ -869,6 +1271,14 @@ class TalkModeManager( } } + /** Stop any active TTS immediately — call when user taps mic to barge in. */ + fun stopTts() { + stopActiveStreamingTts() + stopSpeaking(resetInterrupt = true) + _isSpeaking.value = false + _statusText.value = "Listening" + } + private fun stopSpeaking(resetInterrupt: Boolean = true) { pcmStopRequested = true if (!_isSpeaking.value) { @@ -878,6 +1288,7 @@ class TalkModeManager( systemTtsPending?.cancel() systemTtsPending = null systemTtsPendingId = null + abandonAudioFocus() return } if (resetInterrupt) { @@ -891,6 +1302,57 @@ class TalkModeManager( systemTtsPending = null systemTtsPendingId = null _isSpeaking.value = false + abandonAudioFocus() + } + + private fun shouldAllowSpeechInterrupt(): Boolean { + return !finalizeInFlight + } + + private fun clearListenWatchdog() { + listenWatchdogJob?.cancel() + listenWatchdogJob = null + } + + private fun requestAudioFocusForTts(): Boolean { + val am = context.getSystemService(Context.AUDIO_SERVICE) as? AudioManager ?: return true + return if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + val req = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK) + .setAudioAttributes( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_MEDIA) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build() + ) + .setOnAudioFocusChangeListener(audioFocusListener) + .build() + audioFocusRequest = req + val result = am.requestAudioFocus(req) + Log.d(tag, "audio focus request result=$result") + result == AudioManager.AUDIOFOCUS_REQUEST_GRANTED || result == AudioManager.AUDIOFOCUS_REQUEST_DELAYED + } else { + @Suppress("DEPRECATION") + val result = am.requestAudioFocus( + audioFocusListener, + AudioManager.STREAM_MUSIC, + AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK, + ) + result == AudioManager.AUDIOFOCUS_REQUEST_GRANTED + } + } + + private fun abandonAudioFocus() { + val am = context.getSystemService(Context.AUDIO_SERVICE) as? AudioManager ?: return + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + audioFocusRequest?.let { + am.abandonAudioFocusRequest(it) + Log.d(tag, "audio focus abandoned") + } + audioFocusRequest = null + } else { + @Suppress("DEPRECATION") + am.abandonAudioFocus(audioFocusListener) + } } private fun cleanupPlayer() { @@ -980,14 +1442,15 @@ class TalkModeManager( defaultModelId = model ?: defaultModelIdFallback if (!modelOverrideActive) currentModelId = defaultModelId defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback - apiKey = - if (activeProvider == defaultTalkProvider) { - key ?: envKey?.takeIf { it.isNotEmpty() } - } else { - null - } + apiKey = key ?: envKey?.takeIf { it.isNotEmpty() } + Log.d(tag, "reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId") if (interrupt != null) interruptOnSpeech = interrupt - if (activeProvider != defaultTalkProvider) { + activeProviderIsElevenLabs = activeProvider == defaultTalkProvider + if (!activeProviderIsElevenLabs) { + // Clear ElevenLabs credentials so playAssistant won't attempt ElevenLabs calls + apiKey = null + defaultVoiceId = null + if (!voiceOverrideActive) currentVoiceId = null Log.w(tag, "talk provider $activeProvider unsupported; using system voice fallback") } else if (selection?.normalizedPayload == true) { Log.d(tag, "talk config provider=elevenlabs") @@ -1025,8 +1488,10 @@ class TalkModeManager( conn.outputStream.use { it.write(payload.toByteArray()) } val code = conn.responseCode + Log.d(tag, "elevenlabs http code=$code voiceId=$voiceId format=${request.outputFormat} keyLen=${apiKey.length}") if (code >= 400) { val message = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: "" + Log.w(tag, "elevenlabs error code=$code voiceId=$voiceId body=$message") sink.fail() throw IllegalStateException("ElevenLabs failed: $code $message") } @@ -1068,12 +1533,21 @@ class TalkModeManager( throw IllegalStateException("ElevenLabs failed: $code $message") } + var totalBytesWritten = 0L + var trackStarted = false val buffer = ByteArray(8 * 1024) conn.inputStream.use { input -> while (true) { if (pcmStopRequested || isPlaybackCancelled(null, playbackToken)) return@withContext val read = input.read(buffer) if (read <= 0) break + // Start the AudioTrack only when the first chunk is ready — avoids + // the ~1.4s underrun window while ElevenLabs prepares audio. + // OxygenOS kills a track that underruns for >1s (write() returns 0). + if (!trackStarted) { + track.play() + trackStarted = true + } var offset = 0 while (offset < read) { if (pcmStopRequested || isPlaybackCancelled(null, playbackToken)) return@withContext @@ -1098,6 +1572,20 @@ class TalkModeManager( } } + private suspend fun waitForPcmDrain(track: AudioTrack, totalFrames: Long, sampleRate: Int) { + if (totalFrames <= 0) return + withContext(Dispatchers.IO) { + val drainDeadline = SystemClock.elapsedRealtime() + 15_000 + while (!pcmStopRequested && SystemClock.elapsedRealtime() < drainDeadline) { + val played = track.playbackHeadPosition.toLong().and(0xFFFFFFFFL) + if (played >= totalFrames) break + val remainingFrames = totalFrames - played + val sleepMs = ((remainingFrames * 1000L) / sampleRate.toLong()).coerceIn(12L, 120L) + delay(sleepMs) + } + } + } + private fun openTtsConnection( voiceId: String, apiKey: String, @@ -1248,9 +1736,13 @@ class TalkModeManager( } private fun ensureInterruptListener() { - if (!interruptOnSpeech || !_isEnabled.value) return + if (!interruptOnSpeech || !_isEnabled.value || !shouldAllowSpeechInterrupt()) return + // Don't create a new recognizer when we just destroyed one for TTS (finalizeInFlight=true). + // Starting a new recognizer mid-TTS causes audio session conflict that kills AudioTrack + // writes (returns 0) and MediaPlayer on OxygenOS/OnePlus devices. + if (finalizeInFlight) return mainHandler.post { - if (stopRequested) return@post + if (stopRequested || finalizeInFlight) return@post if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post try { if (recognizer == null) { @@ -1277,8 +1769,9 @@ class TalkModeManager( val trimmed = preferred?.trim().orEmpty() if (trimmed.isNotEmpty()) { val resolved = resolveVoiceAlias(trimmed) - if (resolved != null) return resolved - Log.w(tag, "unknown voice alias $trimmed") + // If it resolves as an alias, use the alias target. + // Otherwise treat it as a direct voice ID (e.g. "21m00Tcm4TlvDq8ikWAM"). + return resolved ?: trimmed } fallbackVoiceId?.let { return it } @@ -1354,7 +1847,12 @@ class TalkModeManager( override fun onBufferReceived(buffer: ByteArray?) {} override fun onEndOfSpeech() { - scheduleRestart() + clearListenWatchdog() + // Don't restart while a transcript is being processed — the recognizer + // competing for audio resources kills AudioTrack PCM playback. + if (!finalizeInFlight) { + scheduleRestart() + } } override fun onError(error: Int) {