refactor(android): remove legacy elevenlabs talk stack

2026-04-26 16:06:16 +00:00 · 2026-03-20 10:29:06 +05:30
parent e3afaca1a6
commit 4386a0ace8
4 changed files with 0 additions and 650 deletions
--- a/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt
@@ -1,338 +0,0 @@
-package ai.openclaw.app.voice
-
-import android.media.AudioAttributes
-import android.media.AudioFormat
-import android.media.AudioManager
-import android.media.AudioTrack
-import android.util.Base64
-import android.util.Log
-import kotlinx.coroutines.*
-import kotlinx.coroutines.flow.MutableStateFlow
-import kotlinx.coroutines.flow.StateFlow
-import okhttp3.*
-import org.json.JSONObject
-import kotlin.math.max
-
-/**
- * Streams text chunks to ElevenLabs WebSocket API and plays audio in real-time.
- *
- * Usage:
- *   1. Create instance with voice/API config
- *   2. Call [start] to open WebSocket + AudioTrack
- *   3. Call [sendText] with incremental text chunks as they arrive
- *   4. Call [finish] when the full response is ready (sends EOS to ElevenLabs)
- *   5. Call [stop] to cancel/cleanup at any time
- *
- * Audio playback begins as soon as the first audio chunk arrives from ElevenLabs,
- * typically within ~100ms of the first text chunk for eleven_flash_v2_5.
- *
- * Note: eleven_v3 does NOT support WebSocket streaming. Use eleven_flash_v2_5
- * or eleven_flash_v2 for lowest latency.
- */
-class ElevenLabsStreamingTts(
-  private val scope: CoroutineScope,
-  private val voiceId: String,
-  private val apiKey: String,
-  private val modelId: String = "eleven_flash_v2_5",
-  private val outputFormat: String = "pcm_24000",
-  private val sampleRate: Int = 24000,
-) {
-  companion object {
-    private const val TAG = "ElevenLabsStreamTTS"
-    private const val BASE_URL = "wss://api.elevenlabs.io/v1/text-to-speech"
-
-    /** Models that support WebSocket input streaming */
-    val STREAMING_MODELS = setOf(
-      "eleven_flash_v2_5",
-      "eleven_flash_v2",
-      "eleven_multilingual_v2",
-      "eleven_turbo_v2_5",
-      "eleven_turbo_v2",
-      "eleven_monolingual_v1",
-    )
-
-    fun supportsStreaming(modelId: String): Boolean = modelId in STREAMING_MODELS
-  }
-
-  private val _isPlaying = MutableStateFlow(false)
-  val isPlaying: StateFlow<Boolean> = _isPlaying
-
-  private var webSocket: WebSocket? = null
-  private var audioTrack: AudioTrack? = null
-  private var trackStarted = false
-  private var client: OkHttpClient? = null
-  @Volatile private var stopped = false
-  @Volatile private var finished = false
-  @Volatile var hasReceivedAudio = false
-    private set
-  private var drainJob: Job? = null
-
-  // Track text already sent so we only send incremental chunks
-  private var sentTextLength = 0
-  @Volatile private var wsReady = false
-  private val pendingText = mutableListOf<String>()
-
-  /**
-   * Open the WebSocket connection and prepare AudioTrack.
-   * Must be called before [sendText].
-   */
-  fun start() {
-    stopped = false
-    finished = false
-    hasReceivedAudio = false
-    sentTextLength = 0
-    trackStarted = false
-    wsReady = false
-    sentFullText = ""
-    synchronized(pendingText) { pendingText.clear() }
-
-    // Prepare AudioTrack
-    val minBuffer = AudioTrack.getMinBufferSize(
-      sampleRate,
-      AudioFormat.CHANNEL_OUT_MONO,
-      AudioFormat.ENCODING_PCM_16BIT,
-    )
-    val bufferSize = max(minBuffer * 2, 8 * 1024)
-    val track = AudioTrack(
-      AudioAttributes.Builder()
-        .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
-        .setUsage(AudioAttributes.USAGE_MEDIA)
-        .build(),
-      AudioFormat.Builder()
-        .setSampleRate(sampleRate)
-        .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
-        .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
-        .build(),
-      bufferSize,
-      AudioTrack.MODE_STREAM,
-      AudioManager.AUDIO_SESSION_ID_GENERATE,
-    )
-    if (track.state != AudioTrack.STATE_INITIALIZED) {
-      track.release()
-      Log.e(TAG, "AudioTrack init failed")
-      return
-    }
-    audioTrack = track
-    _isPlaying.value = true
-
-    // Open WebSocket
-    val url = "$BASE_URL/$voiceId/stream-input?model_id=$modelId&output_format=$outputFormat"
-    val okClient = OkHttpClient.Builder()
-      .readTimeout(30, java.util.concurrent.TimeUnit.SECONDS)
-      .writeTimeout(10, java.util.concurrent.TimeUnit.SECONDS)
-      .build()
-    client = okClient
-
-    val request = Request.Builder()
-      .url(url)
-      .header("xi-api-key", apiKey)
-      .build()
-
-    webSocket = okClient.newWebSocket(request, object : WebSocketListener() {
-      override fun onOpen(webSocket: WebSocket, response: Response) {
-        Log.d(TAG, "WebSocket connected")
-        // Send initial config with voice settings
-        val config = JSONObject().apply {
-          put("text", " ")
-          put("voice_settings", JSONObject().apply {
-            put("stability", 0.5)
-            put("similarity_boost", 0.8)
-            put("use_speaker_boost", false)
-          })
-          put("generation_config", JSONObject().apply {
-            put("chunk_length_schedule", org.json.JSONArray(listOf(120, 160, 250, 290)))
-          })
-        }
-        webSocket.send(config.toString())
-        wsReady = true
-        // Flush any text that was queued before WebSocket was ready
-        synchronized(pendingText) {
-          for (queued in pendingText) {
-            val msg = JSONObject().apply { put("text", queued) }
-            webSocket.send(msg.toString())
-            Log.d(TAG, "flushed queued chunk: ${queued.length} chars")
-          }
-          pendingText.clear()
-        }
-        // Send deferred EOS if finish() was called before WebSocket was ready
-        if (finished) {
-          val eos = JSONObject().apply { put("text", "") }
-          webSocket.send(eos.toString())
-          Log.d(TAG, "sent deferred EOS")
-        }
-      }
-
-      override fun onMessage(webSocket: WebSocket, text: String) {
-        if (stopped) return
-        try {
-          val json = JSONObject(text)
-          val audio = json.optString("audio", "")
-          if (audio.isNotEmpty()) {
-            val pcmBytes = Base64.decode(audio, Base64.DEFAULT)
-            writeToTrack(pcmBytes)
-          }
-        } catch (e: Exception) {
-          Log.e(TAG, "Error parsing WebSocket message: ${e.message}")
-        }
-      }
-
-      override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) {
-        Log.e(TAG, "WebSocket error: ${t.message}")
-        stopped = true
-        cleanup()
-      }
-
-      override fun onClosed(webSocket: WebSocket, code: Int, reason: String) {
-        Log.d(TAG, "WebSocket closed: $code $reason")
-        // Wait for AudioTrack to finish playing buffered audio, then cleanup
-        drainJob = scope.launch(Dispatchers.IO) {
-          drainAudioTrack()
-          cleanup()
-        }
-      }
-    })
-  }
-
-  /**
-   * Send incremental text. Call with the full accumulated text so far —
-   * only the new portion (since last send) will be transmitted.
-   */
-  // Track the full text we've sent so we can detect replacement vs append
-  private var sentFullText = ""
-
-  /**
-      // If we already sent a superset of this text, it's just a stale/out-of-order
-      // event from a different thread — not a real divergence. Ignore it.
-      if (sentFullText.startsWith(fullText)) return true
-   * Returns true if text was accepted, false if text diverged (caller should restart).
-   */
-  @Synchronized
-  fun sendText(fullText: String): Boolean {
-    if (stopped) return false
-    if (finished) return true  // Already finishing — not a diverge, don't restart
-
-    // Detect text replacement: if the new text doesn't start with what we already sent,
-    // the stream has diverged (e.g., tool call interrupted and text was replaced).
-    if (sentFullText.isNotEmpty() && !fullText.startsWith(sentFullText)) {
-      // If we already sent a superset of this text, it's just a stale/out-of-order
-      // event from a different thread — not a real divergence. Ignore it.
-      if (sentFullText.startsWith(fullText)) return true
-      Log.d(TAG, "text diverged — sent='${sentFullText.take(60)}' new='${fullText.take(60)}'")
-      return false
-    }
-
-    if (fullText.length > sentTextLength) {
-      val newText = fullText.substring(sentTextLength)
-      sentTextLength = fullText.length
-      sentFullText = fullText
-
-      val ws = webSocket
-      if (ws != null && wsReady) {
-        val msg = JSONObject().apply { put("text", newText) }
-        ws.send(msg.toString())
-        Log.d(TAG, "sent chunk: ${newText.length} chars")
-      } else {
-        // Queue if WebSocket not connected yet (ws null = still connecting, wsReady false = handshake pending)
-        synchronized(pendingText) { pendingText.add(newText) }
-        Log.d(TAG, "queued chunk: ${newText.length} chars (ws not ready)")
-      }
-    }
-    return true
-  }
-
-  /**
-   * Signal that no more text is coming. Sends EOS to ElevenLabs.
-   * The WebSocket will close after generating remaining audio.
-   */
-  @Synchronized
-  fun finish() {
-    if (stopped || finished) return
-    finished = true
-    val ws = webSocket
-    if (ws != null && wsReady) {
-      // Send empty text to signal end of stream
-      val eos = JSONObject().apply { put("text", "") }
-      ws.send(eos.toString())
-      Log.d(TAG, "sent EOS")
-    }
-    // else: WebSocket not ready yet; onOpen will send EOS after flushing queued text
-  }
-
-  /**
-   * Immediately stop playback and close everything.
-   */
-  fun stop() {
-    stopped = true
-    finished = true
-    drainJob?.cancel()
-    drainJob = null
-    webSocket?.cancel()
-    webSocket = null
-    val track = audioTrack
-    audioTrack = null
-    if (track != null) {
-      try {
-        track.pause()
-        track.flush()
-        track.release()
-      } catch (_: Throwable) {}
-    }
-    _isPlaying.value = false
-    client?.dispatcher?.executorService?.shutdown()
-    client = null
-  }
-
-  private fun writeToTrack(pcmBytes: ByteArray) {
-    val track = audioTrack ?: return
-    if (stopped) return
-
-    // Start playback on first audio chunk — avoids underrun
-    if (!trackStarted) {
-      track.play()
-      trackStarted = true
-      hasReceivedAudio = true
-      Log.d(TAG, "AudioTrack started on first chunk")
-    }
-
-    var offset = 0
-    while (offset < pcmBytes.size && !stopped) {
-      val wrote = track.write(pcmBytes, offset, pcmBytes.size - offset)
-      if (wrote <= 0) {
-        if (stopped) return
-        Log.w(TAG, "AudioTrack write returned $wrote")
-        break
-      }
-      offset += wrote
-    }
-  }
-
-  private fun drainAudioTrack() {
-    if (stopped) return
-    // Wait up to 10s for audio to finish playing
-    val deadline = System.currentTimeMillis() + 10_000
-    while (!stopped && System.currentTimeMillis() < deadline) {
-      // Check if track is still playing
-      val track = audioTrack ?: return
-      if (track.playState != AudioTrack.PLAYSTATE_PLAYING) return
-      try {
-        Thread.sleep(100)
-      } catch (_: InterruptedException) {
-        return
-      }
-    }
-  }
-
-  private fun cleanup() {
-    val track = audioTrack
-    audioTrack = null
-    if (track != null) {
-      try {
-        track.stop()
-        track.release()
-      } catch (_: Throwable) {}
-    }
-    _isPlaying.value = false
-    client?.dispatcher?.executorService?.shutdown()
-    client = null
-  }
-}
--- a/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt
@@ -1,98 +0,0 @@
-package ai.openclaw.app.voice
-
-import android.media.MediaDataSource
-import kotlin.math.min
-
-internal class StreamingMediaDataSource : MediaDataSource() {
-  private data class Chunk(val start: Long, val data: ByteArray)
-
-  private val lock = Object()
-  private val chunks = ArrayList<Chunk>()
-  private var totalSize: Long = 0
-  private var closed = false
-  private var finished = false
-  private var lastReadIndex = 0
-
-  fun append(data: ByteArray) {
-    if (data.isEmpty()) return
-    synchronized(lock) {
-      if (closed || finished) return
-      val chunk = Chunk(totalSize, data)
-      chunks.add(chunk)
-      totalSize += data.size.toLong()
-      lock.notifyAll()
-    }
-  }
-
-  fun finish() {
-    synchronized(lock) {
-      if (closed) return
-      finished = true
-      lock.notifyAll()
-    }
-  }
-
-  fun fail() {
-    synchronized(lock) {
-      closed = true
-      lock.notifyAll()
-    }
-  }
-
-  override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
-    if (position < 0) return -1
-    synchronized(lock) {
-      while (!closed && !finished && position >= totalSize) {
-        lock.wait()
-      }
-      if (closed) return -1
-      if (position >= totalSize && finished) return -1
-
-      val available = (totalSize - position).toInt()
-      val toRead = min(size, available)
-      var remaining = toRead
-      var destOffset = offset
-      var pos = position
-
-      var index = findChunkIndex(pos)
-      while (remaining > 0 && index < chunks.size) {
-        val chunk = chunks[index]
-        val inChunkOffset = (pos - chunk.start).toInt()
-        if (inChunkOffset >= chunk.data.size) {
-          index++
-          continue
-        }
-        val copyLen = min(remaining, chunk.data.size - inChunkOffset)
-        System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
-        remaining -= copyLen
-        destOffset += copyLen
-        pos += copyLen
-        if (inChunkOffset + copyLen >= chunk.data.size) {
-          index++
-        }
-      }
-
-      return toRead - remaining
-    }
-  }
-
-  override fun getSize(): Long = -1
-
-  override fun close() {
-    synchronized(lock) {
-      closed = true
-      lock.notifyAll()
-    }
-  }
-
-  private fun findChunkIndex(position: Long): Int {
-    var index = lastReadIndex
-    while (index < chunks.size) {
-      val chunk = chunks[index]
-      if (position < chunk.start + chunk.data.size) break
-      index++
-    }
-    lastReadIndex = index
-    return index
-  }
-}
--- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt
@@ -1,122 +0,0 @@
-package ai.openclaw.app.voice
-
-import java.net.HttpURLConnection
-import java.net.URL
-import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.withContext
-import kotlinx.serialization.json.Json
-import kotlinx.serialization.json.JsonArray
-import kotlinx.serialization.json.JsonElement
-import kotlinx.serialization.json.JsonObject
-import kotlinx.serialization.json.JsonPrimitive
-
-internal data class ElevenLabsVoice(val voiceId: String, val name: String?)
-
-internal data class TalkModeResolvedVoice(
-  val voiceId: String?,
-  val fallbackVoiceId: String?,
-  val defaultVoiceId: String?,
-  val currentVoiceId: String?,
-  val selectedVoiceName: String? = null,
-)
-
-internal object TalkModeVoiceResolver {
-  fun resolveVoiceAlias(value: String?, voiceAliases: Map<String, String>): String? {
-    val trimmed = value?.trim().orEmpty()
-    if (trimmed.isEmpty()) return null
-    val normalized = normalizeAliasKey(trimmed)
-    voiceAliases[normalized]?.let { return it }
-    if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed
-    return if (isLikelyVoiceId(trimmed)) trimmed else null
-  }
-
-  suspend fun resolveVoiceId(
-    preferred: String?,
-    fallbackVoiceId: String?,
-    defaultVoiceId: String?,
-    currentVoiceId: String?,
-    voiceOverrideActive: Boolean,
-    listVoices: suspend () -> List<ElevenLabsVoice>,
-  ): TalkModeResolvedVoice {
-    val trimmed = preferred?.trim().orEmpty()
-    if (trimmed.isNotEmpty()) {
-      return TalkModeResolvedVoice(
-        voiceId = trimmed,
-        fallbackVoiceId = fallbackVoiceId,
-        defaultVoiceId = defaultVoiceId,
-        currentVoiceId = currentVoiceId,
-      )
-    }
-    if (!fallbackVoiceId.isNullOrBlank()) {
-      return TalkModeResolvedVoice(
-        voiceId = fallbackVoiceId,
-        fallbackVoiceId = fallbackVoiceId,
-        defaultVoiceId = defaultVoiceId,
-        currentVoiceId = currentVoiceId,
-      )
-    }
-
-    val first = listVoices().firstOrNull()
-    if (first == null) {
-      return TalkModeResolvedVoice(
-        voiceId = null,
-        fallbackVoiceId = fallbackVoiceId,
-        defaultVoiceId = defaultVoiceId,
-        currentVoiceId = currentVoiceId,
-      )
-    }
-
-    return TalkModeResolvedVoice(
-      voiceId = first.voiceId,
-      fallbackVoiceId = first.voiceId,
-      defaultVoiceId = if (defaultVoiceId.isNullOrBlank()) first.voiceId else defaultVoiceId,
-      currentVoiceId = if (voiceOverrideActive) currentVoiceId else first.voiceId,
-      selectedVoiceName = first.name,
-    )
-  }
-
-  suspend fun listVoices(apiKey: String, json: Json): List<ElevenLabsVoice> {
-    return withContext(Dispatchers.IO) {
-      val url = URL("https://api.elevenlabs.io/v1/voices")
-      val conn = url.openConnection() as HttpURLConnection
-      try {
-        conn.requestMethod = "GET"
-        conn.connectTimeout = 15_000
-        conn.readTimeout = 15_000
-        conn.setRequestProperty("xi-api-key", apiKey)
-
-        val code = conn.responseCode
-        val stream = if (code >= 400) conn.errorStream else conn.inputStream
-        val data = stream?.use { it.readBytes() } ?: byteArrayOf()
-        if (code >= 400) {
-          val message = data.toString(Charsets.UTF_8)
-          throw IllegalStateException("ElevenLabs voices failed: $code $message")
-        }
-
-        val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
-        val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
-        voices.mapNotNull { entry ->
-          val obj = entry.asObjectOrNull() ?: return@mapNotNull null
-          val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
-          val name = obj["name"].asStringOrNull()
-          ElevenLabsVoice(voiceId, name)
-        }
-      } finally {
-        conn.disconnect()
-      }
-    }
-  }
-
-  private fun isLikelyVoiceId(value: String): Boolean {
-    if (value.length < 10) return false
-    return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
-  }
-
-  private fun normalizeAliasKey(value: String): String =
-    value.trim().lowercase()
-}
-
-private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
-
-private fun JsonElement?.asStringOrNull(): String? =
-  (this as? JsonPrimitive)?.takeIf { it.isString }?.content
--- a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt
+++ b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt
@@ -1,92 +0,0 @@
-package ai.openclaw.app.voice
-
-import kotlinx.coroutines.runBlocking
-import org.junit.Assert.assertEquals
-import org.junit.Assert.assertNull
-import org.junit.Test
-
-class TalkModeVoiceResolverTest {
-  @Test
-  fun resolvesVoiceAliasCaseInsensitively() {
-    val resolved =
-      TalkModeVoiceResolver.resolveVoiceAlias(
-        " Clawd ",
-        mapOf("clawd" to "voice-123"),
-      )
-
-    assertEquals("voice-123", resolved)
-  }
-
-  @Test
-  fun acceptsDirectVoiceIds() {
-    val resolved = TalkModeVoiceResolver.resolveVoiceAlias("21m00Tcm4TlvDq8ikWAM", emptyMap())
-
-    assertEquals("21m00Tcm4TlvDq8ikWAM", resolved)
-  }
-
-  @Test
-  fun rejectsUnknownAliases() {
-    val resolved = TalkModeVoiceResolver.resolveVoiceAlias("nickname", emptyMap())
-
-    assertNull(resolved)
-  }
-
-  @Test
-  fun reusesCachedFallbackVoiceBeforeFetchingCatalog() =
-    runBlocking {
-      var fetchCount = 0
-
-      val resolved =
-        TalkModeVoiceResolver.resolveVoiceId(
-          preferred = null,
-          fallbackVoiceId = "cached-voice",
-          defaultVoiceId = null,
-          currentVoiceId = null,
-          voiceOverrideActive = false,
-          listVoices = {
-            fetchCount += 1
-            emptyList()
-          },
-        )
-
-      assertEquals("cached-voice", resolved.voiceId)
-      assertEquals(0, fetchCount)
-    }
-
-  @Test
-  fun seedsDefaultVoiceFromCatalogWhenNeeded() =
-    runBlocking {
-      val resolved =
-        TalkModeVoiceResolver.resolveVoiceId(
-          preferred = null,
-          fallbackVoiceId = null,
-          defaultVoiceId = null,
-          currentVoiceId = null,
-          voiceOverrideActive = false,
-          listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) },
-        )
-
-      assertEquals("voice-1", resolved.voiceId)
-      assertEquals("voice-1", resolved.fallbackVoiceId)
-      assertEquals("voice-1", resolved.defaultVoiceId)
-      assertEquals("voice-1", resolved.currentVoiceId)
-      assertEquals("First", resolved.selectedVoiceName)
-    }
-
-  @Test
-  fun preservesCurrentVoiceWhenOverrideIsActive() =
-    runBlocking {
-      val resolved =
-        TalkModeVoiceResolver.resolveVoiceId(
-          preferred = null,
-          fallbackVoiceId = null,
-          defaultVoiceId = null,
-          currentVoiceId = null,
-          voiceOverrideActive = true,
-          listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) },
-        )
-
-      assertEquals("voice-1", resolved.voiceId)
-      assertNull(resolved.currentVoiceId)
-    }
-}