refactor(android): remove legacy elevenlabs talk stack

This commit is contained in:
Ayaan Zaidi
2026-03-20 10:29:06 +05:30
parent e3afaca1a6
commit 4386a0ace8
4 changed files with 0 additions and 650 deletions

View File

@@ -1,338 +0,0 @@
package ai.openclaw.app.voice
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioManager
import android.media.AudioTrack
import android.util.Base64
import android.util.Log
import kotlinx.coroutines.*
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.StateFlow
import okhttp3.*
import org.json.JSONObject
import kotlin.math.max
/**
* Streams text chunks to ElevenLabs WebSocket API and plays audio in real-time.
*
* Usage:
* 1. Create instance with voice/API config
* 2. Call [start] to open WebSocket + AudioTrack
* 3. Call [sendText] with incremental text chunks as they arrive
* 4. Call [finish] when the full response is ready (sends EOS to ElevenLabs)
* 5. Call [stop] to cancel/cleanup at any time
*
* Audio playback begins as soon as the first audio chunk arrives from ElevenLabs,
* typically within ~100ms of the first text chunk for eleven_flash_v2_5.
*
* Note: eleven_v3 does NOT support WebSocket streaming. Use eleven_flash_v2_5
* or eleven_flash_v2 for lowest latency.
*/
class ElevenLabsStreamingTts(
private val scope: CoroutineScope,
private val voiceId: String,
private val apiKey: String,
private val modelId: String = "eleven_flash_v2_5",
private val outputFormat: String = "pcm_24000",
private val sampleRate: Int = 24000,
) {
companion object {
private const val TAG = "ElevenLabsStreamTTS"
private const val BASE_URL = "wss://api.elevenlabs.io/v1/text-to-speech"
/** Models that support WebSocket input streaming */
val STREAMING_MODELS = setOf(
"eleven_flash_v2_5",
"eleven_flash_v2",
"eleven_multilingual_v2",
"eleven_turbo_v2_5",
"eleven_turbo_v2",
"eleven_monolingual_v1",
)
fun supportsStreaming(modelId: String): Boolean = modelId in STREAMING_MODELS
}
private val _isPlaying = MutableStateFlow(false)
val isPlaying: StateFlow<Boolean> = _isPlaying
private var webSocket: WebSocket? = null
private var audioTrack: AudioTrack? = null
private var trackStarted = false
private var client: OkHttpClient? = null
@Volatile private var stopped = false
@Volatile private var finished = false
@Volatile var hasReceivedAudio = false
private set
private var drainJob: Job? = null
// Track text already sent so we only send incremental chunks
private var sentTextLength = 0
@Volatile private var wsReady = false
private val pendingText = mutableListOf<String>()
/**
* Open the WebSocket connection and prepare AudioTrack.
* Must be called before [sendText].
*/
fun start() {
stopped = false
finished = false
hasReceivedAudio = false
sentTextLength = 0
trackStarted = false
wsReady = false
sentFullText = ""
synchronized(pendingText) { pendingText.clear() }
// Prepare AudioTrack
val minBuffer = AudioTrack.getMinBufferSize(
sampleRate,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_16BIT,
)
val bufferSize = max(minBuffer * 2, 8 * 1024)
val track = AudioTrack(
AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_MEDIA)
.build(),
AudioFormat.Builder()
.setSampleRate(sampleRate)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.build(),
bufferSize,
AudioTrack.MODE_STREAM,
AudioManager.AUDIO_SESSION_ID_GENERATE,
)
if (track.state != AudioTrack.STATE_INITIALIZED) {
track.release()
Log.e(TAG, "AudioTrack init failed")
return
}
audioTrack = track
_isPlaying.value = true
// Open WebSocket
val url = "$BASE_URL/$voiceId/stream-input?model_id=$modelId&output_format=$outputFormat"
val okClient = OkHttpClient.Builder()
.readTimeout(30, java.util.concurrent.TimeUnit.SECONDS)
.writeTimeout(10, java.util.concurrent.TimeUnit.SECONDS)
.build()
client = okClient
val request = Request.Builder()
.url(url)
.header("xi-api-key", apiKey)
.build()
webSocket = okClient.newWebSocket(request, object : WebSocketListener() {
override fun onOpen(webSocket: WebSocket, response: Response) {
Log.d(TAG, "WebSocket connected")
// Send initial config with voice settings
val config = JSONObject().apply {
put("text", " ")
put("voice_settings", JSONObject().apply {
put("stability", 0.5)
put("similarity_boost", 0.8)
put("use_speaker_boost", false)
})
put("generation_config", JSONObject().apply {
put("chunk_length_schedule", org.json.JSONArray(listOf(120, 160, 250, 290)))
})
}
webSocket.send(config.toString())
wsReady = true
// Flush any text that was queued before WebSocket was ready
synchronized(pendingText) {
for (queued in pendingText) {
val msg = JSONObject().apply { put("text", queued) }
webSocket.send(msg.toString())
Log.d(TAG, "flushed queued chunk: ${queued.length} chars")
}
pendingText.clear()
}
// Send deferred EOS if finish() was called before WebSocket was ready
if (finished) {
val eos = JSONObject().apply { put("text", "") }
webSocket.send(eos.toString())
Log.d(TAG, "sent deferred EOS")
}
}
override fun onMessage(webSocket: WebSocket, text: String) {
if (stopped) return
try {
val json = JSONObject(text)
val audio = json.optString("audio", "")
if (audio.isNotEmpty()) {
val pcmBytes = Base64.decode(audio, Base64.DEFAULT)
writeToTrack(pcmBytes)
}
} catch (e: Exception) {
Log.e(TAG, "Error parsing WebSocket message: ${e.message}")
}
}
override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) {
Log.e(TAG, "WebSocket error: ${t.message}")
stopped = true
cleanup()
}
override fun onClosed(webSocket: WebSocket, code: Int, reason: String) {
Log.d(TAG, "WebSocket closed: $code $reason")
// Wait for AudioTrack to finish playing buffered audio, then cleanup
drainJob = scope.launch(Dispatchers.IO) {
drainAudioTrack()
cleanup()
}
}
})
}
/**
* Send incremental text. Call with the full accumulated text so far —
* only the new portion (since last send) will be transmitted.
*/
// Track the full text we've sent so we can detect replacement vs append
private var sentFullText = ""
/**
// If we already sent a superset of this text, it's just a stale/out-of-order
// event from a different thread — not a real divergence. Ignore it.
if (sentFullText.startsWith(fullText)) return true
* Returns true if text was accepted, false if text diverged (caller should restart).
*/
@Synchronized
fun sendText(fullText: String): Boolean {
if (stopped) return false
if (finished) return true // Already finishing — not a diverge, don't restart
// Detect text replacement: if the new text doesn't start with what we already sent,
// the stream has diverged (e.g., tool call interrupted and text was replaced).
if (sentFullText.isNotEmpty() && !fullText.startsWith(sentFullText)) {
// If we already sent a superset of this text, it's just a stale/out-of-order
// event from a different thread — not a real divergence. Ignore it.
if (sentFullText.startsWith(fullText)) return true
Log.d(TAG, "text diverged — sent='${sentFullText.take(60)}' new='${fullText.take(60)}'")
return false
}
if (fullText.length > sentTextLength) {
val newText = fullText.substring(sentTextLength)
sentTextLength = fullText.length
sentFullText = fullText
val ws = webSocket
if (ws != null && wsReady) {
val msg = JSONObject().apply { put("text", newText) }
ws.send(msg.toString())
Log.d(TAG, "sent chunk: ${newText.length} chars")
} else {
// Queue if WebSocket not connected yet (ws null = still connecting, wsReady false = handshake pending)
synchronized(pendingText) { pendingText.add(newText) }
Log.d(TAG, "queued chunk: ${newText.length} chars (ws not ready)")
}
}
return true
}
/**
* Signal that no more text is coming. Sends EOS to ElevenLabs.
* The WebSocket will close after generating remaining audio.
*/
@Synchronized
fun finish() {
if (stopped || finished) return
finished = true
val ws = webSocket
if (ws != null && wsReady) {
// Send empty text to signal end of stream
val eos = JSONObject().apply { put("text", "") }
ws.send(eos.toString())
Log.d(TAG, "sent EOS")
}
// else: WebSocket not ready yet; onOpen will send EOS after flushing queued text
}
/**
* Immediately stop playback and close everything.
*/
fun stop() {
stopped = true
finished = true
drainJob?.cancel()
drainJob = null
webSocket?.cancel()
webSocket = null
val track = audioTrack
audioTrack = null
if (track != null) {
try {
track.pause()
track.flush()
track.release()
} catch (_: Throwable) {}
}
_isPlaying.value = false
client?.dispatcher?.executorService?.shutdown()
client = null
}
private fun writeToTrack(pcmBytes: ByteArray) {
val track = audioTrack ?: return
if (stopped) return
// Start playback on first audio chunk — avoids underrun
if (!trackStarted) {
track.play()
trackStarted = true
hasReceivedAudio = true
Log.d(TAG, "AudioTrack started on first chunk")
}
var offset = 0
while (offset < pcmBytes.size && !stopped) {
val wrote = track.write(pcmBytes, offset, pcmBytes.size - offset)
if (wrote <= 0) {
if (stopped) return
Log.w(TAG, "AudioTrack write returned $wrote")
break
}
offset += wrote
}
}
private fun drainAudioTrack() {
if (stopped) return
// Wait up to 10s for audio to finish playing
val deadline = System.currentTimeMillis() + 10_000
while (!stopped && System.currentTimeMillis() < deadline) {
// Check if track is still playing
val track = audioTrack ?: return
if (track.playState != AudioTrack.PLAYSTATE_PLAYING) return
try {
Thread.sleep(100)
} catch (_: InterruptedException) {
return
}
}
}
private fun cleanup() {
val track = audioTrack
audioTrack = null
if (track != null) {
try {
track.stop()
track.release()
} catch (_: Throwable) {}
}
_isPlaying.value = false
client?.dispatcher?.executorService?.shutdown()
client = null
}
}

View File

@@ -1,98 +0,0 @@
package ai.openclaw.app.voice
import android.media.MediaDataSource
import kotlin.math.min
internal class StreamingMediaDataSource : MediaDataSource() {
private data class Chunk(val start: Long, val data: ByteArray)
private val lock = Object()
private val chunks = ArrayList<Chunk>()
private var totalSize: Long = 0
private var closed = false
private var finished = false
private var lastReadIndex = 0
fun append(data: ByteArray) {
if (data.isEmpty()) return
synchronized(lock) {
if (closed || finished) return
val chunk = Chunk(totalSize, data)
chunks.add(chunk)
totalSize += data.size.toLong()
lock.notifyAll()
}
}
fun finish() {
synchronized(lock) {
if (closed) return
finished = true
lock.notifyAll()
}
}
fun fail() {
synchronized(lock) {
closed = true
lock.notifyAll()
}
}
override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int {
if (position < 0) return -1
synchronized(lock) {
while (!closed && !finished && position >= totalSize) {
lock.wait()
}
if (closed) return -1
if (position >= totalSize && finished) return -1
val available = (totalSize - position).toInt()
val toRead = min(size, available)
var remaining = toRead
var destOffset = offset
var pos = position
var index = findChunkIndex(pos)
while (remaining > 0 && index < chunks.size) {
val chunk = chunks[index]
val inChunkOffset = (pos - chunk.start).toInt()
if (inChunkOffset >= chunk.data.size) {
index++
continue
}
val copyLen = min(remaining, chunk.data.size - inChunkOffset)
System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen)
remaining -= copyLen
destOffset += copyLen
pos += copyLen
if (inChunkOffset + copyLen >= chunk.data.size) {
index++
}
}
return toRead - remaining
}
}
override fun getSize(): Long = -1
override fun close() {
synchronized(lock) {
closed = true
lock.notifyAll()
}
}
private fun findChunkIndex(position: Long): Int {
var index = lastReadIndex
while (index < chunks.size) {
val chunk = chunks[index]
if (position < chunk.start + chunk.data.size) break
index++
}
lastReadIndex = index
return index
}
}

View File

@@ -1,122 +0,0 @@
package ai.openclaw.app.voice
import java.net.HttpURLConnection
import java.net.URL
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.withContext
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
internal data class ElevenLabsVoice(val voiceId: String, val name: String?)
internal data class TalkModeResolvedVoice(
val voiceId: String?,
val fallbackVoiceId: String?,
val defaultVoiceId: String?,
val currentVoiceId: String?,
val selectedVoiceName: String? = null,
)
internal object TalkModeVoiceResolver {
fun resolveVoiceAlias(value: String?, voiceAliases: Map<String, String>): String? {
val trimmed = value?.trim().orEmpty()
if (trimmed.isEmpty()) return null
val normalized = normalizeAliasKey(trimmed)
voiceAliases[normalized]?.let { return it }
if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed
return if (isLikelyVoiceId(trimmed)) trimmed else null
}
suspend fun resolveVoiceId(
preferred: String?,
fallbackVoiceId: String?,
defaultVoiceId: String?,
currentVoiceId: String?,
voiceOverrideActive: Boolean,
listVoices: suspend () -> List<ElevenLabsVoice>,
): TalkModeResolvedVoice {
val trimmed = preferred?.trim().orEmpty()
if (trimmed.isNotEmpty()) {
return TalkModeResolvedVoice(
voiceId = trimmed,
fallbackVoiceId = fallbackVoiceId,
defaultVoiceId = defaultVoiceId,
currentVoiceId = currentVoiceId,
)
}
if (!fallbackVoiceId.isNullOrBlank()) {
return TalkModeResolvedVoice(
voiceId = fallbackVoiceId,
fallbackVoiceId = fallbackVoiceId,
defaultVoiceId = defaultVoiceId,
currentVoiceId = currentVoiceId,
)
}
val first = listVoices().firstOrNull()
if (first == null) {
return TalkModeResolvedVoice(
voiceId = null,
fallbackVoiceId = fallbackVoiceId,
defaultVoiceId = defaultVoiceId,
currentVoiceId = currentVoiceId,
)
}
return TalkModeResolvedVoice(
voiceId = first.voiceId,
fallbackVoiceId = first.voiceId,
defaultVoiceId = if (defaultVoiceId.isNullOrBlank()) first.voiceId else defaultVoiceId,
currentVoiceId = if (voiceOverrideActive) currentVoiceId else first.voiceId,
selectedVoiceName = first.name,
)
}
suspend fun listVoices(apiKey: String, json: Json): List<ElevenLabsVoice> {
return withContext(Dispatchers.IO) {
val url = URL("https://api.elevenlabs.io/v1/voices")
val conn = url.openConnection() as HttpURLConnection
try {
conn.requestMethod = "GET"
conn.connectTimeout = 15_000
conn.readTimeout = 15_000
conn.setRequestProperty("xi-api-key", apiKey)
val code = conn.responseCode
val stream = if (code >= 400) conn.errorStream else conn.inputStream
val data = stream?.use { it.readBytes() } ?: byteArrayOf()
if (code >= 400) {
val message = data.toString(Charsets.UTF_8)
throw IllegalStateException("ElevenLabs voices failed: $code $message")
}
val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull()
val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList())
voices.mapNotNull { entry ->
val obj = entry.asObjectOrNull() ?: return@mapNotNull null
val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null
val name = obj["name"].asStringOrNull()
ElevenLabsVoice(voiceId, name)
}
} finally {
conn.disconnect()
}
}
}
private fun isLikelyVoiceId(value: String): Boolean {
if (value.length < 10) return false
return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
}
private fun normalizeAliasKey(value: String): String =
value.trim().lowercase()
}
private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject
private fun JsonElement?.asStringOrNull(): String? =
(this as? JsonPrimitive)?.takeIf { it.isString }?.content

View File

@@ -1,92 +0,0 @@
package ai.openclaw.app.voice
import kotlinx.coroutines.runBlocking
import org.junit.Assert.assertEquals
import org.junit.Assert.assertNull
import org.junit.Test
class TalkModeVoiceResolverTest {
@Test
fun resolvesVoiceAliasCaseInsensitively() {
val resolved =
TalkModeVoiceResolver.resolveVoiceAlias(
" Clawd ",
mapOf("clawd" to "voice-123"),
)
assertEquals("voice-123", resolved)
}
@Test
fun acceptsDirectVoiceIds() {
val resolved = TalkModeVoiceResolver.resolveVoiceAlias("21m00Tcm4TlvDq8ikWAM", emptyMap())
assertEquals("21m00Tcm4TlvDq8ikWAM", resolved)
}
@Test
fun rejectsUnknownAliases() {
val resolved = TalkModeVoiceResolver.resolveVoiceAlias("nickname", emptyMap())
assertNull(resolved)
}
@Test
fun reusesCachedFallbackVoiceBeforeFetchingCatalog() =
runBlocking {
var fetchCount = 0
val resolved =
TalkModeVoiceResolver.resolveVoiceId(
preferred = null,
fallbackVoiceId = "cached-voice",
defaultVoiceId = null,
currentVoiceId = null,
voiceOverrideActive = false,
listVoices = {
fetchCount += 1
emptyList()
},
)
assertEquals("cached-voice", resolved.voiceId)
assertEquals(0, fetchCount)
}
@Test
fun seedsDefaultVoiceFromCatalogWhenNeeded() =
runBlocking {
val resolved =
TalkModeVoiceResolver.resolveVoiceId(
preferred = null,
fallbackVoiceId = null,
defaultVoiceId = null,
currentVoiceId = null,
voiceOverrideActive = false,
listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) },
)
assertEquals("voice-1", resolved.voiceId)
assertEquals("voice-1", resolved.fallbackVoiceId)
assertEquals("voice-1", resolved.defaultVoiceId)
assertEquals("voice-1", resolved.currentVoiceId)
assertEquals("First", resolved.selectedVoiceName)
}
@Test
fun preservesCurrentVoiceWhenOverrideIsActive() =
runBlocking {
val resolved =
TalkModeVoiceResolver.resolveVoiceId(
preferred = null,
fallbackVoiceId = null,
defaultVoiceId = null,
currentVoiceId = null,
voiceOverrideActive = true,
listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) },
)
assertEquals("voice-1", resolved.voiceId)
assertNull(resolved.currentVoiceId)
}
}