From 1531123d3547f9ff8ee8094590fc773305f018f2 Mon Sep 17 00:00:00 2001 From: Rui Xu Date: Sat, 25 Apr 2026 23:34:57 +0100 Subject: [PATCH] feat(tts): add BytePlus Seed Speech provider Add Volcengine/BytePlus Seed Speech as a bundled TTS provider with current API-key auth, legacy AppID/token fallback, native Ogg/Opus voice-note output, and MP3 audio-file output. Co-authored-by: Peter Steinberger --- CHANGELOG.md | 1 + docs/.i18n/glossary.zh-CN.json | 4 + docs/providers/volcengine.md | 76 +++++- docs/tools/tts.md | 62 ++++- extensions/volcengine/index.ts | 2 + extensions/volcengine/openclaw.plugin.json | 11 +- extensions/volcengine/speech-provider.ts | 229 +++++++++++++++++ extensions/volcengine/tts.live.test.ts | 30 +++ extensions/volcengine/tts.test.ts | 272 +++++++++++++++++++++ extensions/volcengine/tts.ts | 266 ++++++++++++++++++++ 10 files changed, 937 insertions(+), 16 deletions(-) create mode 100644 extensions/volcengine/speech-provider.ts create mode 100644 extensions/volcengine/tts.live.test.ts create mode 100644 extensions/volcengine/tts.test.ts create mode 100644 extensions/volcengine/tts.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f51386c364..d5404d930a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,6 +55,7 @@ Docs: https://docs.openclaw.ai - Providers/ElevenLabs: include `eleven_v3` in the bundled TTS model catalog so model selection surfaces can offer ElevenLabs v3. (#68321) Thanks @itsuzef. - Providers/Local CLI TTS: add a bundled local command speech provider with file/stdout input, voice-note Opus conversion, and telephony PCM output. (#56239) Thanks @solar2ain. - Providers/Inworld: add Inworld as a bundled speech provider with streaming TTS synthesis, voice listing, voice-note output, and PCM telephony output. (#55972) Thanks @cshape. +- Providers/Volcengine: add Volcengine/BytePlus Seed Speech as a bundled TTS provider with API-key auth, native Ogg/Opus voice-note output, and MP3 audio-file output. (#55641) Thanks @xuruiray. - Android/Talk Mode: expose Talk Mode in the Voice tab with runtime-owned voice capture modes and microphone foreground-service escalation. Thanks @alex-latitude. - Providers/LiteLLM: register `litellm` as an image-generation provider so `image_generate model=litellm/...` calls and `agents.defaults.imageGenerationModel.fallbacks` entries resolve through the LiteLLM proxy. Thanks @zqchris. - Codex harness: require Codex app-server `0.125.0` or newer and cover native MCP `PreToolUse`, `PostToolUse`, and `PermissionRequest` payloads through the OpenClaw hook relay. diff --git a/docs/.i18n/glossary.zh-CN.json b/docs/.i18n/glossary.zh-CN.json index 05f4859ae63..6c12a9ae53a 100644 --- a/docs/.i18n/glossary.zh-CN.json +++ b/docs/.i18n/glossary.zh-CN.json @@ -111,6 +111,10 @@ "source": "BytePlus (International)", "target": "BytePlus(国际版)" }, + { + "source": "Volcengine TTS HTTP API", + "target": "Volcengine TTS HTTP API" + }, { "source": "Amazon Bedrock Mantle", "target": "Amazon Bedrock Mantle" diff --git a/docs/providers/volcengine.md b/docs/providers/volcengine.md index 9d686de6d4a..477fb6ec32c 100644 --- a/docs/providers/volcengine.md +++ b/docs/providers/volcengine.md @@ -1,20 +1,23 @@ --- -summary: "Volcano Engine setup (Doubao models, general + coding endpoints)" +summary: "Volcano Engine setup (Doubao models, coding endpoints, and Seed Speech TTS)" title: "Volcengine (Doubao)" read_when: - You want to use Volcano Engine or Doubao models with OpenClaw - You need the Volcengine API key setup + - You want to use Volcengine Speech text-to-speech --- The Volcengine provider gives access to Doubao models and third-party models hosted on Volcano Engine, with separate endpoints for general and coding -workloads. +workloads. The same bundled plugin can also register Volcengine Speech as a TTS +provider. -| Detail | Value | -| --------- | --------------------------------------------------- | -| Providers | `volcengine` (general) + `volcengine-plan` (coding) | -| Auth | `VOLCANO_ENGINE_API_KEY` | -| API | OpenAI-compatible | +| Detail | Value | +| ---------- | ---------------------------------------------------------- | +| Providers | `volcengine` (general + TTS) + `volcengine-plan` (coding) | +| Model auth | `VOLCANO_ENGINE_API_KEY` | +| TTS auth | `VOLCENGINE_TTS_API_KEY` or `BYTEPLUS_SEED_SPEECH_API_KEY` | +| API | OpenAI-compatible models, BytePlus Seed Speech TTS | ## Getting started @@ -95,6 +98,59 @@ Both providers are configured from a single API key. Setup registers both automa +## Text-to-speech + +Volcengine TTS uses the BytePlus Seed Speech HTTP API and is configured +separately from the OpenAI-compatible Doubao model API key. In the BytePlus +console, open Seed Speech > Settings > API Keys and copy the API key, then set: + +```bash +export VOLCENGINE_TTS_API_KEY="byteplus_seed_speech_api_key" +export VOLCENGINE_TTS_RESOURCE_ID="seed-tts-1.0" +``` + +Then enable it in `openclaw.json`: + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "volcengine", + providers: { + volcengine: { + apiKey: "byteplus_seed_speech_api_key", + voice: "en_female_anna_mars_bigtts", + speedRatio: 1.0, + }, + }, + }, + }, +} +``` + +For voice-note targets, OpenClaw asks Volcengine for provider-native +`ogg_opus`. For normal audio attachments, it asks for `mp3`. Provider aliases +`bytedance` and `doubao` also resolve to the same speech provider. + +The default resource id is `seed-tts-1.0` because that is what BytePlus grants +to newly created Seed Speech API keys in the default project. If your project +has TTS 2.0 entitlement, set `VOLCENGINE_TTS_RESOURCE_ID=seed-tts-2.0`. + + +`VOLCANO_ENGINE_API_KEY` is for the ModelArk/Doubao model endpoints and is not a +Seed Speech API key. TTS needs a Seed Speech API key from the BytePlus Speech +Console, or a legacy Speech Console AppID/token pair. + + +Legacy AppID/token auth remains supported for older Speech Console applications: + +```bash +export VOLCENGINE_TTS_APPID="speech_app_id" +export VOLCENGINE_TTS_TOKEN="speech_access_token" +export VOLCENGINE_TTS_CLUSTER="volcano_tts" +``` + ## Advanced configuration @@ -112,8 +168,10 @@ Both providers are configured from a single API key. Setup registers both automa - If the Gateway runs as a daemon (launchd/systemd), make sure - `VOLCANO_ENGINE_API_KEY` is available to that process (for example, in + If the Gateway runs as a daemon (launchd/systemd), make sure model and TTS + env vars such as `VOLCANO_ENGINE_API_KEY`, `VOLCENGINE_TTS_API_KEY`, + `BYTEPLUS_SEED_SPEECH_API_KEY`, `VOLCENGINE_TTS_APPID`, and + `VOLCENGINE_TTS_TOKEN` are available to that process (for example, in `~/.openclaw/.env` or via `env.shellEnv`). diff --git a/docs/tools/tts.md b/docs/tools/tts.md index a2e23e22fc8..0a8607c7f35 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -7,7 +7,7 @@ read_when: title: "Text-to-speech" --- -OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo. +OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Volcengine, Vydra, xAI, or Xiaomi MiMo. It works anywhere OpenClaw can send audio. ## Supported services @@ -20,6 +20,7 @@ It works anywhere OpenClaw can send audio. - **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) - **MiniMax** (primary or fallback provider; uses the T2A v2 API) - **OpenAI** (primary or fallback provider; also used for summaries) +- **Volcengine** (primary or fallback provider; uses the BytePlus Seed Speech HTTP API) - **Vydra** (primary or fallback provider; shared image, video, and speech provider) - **xAI** (primary or fallback provider; uses the xAI TTS API) - **Xiaomi MiMo** (primary or fallback provider; uses MiMo TTS through Xiaomi chat completions) @@ -39,7 +40,7 @@ or ElevenLabs. ## Optional keys -If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo: +If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Volcengine, Vydra, xAI, or Xiaomi MiMo: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `GEMINI_API_KEY` (or `GOOGLE_API_KEY`) @@ -49,6 +50,9 @@ If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Vydra, `MINIMAX_OAUTH_TOKEN`, `MINIMAX_CODE_PLAN_KEY`, or `MINIMAX_CODING_API_KEY` - `OPENAI_API_KEY` +- `VOLCENGINE_TTS_API_KEY` (or `BYTEPLUS_SEED_SPEECH_API_KEY`); + legacy AppID/token auth also accepts `VOLCENGINE_TTS_APPID` and + `VOLCENGINE_TTS_TOKEN` - `VYDRA_API_KEY` - `XAI_API_KEY` - `XIAOMI_API_KEY` @@ -68,6 +72,7 @@ so that provider must also be authenticated if you enable summaries. - [Gradium](/providers/gradium) - [Inworld TTS API](https://docs.inworld.ai/tts/tts) - [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2) +- [Volcengine TTS HTTP API](/providers/volcengine#text-to-speech) - [Xiaomi MiMo speech synthesis](/providers/xiaomi#text-to-speech) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) - [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) @@ -249,6 +254,35 @@ encoding, so do not pass a raw bearer token and do not Base64-encode it yourself. The key falls back to the `INWORLD_API_KEY` env var. See [Inworld provider](/providers/inworld) for full setup. +### Volcengine primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "volcengine", + providers: { + volcengine: { + apiKey: "byteplus_seed_speech_api_key", + resourceId: "seed-tts-1.0", + voice: "en_female_anna_mars_bigtts", + speedRatio: 1.0, + }, + }, + }, + }, +} +``` + +Volcengine TTS uses the BytePlus Seed Speech API key from the Speech Console, +not the OpenAI-compatible `VOLCANO_ENGINE_API_KEY` used for Doubao model +providers. Resolution order is `messages.tts.providers.volcengine.apiKey` -> +`VOLCENGINE_TTS_API_KEY` -> `BYTEPLUS_SEED_SPEECH_API_KEY`. Legacy AppID/token +auth still works through `messages.tts.providers.volcengine.appId` / `token` or +`VOLCENGINE_TTS_APPID` / `VOLCENGINE_TTS_TOKEN`. Voice-note targets request +provider-native `ogg_opus`; normal audio-file targets request `mp3`. + ### xAI primary ```json5 @@ -447,7 +481,7 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts:key=value]]` directives or a `[[tts:text]]...[[/tts:text]]` block. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"inworld"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic). +- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"inworld"`, `"microsoft"`, `"minimax"`, `"openai"`, `"volcengine"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic). - If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` config is repaired by `openclaw doctor --fix` and rewritten to `provider: "microsoft"`. @@ -461,7 +495,7 @@ Then run: - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). Volcengine uses `appId`/`token` instead. - `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. - `providers.openai.baseUrl`: override the OpenAI TTS endpoint. - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` @@ -497,6 +531,21 @@ Then run: - If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback. - `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`). - `providers.gradium.voiceId`: Gradium voice identifier (default Emma, `YTpq7expH9539ERJ`). +- `providers.volcengine.apiKey`: BytePlus Seed Speech API key (env: + `VOLCENGINE_TTS_API_KEY` or `BYTEPLUS_SEED_SPEECH_API_KEY`). +- `providers.volcengine.resourceId`: BytePlus Seed Speech resource id (default + `seed-tts-1.0`, env: `VOLCENGINE_TTS_RESOURCE_ID`; use `seed-tts-2.0` when + your BytePlus project has TTS 2.0 entitlement). +- `providers.volcengine.appKey`: BytePlus Seed Speech app key header (default + `aGjiRDfUWi`, env: `VOLCENGINE_TTS_APP_KEY`). +- `providers.volcengine.baseUrl`: override the Seed Speech TTS HTTP endpoint + (env: `VOLCENGINE_TTS_BASE_URL`). +- `providers.volcengine.appId`: legacy Volcengine Speech Console application id (env: `VOLCENGINE_TTS_APPID`). +- `providers.volcengine.token`: legacy Volcengine Speech Console access token (env: `VOLCENGINE_TTS_TOKEN`). +- `providers.volcengine.cluster`: legacy Volcengine TTS cluster (default `volcano_tts`, env: `VOLCENGINE_TTS_CLUSTER`). +- `providers.volcengine.voice`: voice type (default `en_female_anna_mars_bigtts`, env: `VOLCENGINE_TTS_VOICE`). +- `providers.volcengine.speedRatio`: provider-native speed ratio. +- `providers.volcengine.emotion`: provider-native emotion tag. - `providers.xai.apiKey`: xAI TTS API key (env: `XAI_API_KEY`). - `providers.xai.baseUrl`: override the xAI TTS base URL (default `https://api.x.ai/v1`, env: `XAI_BASE_URL`). - `providers.xai.voiceId`: xAI voice id (default `eve`; current live voices: `ara`, `eve`, `leo`, `rex`, `sal`, `una`). @@ -550,12 +599,13 @@ Here you go. Available directive keys (when enabled): -- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `google`, `gradium`, `minimax`, `microsoft`, `vydra`, `xai`, or `xiaomi`; requires `allowProvider: true`) -- `voice` (OpenAI, Gradium, or Xiaomi voice), `voiceName` / `voice_name` / `google_voice` (Google voice), or `voiceId` (ElevenLabs / Gradium / MiniMax / xAI) +- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `google`, `gradium`, `minimax`, `microsoft`, `volcengine`, `vydra`, `xai`, or `xiaomi`; requires `allowProvider: true`) +- `voice` (OpenAI, Gradium, Volcengine, or Xiaomi voice), `voiceName` / `voice_name` / `google_voice` (Google voice), or `voiceId` (ElevenLabs / Gradium / MiniMax / xAI) - `model` (OpenAI TTS model, ElevenLabs model id, MiniMax model, or Xiaomi MiMo TTS model) or `google_model` (Google TTS model) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` - `vol` / `volume` (MiniMax volume, 0-10) - `pitch` (MiniMax integer pitch, -12 to 12; fractional values are truncated before the MiniMax request) +- `emotion` (Volcengine emotion tag) - `applyTextNormalization` (`auto|on|off`) - `languageCode` (ISO 639-1) - `seed` diff --git a/extensions/volcengine/index.ts b/extensions/volcengine/index.ts index a21f08b72b6..6e82de08471 100644 --- a/extensions/volcengine/index.ts +++ b/extensions/volcengine/index.ts @@ -3,6 +3,7 @@ import { createProviderApiKeyAuthMethod } from "openclaw/plugin-sdk/provider-aut import { ensureModelAllowlistEntry } from "openclaw/plugin-sdk/provider-onboard"; import { DOUBAO_CODING_MODEL_CATALOG, DOUBAO_MODEL_CATALOG } from "./models.js"; import { buildDoubaoCodingProvider, buildDoubaoProvider } from "./provider-catalog.js"; +import { buildVolcengineSpeechProvider } from "./speech-provider.js"; const PROVIDER_ID = "volcengine"; const VOLCENGINE_DEFAULT_MODEL_REF = "volcengine-plan/ark-code-latest"; @@ -78,5 +79,6 @@ export default definePluginEntry({ return [...volcengineModels, ...volcenginePlanModels]; }, }); + api.registerSpeechProvider(buildVolcengineSpeechProvider()); }, }); diff --git a/extensions/volcengine/openclaw.plugin.json b/extensions/volcengine/openclaw.plugin.json index e64f0a4048c..4958628cfd7 100644 --- a/extensions/volcengine/openclaw.plugin.json +++ b/extensions/volcengine/openclaw.plugin.json @@ -4,7 +4,13 @@ "providerDiscoveryEntry": "./provider-discovery.ts", "providers": ["volcengine", "volcengine-plan"], "providerAuthEnvVars": { - "volcengine": ["VOLCANO_ENGINE_API_KEY"] + "volcengine": ["VOLCANO_ENGINE_API_KEY"], + "volcengine-tts": [ + "VOLCENGINE_TTS_API_KEY", + "BYTEPLUS_SEED_SPEECH_API_KEY", + "VOLCENGINE_TTS_APPID", + "VOLCENGINE_TTS_TOKEN" + ] }, "providerAuthAliases": { "volcengine-plan": "volcengine" @@ -28,5 +34,8 @@ "type": "object", "additionalProperties": false, "properties": {} + }, + "contracts": { + "speechProviders": ["volcengine"] } } diff --git a/extensions/volcengine/speech-provider.ts b/extensions/volcengine/speech-provider.ts new file mode 100644 index 00000000000..17a2c25453d --- /dev/null +++ b/extensions/volcengine/speech-provider.ts @@ -0,0 +1,229 @@ +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderOverrides, + SpeechProviderPlugin, +} from "openclaw/plugin-sdk/speech-core"; +import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core"; +import { volcengineTTS, type VolcengineTtsEncoding } from "./tts.js"; + +const DEFAULT_VOICE = "en_female_anna_mars_bigtts"; +const DEFAULT_CLUSTER = "volcano_tts"; +const DEFAULT_RESOURCE_ID = "seed-tts-1.0"; +const DEFAULT_APP_KEY = "aGjiRDfUWi"; + +export const VOLCENGINE_VOICES: readonly string[] = [ + "en_female_anna_mars_bigtts", + "en_male_adam_mars_bigtts", + "en_female_sarah_mars_bigtts", + "en_male_smith_mars_bigtts", + "zh_female_cancan_mars_bigtts", + "zh_female_qingxinnvsheng_mars_bigtts", + "zh_female_linjia_mars_bigtts", + "zh_male_wennuanahu_moon_bigtts", + "zh_male_shaonianzixin_moon_bigtts", + "zh_female_shuangkuaisisi_moon_bigtts", +]; + +type VolcengineTtsProviderConfig = { + apiKey?: string; + appId?: string; + token?: string; + voice: string; + cluster: string; + resourceId: string; + appKey: string; + baseUrl?: string; + speedRatio?: number; + emotion?: string; +}; + +type VolcengineTtsProviderOverrides = { + voice?: string; + speedRatio?: number; + emotion?: string; +}; + +function normalizeVolcengineProviderConfig( + rawConfig: Record, +): VolcengineTtsProviderConfig { + const providers = asObject(rawConfig.providers); + const raw = asObject(providers?.volcengine) ?? asObject(rawConfig.volcengine); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.volcengine.apiKey", + }), + appId: trimToUndefined(raw?.appId), + token: normalizeResolvedSecretInputString({ + value: raw?.token, + path: "messages.tts.providers.volcengine.token", + }), + voice: + trimToUndefined(raw?.voice) ?? + trimToUndefined(process.env.VOLCENGINE_TTS_VOICE) ?? + DEFAULT_VOICE, + cluster: + trimToUndefined(raw?.cluster) ?? + trimToUndefined(process.env.VOLCENGINE_TTS_CLUSTER) ?? + DEFAULT_CLUSTER, + resourceId: + trimToUndefined(raw?.resourceId) ?? + trimToUndefined(process.env.VOLCENGINE_TTS_RESOURCE_ID) ?? + DEFAULT_RESOURCE_ID, + appKey: + trimToUndefined(raw?.appKey) ?? + trimToUndefined(process.env.VOLCENGINE_TTS_APP_KEY) ?? + DEFAULT_APP_KEY, + baseUrl: trimToUndefined(raw?.baseUrl) ?? trimToUndefined(process.env.VOLCENGINE_TTS_BASE_URL), + speedRatio: asFiniteNumber(raw?.speedRatio), + emotion: trimToUndefined(raw?.emotion), + }; +} + +function resolveSeedSpeechApiKey(configApiKey?: string): string | undefined { + return ( + configApiKey ?? + trimToUndefined(process.env.VOLCENGINE_TTS_API_KEY) ?? + trimToUndefined(process.env.BYTEPLUS_SEED_SPEECH_API_KEY) + ); +} + +function readProviderConfig(config: SpeechProviderConfig): VolcengineTtsProviderConfig { + const normalized = normalizeVolcengineProviderConfig({}); + return { + apiKey: + normalizeResolvedSecretInputString({ + value: config.apiKey, + path: "messages.tts.providers.volcengine.apiKey", + }) ?? normalized.apiKey, + appId: trimToUndefined(config.appId) ?? normalized.appId, + token: trimToUndefined(config.token) ?? normalized.token, + voice: trimToUndefined(config.voice) ?? normalized.voice, + cluster: trimToUndefined(config.cluster) ?? normalized.cluster, + resourceId: trimToUndefined(config.resourceId) ?? normalized.resourceId, + appKey: trimToUndefined(config.appKey) ?? normalized.appKey, + baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl, + speedRatio: asFiniteNumber(config.speedRatio) ?? normalized.speedRatio, + emotion: trimToUndefined(config.emotion) ?? normalized.emotion, + }; +} + +function readVolcengineOverrides( + overrides: SpeechProviderOverrides | undefined, +): VolcengineTtsProviderOverrides { + if (!overrides) { + return {}; + } + return { + voice: trimToUndefined(overrides.voice), + speedRatio: asFiniteNumber(overrides.speedRatio), + emotion: trimToUndefined(overrides.emotion), + }; +} + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { + handled: boolean; + overrides?: SpeechProviderOverrides; + warnings?: string[]; +} { + switch (ctx.key) { + case "voice": + case "volcengine_voice": + case "volcenginevoice": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { handled: true, overrides: { ...ctx.currentOverrides, voice: ctx.value } }; + case "speed": + case "speedratio": + case "speed_ratio": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const speedRatio = Number(ctx.value); + if (!Number.isFinite(speedRatio) || speedRatio < 0.2 || speedRatio > 3.0) { + return { handled: true, warnings: [`invalid Volcengine speedRatio "${ctx.value}"`] }; + } + return { handled: true, overrides: { ...ctx.currentOverrides, speedRatio } }; + } + case "emotion": + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + return { handled: true, overrides: { ...ctx.currentOverrides, emotion: ctx.value } }; + default: + return { handled: false }; + } +} + +export function buildVolcengineSpeechProvider(): SpeechProviderPlugin { + return { + id: "volcengine", + label: "Volcengine", + autoSelectOrder: 90, + aliases: ["bytedance", "doubao"], + voices: VOLCENGINE_VOICES, + resolveConfig: ({ rawConfig }) => normalizeVolcengineProviderConfig(rawConfig), + parseDirectiveToken, + + listVoices: async () => + VOLCENGINE_VOICES.map((v) => ({ + id: v, + name: v.replace(/^(?:en|zh)_(female|male)_/, "").replace(/_.*$/, ""), + locale: v.startsWith("en_") ? "en-US" : "zh-CN", + gender: v.includes("_female_") ? "female" : "male", + })), + + isConfigured: ({ providerConfig }) => { + const cfg = readProviderConfig(providerConfig); + return Boolean( + resolveSeedSpeechApiKey(cfg.apiKey) || + ((cfg.appId || process.env.VOLCENGINE_TTS_APPID) && + (cfg.token || process.env.VOLCENGINE_TTS_TOKEN)), + ); + }, + + synthesize: async (req) => { + const cfg = readProviderConfig(req.providerConfig); + const overrides = readVolcengineOverrides(req.providerOverrides); + const apiKey = resolveSeedSpeechApiKey(cfg.apiKey); + const appId = cfg.appId || process.env.VOLCENGINE_TTS_APPID; + const token = cfg.token || process.env.VOLCENGINE_TTS_TOKEN; + + if (!apiKey && (!appId || !token)) { + throw new Error( + "Volcengine TTS credentials missing. Set VOLCENGINE_TTS_API_KEY, " + + "BYTEPLUS_SEED_SPEECH_API_KEY, or legacy VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_TOKEN.", + ); + } + + const isVoiceNote = req.target === "voice-note"; + const encoding: VolcengineTtsEncoding = isVoiceNote ? "ogg_opus" : "mp3"; + + const audioBuffer = await volcengineTTS({ + text: req.text, + apiKey, + appId, + token, + voice: overrides.voice ?? cfg.voice, + cluster: cfg.cluster, + resourceId: cfg.resourceId, + appKey: cfg.appKey, + baseUrl: cfg.baseUrl, + speedRatio: overrides.speedRatio ?? cfg.speedRatio, + emotion: overrides.emotion ?? cfg.emotion, + encoding, + timeoutMs: req.timeoutMs, + }); + + return { + audioBuffer, + outputFormat: encoding === "ogg_opus" ? "opus" : "mp3", + fileExtension: encoding === "ogg_opus" ? ".opus" : ".mp3", + voiceCompatible: isVoiceNote, + }; + }, + }; +} diff --git a/extensions/volcengine/tts.live.test.ts b/extensions/volcengine/tts.live.test.ts new file mode 100644 index 00000000000..5495c475e5e --- /dev/null +++ b/extensions/volcengine/tts.live.test.ts @@ -0,0 +1,30 @@ +import { describe, expect, it } from "vitest"; +import { volcengineTTS } from "./tts.js"; + +const seedSpeechApiKey = + process.env.VOLCENGINE_TTS_API_KEY ?? process.env.BYTEPLUS_SEED_SPEECH_API_KEY; +const hasVolcengineTtsCredentials = Boolean( + seedSpeechApiKey || (process.env.VOLCENGINE_TTS_APPID && process.env.VOLCENGINE_TTS_TOKEN), +); +const describeLive = + process.env.OPENCLAW_LIVE_TEST === "1" && hasVolcengineTtsCredentials ? describe : describe.skip; + +describeLive("Volcengine TTS live", () => { + it("synthesizes mp3 audio with .profile credentials", async () => { + const audio = await volcengineTTS({ + text: "OpenClaw live test.", + apiKey: seedSpeechApiKey, + appId: process.env.VOLCENGINE_TTS_APPID, + token: process.env.VOLCENGINE_TTS_TOKEN, + voice: process.env.VOLCENGINE_TTS_VOICE, + cluster: process.env.VOLCENGINE_TTS_CLUSTER, + resourceId: process.env.VOLCENGINE_TTS_RESOURCE_ID, + appKey: process.env.VOLCENGINE_TTS_APP_KEY, + baseUrl: process.env.VOLCENGINE_TTS_BASE_URL, + encoding: "mp3", + timeoutMs: 30_000, + }); + + expect(audio.length).toBeGreaterThan(128); + }); +}); diff --git a/extensions/volcengine/tts.test.ts b/extensions/volcengine/tts.test.ts new file mode 100644 index 00000000000..dd08824e5b7 --- /dev/null +++ b/extensions/volcengine/tts.test.ts @@ -0,0 +1,272 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { buildVolcengineSpeechProvider } from "./speech-provider.js"; +import { volcengineTTS } from "./tts.js"; + +const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({ + fetchWithSsrFGuardMock: vi.fn(), +})); + +vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({ + fetchWithSsrFGuard: fetchWithSsrFGuardMock, +})); + +function makeProviderConfig(overrides?: Record) { + return { + apiKey: "test-api-key", + voice: "en_female_anna_mars_bigtts", + ...overrides, + }; +} + +function makeLegacyProviderConfig(overrides?: Record) { + return { + appId: "test-app-id", + token: "test-token", + voice: "zh_female_xiaohe_uranus_bigtts", + cluster: "volcano_tts", + ...overrides, + }; +} + +function clearTtsEnv() { + delete process.env.BYTEPLUS_API_KEY; + delete process.env.BYTEPLUS_SEED_SPEECH_API_KEY; + delete process.env.VOLCENGINE_TTS_API_KEY; + delete process.env.VOLCENGINE_TTS_APPID; + delete process.env.VOLCENGINE_TTS_TOKEN; +} + +describe("Volcengine speech provider", () => { + const provider = buildVolcengineSpeechProvider(); + + beforeEach(() => { + fetchWithSsrFGuardMock.mockReset(); + }); + + it("has correct id, label, and aliases", () => { + expect(provider.id).toBe("volcengine"); + expect(provider.label).toBe("Volcengine"); + expect(provider.aliases).toContain("bytedance"); + expect(provider.aliases).toContain("doubao"); + }); + + it("reports configured when an API key is present in providerConfig", () => { + expect(provider.isConfigured({ providerConfig: makeProviderConfig(), timeoutMs: 30000 })).toBe( + true, + ); + }); + + it("reports configured for legacy appId and token in providerConfig", () => { + expect( + provider.isConfigured({ providerConfig: makeLegacyProviderConfig(), timeoutMs: 30000 }), + ).toBe(true); + }); + + it("reports not configured when credentials are missing", () => { + const oldBytePlusKey = process.env.BYTEPLUS_API_KEY; + const oldSeedKey = process.env.BYTEPLUS_SEED_SPEECH_API_KEY; + const oldApiKey = process.env.VOLCENGINE_TTS_API_KEY; + const oldAppId = process.env.VOLCENGINE_TTS_APPID; + const oldToken = process.env.VOLCENGINE_TTS_TOKEN; + clearTtsEnv(); + try { + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(false); + } finally { + if (oldBytePlusKey) { + process.env.BYTEPLUS_API_KEY = oldBytePlusKey; + } + if (oldSeedKey) { + process.env.BYTEPLUS_SEED_SPEECH_API_KEY = oldSeedKey; + } + if (oldApiKey) { + process.env.VOLCENGINE_TTS_API_KEY = oldApiKey; + } + if (oldAppId) { + process.env.VOLCENGINE_TTS_APPID = oldAppId; + } + if (oldToken) { + process.env.VOLCENGINE_TTS_TOKEN = oldToken; + } + } + }); + + it("falls back to env vars for credentials", () => { + const oldBytePlusKey = process.env.BYTEPLUS_API_KEY; + const oldSeedKey = process.env.BYTEPLUS_SEED_SPEECH_API_KEY; + const oldApiKey = process.env.VOLCENGINE_TTS_API_KEY; + const oldAppId = process.env.VOLCENGINE_TTS_APPID; + const oldToken = process.env.VOLCENGINE_TTS_TOKEN; + clearTtsEnv(); + process.env.BYTEPLUS_SEED_SPEECH_API_KEY = "env-api-key"; + try { + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(true); + } finally { + if (oldBytePlusKey) { + process.env.BYTEPLUS_API_KEY = oldBytePlusKey; + } + if (oldSeedKey) { + process.env.BYTEPLUS_SEED_SPEECH_API_KEY = oldSeedKey; + } else { + delete process.env.BYTEPLUS_SEED_SPEECH_API_KEY; + } + if (oldApiKey) { + process.env.VOLCENGINE_TTS_API_KEY = oldApiKey; + } + if (oldAppId) { + process.env.VOLCENGINE_TTS_APPID = oldAppId; + } else { + delete process.env.VOLCENGINE_TTS_APPID; + } + if (oldToken) { + process.env.VOLCENGINE_TTS_TOKEN = oldToken; + } else { + delete process.env.VOLCENGINE_TTS_TOKEN; + } + } + }); + + it("lists voices with locale and gender", async () => { + const voices = await provider.listVoices!({}); + expect(voices.length).toBeGreaterThan(0); + expect(voices[0]).toMatchObject({ locale: "en-US" }); + expect(voices[0].gender).toBeDefined(); + }); + + it("sends the documented Seed Speech API key payload and returns voice-note Opus metadata", async () => { + const release = vi.fn(); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: new Response( + JSON.stringify({ + code: 0, + data: Buffer.from("voice-audio").toString("base64"), + }), + ), + release, + }); + + const result = await provider.synthesize({ + text: "hello", + cfg: {}, + providerConfig: makeProviderConfig({ emotion: "happy", speedRatio: 1.2 }), + target: "voice-note", + providerOverrides: { voice: "zh_male_aojiao_mars_bigtts", speedRatio: 0.9 }, + timeoutMs: 1234, + }); + + expect(result.audioBuffer.toString()).toBe("voice-audio"); + expect(result.outputFormat).toBe("opus"); + expect(result.fileExtension).toBe(".opus"); + expect(result.voiceCompatible).toBe(true); + + const call = fetchWithSsrFGuardMock.mock.calls[0]?.[0]; + expect(call).toMatchObject({ + url: "https://voice.ap-southeast-1.bytepluses.com/api/v3/tts/unidirectional", + timeoutMs: 1234, + policy: { hostnameAllowlist: ["voice.ap-southeast-1.bytepluses.com"] }, + auditContext: "volcengine.tts", + }); + expect(call.init.headers["X-Api-Key"]).toBe("test-api-key"); + expect(call.init.headers["X-Api-Resource-Id"]).toBe("seed-tts-1.0"); + expect(call.init.headers["X-Api-App-Key"]).toBe("aGjiRDfUWi"); + const body = JSON.parse(call.init.body); + expect(body.req_params).toMatchObject({ + text: "hello", + speaker: "zh_male_aojiao_mars_bigtts", + speed_ratio: 0.9, + emotion: "happy", + audio_params: { + format: "ogg_opus", + sample_rate: 24000, + }, + }); + expect(release).toHaveBeenCalledTimes(1); + }); +}); + +describe("volcengineTTS", () => { + beforeEach(() => { + fetchWithSsrFGuardMock.mockReset(); + }); + + it("joins streamed Seed Speech audio frames", async () => { + const release = vi.fn(); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: new Response( + [ + JSON.stringify({ code: 0, message: "" }), + JSON.stringify({ code: 0, data: Buffer.from("audio-1").toString("base64") }), + JSON.stringify({ code: 0, data: Buffer.from("audio-2").toString("base64") }), + JSON.stringify({ code: 20000000, message: "ok", data: null }), + ].join("\n"), + ), + release, + }); + + const audio = await volcengineTTS({ + text: "hello", + apiKey: "secret-api-key", + voice: "zh_female_xiaohe_uranus_bigtts", + encoding: "mp3", + timeoutMs: 1000, + }); + + expect(audio.toString()).toBe("audio-1audio-2"); + expect(release).toHaveBeenCalledTimes(1); + }); + + it("reports Seed Speech provider errors without exposing credentials", async () => { + const release = vi.fn(); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: new Response( + JSON.stringify({ header: { code: 45000000, message: "speaker permission denied" } }), + { status: 403 }, + ), + release, + }); + + let error: unknown; + try { + await volcengineTTS({ + text: "hello", + apiKey: "secret-api-key", + timeoutMs: 1000, + }); + } catch (err) { + error = err; + } + + expect(error).toBeInstanceOf(Error); + expect((error as Error).message).toBe( + "BytePlus Seed Speech TTS error 45000000: speaker permission denied", + ); + expect((error as Error).message).not.toContain("secret-api-key"); + expect(release).toHaveBeenCalledTimes(1); + }); + + it("reports provider errors without exposing credentials", async () => { + const release = vi.fn(); + fetchWithSsrFGuardMock.mockResolvedValue({ + response: new Response(JSON.stringify({ code: 3001, message: "load grant failed" }), { + status: 401, + }), + release, + }); + + let error: unknown; + try { + await volcengineTTS({ + text: "hello", + appId: "app-id", + token: "secret-token", + timeoutMs: 1000, + }); + } catch (err) { + error = err; + } + + expect(error).toBeInstanceOf(Error); + expect((error as Error).message).toBe("Volcengine TTS error 3001: load grant failed"); + expect((error as Error).message).not.toContain("secret-token"); + expect(release).toHaveBeenCalledTimes(1); + }); +}); diff --git a/extensions/volcengine/tts.ts b/extensions/volcengine/tts.ts new file mode 100644 index 00000000000..b66a171fe0a --- /dev/null +++ b/extensions/volcengine/tts.ts @@ -0,0 +1,266 @@ +import * as crypto from "node:crypto"; +import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; + +export type VolcengineTtsEncoding = "ogg_opus" | "mp3" | "pcm" | "wav"; + +export type VolcengineTTSParams = { + text: string; + apiKey?: string; + appId?: string; + token?: string; + voice?: string; + cluster?: string; + resourceId?: string; + appKey?: string; + baseUrl?: string; + speedRatio?: number; + volumeRatio?: number; + pitchRatio?: number; + emotion?: string; + encoding?: VolcengineTtsEncoding; + timeoutMs?: number; +}; + +const DEFAULT_SEED_VOICE = "en_female_anna_mars_bigtts"; +const DEFAULT_LEGACY_VOICE = "zh_female_xiaohe_uranus_bigtts"; +const DEFAULT_CLUSTER = "volcano_tts"; +const DEFAULT_SEED_TTS_RESOURCE_ID = "seed-tts-1.0"; +const DEFAULT_SEED_TTS_APP_KEY = "aGjiRDfUWi"; +const BYTEPLUS_SEED_TTS_URL = + "https://voice.ap-southeast-1.bytepluses.com/api/v3/tts/unidirectional"; +const VOLCENGINE_LEGACY_TTS_URL = "https://openspeech.bytedance.com/api/v1/tts"; + +type VolcengineTtsResponse = { + code?: number; + message?: string; + data?: string; +}; + +function parseJsonObject(text: string, providerName: string): Record { + try { + const parsed = JSON.parse(text) as unknown; + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + throw new Error("expected JSON object"); + } + return parsed as Record; + } catch (err) { + const detail = err instanceof Error ? err.message : String(err); + throw new Error(`${providerName} TTS: failed to parse response JSON: ${detail}`, { + cause: err, + }); + } +} + +function toTtsResponse(parsed: Record): VolcengineTtsResponse { + const header = + parsed.header && typeof parsed.header === "object" && !Array.isArray(parsed.header) + ? (parsed.header as Record) + : undefined; + return { + code: + typeof parsed.code === "number" + ? parsed.code + : typeof header?.code === "number" + ? header.code + : undefined, + message: + typeof parsed.message === "string" + ? parsed.message + : typeof header?.message === "string" + ? header.message + : undefined, + data: typeof parsed.data === "string" ? parsed.data : undefined, + }; +} + +function parseLegacyTtsResponse(text: string): VolcengineTtsResponse { + return toTtsResponse(parseJsonObject(text, "Volcengine")); +} + +function parseSeedTtsFrames(text: string): VolcengineTtsResponse[] { + const trimmed = text.trim(); + if (!trimmed) { + return []; + } + + try { + return [toTtsResponse(parseJsonObject(trimmed, "BytePlus Seed Speech"))]; + } catch { + // The HTTP API streams JSON frames; Response.text() preserves line breaks. + } + + const frames: VolcengineTtsResponse[] = []; + for (const line of trimmed.split(/\r?\n/)) { + const item = line.trim(); + if (!item) { + continue; + } + const json = item.startsWith("data:") ? item.slice("data:".length).trim() : item; + frames.push(toTtsResponse(parseJsonObject(json, "BytePlus Seed Speech"))); + } + return frames; +} + +function hostnameAllowlist(url: string): string[] { + return [new URL(url).hostname]; +} + +function seedAudioFormat(encoding: VolcengineTtsEncoding): "ogg_opus" | "mp3" | "pcm" { + return encoding === "wav" ? "pcm" : encoding; +} + +async function seedSpeechTTS(params: VolcengineTTSParams & { apiKey: string }): Promise { + const { + text, + apiKey, + voice = DEFAULT_SEED_VOICE, + resourceId = DEFAULT_SEED_TTS_RESOURCE_ID, + appKey = DEFAULT_SEED_TTS_APP_KEY, + baseUrl = BYTEPLUS_SEED_TTS_URL, + speedRatio = 1.0, + emotion, + encoding = "ogg_opus", + timeoutMs = 30_000, + } = params; + const audioFormat = seedAudioFormat(encoding); + + const payload = JSON.stringify({ + user: { uid: "openclaw" }, + req_params: { + text, + speaker: voice, + audio_params: { + format: audioFormat, + sample_rate: 24_000, + }, + ...(speedRatio !== 1.0 ? { speed_ratio: speedRatio } : {}), + ...(emotion ? { emotion } : {}), + }, + }); + + const { response, release } = await fetchWithSsrFGuard({ + url: baseUrl, + init: { + method: "POST", + headers: { + "Content-Type": "application/json", + Connection: "keep-alive", + "X-Api-Key": apiKey, + "X-Api-Resource-Id": resourceId, + "X-Api-App-Key": appKey, + }, + body: payload, + }, + timeoutMs, + policy: { hostnameAllowlist: hostnameAllowlist(baseUrl) }, + auditContext: "volcengine.tts", + }); + + try { + const frames = parseSeedTtsFrames(await response.text()); + const chunks: Buffer[] = []; + for (const frame of frames) { + if (frame.code === 0) { + if (frame.data) { + chunks.push(Buffer.from(frame.data, "base64")); + } + continue; + } + if (frame.code === 20000000) { + continue; + } + throw new Error( + `BytePlus Seed Speech TTS error ${frame.code ?? response.status}: ${ + frame.message ?? "unknown" + }`, + ); + } + + if (!response.ok || chunks.length === 0) { + throw new Error(`BytePlus Seed Speech TTS error ${response.status}: no audio data`); + } + + return Buffer.concat(chunks); + } finally { + await release(); + } +} + +async function legacyVolcengineTTS( + params: VolcengineTTSParams & { appId: string; token: string }, +): Promise { + const { + text, + appId, + token, + voice = DEFAULT_LEGACY_VOICE, + cluster = DEFAULT_CLUSTER, + baseUrl = VOLCENGINE_LEGACY_TTS_URL, + speedRatio = 1.0, + volumeRatio = 1.0, + pitchRatio = 1.0, + emotion, + encoding = "ogg_opus", + timeoutMs = 30_000, + } = params; + + const payload = JSON.stringify({ + app: { appid: appId, token, cluster }, + user: { uid: "openclaw" }, + audio: { + voice_type: voice, + encoding, + speed_ratio: speedRatio, + volume_ratio: volumeRatio, + pitch_ratio: pitchRatio, + ...(emotion ? { emotion } : {}), + }, + request: { + reqid: crypto.randomUUID(), + text, + text_type: "plain", + operation: "query", + }, + }); + + const { response, release } = await fetchWithSsrFGuard({ + url: baseUrl, + init: { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer;${token}`, + }, + body: payload, + }, + timeoutMs, + policy: { hostnameAllowlist: hostnameAllowlist(baseUrl) }, + auditContext: "volcengine.tts", + }); + + try { + const body = parseLegacyTtsResponse(await response.text()); + if (!response.ok || body.code !== 3000 || !body.data) { + throw new Error( + `Volcengine TTS error ${body.code ?? response.status}: ${body.message ?? "unknown"}`, + ); + } + return Buffer.from(body.data, "base64"); + } finally { + await release(); + } +} + +export async function volcengineTTS(params: VolcengineTTSParams): Promise { + if (params.apiKey) { + return seedSpeechTTS({ ...params, apiKey: params.apiKey }); + } + + if (params.appId && params.token) { + return legacyVolcengineTTS({ ...params, appId: params.appId, token: params.token }); + } + + throw new Error( + "Volcengine TTS credentials missing. Set a BytePlus Seed Speech API key or legacy AppID/token.", + ); +}