feat(tts): add per-agent voice overrides

This commit is contained in:
Peter Steinberger
2026-04-26 02:45:45 +01:00
parent 1bc9bada65
commit 0ca952cdd5
31 changed files with 605 additions and 34 deletions

View File

@@ -9,6 +9,9 @@ Docs: https://docs.openclaw.ai
### Changes
- Plugins/tokenjuice: bump the bundled tokenjuice runtime to 0.6.3. Thanks @vincentkoc.
- TTS/agents: allow `agents.list[].tts` to override global
`messages.tts` for per-agent voices while keeping shared provider
credentials and preferences in the existing TTS config surface.
- Providers/Azure Speech: add Azure Speech as a bundled TTS provider with
Speech-resource auth, voice listing, SSML escaping, native Ogg/Opus
voice-note output, and telephony output. (#51776) Thanks @leonchui.

View File

@@ -1,4 +1,4 @@
211e9d4cdb309e7fe0c1ed91d060201240a9287f8c5cb3c893aba3f904a20d30 config-baseline.json
ffda2d2911adc03148a368f3b40b17cbdcb7af0066bccdc555e8d596cdea8cda config-baseline.core.json
3efb041739877bd5387ffc87e0ddd11be43d80d38e7779407ce8091dcb797e5e config-baseline.json
5c6e35c5846f654d717d4b20853649e0b45a746423834f539b2a2223abcd5226 config-baseline.core.json
7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json
9e131d7734f8b9cc9e7f8af6cc6b6dc81c9971dc551fadbe66fb0d682173f32d config-baseline.plugin.json
a5479c182ec987bb21e814b8a4e7b3bda7190ae5c2b35fd5ca403dfa48afa115 config-baseline.plugin.json

View File

@@ -1,2 +1,2 @@
c911117176b41eebf26470618274a7e093910e9b36855bc045bc8a92f6856745 plugin-sdk-api-baseline.json
ff360635f95beb217b9dd207a87eaf331319a7671aea03acfe05911756741b21 plugin-sdk-api-baseline.jsonl
6eb33044c2a4726f1aeb2d18052643c38c8bf5244bb970f969b1583365063e8b plugin-sdk-api-baseline.json
06e70516047f98d78963c238f1671feb3eea7c7e559c6fa84f403b9562028bb2 plugin-sdk-api-baseline.jsonl

View File

@@ -915,6 +915,11 @@ scripts/sandbox-browser-setup.sh # optional browser image
fastModeDefault: false, // per-agent fast mode override
embeddedHarness: { runtime: "auto", fallback: "pi" },
params: { cacheRetention: "none" }, // overrides matching defaults.models params by key
tts: {
providers: {
elevenlabs: { voiceId: "EXAVITQu4vr4xnSDxMaL" },
},
},
skills: ["docs-search"], // replaces agents.defaults.skills when set
identity: {
name: "Samantha",
@@ -950,6 +955,7 @@ scripts/sandbox-browser-setup.sh # optional browser image
- `default`: when multiple are set, first wins (warning logged). If none set, first list entry is default.
- `model`: string form overrides `primary` only; object form `{ primary, fallbacks }` overrides both (`[]` disables global fallbacks). Cron jobs that only override `primary` still inherit default fallbacks unless you set `fallbacks: []`.
- `params`: per-agent stream params merged over the selected model entry in `agents.defaults.models`. Use this for agent-specific overrides like `cacheRetention`, `temperature`, or `maxTokens` without duplicating the whole model catalog.
- `tts`: optional per-agent text-to-speech overrides. The block deep-merges over `messages.tts`, so keep shared provider credentials and fallback policy in `messages.tts` and set only persona-specific values such as provider, voice, model, style, or auto mode here.
- `skills`: optional per-agent skill allowlist. If omitted, the agent inherits `agents.defaults.skills` when set; an explicit list replaces defaults instead of merging, and `[]` means no skills.
- `thinkingDefault`: optional per-agent default thinking level (`off | minimal | low | medium | high | xhigh | adaptive | max`). Overrides `agents.defaults.thinkingDefault` for this agent when no per-message or session override is set. The selected provider/model profile controls which values are valid; for Google Gemini, `adaptive` keeps provider-owned dynamic thinking (`thinkingLevel` omitted on Gemini 3/3.1, `thinkingBudget: -1` on Gemini 2.5).
- `reasoningDefault`: optional per-agent default reasoning visibility (`on | off | stream`). Applies when no per-message or session reasoning override is set.

View File

@@ -35,6 +35,7 @@ Scope intent:
- `models.providers.*.request.tls.passphrase`
- `skills.entries.*.apiKey`
- `agents.defaults.memorySearch.remote.apiKey`
- `agents.list[].tts.providers.*.apiKey`
- `agents.list[].memorySearch.remote.apiKey`
- `talk.providers.*.apiKey`
- `messages.tts.providers.*.apiKey`

View File

@@ -29,6 +29,13 @@
"secretShape": "secret_input",
"optIn": true
},
{
"id": "agents.list[].tts.providers.*.apiKey",
"configFile": "openclaw.json",
"path": "agents.list[].tts.providers.*.apiKey",
"secretShape": "secret_input",
"optIn": true
},
{
"id": "auth-profiles.api_key.key",
"configFile": "auth-profiles.json",

View File

@@ -109,6 +109,50 @@ Full schema is in [Gateway configuration](/gateway/configuration).
}
```
### Per-agent voice overrides
Use `agents.list[].tts` when one agent should speak with a different provider,
voice, model, style, or auto-TTS mode. The agent block deep-merges over
`messages.tts`, so provider credentials can stay in the global provider config.
```json5
{
messages: {
tts: {
auto: "always",
provider: "elevenlabs",
providers: {
elevenlabs: {
apiKey: "${ELEVENLABS_API_KEY}",
model: "eleven_multilingual_v2",
},
},
},
},
agents: {
list: [
{
id: "reader",
tts: {
providers: {
elevenlabs: {
voiceId: "EXAVITQu4vr4xnSDxMaL",
},
},
},
},
],
},
}
```
Precedence for automatic replies is:
1. `messages.tts`
2. active `agents.list[].tts`
3. local `/tts` preferences for this host
4. inline `[[tts:...]]` directives when model overrides are enabled
### OpenAI primary with ElevenLabs fallback
```json5
@@ -702,7 +746,8 @@ Stored fields:
- `maxLength` (summary threshold; default 1500 chars)
- `summarize` (default `true`)
These override `messages.tts.*` for that host.
These override the effective config from `messages.tts` plus the active
`agents.list[].tts` block for that host.
## Output formats (fixed)

View File

@@ -49,7 +49,7 @@ vi.mock("../api.js", async () => {
};
});
const { _test, maybeApplyTtsToPayload } = await import("./tts.js");
const { _test, maybeApplyTtsToPayload, resolveTtsConfig } = await import("./tts.js");
const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const;
@@ -158,3 +158,82 @@ describe("speech-core native voice-note routing", () => {
});
});
});
describe("speech-core per-agent TTS config", () => {
it("deep-merges the active agent TTS override over messages.tts", () => {
const cfg = {
messages: {
tts: {
enabled: true,
provider: "openai",
providers: {
openai: {
apiKey: "${OPENAI_API_KEY}",
voice: "coral",
speed: 1,
},
},
},
},
agents: {
list: [
{
id: "reader",
tts: {
provider: "openai",
providers: {
openai: {
voice: "nova",
},
},
},
},
],
},
} satisfies OpenClawConfig;
const resolved = resolveTtsConfig(cfg, "reader");
expect(resolved.rawConfig).toMatchObject({
enabled: true,
provider: "openai",
providers: {
openai: {
apiKey: "${OPENAI_API_KEY}",
voice: "nova",
speed: 1,
},
},
});
});
it("ignores prototype-pollution keys in agent TTS overrides", () => {
const cfg = {
messages: {
tts: {
provider: "openai",
providers: {
openai: {
voice: "coral",
},
},
},
},
agents: {
list: [
{
id: "reader",
tts: JSON.parse(
'{"providers":{"openai":{"voice":"nova","__proto__":{"polluted":true}}}}',
),
},
],
},
} as OpenClawConfig;
const resolved = resolveTtsConfig(cfg, "reader");
expect(resolved.rawConfig?.providers?.openai).toEqual({ voice: "nova" });
expect(({} as Record<string, unknown>).polluted).toBeUndefined();
});
});

View File

@@ -62,6 +62,7 @@ const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
type TtsUserPrefs = {
tts?: {
@@ -240,6 +241,48 @@ function resolveRawProviderConfig(
return asProviderConfig(direct);
}
function isPlainObject(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function deepMergeDefined(base: unknown, override: unknown): unknown {
if (!isPlainObject(base) || !isPlainObject(override)) {
return override === undefined ? base : override;
}
const result: Record<string, unknown> = { ...base };
for (const [key, value] of Object.entries(override)) {
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
continue;
}
const existing = result[key];
result[key] = key in result ? deepMergeDefined(existing, value) : value;
}
return result;
}
function normalizeAgentConfigId(value: string | undefined | null): string {
return normalizeLowercaseStringOrEmpty(value);
}
function resolveAgentTtsOverride(
cfg: OpenClawConfig,
agentId: string | undefined,
): TtsConfig | undefined {
if (!agentId || !Array.isArray(cfg.agents?.list)) {
return undefined;
}
const normalized = normalizeAgentConfigId(agentId);
const agent = cfg.agents.list.find((entry) => normalizeAgentConfigId(entry.id) === normalized);
return agent?.tts;
}
function resolveEffectiveTtsRawConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig {
const base = cfg.messages?.tts ?? {};
const override = resolveAgentTtsOverride(cfg, agentId);
return deepMergeDefined(base, override ?? {}) as TtsConfig;
}
function resolveLazyProviderConfig(
config: ResolvedTtsConfig,
providerId: string,
@@ -313,8 +356,8 @@ export function getResolvedSpeechProviderConfig(
return resolveLazyProviderConfig(config, canonical, cfg);
}
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): ResolvedTtsConfig {
const raw: TtsConfig = resolveEffectiveTtsRawConfig(cfg, agentId);
const providerSource = raw.provider ? "config" : "default";
const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const auto = resolveConfiguredTtsAutoMode(raw);
@@ -367,11 +410,15 @@ export function resolveTtsAutoMode(params: {
return params.config.auto;
}
function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto?: string }): {
function resolveEffectiveTtsAutoState(params: {
cfg: OpenClawConfig;
sessionAuto?: string;
agentId?: string;
}): {
autoMode: TtsAutoMode;
prefsPath: string;
} {
const raw: TtsConfig = params.cfg.messages?.tts ?? {};
const raw: TtsConfig = resolveEffectiveTtsRawConfig(params.cfg, params.agentId);
const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath);
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) {
@@ -387,12 +434,15 @@ function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto
};
}
export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg });
export function buildTtsSystemPromptHint(
cfg: OpenClawConfig,
agentId?: string,
): string | undefined {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg, agentId });
if (autoMode === "off") {
return undefined;
}
const _config = resolveTtsConfig(cfg);
const _config = resolveTtsConfig(cfg, agentId);
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
@@ -504,11 +554,12 @@ export function resolveExplicitTtsOverrides(params: {
provider?: string;
modelId?: string;
voiceId?: string;
agentId?: string;
}): TtsDirectiveOverrides {
const providerInput = params.provider?.trim();
const modelId = params.modelId?.trim();
const voiceId = params.voiceId?.trim();
const config = resolveTtsConfig(params.cfg);
const config = resolveTtsConfig(params.cfg, params.agentId);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
const selectedProvider =
canonicalizeSpeechProviderId(providerInput, params.cfg) ??
@@ -741,6 +792,7 @@ function resolveTtsRequestSetup(params: {
prefsPath?: string;
providerOverride?: TtsProvider;
disableFallback?: boolean;
agentId?: string;
}):
| {
config: ResolvedTtsConfig;
@@ -749,7 +801,7 @@ function resolveTtsRequestSetup(params: {
| {
error: string;
} {
const config = resolveTtsConfig(params.cfg);
const config = resolveTtsConfig(params.cfg, params.agentId);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
@@ -774,6 +826,7 @@ export async function textToSpeech(params: {
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
agentId?: string;
}): Promise<TtsResult> {
const synthesis = await synthesizeSpeech(params);
if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) {
@@ -819,6 +872,7 @@ export async function synthesizeSpeech(params: {
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
agentId?: string;
}): Promise<TtsSynthesisResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
@@ -826,6 +880,7 @@ export async function synthesizeSpeech(params: {
prefsPath: params.prefsPath,
providerOverride: params.overrides?.provider,
disableFallback: params.disableFallback,
agentId: params.agentId,
});
if ("error" in setup) {
return { success: false, error: setup.error };
@@ -1064,6 +1119,7 @@ export async function maybeApplyTtsToPayload(params: {
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
agentId?: string;
}): Promise<ReplyPayload> {
if (params.payload.isCompactionNotice) {
return params.payload;
@@ -1071,11 +1127,12 @@ export async function maybeApplyTtsToPayload(params: {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({
cfg: params.cfg,
sessionAuto: params.ttsAuto,
agentId: params.agentId,
});
if (autoMode === "off") {
return params.payload;
}
const config = resolveTtsConfig(params.cfg);
const config = resolveTtsConfig(params.cfg, params.agentId);
const activeProvider = getTtsProvider(config, prefsPath);
const reply = resolveSendableOutboundReplyParts(params.payload);
@@ -1183,6 +1240,7 @@ export async function maybeApplyTtsToPayload(params: {
prefsPath,
channel: params.channel,
overrides: directives.overrides,
agentId: params.agentId,
});
if (result.success && result.audioPath) {

View File

@@ -25,6 +25,7 @@ export type ResolvedAgentConfig = {
skills?: AgentEntry["skills"];
memorySearch?: AgentEntry["memorySearch"];
humanDelay?: AgentEntry["humanDelay"];
tts?: AgentEntry["tts"];
contextLimits?: AgentContextLimitsConfig;
heartbeat?: AgentEntry["heartbeat"];
identity?: AgentEntry["identity"];
@@ -123,6 +124,7 @@ export function resolveAgentConfig(
skills: Array.isArray(entry.skills) ? entry.skills : undefined,
memorySearch: entry.memorySearch,
humanDelay: entry.humanDelay,
tts: entry.tts,
contextLimits:
typeof entry.contextLimits === "object" && entry.contextLimits
? { ...agentDefaults?.contextLimits, ...entry.contextLimits }

View File

@@ -65,6 +65,7 @@ describe("resolveAgentConfig", () => {
groupChat: undefined,
subagents: undefined,
sandbox: undefined,
tts: undefined,
tools: undefined,
});
});

View File

@@ -99,7 +99,9 @@ export function buildSystemPrompt(params: {
shell: detectRuntimeShell(),
},
});
const ttsHint = params.config ? buildTtsSystemPromptHint(params.config) : undefined;
const ttsHint = params.config
? buildTtsSystemPromptHint(params.config, params.agentId)
: undefined;
const ownerDisplay = resolveOwnerDisplaySetting(params.config);
return buildAgentSystemPrompt({
workspaceDir: params.workspaceDir,

View File

@@ -722,7 +722,9 @@ export async function compactEmbeddedPiSessionDirect(
cwd: effectiveWorkspace,
moduleUrl: import.meta.url,
});
const ttsHint = params.config ? buildTtsSystemPromptHint(params.config) : undefined;
const ttsHint = params.config
? buildTtsSystemPromptHint(params.config, sessionAgentId)
: undefined;
const ownerDisplay = resolveOwnerDisplaySetting(params.config);
const promptContributionContext: Parameters<
AgentRuntimePlan["prompt"]["resolveSystemPromptContribution"]

View File

@@ -1065,7 +1065,9 @@ export async function runEmbeddedAttempt(
cwd: effectiveWorkspace,
moduleUrl: import.meta.url,
});
const ttsHint = params.config ? buildTtsSystemPromptHint(params.config) : undefined;
const ttsHint = params.config
? buildTtsSystemPromptHint(params.config, sessionAgentId)
: undefined;
const ownerDisplay = resolveOwnerDisplaySetting(params.config);
const heartbeatPrompt = shouldInjectHeartbeatPrompt({
config: params.config,

View File

@@ -146,7 +146,7 @@ export async function resolveCommandsSystemPromptBundle(
},
}
: { enabled: false };
const ttsHint = params.cfg ? buildTtsSystemPromptHint(params.cfg) : undefined;
const ttsHint = params.cfg ? buildTtsSystemPromptHint(params.cfg, sessionAgentId) : undefined;
const systemPrompt = buildAgentSystemPrompt({
workspaceDir,

View File

@@ -88,6 +88,7 @@ async function shouldTreatDeliveredTextAsVisible(params: {
async function maybeApplyAcpTts(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
agentId?: string;
channel?: string;
kind: ReplyDispatchKind;
inboundAudio: boolean;
@@ -100,6 +101,7 @@ async function maybeApplyAcpTts(params: {
const ttsStatus = resolveStatusTtsSnapshot({
cfg: params.cfg,
sessionAuto: params.ttsAuto,
agentId: params.agentId,
});
if (!ttsStatus) {
return params.payload;
@@ -107,7 +109,7 @@ async function maybeApplyAcpTts(params: {
if (ttsStatus.autoMode === "inbound" && !params.inboundAudio) {
return params.payload;
}
if (params.kind !== "final" && resolveConfiguredTtsMode(params.cfg) === "final") {
if (params.kind !== "final" && resolveConfiguredTtsMode(params.cfg, params.agentId) === "final") {
return params.payload;
}
const { maybeApplyTtsToPayload } = await loadDispatchAcpTtsRuntime();
@@ -118,6 +120,7 @@ async function maybeApplyAcpTts(params: {
kind: params.kind,
inboundAudio: params.inboundAudio,
ttsAuto: params.ttsAuto,
agentId: params.agentId,
});
}
@@ -153,6 +156,7 @@ export type AcpDispatchDeliveryCoordinator = {
export function createAcpDispatchDeliveryCoordinator(params: {
cfg: OpenClawConfig;
agentId?: string;
ctx: FinalizedMsgContext;
dispatcher: ReplyDispatcher;
inboundAudio: boolean;
@@ -294,6 +298,7 @@ export function createAcpDispatchDeliveryCoordinator(params: {
const ttsPayload = await maybeApplyAcpTts({
payload,
cfg: params.cfg,
agentId: params.agentId,
channel: params.ttsChannel,
kind,
inboundAudio: params.inboundAudio,

View File

@@ -186,6 +186,7 @@ async function maybeUnbindStaleBoundConversations(params: {
async function finalizeAcpTurnOutput(params: {
cfg: OpenClawConfig;
sessionKey: string;
agentId: string;
delivery: AcpDispatchDeliveryCoordinator;
inboundAudio: boolean;
sessionTtsAuto?: TtsAutoMode;
@@ -195,12 +196,13 @@ async function finalizeAcpTurnOutput(params: {
await params.delivery.settleVisibleText();
let queuedFinal =
params.delivery.hasDeliveredVisibleText() && !params.delivery.hasFailedVisibleTextDelivery();
const ttsMode = resolveConfiguredTtsMode(params.cfg);
const ttsMode = resolveConfiguredTtsMode(params.cfg, params.agentId);
const accumulatedBlockText = params.delivery.getAccumulatedBlockText();
const hasAccumulatedBlockText = accumulatedBlockText.trim().length > 0;
const ttsStatus = resolveStatusTtsSnapshot({
cfg: params.cfg,
sessionAuto: params.sessionTtsAuto,
agentId: params.agentId,
});
const canAttemptFinalTts =
ttsStatus != null && !(ttsStatus.autoMode === "inbound" && !params.inboundAudio);
@@ -216,6 +218,7 @@ async function finalizeAcpTurnOutput(params: {
kind: "final",
inboundAudio: params.inboundAudio,
ttsAuto: params.sessionTtsAuto,
agentId: params.agentId,
});
if (ttsSyntheticReply.mediaUrl) {
const delivered = await params.delivery.deliver("final", {
@@ -308,10 +311,12 @@ export async function tryDispatchAcpReply(params: {
return null;
}
const canonicalSessionKey = acpResolution.sessionKey;
const acpAgentId = resolveAgentIdFromSessionKey(canonicalSessionKey);
let queuedFinal = false;
const delivery = createAcpDispatchDeliveryCoordinator({
cfg: params.cfg,
agentId: acpAgentId,
ctx: params.ctx,
dispatcher: params.dispatcher,
inboundAudio: params.inboundAudio,
@@ -476,6 +481,7 @@ export async function tryDispatchAcpReply(params: {
(await finalizeAcpTurnOutput({
cfg: params.cfg,
sessionKey: canonicalSessionKey,
agentId: acpAgentId,
delivery,
inboundAudio: params.inboundAudio,
sessionTtsAuto: params.sessionTtsAuto,

View File

@@ -119,7 +119,9 @@ function loadReplyMediaPathsRuntime() {
async function maybeApplyTtsToReplyPayload(
params: Parameters<Awaited<ReturnType<typeof loadTtsRuntime>>["maybeApplyTtsToPayload"]>[0],
) {
if (!shouldAttemptTtsPayload({ cfg: params.cfg, ttsAuto: params.ttsAuto })) {
if (
!shouldAttemptTtsPayload({ cfg: params.cfg, ttsAuto: params.ttsAuto, agentId: params.agentId })
) {
return params.payload;
}
const { maybeApplyTtsToPayload } = await loadTtsRuntime();
@@ -729,6 +731,7 @@ export async function dispatchReplyFromConfig(
kind: "final",
inboundAudio,
ttsAuto: sessionTtsAuto,
agentId: sessionAgentId,
});
const normalizedPayload = await normalizeReplyMediaPayload(ttsPayload);
const result = await routeReplyToOriginating(normalizedPayload);
@@ -996,6 +999,7 @@ export async function dispatchReplyFromConfig(
kind: "tool",
inboundAudio,
ttsAuto: sessionTtsAuto,
agentId: sessionAgentId,
});
const normalizedPayload = await normalizeReplyMediaPayload(ttsPayload);
const deliveryPayload = resolveToolDeliveryPayload(normalizedPayload);
@@ -1097,6 +1101,7 @@ export async function dispatchReplyFromConfig(
kind: "block",
inboundAudio,
ttsAuto: sessionTtsAuto,
agentId: sessionAgentId,
});
const normalizedPayload = await normalizeReplyMediaPayload(ttsPayload);
if (shouldRouteToOriginating) {
@@ -1167,7 +1172,7 @@ export async function dispatchReplyFromConfig(
routedFinalCount += finalReply.routedFinalCount;
}
const ttsMode = resolveConfiguredTtsMode(cfg);
const ttsMode = resolveConfiguredTtsMode(cfg, sessionAgentId);
// Generate TTS-only reply after block streaming completes (when there's no final reply).
// This handles the case where block streaming succeeds and drops final payloads,
// but we still want TTS audio to be generated from the accumulated block content.
@@ -1185,6 +1190,7 @@ export async function dispatchReplyFromConfig(
kind: "final",
inboundAudio,
ttsAuto: sessionTtsAuto,
agentId: sessionAgentId,
});
// Only send if TTS was actually applied (mediaUrl exists)
if (ttsSyntheticReply.mediaUrl) {

View File

@@ -27,6 +27,7 @@ const STATIC_AGENT_RUNTIME_BASE_TARGET_IDS = [
...STATIC_MODEL_TARGET_IDS,
"agents.defaults.memorySearch.remote.apiKey",
"agents.list[].memorySearch.remote.apiKey",
"agents.list[].tts.providers.*.apiKey",
"messages.tts.providers.*.apiKey",
"skills.entries.*.apiKey",
"tools.web.search.apiKey",

View File

@@ -6531,6 +6531,177 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
},
additionalProperties: false,
},
tts: {
type: "object",
properties: {
auto: {
type: "string",
enum: ["off", "always", "inbound", "tagged"],
},
enabled: {
type: "boolean",
},
mode: {
type: "string",
enum: ["final", "all"],
},
provider: {
type: "string",
minLength: 1,
},
summaryModel: {
type: "string",
},
modelOverrides: {
type: "object",
properties: {
enabled: {
type: "boolean",
},
allowText: {
type: "boolean",
},
allowProvider: {
type: "boolean",
},
allowVoice: {
type: "boolean",
},
allowModelId: {
type: "boolean",
},
allowVoiceSettings: {
type: "boolean",
},
allowNormalization: {
type: "boolean",
},
allowSeed: {
type: "boolean",
},
},
additionalProperties: false,
},
providers: {
type: "object",
propertyNames: {
type: "string",
},
additionalProperties: {
type: "object",
properties: {
apiKey: {
anyOf: [
{
type: "string",
},
{
oneOf: [
{
type: "object",
properties: {
source: {
type: "string",
const: "env",
},
provider: {
type: "string",
pattern: "^[a-z][a-z0-9_-]{0,63}$",
},
id: {
type: "string",
pattern: "^[A-Z][A-Z0-9_]{0,127}$",
},
},
required: ["source", "provider", "id"],
additionalProperties: false,
},
{
type: "object",
properties: {
source: {
type: "string",
const: "file",
},
provider: {
type: "string",
pattern: "^[a-z][a-z0-9_-]{0,63}$",
},
id: {
type: "string",
},
},
required: ["source", "provider", "id"],
additionalProperties: false,
},
{
type: "object",
properties: {
source: {
type: "string",
const: "exec",
},
provider: {
type: "string",
pattern: "^[a-z][a-z0-9_-]{0,63}$",
},
id: {
type: "string",
},
},
required: ["source", "provider", "id"],
additionalProperties: false,
},
],
},
],
},
},
additionalProperties: {
anyOf: [
{
type: "string",
},
{
type: "number",
},
{
type: "boolean",
},
{
type: "null",
},
{
type: "array",
items: {},
},
{
type: "object",
propertyNames: {
type: "string",
},
additionalProperties: {},
},
],
},
},
},
prefsPath: {
type: "string",
},
maxTextLength: {
type: "integer",
minimum: 1,
maximum: 9007199254740991,
},
timeoutMs: {
type: "integer",
minimum: 1000,
maximum: 120000,
},
},
additionalProperties: false,
},
skillsLimits: {
type: "object",
properties: {
@@ -27586,6 +27757,10 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
sensitive: true,
tags: ["security", "auth"],
},
"agents.list[].tts.providers.*.apiKey": {
sensitive: true,
tags: ["security", "auth", "media"],
},
"agents.list[].sandbox.ssh.identityData": {
sensitive: true,
tags: ["security", "storage"],

View File

@@ -13,6 +13,7 @@ import type { DmScope, HumanDelayConfig, IdentityConfig } from "./types.base.js"
import type { GroupChatConfig } from "./types.messages.js";
import type { SkillsLimitsConfig } from "./types.skills.js";
import type { AgentToolsConfig, MemorySearchConfig } from "./types.tools.js";
import type { TtsConfig } from "./types.tts.js";
export type AgentRuntimeAcpConfig = {
/** ACP harness adapter id (for example codex, claude). */
@@ -95,6 +96,8 @@ export type AgentConfig = {
memorySearch?: MemorySearchConfig;
/** Human-like delay between block replies for this agent. */
humanDelay?: HumanDelayConfig;
/** Optional per-agent TTS overrides, deep-merged over messages.tts. */
tts?: TtsConfig;
/** Optional per-agent skills subsystem overrides. */
skillsLimits?: Pick<SkillsLimitsConfig, "maxSkillsPromptChars">;
/** Optional per-agent overrides for selected context/token-heavy limits. */

View File

@@ -140,6 +140,25 @@ describe("agent defaults schema", () => {
expect(agent.heartbeat?.timeoutSeconds).toBe(45);
});
it("accepts per-agent TTS overrides", () => {
const agent = AgentEntrySchema.parse({
id: "reader",
tts: {
provider: "openai",
auto: "always",
providers: {
openai: {
voice: "nova",
apiKey: "${OPENAI_API_KEY}",
},
},
},
});
expect(agent.tts?.provider).toBe("openai");
expect(agent.tts?.providers?.openai?.voice).toBe("nova");
});
it("rejects zero heartbeat timeoutSeconds", () => {
expect(() => AgentDefaultsSchema.parse({ heartbeat: { timeoutSeconds: 0 } })).toThrow();
expect(() => AgentEntrySchema.parse({ id: "ops", heartbeat: { timeoutSeconds: 0 } })).toThrow();

View File

@@ -13,6 +13,7 @@ import {
SecretInputSchema,
ToolsLinksSchema,
ToolsMediaSchema,
TtsConfigSchema,
} from "./zod-schema.core.js";
import { sensitive } from "./zod-schema.sensitive.js";
@@ -828,6 +829,7 @@ export const AgentEntrySchema = z
skills: z.array(z.string()).optional(),
memorySearch: MemorySearchSchema,
humanDelay: HumanDelaySchema.optional(),
tts: TtsConfigSchema,
skillsLimits: AgentSkillsLimitsSchema,
contextLimits: AgentContextLimitsSchema,
contextTokens: z.number().int().positive().optional(),

View File

@@ -62,6 +62,7 @@ export type ResolveExplicitTtsOverridesParams = {
provider?: string;
modelId?: string;
voiceId?: string;
agentId?: string;
};
export type TtsRequestParams = {
@@ -72,6 +73,7 @@ export type TtsRequestParams = {
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
agentId?: string;
};
export type TtsTelephonyRequestParams = {
@@ -95,6 +97,7 @@ export type MaybeApplyTtsToPayloadParams = {
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
agentId?: string;
};
export type TtsTestFacade = {
@@ -168,7 +171,7 @@ export type ListSpeechVoices = (params: ListSpeechVoicesParams) => Promise<Speec
export type TtsRuntimeFacade = {
_test: TtsTestFacade;
buildTtsSystemPromptHint: (cfg: OpenClawConfig) => string | undefined;
buildTtsSystemPromptHint: (cfg: OpenClawConfig, agentId?: string) => string | undefined;
getLastTtsAttempt: () => TtsStatusEntry | undefined;
getResolvedSpeechProviderConfig: (
config: ResolvedTtsConfig,
@@ -188,7 +191,7 @@ export type TtsRuntimeFacade = {
maybeApplyTtsToPayload: (params: MaybeApplyTtsToPayloadParams) => Promise<ReplyPayload>;
resolveExplicitTtsOverrides: (params: ResolveExplicitTtsOverridesParams) => TtsDirectiveOverrides;
resolveTtsAutoMode: (params: ResolveTtsAutoModeParams) => TtsAutoMode;
resolveTtsConfig: (cfg: OpenClawConfig) => ResolvedTtsConfig;
resolveTtsConfig: (cfg: OpenClawConfig, agentId?: string) => ResolvedTtsConfig;
resolveTtsPrefsPath: (config: ResolvedTtsConfig) => string;
resolveTtsProviderOrder: (primary: TtsProvider, cfg?: OpenClawConfig) => TtsProvider[];
setLastTtsAttempt: (entry: TtsStatusEntry | undefined) => void;

View File

@@ -506,6 +506,29 @@ function collectMessagesTtsAssignments(params: {
});
}
function collectAgentTtsAssignments(params: {
config: OpenClawConfig;
defaults: SecretDefaults | undefined;
context: ResolverContext;
}): void {
const agents = params.config.agents as Record<string, unknown> | undefined;
const list = agents?.list;
if (!Array.isArray(list)) {
return;
}
for (const [index, entry] of list.entries()) {
if (!isRecord(entry) || !isRecord(entry.tts)) {
continue;
}
collectTtsApiKeyAssignments({
tts: entry.tts,
pathPrefix: `agents.list.${index}.tts`,
defaults: params.defaults,
context: params.context,
});
}
}
function collectCronAssignments(params: {
config: OpenClawConfig;
defaults: SecretDefaults | undefined;
@@ -640,6 +663,7 @@ export function collectCoreConfigAssignments(params: {
collectGatewayAssignments(params);
collectSandboxSshAssignments(params);
collectMessagesTtsAssignments(params);
collectAgentTtsAssignments(params);
collectCronAssignments(params);
collectMediaRequestAssignments(params);
}

View File

@@ -204,6 +204,18 @@ const CORE_SECRET_TARGET_REGISTRY: SecretTargetRegistryEntry[] = [
includeInAudit: true,
providerIdPathSegmentIndex: 3,
},
{
id: "agents.list[].tts.providers.*.apiKey",
targetType: "agents.list[].tts.providers.*.apiKey",
configFile: "openclaw.json",
pathPattern: "agents.list[].tts.providers.*.apiKey",
secretShape: SECRET_INPUT_SHAPE,
expectedResolvedValue: "string",
includeInPlan: true,
includeInConfigure: false,
includeInAudit: true,
providerIdPathSegmentIndex: 4,
},
{
id: "models.providers.*.apiKey",
targetType: "models.providers.apiKey",

View File

@@ -451,6 +451,7 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray<MediaUnderstandi
const formatVoiceModeLine = (
config?: OpenClawConfig,
sessionEntry?: SessionEntry,
agentId?: string,
): string | null => {
if (!config) {
return null;
@@ -458,6 +459,7 @@ const formatVoiceModeLine = (
const snapshot = resolveStatusTtsSnapshot({
cfg: config,
sessionAuto: sessionEntry?.ttsAuto,
agentId,
});
if (!snapshot) {
return null;
@@ -890,7 +892,7 @@ export function buildStatusMessage(args: StatusArgs): string {
const usageCostLine =
usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine);
const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions);
const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry);
const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry, args.agentId);
return [
versionLine,

View File

@@ -104,6 +104,40 @@ describe("resolveStatusTtsSnapshot", () => {
});
});
it("reports per-agent TTS overrides", async () => {
await withStatusTempHome(async () => {
expect(
resolveStatusTtsSnapshot({
cfg: {
messages: {
tts: {
auto: "off",
provider: "openai",
},
},
agents: {
list: [
{
id: "reader",
tts: {
auto: "always",
provider: "elevenlabs",
},
},
],
},
} as OpenClawConfig,
agentId: "reader",
}),
).toEqual({
autoMode: "always",
provider: "elevenlabs",
maxLength: 1500,
summarize: true,
});
});
});
it("derives the default prefs path from OPENCLAW_CONFIG_PATH when set", async () => {
await withStatusTempHome(async (home) => {
const stateDir = path.join(home, ".openclaw-dev");

View File

@@ -8,6 +8,7 @@ import {
} from "../shared/string-coerce.js";
import { resolveConfigDir, resolveUserPath } from "../utils.js";
import { normalizeTtsAutoMode } from "./tts-auto-mode.js";
import { resolveEffectiveTtsConfig } from "./tts-config.js";
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
@@ -80,8 +81,9 @@ function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefin
export function resolveStatusTtsSnapshot(params: {
cfg: OpenClawConfig;
sessionAuto?: string;
agentId?: string;
}): TtsStatusSnapshot | null {
const raw: TtsConfig = params.cfg.messages?.tts ?? {};
const raw: TtsConfig = resolveEffectiveTtsConfig(params.cfg, params.agentId);
const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath);
const prefs = readPrefs(prefsPath);
const autoMode =

View File

@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
import path from "node:path";
import { afterAll, beforeAll, afterEach, beforeEach, describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import { shouldAttemptTtsPayload } from "./tts-config.js";
import { resolveConfiguredTtsMode, shouldAttemptTtsPayload } from "./tts-config.js";
describe("shouldAttemptTtsPayload", () => {
let originalPrefsPath: string | undefined;
@@ -61,4 +61,31 @@ describe("shouldAttemptTtsPayload", () => {
shouldAttemptTtsPayload({ cfg: { messages: { tts: { enabled: true } } } as OpenClawConfig }),
).toBe(false);
});
it("uses per-agent TTS auto and mode overrides", () => {
const cfg = {
messages: {
tts: {
auto: "off",
mode: "final",
},
},
agents: {
list: [
{
id: "voice",
tts: {
auto: "always",
mode: "all",
},
},
],
},
} as OpenClawConfig;
expect(shouldAttemptTtsPayload({ cfg, agentId: "voice" })).toBe(true);
expect(resolveConfiguredTtsMode(cfg, "voice")).toBe("all");
expect(shouldAttemptTtsPayload({ cfg, agentId: "main" })).toBe(false);
expect(resolveConfiguredTtsMode(cfg, "main")).toBe("final");
});
});

View File

@@ -1,13 +1,54 @@
import { existsSync, readFileSync } from "node:fs";
import path from "node:path";
import type { OpenClawConfig } from "../config/types.js";
import type { TtsAutoMode, TtsMode } from "../config/types.tts.js";
import type { TtsAutoMode, TtsConfig, TtsMode } from "../config/types.tts.js";
import { normalizeAgentId } from "../routing/session-key.js";
import { resolveConfigDir, resolveUserPath } from "../utils.js";
import { normalizeTtsAutoMode } from "./tts-auto-mode.js";
export { normalizeTtsAutoMode } from "./tts-auto-mode.js";
export function resolveConfiguredTtsMode(cfg: OpenClawConfig): TtsMode {
return cfg.messages?.tts?.mode ?? "final";
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
function isPlainObject(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function deepMergeDefined(base: unknown, override: unknown): unknown {
if (!isPlainObject(base) || !isPlainObject(override)) {
return override === undefined ? base : override;
}
const result: Record<string, unknown> = { ...base };
for (const [key, value] of Object.entries(override)) {
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
continue;
}
const existing = result[key];
result[key] = key in result ? deepMergeDefined(existing, value) : value;
}
return result;
}
function resolveAgentTtsOverride(
cfg: OpenClawConfig,
agentId: string | undefined,
): TtsConfig | undefined {
if (!agentId || !Array.isArray(cfg.agents?.list)) {
return undefined;
}
const normalized = normalizeAgentId(agentId);
const agent = cfg.agents.list.find((entry) => normalizeAgentId(entry.id) === normalized);
return agent?.tts;
}
export function resolveEffectiveTtsConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig {
const base = cfg.messages?.tts ?? {};
const override = resolveAgentTtsOverride(cfg, agentId);
return deepMergeDefined(base, override ?? {}) as TtsConfig;
}
export function resolveConfiguredTtsMode(cfg: OpenClawConfig, agentId?: string): TtsMode {
return resolveEffectiveTtsConfig(cfg, agentId).mode ?? "final";
}
function resolveTtsPrefsPathValue(prefsPath: string | undefined): string {
@@ -45,13 +86,14 @@ function readTtsPrefsAutoMode(prefsPath: string): TtsAutoMode | undefined {
export function shouldAttemptTtsPayload(params: {
cfg: OpenClawConfig;
ttsAuto?: string;
agentId?: string;
}): boolean {
const sessionAuto = normalizeTtsAutoMode(params.ttsAuto);
if (sessionAuto) {
return sessionAuto !== "off";
}
const raw = params.cfg.messages?.tts;
const raw = resolveEffectiveTtsConfig(params.cfg, params.agentId);
const prefsAuto = readTtsPrefsAutoMode(resolveTtsPrefsPathValue(raw?.prefsPath));
if (prefsAuto) {
return prefsAuto !== "off";