diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index c3f8cb72fab..e3a46d67795 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -1342,6 +1342,7 @@ Batches rapid text-only messages from the same sender into a single agent turn. - `auto` controls auto-TTS. `/tts off|always|inbound|tagged` overrides per session. - `summaryModel` overrides `agents.defaults.model.primary` for auto-summary. +- `modelOverrides` is enabled by default; `modelOverrides.allowProvider` defaults to `false` (opt-in). - API keys fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY` and `OPENAI_API_KEY`. --- diff --git a/docs/tts.md b/docs/tts.md index c52a1546cbd..24ca527e13a 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -210,6 +210,7 @@ Then run: - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). + - `allowProvider` defaults to `false` (provider switching is opt-in). - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). @@ -242,18 +243,20 @@ for a single reply, plus an optional `[[tts:text]]...[[/tts:text]]` block to provide expressive tags (laughter, singing cues, etc) that should only appear in the audio. +`provider=...` directives are ignored unless `modelOverrides.allowProvider: true`. + Example reply payload: ``` Here you go. -[[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE model=eleven_v3 speed=1.1]] +[[tts:voiceId=pMsXgVXv3BLzUgSXRplE model=eleven_v3 speed=1.1]] [[tts:text]](laughs) Read the song once more.[[/tts:text]] ``` Available directive keys (when enabled): -- `provider` (`openai` | `elevenlabs` | `edge`) +- `provider` (`openai` | `elevenlabs` | `edge`, requires `allowProvider: true`) - `voice` (OpenAI voice) or `voiceId` (ElevenLabs) - `model` (OpenAI TTS model or ElevenLabs model id) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` @@ -275,7 +278,7 @@ Disable all model overrides: } ``` -Optional allowlist (disable specific overrides while keeping tags enabled): +Optional allowlist (enable provider switching while keeping other knobs configurable): ```json5 { @@ -283,7 +286,7 @@ Optional allowlist (disable specific overrides while keeping tags enabled): tts: { modelOverrides: { enabled: true, - allowProvider: false, + allowProvider: true, allowSeed: false, }, }, diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4eb4989b98c..82875d55e4a 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -9,7 +9,7 @@ export type TtsModelOverrideConfig = { enabled?: boolean; /** Allow model-provided TTS text blocks. */ allowText?: boolean; - /** Allow model-provided provider override. */ + /** Allow model-provided provider override (default: false). */ allowProvider?: boolean; /** Allow model-provided voice/voiceId override. */ allowVoice?: boolean; diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 1c5ecfb558d..09dc90e642c 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -215,7 +215,7 @@ describe("tts", () => { describe("parseTtsDirectives", () => { it("extracts overrides and strips directives when enabled", () => { - const policy = resolveModelOverridePolicy({ enabled: true }); + const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true }); const input = "Hello [[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE stability=0.4 speed=1.1]] world\n\n" + "[[tts:text]](laughs) Read the song once more.[[/tts:text]]"; @@ -230,13 +230,22 @@ describe("tts", () => { }); it("accepts edge as provider override", () => { - const policy = resolveModelOverridePolicy({ enabled: true }); + const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true }); const input = "Hello [[tts:provider=edge]] world"; const result = parseTtsDirectives(input, policy); expect(result.overrides.provider).toBe("edge"); }); + it("rejects provider override by default while keeping voice overrides enabled", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = "Hello [[tts:provider=edge voice=alloy]] world"; + const result = parseTtsDirectives(input, policy); + + expect(result.overrides.provider).toBeUndefined(); + expect(result.overrides.openai?.voice).toBe("alloy"); + }); + it("keeps text intact when overrides are disabled", () => { const policy = resolveModelOverridePolicy({ enabled: false }); const input = "Hello [[tts:voice=alloy]] world"; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index fb27eddd2d6..3130cf396b8 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -238,11 +238,12 @@ function resolveModelOverridePolicy( allowSeed: false, }; } - const allow = (value?: boolean) => value ?? true; + const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue; return { enabled: true, allowText: allow(overrides?.allowText), - allowProvider: allow(overrides?.allowProvider), + // Provider switching is higher-impact than voice/style tweaks; keep opt-in. + allowProvider: allow(overrides?.allowProvider, false), allowVoice: allow(overrides?.allowVoice), allowModelId: allow(overrides?.allowModelId), allowVoiceSettings: allow(overrides?.allowVoiceSettings),