feat(minimax): add native TTS speech provider (T2A v2)

Add MiniMax as a fourth TTS provider alongside OpenAI, ElevenLabs, and
Microsoft. Registers a SpeechProviderPlugin in the existing minimax
extension with config resolution, directive parsing, and Talk Mode
support. Hex-encoded audio response from the T2A v2 API is decoded to
MP3.

Closes #52720

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
gnuduncan
2026-03-27 16:24:56 +01:00
committed by Peter Steinberger
parent 49d962a82f
commit 7d7f5d85b4
6 changed files with 699 additions and 7 deletions

View File

@@ -26,6 +26,7 @@ import {
import type { MiniMaxRegion } from "./oauth.js";
import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js";
import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js";
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
const API_PROVIDER_ID = "minimax";
const PORTAL_PROVIDER_ID = "minimax-portal";
@@ -303,5 +304,6 @@ export default definePluginEntry({
});
api.registerImageGenerationProvider(buildMinimaxImageGenerationProvider());
api.registerImageGenerationProvider(buildMinimaxPortalImageGenerationProvider());
api.registerSpeechProvider(buildMinimaxSpeechProvider());
},
});

View File

@@ -61,6 +61,7 @@
}
],
"contracts": {
"speechProviders": ["minimax"],
"mediaUnderstandingProviders": ["minimax", "minimax-portal"],
"imageGenerationProviders": ["minimax", "minimax-portal"]
},

View File

@@ -0,0 +1,318 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
describe("buildMinimaxSpeechProvider", () => {
const provider = buildMinimaxSpeechProvider();
describe("metadata", () => {
it("has correct id and label", () => {
expect(provider.id).toBe("minimax");
expect(provider.label).toBe("MiniMax");
});
it("has autoSelectOrder 40", () => {
expect(provider.autoSelectOrder).toBe(40);
});
it("exposes models and voices", () => {
expect(provider.models).toContain("speech-2.8-hd");
expect(provider.voices).toContain("English_expressive_narrator");
});
});
describe("isConfigured", () => {
const savedEnv = { ...process.env };
afterEach(() => {
process.env = { ...savedEnv };
});
it("returns true when apiKey is in provider config", () => {
expect(
provider.isConfigured({ providerConfig: { apiKey: "sk-test" }, timeoutMs: 30000 }),
).toBe(true);
});
it("returns false when no apiKey anywhere", () => {
delete process.env.MINIMAX_API_KEY;
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(false);
});
it("returns true when MINIMAX_API_KEY env var is set", () => {
process.env.MINIMAX_API_KEY = "sk-env";
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(true);
});
});
describe("resolveConfig", () => {
const savedEnv = { ...process.env };
afterEach(() => {
process.env = { ...savedEnv };
});
it("returns defaults when rawConfig is empty", () => {
delete process.env.MINIMAX_API_HOST;
delete process.env.MINIMAX_TTS_MODEL;
delete process.env.MINIMAX_TTS_VOICE_ID;
const config = provider.resolveConfig!({ rawConfig: {}, cfg: {} as never, timeoutMs: 30000 });
expect(config.baseUrl).toBe("https://api.minimaxi.com");
expect(config.model).toBe("speech-2.8-hd");
expect(config.voiceId).toBe("English_expressive_narrator");
});
it("reads from providers.minimax in rawConfig", () => {
const config = provider.resolveConfig!({
rawConfig: {
providers: {
minimax: {
baseUrl: "https://custom.api.com",
model: "speech-01-240228",
voiceId: "Chinese (Mandarin)_Warm_Girl",
speed: 1.5,
vol: 2.0,
pitch: 3,
},
},
},
cfg: {} as never,
timeoutMs: 30000,
});
expect(config.baseUrl).toBe("https://custom.api.com");
expect(config.model).toBe("speech-01-240228");
expect(config.voiceId).toBe("Chinese (Mandarin)_Warm_Girl");
expect(config.speed).toBe(1.5);
expect(config.vol).toBe(2.0);
expect(config.pitch).toBe(3);
});
it("reads from env vars as fallback", () => {
process.env.MINIMAX_API_HOST = "https://env.api.com";
process.env.MINIMAX_TTS_MODEL = "speech-01-240228";
process.env.MINIMAX_TTS_VOICE_ID = "Chinese (Mandarin)_Gentle_Boy";
const config = provider.resolveConfig!({ rawConfig: {}, cfg: {} as never, timeoutMs: 30000 });
expect(config.baseUrl).toBe("https://env.api.com");
expect(config.model).toBe("speech-01-240228");
expect(config.voiceId).toBe("Chinese (Mandarin)_Gentle_Boy");
});
});
describe("parseDirectiveToken", () => {
const policy = {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
};
it("handles voice key", () => {
const result = provider.parseDirectiveToken!({
key: "voice",
value: "Chinese (Mandarin)_Warm_Girl",
policy,
});
expect(result.handled).toBe(true);
expect(result.overrides?.voiceId).toBe("Chinese (Mandarin)_Warm_Girl");
});
it("handles voiceid key", () => {
const result = provider.parseDirectiveToken!({ key: "voiceid", value: "test_voice", policy });
expect(result.handled).toBe(true);
expect(result.overrides?.voiceId).toBe("test_voice");
});
it("handles model key", () => {
const result = provider.parseDirectiveToken!({
key: "model",
value: "speech-01-240228",
policy,
});
expect(result.handled).toBe(true);
expect(result.overrides?.model).toBe("speech-01-240228");
});
it("handles speed key with valid value", () => {
const result = provider.parseDirectiveToken!({ key: "speed", value: "1.5", policy });
expect(result.handled).toBe(true);
expect(result.overrides?.speed).toBe(1.5);
});
it("warns on invalid speed", () => {
const result = provider.parseDirectiveToken!({ key: "speed", value: "5.0", policy });
expect(result.handled).toBe(true);
expect(result.warnings).toHaveLength(1);
expect(result.overrides).toBeUndefined();
});
it("handles vol key", () => {
const result = provider.parseDirectiveToken!({ key: "vol", value: "3", policy });
expect(result.handled).toBe(true);
expect(result.overrides?.vol).toBe(3);
});
it("warns on vol=0 (exclusive minimum)", () => {
const result = provider.parseDirectiveToken!({ key: "vol", value: "0", policy });
expect(result.handled).toBe(true);
expect(result.warnings).toHaveLength(1);
});
it("handles volume alias", () => {
const result = provider.parseDirectiveToken!({ key: "volume", value: "5", policy });
expect(result.handled).toBe(true);
expect(result.overrides?.vol).toBe(5);
});
it("handles pitch key", () => {
const result = provider.parseDirectiveToken!({ key: "pitch", value: "-3", policy });
expect(result.handled).toBe(true);
expect(result.overrides?.pitch).toBe(-3);
});
it("warns on out-of-range pitch", () => {
const result = provider.parseDirectiveToken!({ key: "pitch", value: "20", policy });
expect(result.handled).toBe(true);
expect(result.warnings).toHaveLength(1);
});
it("returns handled=false for unknown keys", () => {
const result = provider.parseDirectiveToken!({
key: "unknown_key",
value: "whatever",
policy,
});
expect(result.handled).toBe(false);
});
it("suppresses voice when policy disallows it", () => {
const result = provider.parseDirectiveToken!({
key: "voice",
value: "test",
policy: { ...policy, allowVoice: false },
});
expect(result.handled).toBe(true);
expect(result.overrides).toBeUndefined();
});
it("suppresses model when policy disallows it", () => {
const result = provider.parseDirectiveToken!({
key: "model",
value: "test",
policy: { ...policy, allowModelId: false },
});
expect(result.handled).toBe(true);
expect(result.overrides).toBeUndefined();
});
});
describe("synthesize", () => {
const savedFetch = globalThis.fetch;
beforeEach(() => {
vi.stubGlobal("fetch", vi.fn());
});
afterEach(() => {
globalThis.fetch = savedFetch;
vi.restoreAllMocks();
});
it("makes correct API call and decodes hex response", async () => {
const hexAudio = Buffer.from("fake-audio-data").toString("hex");
const mockFetch = vi.mocked(globalThis.fetch);
mockFetch.mockResolvedValueOnce(
new Response(JSON.stringify({ data: { audio: hexAudio } }), {
status: 200,
headers: { "Content-Type": "application/json" },
}),
);
const result = await provider.synthesize({
text: "Hello world",
cfg: {} as never,
providerConfig: { apiKey: "sk-test", baseUrl: "https://api.minimaxi.com" },
target: "audio-file",
timeoutMs: 30000,
});
expect(result.outputFormat).toBe("mp3");
expect(result.fileExtension).toBe(".mp3");
expect(result.voiceCompatible).toBe(false);
expect(result.audioBuffer.toString()).toBe("fake-audio-data");
expect(mockFetch).toHaveBeenCalledOnce();
const [url, init] = mockFetch.mock.calls[0]!;
expect(url).toBe("https://api.minimaxi.com/v1/t2a_v2");
const body = JSON.parse(init!.body as string);
expect(body.model).toBe("speech-2.8-hd");
expect(body.text).toBe("Hello world");
expect(body.voice_setting.voice_id).toBe("English_expressive_narrator");
});
it("applies overrides", async () => {
const hexAudio = Buffer.from("audio").toString("hex");
const mockFetch = vi.mocked(globalThis.fetch);
mockFetch.mockResolvedValueOnce(
new Response(JSON.stringify({ data: { audio: hexAudio } }), { status: 200 }),
);
await provider.synthesize({
text: "Test",
cfg: {} as never,
providerConfig: { apiKey: "sk-test" },
providerOverrides: { model: "speech-01-240228", voiceId: "custom_voice", speed: 1.5 },
target: "audio-file",
timeoutMs: 30000,
});
const body = JSON.parse(vi.mocked(globalThis.fetch).mock.calls[0]![1]!.body as string);
expect(body.model).toBe("speech-01-240228");
expect(body.voice_setting.voice_id).toBe("custom_voice");
expect(body.voice_setting.speed).toBe(1.5);
});
it("throws when API key is missing", async () => {
const savedKey = process.env.MINIMAX_API_KEY;
delete process.env.MINIMAX_API_KEY;
try {
await expect(
provider.synthesize({
text: "Test",
cfg: {} as never,
providerConfig: {},
target: "audio-file",
timeoutMs: 30000,
}),
).rejects.toThrow("MiniMax API key missing");
} finally {
if (savedKey) process.env.MINIMAX_API_KEY = savedKey;
}
});
it("throws on API error with response body", async () => {
vi.mocked(globalThis.fetch).mockResolvedValueOnce(
new Response("Unauthorized", { status: 401 }),
);
await expect(
provider.synthesize({
text: "Test",
cfg: {} as never,
providerConfig: { apiKey: "sk-test" },
target: "audio-file",
timeoutMs: 30000,
}),
).rejects.toThrow("MiniMax TTS API error (401): Unauthorized");
});
});
describe("listVoices", () => {
it("returns known voices", async () => {
const voices = await provider.listVoices!({} as never);
expect(voices.length).toBeGreaterThan(0);
expect(voices[0]!.id).toBe("English_expressive_narrator");
});
});
});

View File

@@ -0,0 +1,245 @@
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech-core";
import {
DEFAULT_MINIMAX_TTS_BASE_URL,
MINIMAX_TTS_MODELS,
MINIMAX_TTS_VOICES,
minimaxTTS,
normalizeMinimaxTtsBaseUrl,
} from "./tts.js";
type MinimaxTtsProviderConfig = {
apiKey?: string;
baseUrl: string;
model: string;
voiceId: string;
speed?: number;
vol?: number;
pitch?: number;
};
type MinimaxTtsProviderOverrides = {
model?: string;
voiceId?: string;
speed?: number;
vol?: number;
pitch?: number;
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeMinimaxProviderConfig(
rawConfig: Record<string, unknown>,
): MinimaxTtsProviderConfig {
const providers = asObject(rawConfig.providers);
const raw = asObject(providers?.minimax) ?? asObject(rawConfig.minimax);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.minimax.apiKey",
}),
baseUrl: normalizeMinimaxTtsBaseUrl(
trimToUndefined(raw?.baseUrl) ??
trimToUndefined(process.env.MINIMAX_API_HOST) ??
DEFAULT_MINIMAX_TTS_BASE_URL,
),
model:
trimToUndefined(raw?.model) ??
trimToUndefined(process.env.MINIMAX_TTS_MODEL) ??
"speech-2.8-hd",
voiceId:
trimToUndefined(raw?.voiceId) ??
trimToUndefined(process.env.MINIMAX_TTS_VOICE_ID) ??
"English_expressive_narrator",
speed: asNumber(raw?.speed),
vol: asNumber(raw?.vol),
pitch: asNumber(raw?.pitch),
};
}
function readMinimaxProviderConfig(config: SpeechProviderConfig): MinimaxTtsProviderConfig {
const normalized = normalizeMinimaxProviderConfig({});
return {
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl,
model: trimToUndefined(config.model) ?? normalized.model,
voiceId: trimToUndefined(config.voiceId) ?? normalized.voiceId,
speed: asNumber(config.speed) ?? normalized.speed,
vol: asNumber(config.vol) ?? normalized.vol,
pitch: asNumber(config.pitch) ?? normalized.pitch,
};
}
function readMinimaxOverrides(
overrides: SpeechProviderOverrides | undefined,
): MinimaxTtsProviderOverrides {
if (!overrides) {
return {};
}
return {
model: trimToUndefined(overrides.model),
voiceId: trimToUndefined(overrides.voiceId),
speed: asNumber(overrides.speed),
vol: asNumber(overrides.vol),
pitch: asNumber(overrides.pitch),
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
warnings?: string[];
} {
switch (ctx.key) {
case "voice":
case "voiceid":
case "voice_id":
case "minimax_voice":
case "minimaxvoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
return { handled: true, overrides: { voiceId: ctx.value } };
case "model":
case "minimax_model":
case "minimaxmodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return { handled: true, overrides: { model: ctx.value } };
case "speed": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const speed = Number(ctx.value);
if (!Number.isFinite(speed) || speed < 0.5 || speed > 2.0) {
return { handled: true, warnings: [`invalid MiniMax speed "${ctx.value}" (0.5-2.0)`] };
}
return { handled: true, overrides: { speed } };
}
case "vol":
case "volume": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const vol = Number(ctx.value);
if (!Number.isFinite(vol) || vol <= 0 || vol > 10) {
return {
handled: true,
warnings: [`invalid MiniMax volume "${ctx.value}" (0-10, exclusive)`],
};
}
return { handled: true, overrides: { vol } };
}
case "pitch": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const pitch = Number(ctx.value);
if (!Number.isFinite(pitch) || pitch < -12 || pitch > 12) {
return { handled: true, warnings: [`invalid MiniMax pitch "${ctx.value}" (-12 to 12)`] };
}
return { handled: true, overrides: { pitch } };
}
default:
return { handled: false };
}
}
export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
return {
id: "minimax",
label: "MiniMax",
autoSelectOrder: 40,
models: MINIMAX_TTS_MODELS,
voices: MINIMAX_TTS_VOICES,
resolveConfig: ({ rawConfig }) => normalizeMinimaxProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeMinimaxProviderConfig(baseTtsConfig);
return {
...base,
...(talkProviderConfig.apiKey === undefined
? {}
: {
apiKey: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.minimax.apiKey",
}),
}),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeMinimaxTtsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { model: trimToUndefined(talkProviderConfig.modelId) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
...(asNumber(talkProviderConfig.speed) == null
? {}
: { speed: asNumber(talkProviderConfig.speed) }),
...(asNumber(talkProviderConfig.vol) == null
? {}
: { vol: asNumber(talkProviderConfig.vol) }),
...(asNumber(talkProviderConfig.pitch) == null
? {}
: { pitch: asNumber(talkProviderConfig.pitch) }),
};
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId) == null
? {}
: { voiceId: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.modelId) == null
? {}
: { model: trimToUndefined(params.modelId) }),
...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }),
}),
listVoices: async () => MINIMAX_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
isConfigured: ({ providerConfig }) =>
Boolean(readMinimaxProviderConfig(providerConfig).apiKey || process.env.MINIMAX_API_KEY),
synthesize: async (req) => {
const config = readMinimaxProviderConfig(req.providerConfig);
const overrides = readMinimaxOverrides(req.providerOverrides);
const apiKey = config.apiKey || process.env.MINIMAX_API_KEY;
if (!apiKey) {
throw new Error("MiniMax API key missing");
}
const audioBuffer = await minimaxTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
model: overrides.model ?? config.model,
voiceId: overrides.voiceId ?? config.voiceId,
speed: overrides.speed ?? config.speed,
vol: overrides.vol ?? config.vol,
pitch: overrides.pitch ?? config.pitch,
timeoutMs: req.timeoutMs,
});
return {
audioBuffer,
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: false,
};
},
};
}

90
extensions/minimax/tts.ts Normal file
View File

@@ -0,0 +1,90 @@
export const DEFAULT_MINIMAX_TTS_BASE_URL = "https://api.minimaxi.com";
export const MINIMAX_TTS_MODELS = ["speech-2.8-hd", "speech-01-240228"] as const;
export const MINIMAX_TTS_VOICES = [
"English_expressive_narrator",
"Chinese (Mandarin)_Warm_Girl",
"Chinese (Mandarin)_Lively_Girl",
"Chinese (Mandarin)_Gentle_Boy",
"Chinese (Mandarin)_Steady_Boy",
] as const;
export function normalizeMinimaxTtsBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
if (!trimmed) {
return DEFAULT_MINIMAX_TTS_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
export async function minimaxTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
model: string;
voiceId: string;
speed?: number;
vol?: number;
pitch?: number;
format?: string;
sampleRate?: number;
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
model,
voiceId,
speed = 1.0,
vol = 1.0,
pitch = 0,
format = "mp3",
sampleRate = 32000,
timeoutMs,
} = params;
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${baseUrl}/v1/t2a_v2`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
text,
voice_setting: {
voice_id: voiceId,
speed,
vol,
pitch,
},
audio_setting: {
format,
sample_rate: sampleRate,
},
}),
signal: controller.signal,
});
if (!response.ok) {
const errBody = await response.text().catch(() => "");
throw new Error(`MiniMax TTS API error (${response.status})${errBody ? `: ${errBody}` : ""}`);
}
const body = (await response.json()) as { data?: { audio?: string } };
const hexAudio = body?.data?.audio;
if (!hexAudio) {
throw new Error("MiniMax TTS API returned no audio data");
}
return Buffer.from(hexAudio, "hex");
} finally {
clearTimeout(timeout);
}
}