refactor(tts): move speech providers into plugins

2026-04-26 16:06:16 +00:00 · 2026-03-22 17:46:48 -07:00
parent 1d08ad4bac
commit de6bf58e79
15 changed files with 448 additions and 128 deletions
--- a/extensions/microsoft/index.ts
+++ b/extensions/microsoft/index.ts
@@ -1,5 +1,5 @@
 import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
-import { buildMicrosoftSpeechProvider } from "openclaw/plugin-sdk/speech";
+import { buildMicrosoftSpeechProvider } from "./speech-provider.js";

 export default definePluginEntry({
  id: "microsoft",
--- a/extensions/microsoft/package.json
+++ b/extensions/microsoft/package.json
@@ -4,6 +4,9 @@
  "private": true,
  "description": "OpenClaw Microsoft speech plugin",
  "type": "module",
+  "dependencies": {
+    "node-edge-tts": "^1.2.10"
+  },
  "openclaw": {
    "extensions": [
      "./index.ts"
--- a/extensions/microsoft/speech-provider.test.ts
+++ b/extensions/microsoft/speech-provider.test.ts
@@ -0,0 +1,43 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+
+import { listMicrosoftVoices } from "./speech-provider.js";
+
+const fetchMock = vi.fn<typeof fetch>();
+
+describe("listMicrosoftVoices", () => {
+  afterEach(() => {
+    fetchMock.mockReset();
+    vi.unstubAllGlobals();
+  });
+
+  it("maps Microsoft voices to the shared speech voice shape", async () => {
+    fetchMock.mockResolvedValueOnce({
+      ok: true,
+      json: async () => [
+        {
+          ShortName: "en-US-AvaMultilingualNeural",
+          FriendlyName: "Microsoft Ava",
+          Locale: "en-US",
+          Gender: "Female",
+          VoiceTag: {
+            ContentCategories: ["General"],
+            VoicePersonalities: ["Friendly", "Warm"],
+          },
+        },
+      ],
+    } as Response);
+    vi.stubGlobal("fetch", fetchMock);
+
+    await expect(listMicrosoftVoices()).resolves.toEqual([
+      {
+        id: "en-US-AvaMultilingualNeural",
+        name: "Microsoft Ava",
+        category: "General",
+        description: "Friendly, Warm",
+        locale: "en-US",
+        gender: "Female",
+        personalities: ["Friendly", "Warm"],
+      },
+    ]);
+  });
+});
--- a/extensions/microsoft/speech-provider.ts
+++ b/extensions/microsoft/speech-provider.ts
@@ -0,0 +1,130 @@
+import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
+import path from "node:path";
+import {
+  CHROMIUM_FULL_VERSION,
+  TRUSTED_CLIENT_TOKEN,
+  generateSecMsGecToken,
+} from "node-edge-tts/dist/drm.js";
+import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
+import {
+  edgeTTS,
+  inferEdgeExtension,
+  isVoiceCompatibleAudio,
+  resolvePreferredOpenClawTmpDir,
+  type SpeechVoiceOption,
+} from "openclaw/plugin-sdk/speech-core";
+
+const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+type MicrosoftVoiceListEntry = {
+  ShortName?: string;
+  FriendlyName?: string;
+  Locale?: string;
+  Gender?: string;
+  VoiceTag?: {
+    ContentCategories?: string[];
+    VoicePersonalities?: string[];
+  };
+};
+
+function buildMicrosoftVoiceHeaders(): Record<string, string> {
+  const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0";
+  return {
+    Authority: "speech.platform.bing.com",
+    Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+    Accept: "*/*",
+    "User-Agent":
+      `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` +
+      `(KHTML, like Gecko) Chrome/${major}.0.0.0 Safari/537.36 Edg/${major}.0.0.0`,
+    "Sec-MS-GEC": generateSecMsGecToken(),
+    "Sec-MS-GEC-Version": `1-${CHROMIUM_FULL_VERSION}`,
+  };
+}
+
+function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string | undefined {
+  const personalities = entry.VoiceTag?.VoicePersonalities?.filter(Boolean) ?? [];
+  return personalities.length > 0 ? personalities.join(", ") : undefined;
+}
+
+export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
+  const response = await fetch(
+    "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
+      `?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`,
+    {
+      headers: buildMicrosoftVoiceHeaders(),
+    },
+  );
+  if (!response.ok) {
+    throw new Error(`Microsoft voices API error (${response.status})`);
+  }
+  const voices = (await response.json()) as MicrosoftVoiceListEntry[];
+  return Array.isArray(voices)
+    ? voices
+        .map((voice) => ({
+          id: voice.ShortName?.trim() ?? "",
+          name: voice.FriendlyName?.trim() || voice.ShortName?.trim() || undefined,
+          category: voice.VoiceTag?.ContentCategories?.find((value) => value.trim().length > 0),
+          description: formatMicrosoftVoiceDescription(voice),
+          locale: voice.Locale?.trim() || undefined,
+          gender: voice.Gender?.trim() || undefined,
+          personalities: voice.VoiceTag?.VoicePersonalities?.filter(
+            (value): value is string => value.trim().length > 0,
+          ),
+        }))
+        .filter((voice) => voice.id.length > 0)
+    : [];
+}
+
+export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "microsoft",
+    label: "Microsoft",
+    aliases: ["edge"],
+    listVoices: async () => await listMicrosoftVoices(),
+    isConfigured: ({ config }) => config.edge.enabled,
+    synthesize: async (req) => {
+      const tempRoot = resolvePreferredOpenClawTmpDir();
+      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
+      const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
+      let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat;
+      const fallbackOutputFormat =
+        outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
+
+      try {
+        const runEdge = async (format: string) => {
+          const fileExtension = inferEdgeExtension(format);
+          const outputPath = path.join(tempDir, `speech${fileExtension}`);
+          await edgeTTS({
+            text: req.text,
+            outputPath,
+            config: {
+              ...req.config.edge,
+              voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice,
+              outputFormat: format,
+            },
+            timeoutMs: req.config.timeoutMs,
+          });
+          const audioBuffer = readFileSync(outputPath);
+          return {
+            audioBuffer,
+            outputFormat: format,
+            fileExtension,
+            voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
+          };
+        };
+
+        try {
+          return await runEdge(outputFormat);
+        } catch (err) {
+          if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
+            throw err;
+          }
+          outputFormat = fallbackOutputFormat;
+          return await runEdge(outputFormat);
+        }
+      } finally {
+        rmSync(tempDir, { recursive: true, force: true });
+      }
+    },
+  };
+}