feat(tts): add read-latest voice command

2026-05-07 07:58:36 +00:00 · 2026-04-26 03:44:44 +01:00
parent 2235a13dab
commit 97ae1c7c2e
8 changed files with 351 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai

 ### Changes

+- TTS/WhatsApp: add `/tts latest` read-aloud support with duplicate suppression and `/tts chat on|off|default` session-scoped auto-TTS overrides, completing the on-demand voice-note UX for current-chat replies. Fixes #66032.
 - Plugins/tokenjuice: bump the bundled tokenjuice runtime to 0.6.3. Thanks @vincentkoc.
 - TTS/agents: allow `agents.list[].tts` to override global `messages.tts` for per-agent voices while keeping shared provider credentials and preferences in the existing TTS config surface.
 - TTS/agents: make `/tts audio`, `/tts status`, and the `tts` agent tool honor the active `agents.list[].tts` voice/provider override.
--- a/docs/channels/whatsapp.md
+++ b/docs/channels/whatsapp.md
@@ -365,6 +365,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
    - reply payloads preserve `audioAsVoice`; TTS voice-note output for WhatsApp stays on this PTT path even when the provider returns MP3 or WebM
    - native Ogg/Opus audio is sent as `audio/ogg; codecs=opus` for voice-note compatibility
    - non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded with `ffmpeg` to 48 kHz mono Ogg/Opus before PTT delivery
+    - `/tts latest` sends the latest assistant reply as one voice note and suppresses repeat sends for the same reply; `/tts chat on|off|default` controls auto-TTS for the current WhatsApp chat
    - animated GIF playback is supported via `gifPlayback: true` on video sends
    - captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently
    - media source can be HTTP(S), `file://`, or local paths
--- a/docs/tools/slash-commands.md
+++ b/docs/tools/slash-commands.md
@@ -134,7 +134,7 @@ Built-in commands available today:
 - `/plugins list|inspect|show|get|install|enable|disable` inspects or mutates plugin state. `/plugin` is an alias. Owner-only for writes. Requires `commands.plugins: true`.
 - `/debug show|set|unset|reset` manages runtime-only config overrides. Owner-only. Requires `commands.debug: true`.
 - `/usage off|tokens|full|cost` controls the per-response usage footer or prints a local cost summary.
- `/tts on|off|status|provider|limit|summary|audio|help` controls TTS. See [/tools/tts](/tools/tts).
+- `/tts on|off|status|chat|latest|provider|limit|summary|audio|help` controls TTS. See [/tools/tts](/tools/tts).
 - `/restart` restarts OpenClaw when enabled. Default: enabled; set `commands.restart: false` to disable it.
 - `/activation mention|always` sets group activation mode.
 - `/send on|off|inherit` sets send policy. Owner-only.
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -821,6 +821,10 @@ Discord note: `/tts` is a built-in Discord command, so OpenClaw registers
 /tts off
 /tts on
 /tts status
+/tts chat on
+/tts chat off
+/tts chat default
+/tts latest
 /tts provider openai
 /tts limit 2000
 /tts summary off
@@ -833,9 +837,11 @@ Notes:
 - `commands.text` or native command registration must be enabled.
 - Config `messages.tts.auto` accepts `off|always|inbound|tagged`.
 - `/tts on` writes the local TTS preference to `always`; `/tts off` writes it to `off`.
+- `/tts chat on|off|default` writes a session-scoped auto-TTS override for the current chat.
 - Use config when you want `inbound` or `tagged` defaults.
 - `limit` and `summary` are stored in local prefs, not the main config.
 - `/tts audio` generates a one-off audio reply (does not toggle TTS on).
+- `/tts latest` reads the latest assistant reply from the current session transcript and sends it as audio once. It stores only a hash of that reply on the session entry to suppress duplicate voice sends.
 - `/tts status` includes fallback visibility for the latest attempt:
  - success fallback: `Fallback: <primary> -> <used>` plus `Attempts: ...`
  - failure: `Error: ...` plus `Attempts: ...`
--- a/src/auto-reply/reply/commands-tts.test.ts
+++ b/src/auto-reply/reply/commands-tts.test.ts
@@ -1,5 +1,9 @@
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../../config/config.js";
+import type { SessionEntry } from "../../config/sessions.js";

 const ttsMocks = vi.hoisted(() => ({
  getResolvedSpeechProviderConfig: vi.fn(),
@@ -39,6 +43,7 @@ function buildTtsParams(
  commandBodyNormalized: string,
  cfg: OpenClawConfig = {},
  agentId?: string,
+  overrides: Partial<Parameters<typeof handleTtsCommands>[0]> = {},
 ): Parameters<typeof handleTtsCommands>[0] {
  return {
    cfg,
@@ -49,11 +54,14 @@ function buildTtsParams(
      senderId: "owner",
      channel: "forum",
    },
+    sessionKey: "session-key",
+    ...overrides,
  } as unknown as Parameters<typeof handleTtsCommands>[0];
 }

 describe("handleTtsCommands status fallback reporting", () => {
  beforeEach(() => {
+    vi.clearAllMocks();
    ttsMocks.resolveTtsConfig.mockReturnValue({});
    ttsMocks.resolveTtsPrefsPath.mockReturnValue("/tmp/tts-prefs.json");
    ttsMocks.isTtsEnabled.mockReturnValue(true);
@@ -225,4 +233,131 @@ describe("handleTtsCommands status fallback reporting", () => {
      }),
    );
  });
+
+  it("reads the latest assistant transcript reply once", async () => {
+    const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-"));
+    const sessionFile = path.join(tempDir, "session.jsonl");
+    fs.writeFileSync(
+      sessionFile,
+      [
+        JSON.stringify({ type: "session", id: "s1" }),
+        JSON.stringify({
+          type: "message",
+          message: { role: "assistant", content: [{ type: "text", text: "older reply" }] },
+        }),
+        JSON.stringify({
+          type: "message",
+          message: {
+            role: "assistant",
+            content: [
+              {
+                type: "text",
+                text: "internal note",
+                textSignature: JSON.stringify({
+                  v: 1,
+                  id: "item_commentary",
+                  phase: "commentary",
+                }),
+              },
+              {
+                type: "text",
+                text: "latest visible reply",
+                textSignature: JSON.stringify({
+                  v: 1,
+                  id: "item_final",
+                  phase: "final_answer",
+                }),
+              },
+            ],
+          },
+        }),
+      ].join("\n") + "\n",
+      "utf-8",
+    );
+    ttsMocks.textToSpeech.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/latest.ogg",
+      provider: PRIMARY_TTS_PROVIDER,
+      voiceCompatible: true,
+    });
+    const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1, sessionFile };
+    const sessionStore = { "session-key": sessionEntry };
+
+    const result = await handleTtsCommands(
+      buildTtsParams("/tts latest", {}, undefined, { sessionEntry, sessionStore }),
+      true,
+    );
+
+    expect(result?.shouldContinue).toBe(false);
+    expect(result?.reply).toMatchObject({
+      mediaUrl: "/tmp/latest.ogg",
+      audioAsVoice: true,
+      spokenText: "latest visible reply",
+    });
+    expect(ttsMocks.textToSpeech).toHaveBeenCalledWith(
+      expect.objectContaining({ text: "latest visible reply" }),
+    );
+    expect(sessionEntry.lastTtsReadLatestHash).toMatch(/^[a-f0-9]{64}$/);
+    expect(sessionEntry.lastTtsReadLatestAt).toEqual(expect.any(Number));
+  });
+
+  it("does not resend /tts latest for the same assistant reply", async () => {
+    const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-"));
+    const sessionFile = path.join(tempDir, "session.jsonl");
+    fs.writeFileSync(
+      sessionFile,
+      [
+        JSON.stringify({ type: "session", id: "s1" }),
+        JSON.stringify({
+          type: "message",
+          message: { role: "assistant", content: [{ type: "text", text: "read me once" }] },
+        }),
+      ].join("\n") + "\n",
+      "utf-8",
+    );
+    ttsMocks.textToSpeech.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/latest.ogg",
+      provider: PRIMARY_TTS_PROVIDER,
+      voiceCompatible: true,
+    });
+    const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1, sessionFile };
+    const sessionStore = { "session-key": sessionEntry };
+    const params = buildTtsParams("/tts latest", {}, undefined, { sessionEntry, sessionStore });
+
+    const first = await handleTtsCommands(params, true);
+    expect(first?.reply?.mediaUrl).toBe("/tmp/latest.ogg");
+    ttsMocks.textToSpeech.mockClear();
+
+    const second = await handleTtsCommands(params, true);
+
+    expect(second?.reply?.text).toContain("already sent");
+    expect(ttsMocks.textToSpeech).not.toHaveBeenCalled();
+  });
+
+  it("stores chat-scoped TTS overrides on the session entry", async () => {
+    const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1 };
+    const sessionStore = { "session-key": sessionEntry };
+
+    const onResult = await handleTtsCommands(
+      buildTtsParams("/tts chat on", {}, undefined, { sessionEntry, sessionStore }),
+      true,
+    );
+    expect(onResult?.reply?.text).toContain("enabled for this chat");
+    expect(sessionEntry.ttsAuto).toBe("always");
+
+    const offResult = await handleTtsCommands(
+      buildTtsParams("/tts chat off", {}, undefined, { sessionEntry, sessionStore }),
+      true,
+    );
+    expect(offResult?.reply?.text).toContain("disabled for this chat");
+    expect(sessionEntry.ttsAuto).toBe("off");
+
+    const clearResult = await handleTtsCommands(
+      buildTtsParams("/tts chat default", {}, undefined, { sessionEntry, sessionStore }),
+      true,
+    );
+    expect(clearResult?.reply?.text).toContain("override cleared");
+    expect(sessionEntry.ttsAuto).toBeUndefined();
+  });
 });
--- a/src/auto-reply/reply/commands-tts.ts
+++ b/src/auto-reply/reply/commands-tts.ts
@@ -1,3 +1,5 @@
+import crypto from "node:crypto";
+import { readLatestAssistantTextFromSessionTranscript } from "../../config/sessions.js";
 import { logVerbose } from "../../globals.js";
 import {
  normalizeOptionalLowercaseString,
@@ -25,7 +27,9 @@ import {
  setTtsProvider,
  textToSpeech,
 } from "../../tts/tts.js";
+import { isSilentReplyPayloadText } from "../tokens.js";
 import type { ReplyPayload } from "../types.js";
+import { persistSessionEntry } from "./commands-session-store.js";
 import type { CommandHandler } from "./commands-types.js";

 type ParsedTtsCommand = {
@@ -81,7 +85,9 @@ function ttsUsage(): ReplyPayload {
      `• /tts provider [name] — View/change provider\n` +
      `• /tts limit [number] — View/change text limit\n` +
      `• /tts summary [on|off] — View/change auto-summary\n` +
-      `• /tts audio <text> — Generate audio from text\n\n` +
+      `• /tts audio <text> — Generate audio from text\n` +
+      `• /tts latest — Read the latest assistant reply once\n` +
+      `• /tts chat on|off|default — Override auto-TTS for this chat\n\n` +
      `**Providers:**\n` +
      `Use /tts provider to list the registered speech providers and their status.\n\n` +
      `**Text Limit (default: 1500, max: 4096):**\n` +
@@ -91,10 +97,67 @@ function ttsUsage(): ReplyPayload {
      `**Examples:**\n` +
      `/tts provider <id>\n` +
      `/tts limit 2000\n` +
+      `/tts latest\n` +
      `/tts audio Hello, this is a test!`,
  };
 }

+function hashTtsReadLatestText(text: string): string {
+  return crypto.createHash("sha256").update(text).digest("hex");
+}
+
+async function buildTtsAudioReply(params: {
+  text: string;
+  cfg: Parameters<typeof textToSpeech>[0]["cfg"];
+  channel: string;
+  prefsPath: string;
+  agentId?: string;
+}): Promise<{ reply: ReplyPayload; provider?: string; hash?: string } | { error: string }> {
+  const start = Date.now();
+  const result = await textToSpeech({
+    text: params.text,
+    cfg: params.cfg,
+    channel: params.channel,
+    prefsPath: params.prefsPath,
+    agentId: params.agentId,
+  });
+
+  if (result.success && result.audioPath) {
+    setLastTtsAttempt({
+      timestamp: Date.now(),
+      success: true,
+      textLength: params.text.length,
+      summarized: false,
+      provider: result.provider,
+      fallbackFrom: result.fallbackFrom,
+      attemptedProviders: result.attemptedProviders,
+      attempts: result.attempts,
+      latencyMs: result.latencyMs,
+    });
+    return {
+      provider: result.provider,
+      reply: {
+        mediaUrl: result.audioPath,
+        audioAsVoice: result.voiceCompatible === true,
+        trustedLocalMedia: true,
+        spokenText: params.text,
+      },
+    };
+  }
+
+  setLastTtsAttempt({
+    timestamp: Date.now(),
+    success: false,
+    textLength: params.text.length,
+    summarized: false,
+    attemptedProviders: result.attemptedProviders,
+    attempts: result.attempts,
+    error: result.error,
+    latencyMs: Date.now() - start,
+  });
+  return { error: result.error ?? "unknown error" };
+}
+
 export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => {
  if (!allowTextCommands) {
    return null;
@@ -130,6 +193,86 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
    return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
  }

+  if (action === "chat") {
+    const requested = normalizeOptionalLowercaseString(args) ?? "";
+    if (!params.sessionEntry || !params.sessionStore || !params.sessionKey) {
+      return {
+        shouldContinue: false,
+        reply: { text: "🔇 No active chat session is available for a chat-scoped TTS override." },
+      };
+    }
+    if (!requested || requested === "status") {
+      return {
+        shouldContinue: false,
+        reply: { text: `🔊 Chat TTS override: ${params.sessionEntry.ttsAuto ?? "default"}.` },
+      };
+    }
+    if (requested === "on") {
+      params.sessionEntry.ttsAuto = "always";
+      await persistSessionEntry(params);
+      return { shouldContinue: false, reply: { text: "🔊 TTS enabled for this chat." } };
+    }
+    if (requested === "off") {
+      params.sessionEntry.ttsAuto = "off";
+      await persistSessionEntry(params);
+      return { shouldContinue: false, reply: { text: "🔇 TTS disabled for this chat." } };
+    }
+    if (requested === "default" || requested === "inherit" || requested === "clear") {
+      delete params.sessionEntry.ttsAuto;
+      await persistSessionEntry(params);
+      return { shouldContinue: false, reply: { text: "🔊 TTS chat override cleared." } };
+    }
+    return { shouldContinue: false, reply: ttsUsage() };
+  }
+
+  if (
+    action === "latest" ||
+    (action === "read" && normalizeOptionalLowercaseString(args) === "latest")
+  ) {
+    if (!params.sessionEntry || !params.sessionStore || !params.sessionKey) {
+      return {
+        shouldContinue: false,
+        reply: { text: "🎤 No active chat session is available for `/tts latest`." },
+      };
+    }
+    const latest = await readLatestAssistantTextFromSessionTranscript(
+      params.sessionEntry.sessionFile,
+    );
+    const latestText = latest?.text.trim();
+    if (!latestText || isSilentReplyPayloadText(latestText)) {
+      return {
+        shouldContinue: false,
+        reply: { text: "🎤 No readable assistant reply was found in this chat yet." },
+      };
+    }
+    const hash = hashTtsReadLatestText(latestText);
+    if (params.sessionEntry.lastTtsReadLatestHash === hash) {
+      return {
+        shouldContinue: false,
+        reply: { text: "🔊 Latest assistant reply was already sent as audio." },
+      };
+    }
+
+    const audio = await buildTtsAudioReply({
+      text: latestText,
+      cfg: params.cfg,
+      channel: params.command.channel,
+      prefsPath,
+      agentId: params.agentId,
+    });
+    if ("error" in audio) {
+      return {
+        shouldContinue: false,
+        reply: { text: `❌ Error generating audio: ${audio.error}` },
+      };
+    }
+
+    params.sessionEntry.lastTtsReadLatestHash = hash;
+    params.sessionEntry.lastTtsReadLatestAt = Date.now();
+    await persistSessionEntry(params);
+    return { shouldContinue: false, reply: audio.reply };
+  }
+
  if (action === "audio") {
    if (!args.trim()) {
      return {
@@ -143,51 +286,19 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
      };
    }

-    const start = Date.now();
-    const result = await textToSpeech({
+    const audio = await buildTtsAudioReply({
      text: args,
      cfg: params.cfg,
      channel: params.command.channel,
      prefsPath,
      agentId: params.agentId,
    });
-
-    if (result.success && result.audioPath) {
-      // Store last attempt for `/tts status`.
-      setLastTtsAttempt({
-        timestamp: Date.now(),
-        success: true,
-        textLength: args.length,
-        summarized: false,
-        provider: result.provider,
-        fallbackFrom: result.fallbackFrom,
-        attemptedProviders: result.attemptedProviders,
-        attempts: result.attempts,
-        latencyMs: result.latencyMs,
-      });
-      const payload: ReplyPayload = {
-        mediaUrl: result.audioPath,
-        audioAsVoice: result.voiceCompatible === true,
-        trustedLocalMedia: true,
-        spokenText: args,
-      };
-      return { shouldContinue: false, reply: payload };
+    if (!("error" in audio)) {
+      return { shouldContinue: false, reply: audio.reply };
    }
-
-    // Store failure details for `/tts status`.
-    setLastTtsAttempt({
-      timestamp: Date.now(),
-      success: false,
-      textLength: args.length,
-      summarized: false,
-      attemptedProviders: result.attemptedProviders,
-      attempts: result.attempts,
-      error: result.error,
-      latencyMs: Date.now() - start,
-    });
    return {
      shouldContinue: false,
-      reply: { text: `❌ Error generating audio: ${result.error ?? "unknown error"}` },
+      reply: { text: `❌ Error generating audio: ${audio.error}` },
    };
  }

@@ -306,6 +417,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
    const lines = [
      "📊 TTS status",
      `State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
+      `Chat override: ${params.sessionEntry?.ttsAuto ?? "default"}`,
      `Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`,
      `Text limit: ${maxLength} chars`,
      `Auto-summary: ${summarize ? "on" : "off"}`,
--- a/src/config/sessions/transcript.ts
+++ b/src/config/sessions/transcript.ts
@@ -3,6 +3,7 @@ import path from "node:path";
 import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
 import { formatErrorMessage } from "../../infra/errors.js";
 import { emitSessionTranscriptUpdate } from "../../sessions/transcript-events.js";
+import { extractAssistantVisibleText } from "../../shared/chat-message-content.js";
 import {
  resolveDefaultSessionStorePath,
  resolveSessionFilePath,
@@ -46,6 +47,12 @@ export type SessionTranscriptAssistantMessage = Parameters<SessionManager["appen
  role: "assistant";
 };

+export type LatestAssistantTranscriptText = {
+  id?: string;
+  text: string;
+  timestamp?: number;
+};
+
 export async function resolveSessionTranscriptFile(params: {
  sessionId: string;
  sessionKey: string;
@@ -91,6 +98,53 @@ export async function resolveSessionTranscriptFile(params: {
  };
 }

+export async function readLatestAssistantTextFromSessionTranscript(
+  sessionFile: string | undefined,
+): Promise<LatestAssistantTranscriptText | undefined> {
+  if (!sessionFile?.trim()) {
+    return undefined;
+  }
+
+  let raw: string;
+  try {
+    raw = await fs.promises.readFile(sessionFile, "utf-8");
+  } catch {
+    return undefined;
+  }
+
+  const lines = raw.split(/\r?\n/);
+  for (let index = lines.length - 1; index >= 0; index -= 1) {
+    const line = lines[index];
+    if (!line.trim()) {
+      continue;
+    }
+    try {
+      const parsed = JSON.parse(line) as {
+        id?: unknown;
+        message?: unknown;
+      };
+      const message = parsed.message as { role?: unknown; timestamp?: unknown } | undefined;
+      if (!message || message.role !== "assistant") {
+        continue;
+      }
+      const text = extractAssistantVisibleText(message)?.trim();
+      if (!text) {
+        continue;
+      }
+      return {
+        ...(typeof parsed.id === "string" && parsed.id ? { id: parsed.id } : {}),
+        text,
+        ...(typeof message.timestamp === "number" && Number.isFinite(message.timestamp)
+          ? { timestamp: message.timestamp }
+          : {}),
+      };
+    } catch {
+      continue;
+    }
+  }
+  return undefined;
+}
+
 export async function appendAssistantMessageToSessionTranscript(params: {
  agentId?: string;
  sessionKey: string;
--- a/src/config/sessions/types.ts
+++ b/src/config/sessions/types.ts
@@ -175,6 +175,10 @@ export type SessionEntry = {
  reasoningLevel?: string;
  elevatedLevel?: string;
  ttsAuto?: TtsAutoMode;
+  /** Hash of the latest assistant reply that was sent through `/tts latest`. */
+  lastTtsReadLatestHash?: string;
+  /** Timestamp (ms) when `/tts latest` last sent audio for this session. */
+  lastTtsReadLatestAt?: number;
  execHost?: string;
  execSecurity?: string;
  execAsk?: string;