feat(tts): add read-latest voice command

This commit is contained in:
Peter Steinberger
2026-04-26 03:44:44 +01:00
parent 2235a13dab
commit 97ae1c7c2e
8 changed files with 351 additions and 38 deletions

View File

@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
### Changes
- TTS/WhatsApp: add `/tts latest` read-aloud support with duplicate suppression and `/tts chat on|off|default` session-scoped auto-TTS overrides, completing the on-demand voice-note UX for current-chat replies. Fixes #66032.
- Plugins/tokenjuice: bump the bundled tokenjuice runtime to 0.6.3. Thanks @vincentkoc.
- TTS/agents: allow `agents.list[].tts` to override global `messages.tts` for per-agent voices while keeping shared provider credentials and preferences in the existing TTS config surface.
- TTS/agents: make `/tts audio`, `/tts status`, and the `tts` agent tool honor the active `agents.list[].tts` voice/provider override.

View File

@@ -365,6 +365,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- reply payloads preserve `audioAsVoice`; TTS voice-note output for WhatsApp stays on this PTT path even when the provider returns MP3 or WebM
- native Ogg/Opus audio is sent as `audio/ogg; codecs=opus` for voice-note compatibility
- non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded with `ffmpeg` to 48 kHz mono Ogg/Opus before PTT delivery
- `/tts latest` sends the latest assistant reply as one voice note and suppresses repeat sends for the same reply; `/tts chat on|off|default` controls auto-TTS for the current WhatsApp chat
- animated GIF playback is supported via `gifPlayback: true` on video sends
- captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently
- media source can be HTTP(S), `file://`, or local paths

View File

@@ -134,7 +134,7 @@ Built-in commands available today:
- `/plugins list|inspect|show|get|install|enable|disable` inspects or mutates plugin state. `/plugin` is an alias. Owner-only for writes. Requires `commands.plugins: true`.
- `/debug show|set|unset|reset` manages runtime-only config overrides. Owner-only. Requires `commands.debug: true`.
- `/usage off|tokens|full|cost` controls the per-response usage footer or prints a local cost summary.
- `/tts on|off|status|provider|limit|summary|audio|help` controls TTS. See [/tools/tts](/tools/tts).
- `/tts on|off|status|chat|latest|provider|limit|summary|audio|help` controls TTS. See [/tools/tts](/tools/tts).
- `/restart` restarts OpenClaw when enabled. Default: enabled; set `commands.restart: false` to disable it.
- `/activation mention|always` sets group activation mode.
- `/send on|off|inherit` sets send policy. Owner-only.

View File

@@ -821,6 +821,10 @@ Discord note: `/tts` is a built-in Discord command, so OpenClaw registers
/tts off
/tts on
/tts status
/tts chat on
/tts chat off
/tts chat default
/tts latest
/tts provider openai
/tts limit 2000
/tts summary off
@@ -833,9 +837,11 @@ Notes:
- `commands.text` or native command registration must be enabled.
- Config `messages.tts.auto` accepts `off|always|inbound|tagged`.
- `/tts on` writes the local TTS preference to `always`; `/tts off` writes it to `off`.
- `/tts chat on|off|default` writes a session-scoped auto-TTS override for the current chat.
- Use config when you want `inbound` or `tagged` defaults.
- `limit` and `summary` are stored in local prefs, not the main config.
- `/tts audio` generates a one-off audio reply (does not toggle TTS on).
- `/tts latest` reads the latest assistant reply from the current session transcript and sends it as audio once. It stores only a hash of that reply on the session entry to suppress duplicate voice sends.
- `/tts status` includes fallback visibility for the latest attempt:
- success fallback: `Fallback: <primary> -> <used>` plus `Attempts: ...`
- failure: `Error: ...` plus `Attempts: ...`

View File

@@ -1,5 +1,9 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../../config/config.js";
import type { SessionEntry } from "../../config/sessions.js";
const ttsMocks = vi.hoisted(() => ({
getResolvedSpeechProviderConfig: vi.fn(),
@@ -39,6 +43,7 @@ function buildTtsParams(
commandBodyNormalized: string,
cfg: OpenClawConfig = {},
agentId?: string,
overrides: Partial<Parameters<typeof handleTtsCommands>[0]> = {},
): Parameters<typeof handleTtsCommands>[0] {
return {
cfg,
@@ -49,11 +54,14 @@ function buildTtsParams(
senderId: "owner",
channel: "forum",
},
sessionKey: "session-key",
...overrides,
} as unknown as Parameters<typeof handleTtsCommands>[0];
}
describe("handleTtsCommands status fallback reporting", () => {
beforeEach(() => {
vi.clearAllMocks();
ttsMocks.resolveTtsConfig.mockReturnValue({});
ttsMocks.resolveTtsPrefsPath.mockReturnValue("/tmp/tts-prefs.json");
ttsMocks.isTtsEnabled.mockReturnValue(true);
@@ -225,4 +233,131 @@ describe("handleTtsCommands status fallback reporting", () => {
}),
);
});
it("reads the latest assistant transcript reply once", async () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-"));
const sessionFile = path.join(tempDir, "session.jsonl");
fs.writeFileSync(
sessionFile,
[
JSON.stringify({ type: "session", id: "s1" }),
JSON.stringify({
type: "message",
message: { role: "assistant", content: [{ type: "text", text: "older reply" }] },
}),
JSON.stringify({
type: "message",
message: {
role: "assistant",
content: [
{
type: "text",
text: "internal note",
textSignature: JSON.stringify({
v: 1,
id: "item_commentary",
phase: "commentary",
}),
},
{
type: "text",
text: "latest visible reply",
textSignature: JSON.stringify({
v: 1,
id: "item_final",
phase: "final_answer",
}),
},
],
},
}),
].join("\n") + "\n",
"utf-8",
);
ttsMocks.textToSpeech.mockResolvedValue({
success: true,
audioPath: "/tmp/latest.ogg",
provider: PRIMARY_TTS_PROVIDER,
voiceCompatible: true,
});
const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1, sessionFile };
const sessionStore = { "session-key": sessionEntry };
const result = await handleTtsCommands(
buildTtsParams("/tts latest", {}, undefined, { sessionEntry, sessionStore }),
true,
);
expect(result?.shouldContinue).toBe(false);
expect(result?.reply).toMatchObject({
mediaUrl: "/tmp/latest.ogg",
audioAsVoice: true,
spokenText: "latest visible reply",
});
expect(ttsMocks.textToSpeech).toHaveBeenCalledWith(
expect.objectContaining({ text: "latest visible reply" }),
);
expect(sessionEntry.lastTtsReadLatestHash).toMatch(/^[a-f0-9]{64}$/);
expect(sessionEntry.lastTtsReadLatestAt).toEqual(expect.any(Number));
});
it("does not resend /tts latest for the same assistant reply", async () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-tts-latest-"));
const sessionFile = path.join(tempDir, "session.jsonl");
fs.writeFileSync(
sessionFile,
[
JSON.stringify({ type: "session", id: "s1" }),
JSON.stringify({
type: "message",
message: { role: "assistant", content: [{ type: "text", text: "read me once" }] },
}),
].join("\n") + "\n",
"utf-8",
);
ttsMocks.textToSpeech.mockResolvedValue({
success: true,
audioPath: "/tmp/latest.ogg",
provider: PRIMARY_TTS_PROVIDER,
voiceCompatible: true,
});
const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1, sessionFile };
const sessionStore = { "session-key": sessionEntry };
const params = buildTtsParams("/tts latest", {}, undefined, { sessionEntry, sessionStore });
const first = await handleTtsCommands(params, true);
expect(first?.reply?.mediaUrl).toBe("/tmp/latest.ogg");
ttsMocks.textToSpeech.mockClear();
const second = await handleTtsCommands(params, true);
expect(second?.reply?.text).toContain("already sent");
expect(ttsMocks.textToSpeech).not.toHaveBeenCalled();
});
it("stores chat-scoped TTS overrides on the session entry", async () => {
const sessionEntry: SessionEntry = { sessionId: "s1", updatedAt: 1 };
const sessionStore = { "session-key": sessionEntry };
const onResult = await handleTtsCommands(
buildTtsParams("/tts chat on", {}, undefined, { sessionEntry, sessionStore }),
true,
);
expect(onResult?.reply?.text).toContain("enabled for this chat");
expect(sessionEntry.ttsAuto).toBe("always");
const offResult = await handleTtsCommands(
buildTtsParams("/tts chat off", {}, undefined, { sessionEntry, sessionStore }),
true,
);
expect(offResult?.reply?.text).toContain("disabled for this chat");
expect(sessionEntry.ttsAuto).toBe("off");
const clearResult = await handleTtsCommands(
buildTtsParams("/tts chat default", {}, undefined, { sessionEntry, sessionStore }),
true,
);
expect(clearResult?.reply?.text).toContain("override cleared");
expect(sessionEntry.ttsAuto).toBeUndefined();
});
});

View File

@@ -1,3 +1,5 @@
import crypto from "node:crypto";
import { readLatestAssistantTextFromSessionTranscript } from "../../config/sessions.js";
import { logVerbose } from "../../globals.js";
import {
normalizeOptionalLowercaseString,
@@ -25,7 +27,9 @@ import {
setTtsProvider,
textToSpeech,
} from "../../tts/tts.js";
import { isSilentReplyPayloadText } from "../tokens.js";
import type { ReplyPayload } from "../types.js";
import { persistSessionEntry } from "./commands-session-store.js";
import type { CommandHandler } from "./commands-types.js";
type ParsedTtsCommand = {
@@ -81,7 +85,9 @@ function ttsUsage(): ReplyPayload {
`• /tts provider [name] — View/change provider\n` +
`• /tts limit [number] — View/change text limit\n` +
`• /tts summary [on|off] — View/change auto-summary\n` +
`• /tts audio <text> — Generate audio from text\n\n` +
`• /tts audio <text> — Generate audio from text\n` +
`• /tts latest — Read the latest assistant reply once\n` +
`• /tts chat on|off|default — Override auto-TTS for this chat\n\n` +
`**Providers:**\n` +
`Use /tts provider to list the registered speech providers and their status.\n\n` +
`**Text Limit (default: 1500, max: 4096):**\n` +
@@ -91,10 +97,67 @@ function ttsUsage(): ReplyPayload {
`**Examples:**\n` +
`/tts provider <id>\n` +
`/tts limit 2000\n` +
`/tts latest\n` +
`/tts audio Hello, this is a test!`,
};
}
function hashTtsReadLatestText(text: string): string {
return crypto.createHash("sha256").update(text).digest("hex");
}
async function buildTtsAudioReply(params: {
text: string;
cfg: Parameters<typeof textToSpeech>[0]["cfg"];
channel: string;
prefsPath: string;
agentId?: string;
}): Promise<{ reply: ReplyPayload; provider?: string; hash?: string } | { error: string }> {
const start = Date.now();
const result = await textToSpeech({
text: params.text,
cfg: params.cfg,
channel: params.channel,
prefsPath: params.prefsPath,
agentId: params.agentId,
});
if (result.success && result.audioPath) {
setLastTtsAttempt({
timestamp: Date.now(),
success: true,
textLength: params.text.length,
summarized: false,
provider: result.provider,
fallbackFrom: result.fallbackFrom,
attemptedProviders: result.attemptedProviders,
attempts: result.attempts,
latencyMs: result.latencyMs,
});
return {
provider: result.provider,
reply: {
mediaUrl: result.audioPath,
audioAsVoice: result.voiceCompatible === true,
trustedLocalMedia: true,
spokenText: params.text,
},
};
}
setLastTtsAttempt({
timestamp: Date.now(),
success: false,
textLength: params.text.length,
summarized: false,
attemptedProviders: result.attemptedProviders,
attempts: result.attempts,
error: result.error,
latencyMs: Date.now() - start,
});
return { error: result.error ?? "unknown error" };
}
export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => {
if (!allowTextCommands) {
return null;
@@ -130,6 +193,86 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
}
if (action === "chat") {
const requested = normalizeOptionalLowercaseString(args) ?? "";
if (!params.sessionEntry || !params.sessionStore || !params.sessionKey) {
return {
shouldContinue: false,
reply: { text: "🔇 No active chat session is available for a chat-scoped TTS override." },
};
}
if (!requested || requested === "status") {
return {
shouldContinue: false,
reply: { text: `🔊 Chat TTS override: ${params.sessionEntry.ttsAuto ?? "default"}.` },
};
}
if (requested === "on") {
params.sessionEntry.ttsAuto = "always";
await persistSessionEntry(params);
return { shouldContinue: false, reply: { text: "🔊 TTS enabled for this chat." } };
}
if (requested === "off") {
params.sessionEntry.ttsAuto = "off";
await persistSessionEntry(params);
return { shouldContinue: false, reply: { text: "🔇 TTS disabled for this chat." } };
}
if (requested === "default" || requested === "inherit" || requested === "clear") {
delete params.sessionEntry.ttsAuto;
await persistSessionEntry(params);
return { shouldContinue: false, reply: { text: "🔊 TTS chat override cleared." } };
}
return { shouldContinue: false, reply: ttsUsage() };
}
if (
action === "latest" ||
(action === "read" && normalizeOptionalLowercaseString(args) === "latest")
) {
if (!params.sessionEntry || !params.sessionStore || !params.sessionKey) {
return {
shouldContinue: false,
reply: { text: "🎤 No active chat session is available for `/tts latest`." },
};
}
const latest = await readLatestAssistantTextFromSessionTranscript(
params.sessionEntry.sessionFile,
);
const latestText = latest?.text.trim();
if (!latestText || isSilentReplyPayloadText(latestText)) {
return {
shouldContinue: false,
reply: { text: "🎤 No readable assistant reply was found in this chat yet." },
};
}
const hash = hashTtsReadLatestText(latestText);
if (params.sessionEntry.lastTtsReadLatestHash === hash) {
return {
shouldContinue: false,
reply: { text: "🔊 Latest assistant reply was already sent as audio." },
};
}
const audio = await buildTtsAudioReply({
text: latestText,
cfg: params.cfg,
channel: params.command.channel,
prefsPath,
agentId: params.agentId,
});
if ("error" in audio) {
return {
shouldContinue: false,
reply: { text: `❌ Error generating audio: ${audio.error}` },
};
}
params.sessionEntry.lastTtsReadLatestHash = hash;
params.sessionEntry.lastTtsReadLatestAt = Date.now();
await persistSessionEntry(params);
return { shouldContinue: false, reply: audio.reply };
}
if (action === "audio") {
if (!args.trim()) {
return {
@@ -143,51 +286,19 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
};
}
const start = Date.now();
const result = await textToSpeech({
const audio = await buildTtsAudioReply({
text: args,
cfg: params.cfg,
channel: params.command.channel,
prefsPath,
agentId: params.agentId,
});
if (result.success && result.audioPath) {
// Store last attempt for `/tts status`.
setLastTtsAttempt({
timestamp: Date.now(),
success: true,
textLength: args.length,
summarized: false,
provider: result.provider,
fallbackFrom: result.fallbackFrom,
attemptedProviders: result.attemptedProviders,
attempts: result.attempts,
latencyMs: result.latencyMs,
});
const payload: ReplyPayload = {
mediaUrl: result.audioPath,
audioAsVoice: result.voiceCompatible === true,
trustedLocalMedia: true,
spokenText: args,
};
return { shouldContinue: false, reply: payload };
if (!("error" in audio)) {
return { shouldContinue: false, reply: audio.reply };
}
// Store failure details for `/tts status`.
setLastTtsAttempt({
timestamp: Date.now(),
success: false,
textLength: args.length,
summarized: false,
attemptedProviders: result.attemptedProviders,
attempts: result.attempts,
error: result.error,
latencyMs: Date.now() - start,
});
return {
shouldContinue: false,
reply: { text: `❌ Error generating audio: ${result.error ?? "unknown error"}` },
reply: { text: `❌ Error generating audio: ${audio.error}` },
};
}
@@ -306,6 +417,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
const lines = [
"📊 TTS status",
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
`Chat override: ${params.sessionEntry?.ttsAuto ?? "default"}`,
`Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`,
`Text limit: ${maxLength} chars`,
`Auto-summary: ${summarize ? "on" : "off"}`,

View File

@@ -3,6 +3,7 @@ import path from "node:path";
import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
import { formatErrorMessage } from "../../infra/errors.js";
import { emitSessionTranscriptUpdate } from "../../sessions/transcript-events.js";
import { extractAssistantVisibleText } from "../../shared/chat-message-content.js";
import {
resolveDefaultSessionStorePath,
resolveSessionFilePath,
@@ -46,6 +47,12 @@ export type SessionTranscriptAssistantMessage = Parameters<SessionManager["appen
role: "assistant";
};
export type LatestAssistantTranscriptText = {
id?: string;
text: string;
timestamp?: number;
};
export async function resolveSessionTranscriptFile(params: {
sessionId: string;
sessionKey: string;
@@ -91,6 +98,53 @@ export async function resolveSessionTranscriptFile(params: {
};
}
export async function readLatestAssistantTextFromSessionTranscript(
sessionFile: string | undefined,
): Promise<LatestAssistantTranscriptText | undefined> {
if (!sessionFile?.trim()) {
return undefined;
}
let raw: string;
try {
raw = await fs.promises.readFile(sessionFile, "utf-8");
} catch {
return undefined;
}
const lines = raw.split(/\r?\n/);
for (let index = lines.length - 1; index >= 0; index -= 1) {
const line = lines[index];
if (!line.trim()) {
continue;
}
try {
const parsed = JSON.parse(line) as {
id?: unknown;
message?: unknown;
};
const message = parsed.message as { role?: unknown; timestamp?: unknown } | undefined;
if (!message || message.role !== "assistant") {
continue;
}
const text = extractAssistantVisibleText(message)?.trim();
if (!text) {
continue;
}
return {
...(typeof parsed.id === "string" && parsed.id ? { id: parsed.id } : {}),
text,
...(typeof message.timestamp === "number" && Number.isFinite(message.timestamp)
? { timestamp: message.timestamp }
: {}),
};
} catch {
continue;
}
}
return undefined;
}
export async function appendAssistantMessageToSessionTranscript(params: {
agentId?: string;
sessionKey: string;

View File

@@ -175,6 +175,10 @@ export type SessionEntry = {
reasoningLevel?: string;
elevatedLevel?: string;
ttsAuto?: TtsAutoMode;
/** Hash of the latest assistant reply that was sent through `/tts latest`. */
lastTtsReadLatestHash?: string;
/** Timestamp (ms) when `/tts latest` last sent audio for this session. */
lastTtsReadLatestAt?: number;
execHost?: string;
execSecurity?: string;
execAsk?: string;