From 1b61269eeccfa3bd6d6a9f21e5a7d9a94a4c7569 Mon Sep 17 00:00:00 2001 From: AytuncYildizli Date: Mon, 2 Mar 2026 23:31:57 +0300 Subject: [PATCH] feat(audio): auto-echo transcription to chat before agent processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When echoTranscript is enabled in tools.media.audio config, the transcription text is sent back to the originating chat immediately after successful audio transcription — before the agent processes it. This lets users verify what was heard from their voice note. Changes: - config/types.tools.ts: add echoTranscript (bool) and echoFormat (string template) to MediaUnderstandingConfig - media-understanding/apply.ts: sendTranscriptEcho() helper that resolves channel/to from ctx, guards on isDeliverableMessageChannel, and calls deliverOutboundPayloads best-effort - config/schema.help.ts: help text for both new fields - config/schema.labels.ts: labels for both new fields - media-understanding/apply.echo-transcript.test.ts: 10 vitest cases covering disabled/enabled/custom-format/no-audio/failed-transcription/ non-deliverable-channel/missing-from/OriginatingTo/delivery-failure Default echoFormat: '📝 "{transcript}"' Closes #32102 --- src/config/schema.help.ts | 4 + src/config/schema.labels.ts | 2 + src/config/types.tools.ts | 10 + .../apply.echo-transcript.test.ts | 353 ++++++++++++++++++ src/media-understanding/apply.ts | 73 ++++ 5 files changed, 442 insertions(+) create mode 100644 src/media-understanding/apply.echo-transcript.test.ts diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 702a496cddf..f7b0c32587c 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -545,6 +545,10 @@ export const FIELD_HELP: Record = { "Ordered model preferences specifically for audio understanding, used before shared media model fallback. Choose models optimized for transcription quality in your primary language/domain.", "tools.media.audio.scope": "Scope selector for when audio understanding runs across inbound messages and attachments. Keep focused scopes in high-volume channels to reduce cost and avoid accidental transcription.", + "tools.media.audio.echoTranscript": + "Echo the audio transcript back to the originating chat before agent processing. When enabled, users immediately see what was heard from their voice note, helping them verify transcription accuracy before the agent acts on it. Default: false.", + "tools.media.audio.echoFormat": + "Format string for the echoed transcript message. Use `{transcript}` as a placeholder for the transcribed text. Default: '📝 \"{transcript}\"'.", "tools.media.video.enabled": "Enable video understanding so clips can be summarized into text for downstream reasoning and responses. Disable when processing video is out of policy or too expensive for your deployment.", "tools.media.video.maxBytes": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 4dd69ff2e65..8d2334f1a78 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -137,6 +137,8 @@ export const FIELD_LABELS: Record = { "tools.media.audio.attachments": "Audio Understanding Attachment Policy", "tools.media.audio.models": "Audio Understanding Models", "tools.media.audio.scope": "Audio Understanding Scope", + "tools.media.audio.echoTranscript": "Echo Transcript to Chat", + "tools.media.audio.echoFormat": "Transcript Echo Format", "tools.media.video.enabled": "Enable Video Understanding", "tools.media.video.maxBytes": "Video Understanding Max Bytes", "tools.media.video.maxChars": "Video Understanding Max Chars", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index 492282f2397..9d45a9d7fa4 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -92,6 +92,16 @@ export type MediaUnderstandingConfig = MediaProviderRequestConfig & { attachments?: MediaUnderstandingAttachmentsConfig; /** Ordered model list (fallbacks in order). */ models?: MediaUnderstandingModelConfig[]; + /** + * Echo the audio transcript back to the originating chat before agent processing. + * Lets users verify what was heard. Default: false. + */ + echoTranscript?: boolean; + /** + * Format string for the echoed transcript. Use `{transcript}` as placeholder. + * Default: '📝 "{transcript}"' + */ + echoFormat?: string; }; export type LinkModelConfig = { diff --git a/src/media-understanding/apply.echo-transcript.test.ts b/src/media-understanding/apply.echo-transcript.test.ts new file mode 100644 index 00000000000..afda260e2f3 --- /dev/null +++ b/src/media-understanding/apply.echo-transcript.test.ts @@ -0,0 +1,353 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; +import type { MsgContext } from "../auto-reply/templating.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; + +// --------------------------------------------------------------------------- +// Module mocks +// --------------------------------------------------------------------------- + +vi.mock("../agents/model-auth.js", () => ({ + resolveApiKeyForProvider: vi.fn(async () => ({ + apiKey: "test-key", + source: "test", + mode: "api-key", + })), + requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => { + if (auth?.apiKey) { + return auth.apiKey; + } + throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`); + }, + resolveAwsSdkEnvVarName: vi.fn(() => undefined), + resolveEnvApiKey: vi.fn(() => null), + resolveModelAuthMode: vi.fn(() => "api-key"), + getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })), + getCustomProviderApiKey: vi.fn(() => undefined), + ensureAuthProfileStore: vi.fn(async () => ({})), + resolveAuthProfileOrder: vi.fn(() => []), +})); + +class MediaFetchErrorMock extends Error { + code: string; + constructor(message: string, code: string) { + super(message); + this.name = "MediaFetchError"; + this.code = code; + } +} + +vi.mock("../media/fetch.js", () => ({ + fetchRemoteMedia: vi.fn(), + MediaFetchError: MediaFetchErrorMock, +})); + +vi.mock("../process/exec.js", () => ({ + runExec: vi.fn(), + runCommandWithTimeout: vi.fn(), +})); + +const mockDeliverOutboundPayloads = vi.fn(); + +vi.mock("../infra/outbound/deliver.js", () => ({ + deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args), +})); + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding; +let clearMediaUnderstandingBinaryCacheForTests: () => void; + +const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-"; +let suiteTempMediaRootDir = ""; + +async function createTempAudioFile(): Promise { + const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-")); + const filePath = path.join(dir, "note.ogg"); + await fs.writeFile(filePath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8])); + return filePath; +} + +function createAudioCtxWithProvider(mediaPath: string, extra?: Partial): MsgContext { + return { + Body: "", + MediaPath: mediaPath, + MediaType: "audio/ogg", + Provider: "whatsapp", + From: "+10000000001", + AccountId: "acc1", + ...extra, + }; +} + +function createAudioConfigWithEcho(opts?: { + echoTranscript?: boolean; + echoFormat?: string; + transcribedText?: string; +}): { + cfg: OpenClawConfig; + providers: Record Promise<{ text: string }> }>; +} { + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + echoTranscript: opts?.echoTranscript ?? true, + ...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}), + }, + }, + }, + }; + const providers = { + groq: { + id: "groq", + transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }), + }, + }; + return { cfg, providers }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe("applyMediaUnderstanding – echo transcript", () => { + beforeAll(async () => { + const baseDir = resolvePreferredOpenClawTmpDir(); + await fs.mkdir(baseDir, { recursive: true }); + suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX)); + const mod = await import("./apply.js"); + applyMediaUnderstanding = mod.applyMediaUnderstanding; + const runner = await import("./runner.js"); + clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests; + }); + + beforeEach(() => { + mockDeliverOutboundPayloads.mockClear(); + mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]); + clearMediaUnderstandingBinaryCacheForTests?.(); + }); + + afterAll(async () => { + if (!suiteTempMediaRootDir) { + return; + } + await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true }); + suiteTempMediaRootDir = ""; + }); + + it("does NOT echo when echoTranscript is false (default)", async () => { + const mediaPath = await createTempAudioFile(); + const ctx = createAudioCtxWithProvider(mediaPath); + const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false }); + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); + }); + + it("does NOT echo when echoTranscript is absent (default)", async () => { + const mediaPath = await createTempAudioFile(); + const ctx = createAudioCtxWithProvider(mediaPath); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + // echoTranscript not set → defaults to false + }, + }, + }, + }; + const providers = { + groq: { id: "groq", transcribeAudio: async () => ({ text: "hello world" }) }, + }; + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); + }); + + it("echoes transcript with default format when echoTranscript is true", async () => { + const mediaPath = await createTempAudioFile(); + const ctx = createAudioCtxWithProvider(mediaPath); + const { cfg, providers } = createAudioConfigWithEcho({ + echoTranscript: true, + transcribedText: "hello world", + }); + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); + const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0]; + expect(callArgs).toBeDefined(); + expect(callArgs.channel).toBe("whatsapp"); + expect(callArgs.to).toBe("+10000000001"); + expect(callArgs.accountId).toBe("acc1"); + expect(callArgs.payloads).toHaveLength(1); + expect(callArgs.payloads[0].text).toBe('📝 "hello world"'); + }); + + it("uses custom echoFormat when provided", async () => { + const mediaPath = await createTempAudioFile(); + const ctx = createAudioCtxWithProvider(mediaPath); + const { cfg, providers } = createAudioConfigWithEcho({ + echoTranscript: true, + echoFormat: "🎙️ Heard: {transcript}", + transcribedText: "custom message", + }); + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); + const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0]; + expect(callArgs?.payloads[0].text).toBe("🎙️ Heard: custom message"); + }); + + it("does NOT echo when there are no audio attachments", async () => { + // Image-only context — no audio attachment + const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-")); + const imgPath = path.join(dir, "photo.jpg"); + await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0])); + + const ctx: MsgContext = { + Body: "", + MediaPath: imgPath, + MediaType: "image/jpeg", + Provider: "whatsapp", + From: "+10000000001", + }; + + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + echoTranscript: true, + }, + image: { enabled: false }, + }, + }, + }; + const providers = { + groq: { id: "groq", transcribeAudio: async () => ({ text: "should not appear" }) }, + }; + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + // No audio outputs → Transcript not set → no echo + expect(ctx.Transcript).toBeUndefined(); + expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); + }); + + it("does NOT echo when transcription fails", async () => { + const mediaPath = await createTempAudioFile(); + const ctx = createAudioCtxWithProvider(mediaPath); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + echoTranscript: true, + }, + }, + }, + }; + const providers = { + groq: { + id: "groq", + transcribeAudio: async () => { + throw new Error("transcription provider failure"); + }, + }, + }; + + // Should not throw; transcription failure is swallowed by runner + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(ctx.Transcript).toBeUndefined(); + expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); + }); + + it("does NOT echo when channel is not deliverable", async () => { + const mediaPath = await createTempAudioFile(); + // Use an internal/non-deliverable channel + const ctx = createAudioCtxWithProvider(mediaPath, { + Provider: "internal-system", + From: "some-source", + }); + const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + // Transcript should be set (transcription succeeded) + expect(ctx.Transcript).toBe("hello world"); + // But echo should be skipped + expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); + }); + + it("does NOT echo when ctx has no From or OriginatingTo", async () => { + const mediaPath = await createTempAudioFile(); + const ctx: MsgContext = { + Body: "", + MediaPath: mediaPath, + MediaType: "audio/ogg", + Provider: "whatsapp", + // From and OriginatingTo intentionally absent + }; + const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(ctx.Transcript).toBe("hello world"); + expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); + }); + + it("uses OriginatingTo when From is absent", async () => { + const mediaPath = await createTempAudioFile(); + const ctx: MsgContext = { + Body: "", + MediaPath: mediaPath, + MediaType: "audio/ogg", + Provider: "whatsapp", + OriginatingTo: "+19999999999", + }; + const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); + + await applyMediaUnderstanding({ ctx, cfg, providers }); + + expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); + const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0]; + expect(callArgs?.to).toBe("+19999999999"); + }); + + it("echo delivery failure does not throw or break transcription", async () => { + const mediaPath = await createTempAudioFile(); + const ctx = createAudioCtxWithProvider(mediaPath); + const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); + + mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout")); + + // Should not throw + const result = await applyMediaUnderstanding({ ctx, cfg, providers }); + + // Transcription itself succeeded + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("hello world"); + // Deliver was attempted + expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); + }); +}); diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index f7d5ecddbcf..bfe5dbc225b 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -8,6 +8,7 @@ import { normalizeMimeType, resolveInputFileLimits, } from "../media/input-files.js"; +import { isDeliverableMessageChannel } from "../utils/message-channel.js"; import { resolveAttachmentKind } from "./attachments.js"; import { runWithConcurrency } from "./concurrency.js"; import { @@ -462,6 +463,68 @@ async function extractFileBlocks(params: { return blocks; } +const DEFAULT_ECHO_FORMAT = '📝 "{transcript}"'; + +/** + * Formats a transcript echo message using the configured format string. + * Replaces `{transcript}` placeholder with the actual transcript text. + */ +function formatEchoTranscript(transcript: string, format: string): string { + return format.replace("{transcript}", transcript); +} + +/** + * Sends the transcript echo back to the originating chat. + * Best-effort: logs on failure, never throws. + */ +async function sendTranscriptEcho(params: { + ctx: MsgContext; + cfg: OpenClawConfig; + transcript: string; + format: string; +}): Promise { + const { ctx, cfg, transcript, format } = params; + const channel = ctx.Provider ?? ctx.Surface ?? ""; + const to = ctx.OriginatingTo ?? ctx.From ?? ""; + + if (!channel || !to) { + if (shouldLogVerbose()) { + logVerbose("media: echo-transcript skipped (no channel/to resolved from ctx)"); + } + return; + } + + const normalizedChannel = channel.trim().toLowerCase(); + if (!isDeliverableMessageChannel(normalizedChannel)) { + if (shouldLogVerbose()) { + logVerbose( + `media: echo-transcript skipped (channel "${String(normalizedChannel)}" is not deliverable)`, + ); + } + return; + } + + const text = formatEchoTranscript(transcript, format); + + try { + const { deliverOutboundPayloads } = await import("../infra/outbound/deliver.js"); + await deliverOutboundPayloads({ + cfg, + channel: normalizedChannel, + to, + accountId: ctx.AccountId ?? undefined, + threadId: ctx.MessageThreadId ?? undefined, + payloads: [{ text }], + bestEffort: true, + }); + if (shouldLogVerbose()) { + logVerbose(`media: echo-transcript sent to ${normalizedChannel}/${to}`); + } + } catch (err) { + logVerbose(`media: echo-transcript delivery failed: ${String(err)}`); + } +} + export async function applyMediaUnderstanding(params: { ctx: MsgContext; cfg: OpenClawConfig; @@ -528,6 +591,16 @@ export async function applyMediaUnderstanding(params: { ctx.CommandBody = transcript; ctx.RawBody = transcript; } + // Echo transcript back to chat before agent processing, if configured. + const audioCfg = cfg.tools?.media?.audio; + if (audioCfg?.echoTranscript && transcript) { + await sendTranscriptEcho({ + ctx, + cfg, + transcript, + format: audioCfg.echoFormat ?? DEFAULT_ECHO_FORMAT, + }); + } } else if (originalUserText) { ctx.CommandBody = originalUserText; ctx.RawBody = originalUserText;