From 1b95220a99e07df5baedf02d128fd9087b0a9cb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Azade=20=F0=9F=90=90?= Date: Sat, 14 Feb 2026 02:03:02 +0000 Subject: [PATCH] fix(media): recognize MP3 and M4A as voice-compatible audio (#15438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(media): recognize MP3 and M4A as voice-compatible audio Telegram sendVoice supports OGG/Opus, MP3, and M4A, but isVoiceCompatibleAudio only recognized OGG/Opus formats. - Add MP3 and M4A extensions and MIME types - Use explicit MIME set instead of substring matching - Handle MIME parameters (e.g. 'audio/ogg; codecs=opus') - Add test coverage for all supported and unsupported formats * fix: narrow MIME allowlist per review feedback Remove audio/mp4 and audio/aac from voice MIME types — too broad. Keep only M4A-specific types (audio/x-m4a, audio/m4a). Add audio/mp4 and audio/aac as negative test cases. * fix: align voice compatibility and channel coverage (#15438) (thanks @azade-c) --------- Co-authored-by: Peter Steinberger --- CHANGELOG.md | 1 + extensions/matrix/src/matrix/send.test.ts | 68 ++++++++++++++++++- src/media/audio.test.ts | 43 ++++++++++++ src/media/audio.ts | 26 +++++-- ...send.returns-undefined-empty-input.test.ts | 39 ++++++++++- src/telegram/voice.test.ts | 21 +++++- 6 files changed, 187 insertions(+), 11 deletions(-) create mode 100644 src/media/audio.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index d1c4347f344..2d0aa6c56b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai - Exec/Allowlist: allow multiline heredoc bodies (`<<`, `<<-`) while keeping multiline non-heredoc shell commands blocked, so exec approval parsing permits heredoc input safely without allowing general newline command chaining. (#13811) Thanks @mcaxtr. - Docs/Mermaid: remove hardcoded Mermaid init theme blocks from four docs diagrams so dark mode inherits readable theme defaults. (#15157) Thanks @heytulsiprasad. - Outbound/Threading: pass `replyTo` and `threadId` from `message send` tool actions through the core outbound send path to channel adapters, preserving thread/reply routing. (#14948) Thanks @mcaxtr. +- Telegram/Matrix: treat MP3 and M4A (including `audio/mp4`) as voice-compatible for `asVoice` routing, and keep WAV/AAC falling back to regular audio sends. (#15438) Thanks @azade-c. - Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew. - Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman. - Sessions: archive previous transcript files on `/new` and `/reset` session resets (including gateway `sessions.reset`) so stale transcripts do not accumulate on disk. (#14869) Thanks @mcaxtr. diff --git a/extensions/matrix/src/matrix/send.test.ts b/extensions/matrix/src/matrix/send.test.ts index 0ebfc826f80..d931d6f9db1 100644 --- a/extensions/matrix/src/matrix/send.test.ts +++ b/extensions/matrix/src/matrix/send.test.ts @@ -24,6 +24,8 @@ const loadWebMediaMock = vi.fn().mockResolvedValue({ contentType: "image/png", kind: "image", }); +const mediaKindFromMimeMock = vi.fn(() => "image"); +const isVoiceCompatibleAudioMock = vi.fn(() => false); const getImageMetadataMock = vi.fn().mockResolvedValue(null); const resizeToJpegMock = vi.fn(); @@ -33,8 +35,8 @@ const runtimeStub = { }, media: { loadWebMedia: (...args: unknown[]) => loadWebMediaMock(...args), - mediaKindFromMime: () => "image", - isVoiceCompatibleAudio: () => false, + mediaKindFromMime: (...args: unknown[]) => mediaKindFromMimeMock(...args), + isVoiceCompatibleAudio: (...args: unknown[]) => isVoiceCompatibleAudioMock(...args), getImageMetadata: (...args: unknown[]) => getImageMetadataMock(...args), resizeToJpeg: (...args: unknown[]) => resizeToJpegMock(...args), }, @@ -71,6 +73,8 @@ describe("sendMessageMatrix media", () => { beforeEach(() => { vi.clearAllMocks(); + mediaKindFromMimeMock.mockReturnValue("image"); + isVoiceCompatibleAudioMock.mockReturnValue(false); setMatrixRuntime(runtimeStub); }); @@ -133,6 +137,66 @@ describe("sendMessageMatrix media", () => { expect(content.url).toBeUndefined(); expect(content.file?.url).toBe("mxc://example/file"); }); + + it("marks voice metadata and sends caption follow-up when audioAsVoice is compatible", async () => { + const { client, sendMessage } = makeClient(); + mediaKindFromMimeMock.mockReturnValue("audio"); + isVoiceCompatibleAudioMock.mockReturnValue(true); + loadWebMediaMock.mockResolvedValueOnce({ + buffer: Buffer.from("audio"), + fileName: "clip.mp3", + contentType: "audio/mpeg", + kind: "audio", + }); + + await sendMessageMatrix("room:!room:example", "voice caption", { + client, + mediaUrl: "file:///tmp/clip.mp3", + audioAsVoice: true, + }); + + expect(isVoiceCompatibleAudioMock).toHaveBeenCalledWith({ + contentType: "audio/mpeg", + fileName: "clip.mp3", + }); + expect(sendMessage).toHaveBeenCalledTimes(2); + const mediaContent = sendMessage.mock.calls[0]?.[1] as { + msgtype?: string; + body?: string; + "org.matrix.msc3245.voice"?: Record; + }; + expect(mediaContent.msgtype).toBe("m.audio"); + expect(mediaContent.body).toBe("Voice message"); + expect(mediaContent["org.matrix.msc3245.voice"]).toEqual({}); + }); + + it("keeps regular audio payload when audioAsVoice media is incompatible", async () => { + const { client, sendMessage } = makeClient(); + mediaKindFromMimeMock.mockReturnValue("audio"); + isVoiceCompatibleAudioMock.mockReturnValue(false); + loadWebMediaMock.mockResolvedValueOnce({ + buffer: Buffer.from("audio"), + fileName: "clip.wav", + contentType: "audio/wav", + kind: "audio", + }); + + await sendMessageMatrix("room:!room:example", "voice caption", { + client, + mediaUrl: "file:///tmp/clip.wav", + audioAsVoice: true, + }); + + expect(sendMessage).toHaveBeenCalledTimes(1); + const mediaContent = sendMessage.mock.calls[0]?.[1] as { + msgtype?: string; + body?: string; + "org.matrix.msc3245.voice"?: Record; + }; + expect(mediaContent.msgtype).toBe("m.audio"); + expect(mediaContent.body).toBe("voice caption"); + expect(mediaContent["org.matrix.msc3245.voice"]).toBeUndefined(); + }); }); describe("sendMessageMatrix threads", () => { diff --git a/src/media/audio.test.ts b/src/media/audio.test.ts new file mode 100644 index 00000000000..af25bb69d74 --- /dev/null +++ b/src/media/audio.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from "vitest"; +import { isVoiceCompatibleAudio } from "./audio.js"; + +describe("isVoiceCompatibleAudio", () => { + it.each([ + { contentType: "audio/ogg", fileName: null }, + { contentType: "audio/opus", fileName: null }, + { contentType: "audio/ogg; codecs=opus", fileName: null }, + { contentType: "audio/mpeg", fileName: null }, + { contentType: "audio/mp3", fileName: null }, + { contentType: "audio/mp4", fileName: null }, + { contentType: "audio/mp4; codecs=mp4a.40.2", fileName: null }, + { contentType: "audio/x-m4a", fileName: null }, + { contentType: "audio/m4a", fileName: null }, + ])("returns true for MIME type $contentType", (opts) => { + expect(isVoiceCompatibleAudio(opts)).toBe(true); + }); + + it.each([".ogg", ".oga", ".opus", ".mp3", ".m4a"])("returns true for extension %s", (ext) => { + expect(isVoiceCompatibleAudio({ fileName: `voice${ext}` })).toBe(true); + }); + + it.each([ + { contentType: "audio/wav", fileName: null }, + { contentType: "audio/flac", fileName: null }, + { contentType: "audio/aac", fileName: null }, + { contentType: "video/mp4", fileName: null }, + ])("returns false for unsupported MIME $contentType", (opts) => { + expect(isVoiceCompatibleAudio(opts)).toBe(false); + }); + + it.each([".wav", ".flac", ".webm"])("returns false for extension %s", (ext) => { + expect(isVoiceCompatibleAudio({ fileName: `audio${ext}` })).toBe(false); + }); + + it("returns false when no contentType and no fileName", () => { + expect(isVoiceCompatibleAudio({})).toBe(false); + }); + + it("prefers MIME type over extension", () => { + expect(isVoiceCompatibleAudio({ contentType: "audio/mpeg", fileName: "file.wav" })).toBe(true); + }); +}); diff --git a/src/media/audio.ts b/src/media/audio.ts index aeca2ce0b53..b632533bbb0 100644 --- a/src/media/audio.ts +++ b/src/media/audio.ts @@ -1,14 +1,32 @@ import { getFileExtension } from "./mime.js"; -const VOICE_AUDIO_EXTENSIONS = new Set([".oga", ".ogg", ".opus"]); +const VOICE_AUDIO_EXTENSIONS = new Set([".oga", ".ogg", ".opus", ".mp3", ".m4a"]); + +/** + * MIME types compatible with voice messages. + * Telegram sendVoice supports OGG/Opus, MP3, and M4A. + * https://core.telegram.org/bots/api#sendvoice + */ +const VOICE_MIME_TYPES = new Set([ + "audio/ogg", + "audio/opus", + "audio/mpeg", + "audio/mp3", + "audio/mp4", + "audio/x-m4a", + "audio/m4a", +]); export function isVoiceCompatibleAudio(opts: { contentType?: string | null; fileName?: string | null; }): boolean { - const mime = opts.contentType?.toLowerCase(); - if (mime && (mime.includes("ogg") || mime.includes("opus"))) { - return true; + const mime = opts.contentType?.toLowerCase().trim(); + if (mime) { + const baseMime = mime.split(";")[0].trim(); + if (VOICE_MIME_TYPES.has(baseMime)) { + return true; + } } const fileName = opts.fileName?.trim(); if (!fileName) { diff --git a/src/telegram/send.returns-undefined-empty-input.test.ts b/src/telegram/send.returns-undefined-empty-input.test.ts index a93a1e41b66..b73a3292684 100644 --- a/src/telegram/send.returns-undefined-empty-input.test.ts +++ b/src/telegram/send.returns-undefined-empty-input.test.ts @@ -436,6 +436,41 @@ describe("sendMessageTelegram", () => { sendVoice: typeof sendVoice; }; + loadWebMedia.mockResolvedValueOnce({ + buffer: Buffer.from("audio"), + contentType: "audio/wav", + fileName: "clip.wav", + }); + + await sendMessageTelegram(chatId, "caption", { + token: "tok", + api, + mediaUrl: "https://example.com/clip.wav", + asVoice: true, + }); + + expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), { + caption: "caption", + parse_mode: "HTML", + }); + expect(sendVoice).not.toHaveBeenCalled(); + }); + + it("sends MP3 as voice when asVoice is true", async () => { + const chatId = "123"; + const sendAudio = vi.fn().mockResolvedValue({ + message_id: 16, + chat: { id: chatId }, + }); + const sendVoice = vi.fn().mockResolvedValue({ + message_id: 17, + chat: { id: chatId }, + }); + const api = { sendAudio, sendVoice } as unknown as { + sendAudio: typeof sendAudio; + sendVoice: typeof sendVoice; + }; + loadWebMedia.mockResolvedValueOnce({ buffer: Buffer.from("audio"), contentType: "audio/mpeg", @@ -449,11 +484,11 @@ describe("sendMessageTelegram", () => { asVoice: true, }); - expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), { + expect(sendVoice).toHaveBeenCalledWith(chatId, expect.anything(), { caption: "caption", parse_mode: "HTML", }); - expect(sendVoice).not.toHaveBeenCalled(); + expect(sendAudio).not.toHaveBeenCalled(); }); it("includes message_thread_id for forum topic messages", async () => { diff --git a/src/telegram/voice.test.ts b/src/telegram/voice.test.ts index e2d96a971bc..bcae3b0f33d 100644 --- a/src/telegram/voice.test.ts +++ b/src/telegram/voice.test.ts @@ -18,13 +18,13 @@ describe("resolveTelegramVoiceSend", () => { const logFallback = vi.fn(); const result = resolveTelegramVoiceSend({ wantsVoice: true, - contentType: "audio/mpeg", - fileName: "track.mp3", + contentType: "audio/wav", + fileName: "track.wav", logFallback, }); expect(result.useVoice).toBe(false); expect(logFallback).toHaveBeenCalledWith( - "Telegram voice requested but media is audio/mpeg (track.mp3); sending as audio file instead.", + "Telegram voice requested but media is audio/wav (track.wav); sending as audio file instead.", ); }); @@ -39,4 +39,19 @@ describe("resolveTelegramVoiceSend", () => { expect(result.useVoice).toBe(true); expect(logFallback).not.toHaveBeenCalled(); }); + + it.each([ + { contentType: "audio/mpeg", fileName: "track.mp3" }, + { contentType: "audio/mp4", fileName: "track.m4a" }, + ])("keeps voice for compatible MIME $contentType", ({ contentType, fileName }) => { + const logFallback = vi.fn(); + const result = resolveTelegramVoiceSend({ + wantsVoice: true, + contentType, + fileName, + logFallback, + }); + expect(result.useVoice).toBe(true); + expect(logFallback).not.toHaveBeenCalled(); + }); });