From 1b95220a99e07df5baedf02d128fd9087b0a9cb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Azade=20=F0=9F=90=90?= <azade@hey.com>
Date: Sat, 14 Feb 2026 02:03:02 +0000
Subject: [PATCH] fix(media): recognize MP3 and M4A as voice-compatible audio
 (#15438)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(media): recognize MP3 and M4A as voice-compatible audio

Telegram sendVoice supports OGG/Opus, MP3, and M4A, but
isVoiceCompatibleAudio only recognized OGG/Opus formats.

- Add MP3 and M4A extensions and MIME types
- Use explicit MIME set instead of substring matching
- Handle MIME parameters (e.g. 'audio/ogg; codecs=opus')
- Add test coverage for all supported and unsupported formats

* fix: narrow MIME allowlist per review feedback

Remove audio/mp4 and audio/aac from voice MIME types — too broad.
Keep only M4A-specific types (audio/x-m4a, audio/m4a).
Add audio/mp4 and audio/aac as negative test cases.

* fix: align voice compatibility and channel coverage (#15438) (thanks @azade-c)

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
---
 CHANGELOG.md                                  |  1 +
 extensions/matrix/src/matrix/send.test.ts     | 68 ++++++++++++++++++-
 src/media/audio.test.ts                       | 43 ++++++++++++
 src/media/audio.ts                            | 26 +++++--
 ...send.returns-undefined-empty-input.test.ts | 39 ++++++++++-
 src/telegram/voice.test.ts                    | 21 +++++-
 6 files changed, 187 insertions(+), 11 deletions(-)
 create mode 100644 src/media/audio.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d1c4347f344..2d0aa6c56b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai
 - Exec/Allowlist: allow multiline heredoc bodies (`<<`, `<<-`) while keeping multiline non-heredoc shell commands blocked, so exec approval parsing permits heredoc input safely without allowing general newline command chaining. (#13811) Thanks @mcaxtr.
 - Docs/Mermaid: remove hardcoded Mermaid init theme blocks from four docs diagrams so dark mode inherits readable theme defaults. (#15157) Thanks @heytulsiprasad.
 - Outbound/Threading: pass `replyTo` and `threadId` from `message send` tool actions through the core outbound send path to channel adapters, preserving thread/reply routing. (#14948) Thanks @mcaxtr.
+- Telegram/Matrix: treat MP3 and M4A (including `audio/mp4`) as voice-compatible for `asVoice` routing, and keep WAV/AAC falling back to regular audio sends. (#15438) Thanks @azade-c.
 - Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew.
 - Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman.
 - Sessions: archive previous transcript files on `/new` and `/reset` session resets (including gateway `sessions.reset`) so stale transcripts do not accumulate on disk. (#14869) Thanks @mcaxtr.
diff --git a/extensions/matrix/src/matrix/send.test.ts b/extensions/matrix/src/matrix/send.test.ts
index 0ebfc826f80..d931d6f9db1 100644
--- a/extensions/matrix/src/matrix/send.test.ts
+++ b/extensions/matrix/src/matrix/send.test.ts
@@ -24,6 +24,8 @@ const loadWebMediaMock = vi.fn().mockResolvedValue({
   contentType: "image/png",
   kind: "image",
 });
+const mediaKindFromMimeMock = vi.fn(() => "image");
+const isVoiceCompatibleAudioMock = vi.fn(() => false);
 const getImageMetadataMock = vi.fn().mockResolvedValue(null);
 const resizeToJpegMock = vi.fn();
 
@@ -33,8 +35,8 @@ const runtimeStub = {
   },
   media: {
     loadWebMedia: (...args: unknown[]) => loadWebMediaMock(...args),
-    mediaKindFromMime: () => "image",
-    isVoiceCompatibleAudio: () => false,
+    mediaKindFromMime: (...args: unknown[]) => mediaKindFromMimeMock(...args),
+    isVoiceCompatibleAudio: (...args: unknown[]) => isVoiceCompatibleAudioMock(...args),
     getImageMetadata: (...args: unknown[]) => getImageMetadataMock(...args),
     resizeToJpeg: (...args: unknown[]) => resizeToJpegMock(...args),
   },
@@ -71,6 +73,8 @@ describe("sendMessageMatrix media", () => {
 
   beforeEach(() => {
     vi.clearAllMocks();
+    mediaKindFromMimeMock.mockReturnValue("image");
+    isVoiceCompatibleAudioMock.mockReturnValue(false);
     setMatrixRuntime(runtimeStub);
   });
 
@@ -133,6 +137,66 @@ describe("sendMessageMatrix media", () => {
     expect(content.url).toBeUndefined();
     expect(content.file?.url).toBe("mxc://example/file");
   });
+
+  it("marks voice metadata and sends caption follow-up when audioAsVoice is compatible", async () => {
+    const { client, sendMessage } = makeClient();
+    mediaKindFromMimeMock.mockReturnValue("audio");
+    isVoiceCompatibleAudioMock.mockReturnValue(true);
+    loadWebMediaMock.mockResolvedValueOnce({
+      buffer: Buffer.from("audio"),
+      fileName: "clip.mp3",
+      contentType: "audio/mpeg",
+      kind: "audio",
+    });
+
+    await sendMessageMatrix("room:!room:example", "voice caption", {
+      client,
+      mediaUrl: "file:///tmp/clip.mp3",
+      audioAsVoice: true,
+    });
+
+    expect(isVoiceCompatibleAudioMock).toHaveBeenCalledWith({
+      contentType: "audio/mpeg",
+      fileName: "clip.mp3",
+    });
+    expect(sendMessage).toHaveBeenCalledTimes(2);
+    const mediaContent = sendMessage.mock.calls[0]?.[1] as {
+      msgtype?: string;
+      body?: string;
+      "org.matrix.msc3245.voice"?: Record<string, never>;
+    };
+    expect(mediaContent.msgtype).toBe("m.audio");
+    expect(mediaContent.body).toBe("Voice message");
+    expect(mediaContent["org.matrix.msc3245.voice"]).toEqual({});
+  });
+
+  it("keeps regular audio payload when audioAsVoice media is incompatible", async () => {
+    const { client, sendMessage } = makeClient();
+    mediaKindFromMimeMock.mockReturnValue("audio");
+    isVoiceCompatibleAudioMock.mockReturnValue(false);
+    loadWebMediaMock.mockResolvedValueOnce({
+      buffer: Buffer.from("audio"),
+      fileName: "clip.wav",
+      contentType: "audio/wav",
+      kind: "audio",
+    });
+
+    await sendMessageMatrix("room:!room:example", "voice caption", {
+      client,
+      mediaUrl: "file:///tmp/clip.wav",
+      audioAsVoice: true,
+    });
+
+    expect(sendMessage).toHaveBeenCalledTimes(1);
+    const mediaContent = sendMessage.mock.calls[0]?.[1] as {
+      msgtype?: string;
+      body?: string;
+      "org.matrix.msc3245.voice"?: Record<string, never>;
+    };
+    expect(mediaContent.msgtype).toBe("m.audio");
+    expect(mediaContent.body).toBe("voice caption");
+    expect(mediaContent["org.matrix.msc3245.voice"]).toBeUndefined();
+  });
 });
 
 describe("sendMessageMatrix threads", () => {
diff --git a/src/media/audio.test.ts b/src/media/audio.test.ts
new file mode 100644
index 00000000000..af25bb69d74
--- /dev/null
+++ b/src/media/audio.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it } from "vitest";
+import { isVoiceCompatibleAudio } from "./audio.js";
+
+describe("isVoiceCompatibleAudio", () => {
+  it.each([
+    { contentType: "audio/ogg", fileName: null },
+    { contentType: "audio/opus", fileName: null },
+    { contentType: "audio/ogg; codecs=opus", fileName: null },
+    { contentType: "audio/mpeg", fileName: null },
+    { contentType: "audio/mp3", fileName: null },
+    { contentType: "audio/mp4", fileName: null },
+    { contentType: "audio/mp4; codecs=mp4a.40.2", fileName: null },
+    { contentType: "audio/x-m4a", fileName: null },
+    { contentType: "audio/m4a", fileName: null },
+  ])("returns true for MIME type $contentType", (opts) => {
+    expect(isVoiceCompatibleAudio(opts)).toBe(true);
+  });
+
+  it.each([".ogg", ".oga", ".opus", ".mp3", ".m4a"])("returns true for extension %s", (ext) => {
+    expect(isVoiceCompatibleAudio({ fileName: `voice${ext}` })).toBe(true);
+  });
+
+  it.each([
+    { contentType: "audio/wav", fileName: null },
+    { contentType: "audio/flac", fileName: null },
+    { contentType: "audio/aac", fileName: null },
+    { contentType: "video/mp4", fileName: null },
+  ])("returns false for unsupported MIME $contentType", (opts) => {
+    expect(isVoiceCompatibleAudio(opts)).toBe(false);
+  });
+
+  it.each([".wav", ".flac", ".webm"])("returns false for extension %s", (ext) => {
+    expect(isVoiceCompatibleAudio({ fileName: `audio${ext}` })).toBe(false);
+  });
+
+  it("returns false when no contentType and no fileName", () => {
+    expect(isVoiceCompatibleAudio({})).toBe(false);
+  });
+
+  it("prefers MIME type over extension", () => {
+    expect(isVoiceCompatibleAudio({ contentType: "audio/mpeg", fileName: "file.wav" })).toBe(true);
+  });
+});
diff --git a/src/media/audio.ts b/src/media/audio.ts
index aeca2ce0b53..b632533bbb0 100644
--- a/src/media/audio.ts
+++ b/src/media/audio.ts
@@ -1,14 +1,32 @@
 import { getFileExtension } from "./mime.js";
 
-const VOICE_AUDIO_EXTENSIONS = new Set([".oga", ".ogg", ".opus"]);
+const VOICE_AUDIO_EXTENSIONS = new Set([".oga", ".ogg", ".opus", ".mp3", ".m4a"]);
+
+/**
+ * MIME types compatible with voice messages.
+ * Telegram sendVoice supports OGG/Opus, MP3, and M4A.
+ * https://core.telegram.org/bots/api#sendvoice
+ */
+const VOICE_MIME_TYPES = new Set([
+  "audio/ogg",
+  "audio/opus",
+  "audio/mpeg",
+  "audio/mp3",
+  "audio/mp4",
+  "audio/x-m4a",
+  "audio/m4a",
+]);
 
 export function isVoiceCompatibleAudio(opts: {
   contentType?: string | null;
   fileName?: string | null;
 }): boolean {
-  const mime = opts.contentType?.toLowerCase();
-  if (mime && (mime.includes("ogg") || mime.includes("opus"))) {
-    return true;
+  const mime = opts.contentType?.toLowerCase().trim();
+  if (mime) {
+    const baseMime = mime.split(";")[0].trim();
+    if (VOICE_MIME_TYPES.has(baseMime)) {
+      return true;
+    }
   }
   const fileName = opts.fileName?.trim();
   if (!fileName) {
diff --git a/src/telegram/send.returns-undefined-empty-input.test.ts b/src/telegram/send.returns-undefined-empty-input.test.ts
index a93a1e41b66..b73a3292684 100644
--- a/src/telegram/send.returns-undefined-empty-input.test.ts
+++ b/src/telegram/send.returns-undefined-empty-input.test.ts
@@ -436,6 +436,41 @@ describe("sendMessageTelegram", () => {
       sendVoice: typeof sendVoice;
     };
 
+    loadWebMedia.mockResolvedValueOnce({
+      buffer: Buffer.from("audio"),
+      contentType: "audio/wav",
+      fileName: "clip.wav",
+    });
+
+    await sendMessageTelegram(chatId, "caption", {
+      token: "tok",
+      api,
+      mediaUrl: "https://example.com/clip.wav",
+      asVoice: true,
+    });
+
+    expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), {
+      caption: "caption",
+      parse_mode: "HTML",
+    });
+    expect(sendVoice).not.toHaveBeenCalled();
+  });
+
+  it("sends MP3 as voice when asVoice is true", async () => {
+    const chatId = "123";
+    const sendAudio = vi.fn().mockResolvedValue({
+      message_id: 16,
+      chat: { id: chatId },
+    });
+    const sendVoice = vi.fn().mockResolvedValue({
+      message_id: 17,
+      chat: { id: chatId },
+    });
+    const api = { sendAudio, sendVoice } as unknown as {
+      sendAudio: typeof sendAudio;
+      sendVoice: typeof sendVoice;
+    };
+
     loadWebMedia.mockResolvedValueOnce({
       buffer: Buffer.from("audio"),
       contentType: "audio/mpeg",
@@ -449,11 +484,11 @@ describe("sendMessageTelegram", () => {
       asVoice: true,
     });
 
-    expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), {
+    expect(sendVoice).toHaveBeenCalledWith(chatId, expect.anything(), {
       caption: "caption",
       parse_mode: "HTML",
     });
-    expect(sendVoice).not.toHaveBeenCalled();
+    expect(sendAudio).not.toHaveBeenCalled();
   });
 
   it("includes message_thread_id for forum topic messages", async () => {
diff --git a/src/telegram/voice.test.ts b/src/telegram/voice.test.ts
index e2d96a971bc..bcae3b0f33d 100644
--- a/src/telegram/voice.test.ts
+++ b/src/telegram/voice.test.ts
@@ -18,13 +18,13 @@ describe("resolveTelegramVoiceSend", () => {
     const logFallback = vi.fn();
     const result = resolveTelegramVoiceSend({
       wantsVoice: true,
-      contentType: "audio/mpeg",
-      fileName: "track.mp3",
+      contentType: "audio/wav",
+      fileName: "track.wav",
       logFallback,
     });
     expect(result.useVoice).toBe(false);
     expect(logFallback).toHaveBeenCalledWith(
-      "Telegram voice requested but media is audio/mpeg (track.mp3); sending as audio file instead.",
+      "Telegram voice requested but media is audio/wav (track.wav); sending as audio file instead.",
     );
   });
 
@@ -39,4 +39,19 @@ describe("resolveTelegramVoiceSend", () => {
     expect(result.useVoice).toBe(true);
     expect(logFallback).not.toHaveBeenCalled();
   });
+
+  it.each([
+    { contentType: "audio/mpeg", fileName: "track.mp3" },
+    { contentType: "audio/mp4", fileName: "track.m4a" },
+  ])("keeps voice for compatible MIME $contentType", ({ contentType, fileName }) => {
+    const logFallback = vi.fn();
+    const result = resolveTelegramVoiceSend({
+      wantsVoice: true,
+      contentType,
+      fileName,
+      logFallback,
+    });
+    expect(result.useVoice).toBe(true);
+    expect(logFallback).not.toHaveBeenCalled();
+  });
 });