diff --git a/src/telegram/bot-message-context.ts b/src/telegram/bot-message-context.ts
index 7961fb469ed..b4ffd2651c9 100644
--- a/src/telegram/bot-message-context.ts
+++ b/src/telegram/bot-message-context.ts
@@ -389,11 +389,12 @@ export const buildTelegramMessageContext = async ({
   let bodyText = rawBody;
   const hasAudio = allMedia.some((media) => media.contentType?.startsWith("audio/"));
 
-  // Preflight audio transcription for mention detection in groups
-  // This allows voice notes to be checked for mentions before being dropped
+  // Audio transcription: transcribe voice notes before they reach the agent.
+  // In groups: enables mention detection in voice notes.
+  // In DMs: replaces <media:audio> placeholder with transcript text.
   let preflightTranscript: string | undefined;
   const needsPreflightTranscription =
-    isGroup && requireMention && hasAudio && !hasUserText && mentionRegexes.length > 0;
+    hasAudio && !hasUserText && (!isGroup || (requireMention && mentionRegexes.length > 0));
 
   if (needsPreflightTranscription) {
     try {
@@ -414,6 +415,53 @@ export const buildTelegramMessageContext = async ({
     } catch (err) {
       logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`);
     }
+
+    // Fallback: if the media pipeline returned nothing, try calling whisper-cli directly.
+    // This handles cases where the pipeline's attachment normalization or model resolution
+    // silently produces no output (e.g. format mismatch, missing config fields).
+    if (!preflightTranscript && allMedia.length > 0) {
+      const audioMedia = allMedia.find((m) => m.contentType?.startsWith("audio/"));
+      if (audioMedia?.path) {
+        try {
+          const { execFile } = await import("node:child_process");
+          const { promisify } = await import("node:util");
+          const { mkdtemp, readFile, rm } = await import("node:fs/promises");
+          const { tmpdir } = await import("node:os");
+          const pathMod = await import("node:path");
+          const execFileAsync = promisify(execFile);
+
+          const audioModels = cfg.tools?.media?.audio?.models;
+          const cliEntry = audioModels?.find(
+            (m: { type?: string; command?: string }) => m.type === "cli" || m.command,
+          );
+          if (cliEntry?.command) {
+            const outputDir = await mkdtemp(pathMod.join(tmpdir(), "openclaw-audio-fallback-"));
+            const outputBase = pathMod.join(outputDir, "out");
+            const resolvedArgs = (cliEntry.args ?? []).map((a: string) =>
+              a.replace("{{MediaPath}}", audioMedia.path).replace("{{OutputBase}}", outputBase),
+            );
+            try {
+              await execFileAsync(cliEntry.command, resolvedArgs, {
+                timeout: 30_000,
+                maxBuffer: 1024 * 1024,
+              });
+              const outputFile = outputBase + ".txt";
+              const text = (await readFile(outputFile, "utf-8")).trim();
+              if (text) {
+                preflightTranscript = text;
+                logVerbose(
+                  `telegram: audio fallback transcription succeeded (${text.length} chars)`,
+                );
+              }
+            } finally {
+              await rm(outputDir, { recursive: true, force: true }).catch(() => {});
+            }
+          }
+        } catch (fallbackErr) {
+          logVerbose(`telegram: audio fallback transcription failed: ${String(fallbackErr)}`);
+        }
+      }
+    }
   }
 
   // Replace audio placeholder with transcript when preflight succeeds.