diff --git a/src/telegram/bot-message-context.ts b/src/telegram/bot-message-context.ts index 7961fb469ed..b4ffd2651c9 100644 --- a/src/telegram/bot-message-context.ts +++ b/src/telegram/bot-message-context.ts @@ -389,11 +389,12 @@ export const buildTelegramMessageContext = async ({ let bodyText = rawBody; const hasAudio = allMedia.some((media) => media.contentType?.startsWith("audio/")); - // Preflight audio transcription for mention detection in groups - // This allows voice notes to be checked for mentions before being dropped + // Audio transcription: transcribe voice notes before they reach the agent. + // In groups: enables mention detection in voice notes. + // In DMs: replaces placeholder with transcript text. let preflightTranscript: string | undefined; const needsPreflightTranscription = - isGroup && requireMention && hasAudio && !hasUserText && mentionRegexes.length > 0; + hasAudio && !hasUserText && (!isGroup || (requireMention && mentionRegexes.length > 0)); if (needsPreflightTranscription) { try { @@ -414,6 +415,53 @@ export const buildTelegramMessageContext = async ({ } catch (err) { logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`); } + + // Fallback: if the media pipeline returned nothing, try calling whisper-cli directly. + // This handles cases where the pipeline's attachment normalization or model resolution + // silently produces no output (e.g. format mismatch, missing config fields). + if (!preflightTranscript && allMedia.length > 0) { + const audioMedia = allMedia.find((m) => m.contentType?.startsWith("audio/")); + if (audioMedia?.path) { + try { + const { execFile } = await import("node:child_process"); + const { promisify } = await import("node:util"); + const { mkdtemp, readFile, rm } = await import("node:fs/promises"); + const { tmpdir } = await import("node:os"); + const pathMod = await import("node:path"); + const execFileAsync = promisify(execFile); + + const audioModels = cfg.tools?.media?.audio?.models; + const cliEntry = audioModels?.find( + (m: { type?: string; command?: string }) => m.type === "cli" || m.command, + ); + if (cliEntry?.command) { + const outputDir = await mkdtemp(pathMod.join(tmpdir(), "openclaw-audio-fallback-")); + const outputBase = pathMod.join(outputDir, "out"); + const resolvedArgs = (cliEntry.args ?? []).map((a: string) => + a.replace("{{MediaPath}}", audioMedia.path).replace("{{OutputBase}}", outputBase), + ); + try { + await execFileAsync(cliEntry.command, resolvedArgs, { + timeout: 30_000, + maxBuffer: 1024 * 1024, + }); + const outputFile = outputBase + ".txt"; + const text = (await readFile(outputFile, "utf-8")).trim(); + if (text) { + preflightTranscript = text; + logVerbose( + `telegram: audio fallback transcription succeeded (${text.length} chars)`, + ); + } + } finally { + await rm(outputDir, { recursive: true, force: true }).catch(() => {}); + } + } + } catch (fallbackErr) { + logVerbose(`telegram: audio fallback transcription failed: ${String(fallbackErr)}`); + } + } + } } // Replace audio placeholder with transcript when preflight succeeds.