fix(telegram): enable voice-note transcription in DMs and add CLI fallback

The preflight transcription condition only triggered for group chats
(isGroup && requireMention), so voice notes sent in direct messages
were never transcribed — they arrived as raw <media:audio> placeholders.

This patch widens the condition to fire whenever there is audio and no
accompanying text, regardless of chat type.

It also adds a fallback path: if the standard media pipeline returns no
transcript (e.g. format mismatch, missing config), OpenClaw now calls
the configured whisper CLI command directly with the audio file, using
the same {{MediaPath}}/{{OutputBase}} template variables from config.

Co-Authored-By: TH <tzhsn.huang@gmail.com>
This commit is contained in:
Winry
2026-02-16 13:00:29 -08:00
committed by Peter Steinberger
parent e8b03a8622
commit c15385fc94

View File

@@ -389,11 +389,12 @@ export const buildTelegramMessageContext = async ({
let bodyText = rawBody;
const hasAudio = allMedia.some((media) => media.contentType?.startsWith("audio/"));
// Preflight audio transcription for mention detection in groups
// This allows voice notes to be checked for mentions before being dropped
// Audio transcription: transcribe voice notes before they reach the agent.
// In groups: enables mention detection in voice notes.
// In DMs: replaces <media:audio> placeholder with transcript text.
let preflightTranscript: string | undefined;
const needsPreflightTranscription =
isGroup && requireMention && hasAudio && !hasUserText && mentionRegexes.length > 0;
hasAudio && !hasUserText && (!isGroup || (requireMention && mentionRegexes.length > 0));
if (needsPreflightTranscription) {
try {
@@ -414,6 +415,53 @@ export const buildTelegramMessageContext = async ({
} catch (err) {
logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`);
}
// Fallback: if the media pipeline returned nothing, try calling whisper-cli directly.
// This handles cases where the pipeline's attachment normalization or model resolution
// silently produces no output (e.g. format mismatch, missing config fields).
if (!preflightTranscript && allMedia.length > 0) {
const audioMedia = allMedia.find((m) => m.contentType?.startsWith("audio/"));
if (audioMedia?.path) {
try {
const { execFile } = await import("node:child_process");
const { promisify } = await import("node:util");
const { mkdtemp, readFile, rm } = await import("node:fs/promises");
const { tmpdir } = await import("node:os");
const pathMod = await import("node:path");
const execFileAsync = promisify(execFile);
const audioModels = cfg.tools?.media?.audio?.models;
const cliEntry = audioModels?.find(
(m: { type?: string; command?: string }) => m.type === "cli" || m.command,
);
if (cliEntry?.command) {
const outputDir = await mkdtemp(pathMod.join(tmpdir(), "openclaw-audio-fallback-"));
const outputBase = pathMod.join(outputDir, "out");
const resolvedArgs = (cliEntry.args ?? []).map((a: string) =>
a.replace("{{MediaPath}}", audioMedia.path).replace("{{OutputBase}}", outputBase),
);
try {
await execFileAsync(cliEntry.command, resolvedArgs, {
timeout: 30_000,
maxBuffer: 1024 * 1024,
});
const outputFile = outputBase + ".txt";
const text = (await readFile(outputFile, "utf-8")).trim();
if (text) {
preflightTranscript = text;
logVerbose(
`telegram: audio fallback transcription succeeded (${text.length} chars)`,
);
}
} finally {
await rm(outputDir, { recursive: true, force: true }).catch(() => {});
}
}
} catch (fallbackErr) {
logVerbose(`telegram: audio fallback transcription failed: ${String(fallbackErr)}`);
}
}
}
}
// Replace audio placeholder with transcript when preflight succeeds.