diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index 67effa90b82..cac7dbf5271 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -58,3 +58,10 @@ export const DEFAULT_IMAGE_MODELS: Record = { }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; + +/** + * Minimum audio file size in bytes below which transcription is skipped. + * Files smaller than this threshold are almost certainly empty or corrupt + * and would cause unhelpful API errors from Whisper/transcription providers. + */ +export const MIN_AUDIO_FILE_BYTES = 1024; diff --git a/src/media-understanding/errors.ts b/src/media-understanding/errors.ts index 450dd73250f..8f0b8b78aa0 100644 --- a/src/media-understanding/errors.ts +++ b/src/media-understanding/errors.ts @@ -1,4 +1,9 @@ -export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty"; +export type MediaUnderstandingSkipReason = + | "maxBytes" + | "timeout" + | "unsupported" + | "empty" + | "tooSmall"; export class MediaUnderstandingSkipError extends Error { readonly reason: MediaUnderstandingSkipReason; diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index 740310affcc..e7665a96e66 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -21,6 +21,7 @@ import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_AUDIO_MODELS, DEFAULT_TIMEOUT_SECONDS, + MIN_AUDIO_FILE_BYTES, } from "./defaults.js"; import { MediaUnderstandingSkipError } from "./errors.js"; import { fileExists } from "./fs.js"; @@ -446,6 +447,12 @@ export async function runProviderEntry(params: { maxBytes, timeoutMs, }); + if (media.size < MIN_AUDIO_FILE_BYTES) { + throw new MediaUnderstandingSkipError( + "tooSmall", + `Audio attachment ${params.attachmentIndex + 1} is too small (${media.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`, + ); + } const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({ providerId, cfg, @@ -563,6 +570,15 @@ export async function runCliEntry(params: { maxBytes, timeoutMs, }); + if (capability === "audio") { + const stat = await fs.stat(pathResult.path); + if (stat.size < MIN_AUDIO_FILE_BYTES) { + throw new MediaUnderstandingSkipError( + "tooSmall", + `Audio attachment ${params.attachmentIndex + 1} is too small (${stat.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`, + ); + } + } const outputDir = await fs.mkdtemp( path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"), ); diff --git a/src/media-understanding/runner.skip-tiny-audio.test.ts b/src/media-understanding/runner.skip-tiny-audio.test.ts new file mode 100644 index 00000000000..1af70408f12 --- /dev/null +++ b/src/media-understanding/runner.skip-tiny-audio.test.ts @@ -0,0 +1,185 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import type { MsgContext } from "../auto-reply/templating.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { MIN_AUDIO_FILE_BYTES } from "./defaults.js"; +import { + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; + +describe("runCapability skips tiny audio files", () => { + it("skips audio transcription when file is smaller than MIN_AUDIO_FILE_BYTES", async () => { + const originalPath = process.env.PATH; + process.env.PATH = "/usr/bin:/bin"; + + // Create a tiny audio file (well below the 1KB threshold) + const tmpPath = path.join(os.tmpdir(), `openclaw-tiny-audio-${Date.now()}.wav`); + const tinyBuffer = Buffer.alloc(100); // 100 bytes, way below 1024 + await fs.writeFile(tmpPath, tinyBuffer); + + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + let transcribeCalled = false; + const providerRegistry = buildProviderRegistry({ + openai: { + id: "openai", + capabilities: ["audio"], + transcribeAudio: async (req) => { + transcribeCalled = true; + return { text: "should not happen", model: req.model }; + }, + }, + }); + + const cfg = { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + } as unknown as OpenClawConfig; + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + + // The provider should never be called + expect(transcribeCalled).toBe(false); + + // The result should indicate the attachment was skipped + expect(result.outputs).toHaveLength(0); + expect(result.decision.outcome).toBe("skipped"); + expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped"); + expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("tooSmall"); + } finally { + process.env.PATH = originalPath; + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } + }); + + it("skips audio transcription for empty (0-byte) files", async () => { + const originalPath = process.env.PATH; + process.env.PATH = "/usr/bin:/bin"; + + const tmpPath = path.join(os.tmpdir(), `openclaw-empty-audio-${Date.now()}.ogg`); + await fs.writeFile(tmpPath, Buffer.alloc(0)); + + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/ogg" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + let transcribeCalled = false; + const providerRegistry = buildProviderRegistry({ + openai: { + id: "openai", + capabilities: ["audio"], + transcribeAudio: async () => { + transcribeCalled = true; + return { text: "nope", model: "whisper-1" }; + }, + }, + }); + + const cfg = { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + } as unknown as OpenClawConfig; + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + + expect(transcribeCalled).toBe(false); + expect(result.outputs).toHaveLength(0); + } finally { + process.env.PATH = originalPath; + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } + }); + + it("proceeds with transcription when file meets minimum size", async () => { + const originalPath = process.env.PATH; + process.env.PATH = "/usr/bin:/bin"; + + const tmpPath = path.join(os.tmpdir(), `openclaw-ok-audio-${Date.now()}.wav`); + const okBuffer = Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100); + await fs.writeFile(tmpPath, okBuffer); + + const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + + let transcribeCalled = false; + const providerRegistry = buildProviderRegistry({ + openai: { + id: "openai", + capabilities: ["audio"], + transcribeAudio: async (req) => { + transcribeCalled = true; + return { text: "hello world", model: req.model }; + }, + }, + }); + + const cfg = { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + } as unknown as OpenClawConfig; + + try { + const result = await runCapability({ + capability: "audio", + cfg, + ctx, + attachments: cache, + media, + providerRegistry, + }); + + expect(transcribeCalled).toBe(true); + expect(result.outputs).toHaveLength(1); + expect(result.outputs[0]?.text).toBe("hello world"); + expect(result.decision.outcome).toBe("success"); + } finally { + process.env.PATH = originalPath; + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } + }); +});