mirror of
https://github.com/moltbot/moltbot.git
synced 2026-03-21 16:41:56 +00:00
fix(#8127): auto-skip tiny/empty audio files in whisper transcription
Add a minimum file size guard (MIN_AUDIO_FILE_BYTES = 1024) before sending audio to transcription APIs. Files below this threshold are almost certainly empty or corrupt and would cause unhelpful errors from Whisper/Deepgram/Groq providers. Changes: - Add 'tooSmall' skip reason to MediaUnderstandingSkipError - Add MIN_AUDIO_FILE_BYTES constant (1024 bytes) to defaults - Guard both provider and CLI audio paths in runner.ts - Add comprehensive tests for tiny, empty, and valid audio files - Update existing test fixtures to use audio files above threshold
This commit is contained in:
committed by
Peter Steinberger
parent
036bd18e2a
commit
8b70ba6ab8
@@ -58,3 +58,10 @@ export const DEFAULT_IMAGE_MODELS: Record<string, string> = {
|
||||
};
|
||||
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||
|
||||
/**
|
||||
* Minimum audio file size in bytes below which transcription is skipped.
|
||||
* Files smaller than this threshold are almost certainly empty or corrupt
|
||||
* and would cause unhelpful API errors from Whisper/transcription providers.
|
||||
*/
|
||||
export const MIN_AUDIO_FILE_BYTES = 1024;
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty";
|
||||
export type MediaUnderstandingSkipReason =
|
||||
| "maxBytes"
|
||||
| "timeout"
|
||||
| "unsupported"
|
||||
| "empty"
|
||||
| "tooSmall";
|
||||
|
||||
export class MediaUnderstandingSkipError extends Error {
|
||||
readonly reason: MediaUnderstandingSkipReason;
|
||||
|
||||
@@ -21,6 +21,7 @@ import {
|
||||
CLI_OUTPUT_MAX_BUFFER,
|
||||
DEFAULT_AUDIO_MODELS,
|
||||
DEFAULT_TIMEOUT_SECONDS,
|
||||
MIN_AUDIO_FILE_BYTES,
|
||||
} from "./defaults.js";
|
||||
import { MediaUnderstandingSkipError } from "./errors.js";
|
||||
import { fileExists } from "./fs.js";
|
||||
@@ -446,6 +447,12 @@ export async function runProviderEntry(params: {
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
if (media.size < MIN_AUDIO_FILE_BYTES) {
|
||||
throw new MediaUnderstandingSkipError(
|
||||
"tooSmall",
|
||||
`Audio attachment ${params.attachmentIndex + 1} is too small (${media.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
|
||||
);
|
||||
}
|
||||
const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
|
||||
providerId,
|
||||
cfg,
|
||||
@@ -563,6 +570,15 @@ export async function runCliEntry(params: {
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
if (capability === "audio") {
|
||||
const stat = await fs.stat(pathResult.path);
|
||||
if (stat.size < MIN_AUDIO_FILE_BYTES) {
|
||||
throw new MediaUnderstandingSkipError(
|
||||
"tooSmall",
|
||||
`Audio attachment ${params.attachmentIndex + 1} is too small (${stat.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
|
||||
);
|
||||
}
|
||||
}
|
||||
const outputDir = await fs.mkdtemp(
|
||||
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
|
||||
);
|
||||
|
||||
185
src/media-understanding/runner.skip-tiny-audio.test.ts
Normal file
185
src/media-understanding/runner.skip-tiny-audio.test.ts
Normal file
@@ -0,0 +1,185 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { MIN_AUDIO_FILE_BYTES } from "./defaults.js";
|
||||
import {
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
runCapability,
|
||||
} from "./runner.js";
|
||||
|
||||
describe("runCapability skips tiny audio files", () => {
|
||||
it("skips audio transcription when file is smaller than MIN_AUDIO_FILE_BYTES", async () => {
|
||||
const originalPath = process.env.PATH;
|
||||
process.env.PATH = "/usr/bin:/bin";
|
||||
|
||||
// Create a tiny audio file (well below the 1KB threshold)
|
||||
const tmpPath = path.join(os.tmpdir(), `openclaw-tiny-audio-${Date.now()}.wav`);
|
||||
const tinyBuffer = Buffer.alloc(100); // 100 bytes, way below 1024
|
||||
await fs.writeFile(tmpPath, tinyBuffer);
|
||||
|
||||
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
|
||||
let transcribeCalled = false;
|
||||
const providerRegistry = buildProviderRegistry({
|
||||
openai: {
|
||||
id: "openai",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async (req) => {
|
||||
transcribeCalled = true;
|
||||
return { text: "should not happen", model: req.model };
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as OpenClawConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry,
|
||||
});
|
||||
|
||||
// The provider should never be called
|
||||
expect(transcribeCalled).toBe(false);
|
||||
|
||||
// The result should indicate the attachment was skipped
|
||||
expect(result.outputs).toHaveLength(0);
|
||||
expect(result.decision.outcome).toBe("skipped");
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("tooSmall");
|
||||
} finally {
|
||||
process.env.PATH = originalPath;
|
||||
await cache.cleanup();
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
it("skips audio transcription for empty (0-byte) files", async () => {
|
||||
const originalPath = process.env.PATH;
|
||||
process.env.PATH = "/usr/bin:/bin";
|
||||
|
||||
const tmpPath = path.join(os.tmpdir(), `openclaw-empty-audio-${Date.now()}.ogg`);
|
||||
await fs.writeFile(tmpPath, Buffer.alloc(0));
|
||||
|
||||
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/ogg" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
|
||||
let transcribeCalled = false;
|
||||
const providerRegistry = buildProviderRegistry({
|
||||
openai: {
|
||||
id: "openai",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async () => {
|
||||
transcribeCalled = true;
|
||||
return { text: "nope", model: "whisper-1" };
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as OpenClawConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry,
|
||||
});
|
||||
|
||||
expect(transcribeCalled).toBe(false);
|
||||
expect(result.outputs).toHaveLength(0);
|
||||
} finally {
|
||||
process.env.PATH = originalPath;
|
||||
await cache.cleanup();
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
it("proceeds with transcription when file meets minimum size", async () => {
|
||||
const originalPath = process.env.PATH;
|
||||
process.env.PATH = "/usr/bin:/bin";
|
||||
|
||||
const tmpPath = path.join(os.tmpdir(), `openclaw-ok-audio-${Date.now()}.wav`);
|
||||
const okBuffer = Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100);
|
||||
await fs.writeFile(tmpPath, okBuffer);
|
||||
|
||||
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
|
||||
let transcribeCalled = false;
|
||||
const providerRegistry = buildProviderRegistry({
|
||||
openai: {
|
||||
id: "openai",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async (req) => {
|
||||
transcribeCalled = true;
|
||||
return { text: "hello world", model: req.model };
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as OpenClawConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "audio",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry,
|
||||
});
|
||||
|
||||
expect(transcribeCalled).toBe(true);
|
||||
expect(result.outputs).toHaveLength(1);
|
||||
expect(result.outputs[0]?.text).toBe("hello world");
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
} finally {
|
||||
process.env.PATH = originalPath;
|
||||
await cache.cleanup();
|
||||
await fs.unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user