fix(#8127): auto-skip tiny/empty audio files in whisper transcription

Add a minimum file size guard (MIN_AUDIO_FILE_BYTES = 1024) before sending audio to transcription APIs. Files below this threshold are almost certainly empty or corrupt and would cause unhelpful errors from Whisper/Deepgram/Groq providers. Changes: - Add 'tooSmall' skip reason to MediaUnderstandingSkipError - Add MIN_AUDIO_FILE_BYTES constant (1024 bytes) to defaults - Guard both provider and CLI audio paths in runner.ts - Add comprehensive tests for tiny, empty, and valid audio files - Update existing test fixtures to use audio files above threshold
2026-03-21 16:41:56 +00:00 · 2026-02-03 23:28:13 +00:00
parent 036bd18e2a
commit 8b70ba6ab8
4 changed files with 214 additions and 1 deletions
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -58,3 +58,10 @@ export const DEFAULT_IMAGE_MODELS: Record<string, string> = {
 };
 export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
 export const DEFAULT_MEDIA_CONCURRENCY = 2;
+
+/**
+ * Minimum audio file size in bytes below which transcription is skipped.
+ * Files smaller than this threshold are almost certainly empty or corrupt
+ * and would cause unhelpful API errors from Whisper/transcription providers.
+ */
+export const MIN_AUDIO_FILE_BYTES = 1024;
--- a/src/media-understanding/errors.ts
+++ b/src/media-understanding/errors.ts
@@ -1,4 +1,9 @@
-export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty";
+export type MediaUnderstandingSkipReason =
+  | "maxBytes"
+  | "timeout"
+  | "unsupported"
+  | "empty"
+  | "tooSmall";

 export class MediaUnderstandingSkipError extends Error {
  readonly reason: MediaUnderstandingSkipReason;
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -21,6 +21,7 @@ import {
  CLI_OUTPUT_MAX_BUFFER,
  DEFAULT_AUDIO_MODELS,
  DEFAULT_TIMEOUT_SECONDS,
+  MIN_AUDIO_FILE_BYTES,
 } from "./defaults.js";
 import { MediaUnderstandingSkipError } from "./errors.js";
 import { fileExists } from "./fs.js";
@@ -446,6 +447,12 @@ export async function runProviderEntry(params: {
      maxBytes,
      timeoutMs,
    });
+    if (media.size < MIN_AUDIO_FILE_BYTES) {
+      throw new MediaUnderstandingSkipError(
+        "tooSmall",
+        `Audio attachment ${params.attachmentIndex + 1} is too small (${media.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
+      );
+    }
    const { apiKeys, baseUrl, headers } = await resolveProviderExecutionContext({
      providerId,
      cfg,
@@ -563,6 +570,15 @@ export async function runCliEntry(params: {
    maxBytes,
    timeoutMs,
  });
+  if (capability === "audio") {
+    const stat = await fs.stat(pathResult.path);
+    if (stat.size < MIN_AUDIO_FILE_BYTES) {
+      throw new MediaUnderstandingSkipError(
+        "tooSmall",
+        `Audio attachment ${params.attachmentIndex + 1} is too small (${stat.size} bytes, minimum ${MIN_AUDIO_FILE_BYTES})`,
+      );
+    }
+  }
  const outputDir = await fs.mkdtemp(
    path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
  );
--- a/src/media-understanding/runner.skip-tiny-audio.test.ts
+++ b/src/media-understanding/runner.skip-tiny-audio.test.ts
@@ -0,0 +1,185 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { describe, expect, it } from "vitest";
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import { MIN_AUDIO_FILE_BYTES } from "./defaults.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+describe("runCapability skips tiny audio files", () => {
+  it("skips audio transcription when file is smaller than MIN_AUDIO_FILE_BYTES", async () => {
+    const originalPath = process.env.PATH;
+    process.env.PATH = "/usr/bin:/bin";
+
+    // Create a tiny audio file (well below the 1KB threshold)
+    const tmpPath = path.join(os.tmpdir(), `openclaw-tiny-audio-${Date.now()}.wav`);
+    const tinyBuffer = Buffer.alloc(100); // 100 bytes, way below 1024
+    await fs.writeFile(tmpPath, tinyBuffer);
+
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let transcribeCalled = false;
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async (req) => {
+          transcribeCalled = true;
+          return { text: "should not happen", model: req.model };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+
+      // The provider should never be called
+      expect(transcribeCalled).toBe(false);
+
+      // The result should indicate the attachment was skipped
+      expect(result.outputs).toHaveLength(0);
+      expect(result.decision.outcome).toBe("skipped");
+      expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
+      expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("tooSmall");
+    } finally {
+      process.env.PATH = originalPath;
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+
+  it("skips audio transcription for empty (0-byte) files", async () => {
+    const originalPath = process.env.PATH;
+    process.env.PATH = "/usr/bin:/bin";
+
+    const tmpPath = path.join(os.tmpdir(), `openclaw-empty-audio-${Date.now()}.ogg`);
+    await fs.writeFile(tmpPath, Buffer.alloc(0));
+
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/ogg" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let transcribeCalled = false;
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async () => {
+          transcribeCalled = true;
+          return { text: "nope", model: "whisper-1" };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+
+      expect(transcribeCalled).toBe(false);
+      expect(result.outputs).toHaveLength(0);
+    } finally {
+      process.env.PATH = originalPath;
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+
+  it("proceeds with transcription when file meets minimum size", async () => {
+    const originalPath = process.env.PATH;
+    process.env.PATH = "/usr/bin:/bin";
+
+    const tmpPath = path.join(os.tmpdir(), `openclaw-ok-audio-${Date.now()}.wav`);
+    const okBuffer = Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100);
+    await fs.writeFile(tmpPath, okBuffer);
+
+    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+
+    let transcribeCalled = false;
+    const providerRegistry = buildProviderRegistry({
+      openai: {
+        id: "openai",
+        capabilities: ["audio"],
+        transcribeAudio: async (req) => {
+          transcribeCalled = true;
+          return { text: "hello world", model: req.model };
+        },
+      },
+    });
+
+    const cfg = {
+      models: {
+        providers: {
+          openai: {
+            apiKey: "test-key",
+            models: [],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+      });
+
+      expect(transcribeCalled).toBe(true);
+      expect(result.outputs).toHaveLength(1);
+      expect(result.outputs[0]?.text).toBe("hello world");
+      expect(result.decision.outcome).toBe("success");
+    } finally {
+      process.env.PATH = originalPath;
+      await cache.cleanup();
+      await fs.unlink(tmpPath).catch(() => {});
+    }
+  });
+});