diff --git a/src/auto-reply/reply/commands-info.ts b/src/auto-reply/reply/commands-info.ts
index 69c80e812e9..1a525150c7c 100644
--- a/src/auto-reply/reply/commands-info.ts
+++ b/src/auto-reply/reply/commands-info.ts
@@ -68,6 +68,7 @@ export const handleStatusCommand: CommandHandler = async (params, allowTextComma
     resolveDefaultThinkingLevel: params.resolveDefaultThinkingLevel,
     isGroup: params.isGroup,
     defaultGroupActivation: params.defaultGroupActivation,
+    mediaDecisions: params.ctx.MediaUnderstandingDecisions,
   });
   return { shouldContinue: false, reply };
 };
diff --git a/src/auto-reply/reply/commands-status.ts b/src/auto-reply/reply/commands-status.ts
index e69b8203753..67081a209ea 100644
--- a/src/auto-reply/reply/commands-status.ts
+++ b/src/auto-reply/reply/commands-status.ts
@@ -24,6 +24,7 @@ import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "..
 import type { ReplyPayload } from "../types.js";
 import type { CommandContext } from "./commands-types.js";
 import { getFollowupQueueDepth, resolveQueueSettings } from "./queue.js";
+import type { MediaUnderstandingDecision } from "../../media-understanding/types.js";
 
 function formatApiKeySnippet(apiKey: string): string {
   const compact = apiKey.replace(/\s+/g, "");
@@ -105,6 +106,7 @@ export async function buildStatusReply(params: {
   resolveDefaultThinkingLevel: () => Promise<ThinkLevel | undefined>;
   isGroup: boolean;
   defaultGroupActivation: () => "always" | "mention";
+  mediaDecisions?: MediaUnderstandingDecision[];
 }): Promise<ReplyPayload | undefined> {
   const {
     cfg,
@@ -200,6 +202,7 @@ export async function buildStatusReply(params: {
       dropPolicy: queueSettings.dropPolicy,
       showDetails: queueOverrides,
     },
+    mediaDecisions: params.mediaDecisions,
     includeTranscriptUsage: false,
   });
 
diff --git a/src/auto-reply/reply/get-reply-directives-apply.ts b/src/auto-reply/reply/get-reply-directives-apply.ts
index dfc1b69233b..0201ca2870e 100644
--- a/src/auto-reply/reply/get-reply-directives-apply.ts
+++ b/src/auto-reply/reply/get-reply-directives-apply.ts
@@ -188,6 +188,7 @@ export async function applyInlineDirectiveOverrides(params: {
         resolveDefaultThinkingLevel: async () => resolvedDefaultThinkLevel,
         isGroup,
         defaultGroupActivation: defaultActivation,
+        mediaDecisions: ctx.MediaUnderstandingDecisions,
       });
     }
     typing.cleanup();
diff --git a/src/auto-reply/reply/get-reply-inline-actions.ts b/src/auto-reply/reply/get-reply-inline-actions.ts
index 2452fdd8f6a..825df34453a 100644
--- a/src/auto-reply/reply/get-reply-inline-actions.ts
+++ b/src/auto-reply/reply/get-reply-inline-actions.ts
@@ -185,6 +185,7 @@ export async function handleInlineActions(params: {
       resolveDefaultThinkingLevel,
       isGroup,
       defaultGroupActivation: defaultActivation,
+      mediaDecisions: ctx.MediaUnderstandingDecisions,
     });
     await sendInlineReply(inlineStatusReply);
     directives = { ...directives, hasStatusDirective: false };
diff --git a/src/auto-reply/status.test.ts b/src/auto-reply/status.test.ts
index 0ce0d5d308d..fd1e4d36f9d 100644
--- a/src/auto-reply/status.test.ts
+++ b/src/auto-reply/status.test.ts
@@ -90,6 +90,59 @@ describe("buildStatusMessage", () => {
     expect(text).toContain("elevated");
   });
 
+  it("includes media understanding decisions when present", () => {
+    const text = buildStatusMessage({
+      agent: { model: "anthropic/claude-opus-4-5" },
+      sessionEntry: { sessionId: "media", updatedAt: 0 },
+      sessionKey: "agent:main:main",
+      queue: { mode: "none" },
+      mediaDecisions: [
+        {
+          capability: "image",
+          outcome: "success",
+          attachments: [
+            {
+              attachmentIndex: 0,
+              attempts: [
+                {
+                  type: "provider",
+                  outcome: "success",
+                  provider: "openai",
+                  model: "gpt-5.2",
+                },
+              ],
+              chosen: {
+                type: "provider",
+                outcome: "success",
+                provider: "openai",
+                model: "gpt-5.2",
+              },
+            },
+          ],
+        },
+        {
+          capability: "audio",
+          outcome: "skipped",
+          attachments: [
+            {
+              attachmentIndex: 1,
+              attempts: [
+                {
+                  type: "provider",
+                  outcome: "skipped",
+                  reason: "maxBytes: too large",
+                },
+              ],
+            },
+          ],
+        },
+      ],
+    });
+
+    const normalized = normalizeTestText(text);
+    expect(normalized).toContain("Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)");
+  });
+
   it("does not show elevated label when session explicitly disables it", () => {
     const text = buildStatusMessage({
       agent: { model: "anthropic/claude-opus-4-5", elevatedDefault: "on" },
diff --git a/src/auto-reply/status.ts b/src/auto-reply/status.ts
index 66e444456f6..a90e7031c3d 100644
--- a/src/auto-reply/status.ts
+++ b/src/auto-reply/status.ts
@@ -24,6 +24,7 @@ import { VERSION } from "../version.js";
 import { listChatCommands, listChatCommandsForConfig } from "./commands-registry.js";
 import type { SkillCommandSpec } from "../agents/skills.js";
 import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "./thinking.js";
+import type { MediaUnderstandingDecision } from "../media-understanding/types.js";
 
 type AgentConfig = Partial<NonNullable<NonNullable<ClawdbotConfig["agents"]>["defaults"]>>;
 
@@ -52,6 +53,7 @@ type StatusArgs = {
   modelAuth?: string;
   usageLine?: string;
   queue?: QueueStatus;
+  mediaDecisions?: MediaUnderstandingDecision[];
   includeTranscriptUsage?: boolean;
   now?: number;
 };
@@ -167,6 +169,42 @@ const formatUsagePair = (input?: number | null, output?: number | null) => {
   return `🧮 Tokens: ${inputLabel} in / ${outputLabel} out`;
 };
 
+const formatMediaUnderstandingLine = (decisions?: MediaUnderstandingDecision[]) => {
+  if (!decisions || decisions.length === 0) return null;
+  const parts = decisions
+    .map((decision) => {
+      const count = decision.attachments.length;
+      const countLabel = count > 1 ? ` x${count}` : "";
+      if (decision.outcome === "success") {
+        const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen;
+        const provider = chosen?.provider?.trim();
+        const model = chosen?.model?.trim();
+        const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : null;
+        return `${decision.capability}${countLabel} ok${modelLabel ? ` (${modelLabel})` : ""}`;
+      }
+      if (decision.outcome === "no-attachment") {
+        return `${decision.capability} none`;
+      }
+      if (decision.outcome === "disabled") {
+        return `${decision.capability} off`;
+      }
+      if (decision.outcome === "scope-deny") {
+        return `${decision.capability} denied`;
+      }
+      if (decision.outcome === "skipped") {
+        const reason = decision.attachments
+          .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
+          .find(Boolean);
+        const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+        return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`;
+      }
+      return null;
+    })
+    .filter(Boolean);
+  if (parts.length === 0) return null;
+  return `📎 Media: ${parts.join(" · ")}`;
+};
+
 export function buildStatusMessage(args: StatusArgs): string {
   const now = args.now ?? Date.now();
   const entry = args.sessionEntry;
@@ -320,12 +358,14 @@ export function buildStatusMessage(args: StatusArgs): string {
   const costLine = costLabel ? `💵 Cost: ${costLabel}` : null;
   const usageCostLine =
     usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine);
+  const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions);
 
   return [
     versionLine,
     modelLine,
     usageCostLine,
     `📚 ${contextLine}`,
+    mediaLine,
     args.usageLine,
     `🧵 ${sessionLine}`,
     `⚙️ ${optionsLine}`,
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 8a5bac74b7b..d1197ba1a8e 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -1,52 +1,26 @@
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
-import { applyTemplate } from "../auto-reply/templating.js";
 import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
-import { resolveApiKeyForProvider } from "../agents/model-auth.js";
-import { logVerbose, shouldLogVerbose } from "../globals.js";
-import { runExec } from "../process/exec.js";
-import type {
-  MediaUnderstandingConfig,
-  MediaUnderstandingModelConfig,
-} from "../config/types.tools.js";
-import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js";
-import {
-  CLI_OUTPUT_MAX_BUFFER,
-  DEFAULT_AUDIO_MODELS,
-  DEFAULT_TIMEOUT_SECONDS,
-} from "./defaults.js";
-import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js";
 import {
   extractMediaUserText,
   formatAudioTranscripts,
   formatMediaUnderstandingBody,
 } from "./format.js";
-import {
-  buildMediaUnderstandingRegistry,
-  getMediaUnderstandingProvider,
-  normalizeMediaProviderId,
-} from "./providers/index.js";
-import { describeImageWithModel } from "./providers/image.js";
-import {
-  resolveCapabilityConfig,
-  inferProviderCapabilities,
-  resolveConcurrency,
-  resolveMaxBytes,
-  resolveMaxChars,
-  resolveModelEntries,
-  resolvePrompt,
-  resolveScopeDecision,
-  resolveTimeoutMs,
-} from "./resolve.js";
 import type {
   MediaUnderstandingCapability,
   MediaUnderstandingDecision,
-  MediaUnderstandingModelDecision,
   MediaUnderstandingOutput,
   MediaUnderstandingProvider,
 } from "./types.js";
 import { runWithConcurrency } from "./concurrency.js";
-import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
+import { resolveConcurrency } from "./resolve.js";
+import {
+  type ActiveMediaModel,
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
 
 export type ApplyMediaUnderstandingResult = {
   outputs: MediaUnderstandingOutput[];
@@ -58,476 +32,6 @@ export type ApplyMediaUnderstandingResult = {
 
 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
 
-type ActiveMediaModel = {
-  provider: string;
-  model?: string;
-};
-
-function trimOutput(text: string, maxChars?: number): string {
-  const trimmed = text.trim();
-  if (!maxChars || trimmed.length <= maxChars) return trimmed;
-  return trimmed.slice(0, maxChars).trim();
-}
-
-function resolveEntriesWithActiveFallback(params: {
-  cfg: ClawdbotConfig;
-  capability: MediaUnderstandingCapability;
-  config?: MediaUnderstandingConfig;
-  activeModel?: ActiveMediaModel;
-}): MediaUnderstandingModelConfig[] {
-  const entries = resolveModelEntries({
-    cfg: params.cfg,
-    capability: params.capability,
-    config: params.config,
-  });
-  if (entries.length > 0) return entries;
-  if (params.config?.enabled !== true) return entries;
-  const activeProvider = params.activeModel?.provider?.trim();
-  if (!activeProvider) return entries;
-  const capabilities = inferProviderCapabilities(activeProvider);
-  if (!capabilities || !capabilities.includes(params.capability)) return entries;
-  return [
-    {
-      type: "provider",
-      provider: activeProvider,
-      model: params.activeModel?.model,
-    },
-  ];
-}
-
-function buildModelDecision(params: {
-  entry: MediaUnderstandingModelConfig;
-  entryType: "provider" | "cli";
-  outcome: MediaUnderstandingModelDecision["outcome"];
-  reason?: string;
-}): MediaUnderstandingModelDecision {
-  if (params.entryType === "cli") {
-    const command = params.entry.command?.trim();
-    return {
-      type: "cli",
-      provider: command ?? "cli",
-      model: params.entry.model ?? command,
-      outcome: params.outcome,
-      reason: params.reason,
-    };
-  }
-  const providerIdRaw = params.entry.provider?.trim();
-  const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined;
-  return {
-    type: "provider",
-    provider: providerId ?? providerIdRaw,
-    model: params.entry.model,
-    outcome: params.outcome,
-    reason: params.reason,
-  };
-}
-
-async function runProviderEntry(params: {
-  capability: MediaUnderstandingCapability;
-  entry: MediaUnderstandingModelConfig;
-  cfg: ClawdbotConfig;
-  ctx: MsgContext;
-  attachmentIndex: number;
-  cache: MediaAttachmentCache;
-  agentDir?: string;
-  providerRegistry: Map<string, MediaUnderstandingProvider>;
-  config?: MediaUnderstandingConfig;
-}): Promise<MediaUnderstandingOutput | null> {
-  const { entry, capability, cfg } = params;
-  const providerIdRaw = entry.provider?.trim();
-  if (!providerIdRaw) {
-    throw new Error(`Provider entry missing provider for ${capability}`);
-  }
-  const providerId = normalizeMediaProviderId(providerIdRaw);
-  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
-  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
-  const timeoutMs = resolveTimeoutMs(
-    entry.timeoutSeconds ??
-      params.config?.timeoutSeconds ??
-      cfg.tools?.media?.[capability]?.timeoutSeconds,
-    DEFAULT_TIMEOUT_SECONDS[capability],
-  );
-  const prompt = resolvePrompt(
-    capability,
-    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
-    maxChars,
-  );
-
-  if (capability === "image") {
-    if (!params.agentDir) {
-      throw new Error("Image understanding requires agentDir");
-    }
-    const modelId = entry.model?.trim();
-    if (!modelId) {
-      throw new Error("Image understanding requires model id");
-    }
-    const media = await params.cache.getBuffer({
-      attachmentIndex: params.attachmentIndex,
-      maxBytes,
-      timeoutMs,
-    });
-    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
-    const result = provider?.describeImage
-      ? await provider.describeImage({
-          buffer: media.buffer,
-          fileName: media.fileName,
-          mime: media.mime,
-          model: modelId,
-          provider: providerId,
-          prompt,
-          timeoutMs,
-          profile: entry.profile,
-          preferredProfile: entry.preferredProfile,
-          agentDir: params.agentDir,
-          cfg: params.cfg,
-        })
-      : await describeImageWithModel({
-          buffer: media.buffer,
-          fileName: media.fileName,
-          mime: media.mime,
-          model: modelId,
-          provider: providerId,
-          prompt,
-          timeoutMs,
-          profile: entry.profile,
-          preferredProfile: entry.preferredProfile,
-          agentDir: params.agentDir,
-          cfg: params.cfg,
-        });
-    return {
-      kind: "image.description",
-      attachmentIndex: params.attachmentIndex,
-      text: trimOutput(result.text, maxChars),
-      provider: providerId,
-      model: result.model ?? modelId,
-    };
-  }
-
-  const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
-  if (!provider) {
-    throw new Error(`Media provider not available: ${providerId}`);
-  }
-
-  if (capability === "audio") {
-    if (!provider.transcribeAudio) {
-      throw new Error(`Audio transcription provider "${providerId}" not available.`);
-    }
-    const media = await params.cache.getBuffer({
-      attachmentIndex: params.attachmentIndex,
-      maxBytes,
-      timeoutMs,
-    });
-    const key = await resolveApiKeyForProvider({
-      provider: providerId,
-      cfg,
-      profileId: entry.profile,
-      preferredProfile: entry.preferredProfile,
-      agentDir: params.agentDir,
-    });
-    const providerConfig = cfg.models?.providers?.[providerId];
-    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
-    const result = await provider.transcribeAudio({
-      buffer: media.buffer,
-      fileName: media.fileName,
-      mime: media.mime,
-      apiKey: key.apiKey,
-      baseUrl: providerConfig?.baseUrl,
-      headers: providerConfig?.headers,
-      model,
-      language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
-      prompt,
-      timeoutMs,
-    });
-    return {
-      kind: "audio.transcription",
-      attachmentIndex: params.attachmentIndex,
-      text: trimOutput(result.text, maxChars),
-      provider: providerId,
-      model: result.model ?? model,
-    };
-  }
-
-  if (!provider.describeVideo) {
-    throw new Error(`Video understanding provider "${providerId}" not available.`);
-  }
-  const media = await params.cache.getBuffer({
-    attachmentIndex: params.attachmentIndex,
-    maxBytes,
-    timeoutMs,
-  });
-  const estimatedBase64Bytes = estimateBase64Size(media.size);
-  const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
-  if (estimatedBase64Bytes > maxBase64Bytes) {
-    throw new MediaUnderstandingSkipError(
-      "maxBytes",
-      `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
-    );
-  }
-  const key = await resolveApiKeyForProvider({
-    provider: providerId,
-    cfg,
-    profileId: entry.profile,
-    preferredProfile: entry.preferredProfile,
-    agentDir: params.agentDir,
-  });
-  const providerConfig = cfg.models?.providers?.[providerId];
-  const result = await provider.describeVideo({
-    buffer: media.buffer,
-    fileName: media.fileName,
-    mime: media.mime,
-    apiKey: key.apiKey,
-    baseUrl: providerConfig?.baseUrl,
-    headers: providerConfig?.headers,
-    model: entry.model,
-    prompt,
-    timeoutMs,
-  });
-  return {
-    kind: "video.description",
-    attachmentIndex: params.attachmentIndex,
-    text: trimOutput(result.text, maxChars),
-    provider: providerId,
-    model: result.model ?? entry.model,
-  };
-}
-
-async function runCliEntry(params: {
-  capability: MediaUnderstandingCapability;
-  entry: MediaUnderstandingModelConfig;
-  cfg: ClawdbotConfig;
-  ctx: MsgContext;
-  attachmentIndex: number;
-  cache: MediaAttachmentCache;
-  config?: MediaUnderstandingConfig;
-}): Promise<MediaUnderstandingOutput | null> {
-  const { entry, capability, cfg, ctx } = params;
-  const command = entry.command?.trim();
-  const args = entry.args ?? [];
-  if (!command) {
-    throw new Error(`CLI entry missing command for ${capability}`);
-  }
-  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
-  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
-  const timeoutMs = resolveTimeoutMs(
-    entry.timeoutSeconds ??
-      params.config?.timeoutSeconds ??
-      cfg.tools?.media?.[capability]?.timeoutSeconds,
-    DEFAULT_TIMEOUT_SECONDS[capability],
-  );
-  const prompt = resolvePrompt(
-    capability,
-    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
-    maxChars,
-  );
-  const pathResult = await params.cache.getPath({
-    attachmentIndex: params.attachmentIndex,
-    maxBytes,
-    timeoutMs,
-  });
-
-  const templCtx: MsgContext = {
-    ...ctx,
-    MediaPath: pathResult.path,
-    Prompt: prompt,
-    MaxChars: maxChars,
-  };
-  const argv = [command, ...args].map((part, index) =>
-    index === 0 ? part : applyTemplate(part, templCtx),
-  );
-  if (shouldLogVerbose()) {
-    logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
-  }
-  const { stdout } = await runExec(argv[0], argv.slice(1), {
-    timeoutMs,
-    maxBuffer: CLI_OUTPUT_MAX_BUFFER,
-  });
-  const text = trimOutput(stdout, maxChars);
-  if (!text) return null;
-  return {
-    kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
-    attachmentIndex: params.attachmentIndex,
-    text,
-    provider: "cli",
-    model: command,
-  };
-}
-
-async function runAttachmentEntries(params: {
-  capability: MediaUnderstandingCapability;
-  cfg: ClawdbotConfig;
-  ctx: MsgContext;
-  attachmentIndex: number;
-  agentDir?: string;
-  providerRegistry: Map<string, MediaUnderstandingProvider>;
-  cache: MediaAttachmentCache;
-  entries: MediaUnderstandingModelConfig[];
-  config?: MediaUnderstandingConfig;
-}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> {
-  const { entries, capability } = params;
-  const attempts: MediaUnderstandingModelDecision[] = [];
-  for (const entry of entries) {
-    try {
-      const entryType = entry.type ?? (entry.command ? "cli" : "provider");
-      const result =
-        entryType === "cli"
-          ? await runCliEntry({
-              capability,
-              entry,
-              cfg: params.cfg,
-              ctx: params.ctx,
-              attachmentIndex: params.attachmentIndex,
-              cache: params.cache,
-              config: params.config,
-            })
-          : await runProviderEntry({
-              capability,
-              entry,
-              cfg: params.cfg,
-              ctx: params.ctx,
-              attachmentIndex: params.attachmentIndex,
-              cache: params.cache,
-              agentDir: params.agentDir,
-              providerRegistry: params.providerRegistry,
-              config: params.config,
-            });
-      if (result) {
-        const decision = buildModelDecision({ entry, entryType, outcome: "success" });
-        if (result.provider) decision.provider = result.provider;
-        if (result.model) decision.model = result.model;
-        attempts.push(decision);
-        return { output: result, attempts };
-      }
-      attempts.push(
-        buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
-      );
-    } catch (err) {
-      if (isMediaUnderstandingSkipError(err)) {
-        attempts.push(
-          buildModelDecision({
-            entry,
-            entryType: entry.type ?? (entry.command ? "cli" : "provider"),
-            outcome: "skipped",
-            reason: `${err.reason}: ${err.message}`,
-          }),
-        );
-        if (shouldLogVerbose()) {
-          logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
-        }
-        continue;
-      }
-      attempts.push(
-        buildModelDecision({
-          entry,
-          entryType: entry.type ?? (entry.command ? "cli" : "provider"),
-          outcome: "failed",
-          reason: String(err),
-        }),
-      );
-      if (shouldLogVerbose()) {
-        logVerbose(`${capability} understanding failed: ${String(err)}`);
-      }
-    }
-  }
-
-  return { output: null, attempts };
-}
-
-async function runCapability(params: {
-  capability: MediaUnderstandingCapability;
-  cfg: ClawdbotConfig;
-  ctx: MsgContext;
-  attachments: MediaAttachmentCache;
-  media: ReturnType<typeof normalizeAttachments>;
-  agentDir?: string;
-  providerRegistry: Map<string, MediaUnderstandingProvider>;
-  config?: MediaUnderstandingConfig;
-  activeModel?: ActiveMediaModel;
-}): Promise<{ outputs: MediaUnderstandingOutput[]; decision: MediaUnderstandingDecision }> {
-  const { capability, cfg, ctx } = params;
-  const config = params.config ?? resolveCapabilityConfig(cfg, capability);
-  if (config?.enabled === false) {
-    return {
-      outputs: [],
-      decision: { capability, outcome: "disabled", attachments: [] },
-    };
-  }
-
-  const attachmentPolicy = config?.attachments;
-  const selected = selectAttachments({
-    capability,
-    attachments: params.media,
-    policy: attachmentPolicy,
-  });
-  if (selected.length === 0) {
-    return {
-      outputs: [],
-      decision: { capability, outcome: "no-attachment", attachments: [] },
-    };
-  }
-
-  const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
-  if (scopeDecision === "deny") {
-    if (shouldLogVerbose()) {
-      logVerbose(`${capability} understanding disabled by scope policy.`);
-    }
-    return {
-      outputs: [],
-      decision: {
-        capability,
-        outcome: "scope-deny",
-        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
-      },
-    };
-  }
-
-  const entries = resolveEntriesWithActiveFallback({
-    cfg,
-    capability,
-    config,
-    activeModel: params.activeModel,
-  });
-  if (entries.length === 0) {
-    return {
-      outputs: [],
-      decision: {
-        capability,
-        outcome: "skipped",
-        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
-      },
-    };
-  }
-
-  const outputs: MediaUnderstandingOutput[] = [];
-  const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
-  for (const attachment of selected) {
-    const { output, attempts } = await runAttachmentEntries({
-      capability,
-      cfg,
-      ctx,
-      attachmentIndex: attachment.index,
-      agentDir: params.agentDir,
-      providerRegistry: params.providerRegistry,
-      cache: params.attachments,
-      entries,
-      config,
-    });
-    if (output) outputs.push(output);
-    attachmentDecisions.push({
-      attachmentIndex: attachment.index,
-      attempts,
-      chosen: attempts.find((attempt) => attempt.outcome === "success"),
-    });
-  }
-  return {
-    outputs,
-    decision: {
-      capability,
-      outcome: outputs.length > 0 ? "success" : "skipped",
-      attachments: attachmentDecisions,
-    },
-  };
-}
-
 export async function applyMediaUnderstanding(params: {
   ctx: MsgContext;
   cfg: ClawdbotConfig;
@@ -542,13 +46,13 @@ export async function applyMediaUnderstanding(params: {
       .map((value) => extractMediaUserText(value))
       .find((value) => value && value.trim()) ?? undefined;
 
-  const attachments = normalizeAttachments(ctx);
-  const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
-  const cache = new MediaAttachmentCache(attachments);
+  const attachments = normalizeMediaAttachments(ctx);
+  const providerRegistry = buildProviderRegistry(params.providers);
+  const cache = createMediaAttachmentCache(attachments);
 
   try {
     const tasks = CAPABILITY_ORDER.map((capability) => async () => {
-      const config = resolveCapabilityConfig(cfg, capability);
+      const config = cfg.tools?.media?.[capability];
       return await runCapability({
         capability,
         cfg,
@@ -565,17 +69,12 @@ export async function applyMediaUnderstanding(params: {
     const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
     const outputs: MediaUnderstandingOutput[] = [];
     const decisions: MediaUnderstandingDecision[] = [];
-    for (const [index] of CAPABILITY_ORDER.entries()) {
-      const entry = results[index];
+    for (const entry of results) {
       if (!entry) continue;
-      if (Array.isArray(entry.outputs)) {
-        for (const output of entry.outputs) {
-          outputs.push(output);
-        }
-      }
-      if (entry.decision) {
-        decisions.push(entry.decision);
+      for (const output of entry.outputs) {
+        outputs.push(output);
       }
+      decisions.push(entry.decision);
     }
 
     if (decisions.length > 0) {
diff --git a/src/media-understanding/providers/anthropic/index.ts b/src/media-understanding/providers/anthropic/index.ts
index 3f9fc584c3f..35ae04a921e 100644
--- a/src/media-understanding/providers/anthropic/index.ts
+++ b/src/media-understanding/providers/anthropic/index.ts
@@ -3,5 +3,6 @@ import { describeImageWithModel } from "../image.js";
 
 export const anthropicProvider: MediaUnderstandingProvider = {
   id: "anthropic",
+  capabilities: ["image"],
   describeImage: describeImageWithModel,
 };
diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts
index d0f8bae3b8b..6b3d412ba84 100644
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -4,6 +4,7 @@ import { describeGeminiVideo } from "./video.js";
 
 export const googleProvider: MediaUnderstandingProvider = {
   id: "google",
+  capabilities: ["image", "audio", "video"],
   describeImage: describeImageWithModel,
   describeVideo: describeGeminiVideo,
 };
diff --git a/src/media-understanding/providers/groq/index.ts b/src/media-understanding/providers/groq/index.ts
index 451799e8ef9..5f59e5702ab 100644
--- a/src/media-understanding/providers/groq/index.ts
+++ b/src/media-understanding/providers/groq/index.ts
@@ -5,6 +5,7 @@ const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
 
 export const groqProvider: MediaUnderstandingProvider = {
   id: "groq",
+  capabilities: ["audio"],
   transcribeAudio: (req) =>
     transcribeOpenAiCompatibleAudio({
       ...req,
diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts
index 6f4387a10a5..9c560c8e72a 100644
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -29,7 +29,16 @@ export function buildMediaUnderstandingRegistry(
   }
   if (overrides) {
     for (const [key, provider] of Object.entries(overrides)) {
-      registry.set(normalizeMediaProviderId(key), provider);
+      const normalizedKey = normalizeMediaProviderId(key);
+      const existing = registry.get(normalizedKey);
+      const merged = existing
+        ? {
+            ...existing,
+            ...provider,
+            capabilities: provider.capabilities ?? existing.capabilities,
+          }
+        : provider;
+      registry.set(normalizedKey, merged);
     }
   }
   return registry;
diff --git a/src/media-understanding/providers/minimax/index.ts b/src/media-understanding/providers/minimax/index.ts
index 8d500353845..6fa6ebf351a 100644
--- a/src/media-understanding/providers/minimax/index.ts
+++ b/src/media-understanding/providers/minimax/index.ts
@@ -3,5 +3,6 @@ import { describeImageWithModel } from "../image.js";
 
 export const minimaxProvider: MediaUnderstandingProvider = {
   id: "minimax",
+  capabilities: ["image"],
   describeImage: describeImageWithModel,
 };
diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts
index 0aabb275fbf..d6e735c18ef 100644
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -4,6 +4,7 @@ import { transcribeOpenAiCompatibleAudio } from "./audio.js";
 
 export const openaiProvider: MediaUnderstandingProvider = {
   id: "openai",
+  capabilities: ["image"],
   describeImage: describeImageWithModel,
   transcribeAudio: transcribeOpenAiCompatibleAudio,
 };
diff --git a/src/media-understanding/resolve.ts b/src/media-understanding/resolve.ts
index bc34ce7d5b3..8f260542e8d 100644
--- a/src/media-understanding/resolve.ts
+++ b/src/media-understanding/resolve.ts
@@ -77,36 +77,22 @@ export function resolveScopeDecision(params: {
   });
 }
 
-export function inferProviderCapabilities(
-  providerId?: string,
-): MediaUnderstandingCapability[] | undefined {
-  const provider = normalizeMediaProviderId(providerId ?? "");
-  if (!provider) return undefined;
-  if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
-    return ["image"];
-  }
-  if (provider === "google") {
-    return ["image", "audio", "video"];
-  }
-  if (provider === "groq") {
-    return ["audio"];
-  }
-  return undefined;
-}
-
-function inferCapabilities(
-  entry: MediaUnderstandingModelConfig,
-): MediaUnderstandingCapability[] | undefined {
-  if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
-    return undefined;
-  }
-  return inferProviderCapabilities(entry.provider);
+function resolveEntryCapabilities(params: {
+  entry: MediaUnderstandingModelConfig;
+  providerRegistry: Map<string, { capabilities?: MediaUnderstandingCapability[] }>;
+}): MediaUnderstandingCapability[] | undefined {
+  const entryType = params.entry.type ?? (params.entry.command ? "cli" : "provider");
+  if (entryType === "cli") return undefined;
+  const providerId = normalizeMediaProviderId(params.entry.provider ?? "");
+  if (!providerId) return undefined;
+  return params.providerRegistry.get(providerId)?.capabilities;
 }
 
 export function resolveModelEntries(params: {
   cfg: ClawdbotConfig;
   capability: MediaUnderstandingCapability;
   config?: MediaUnderstandingConfig;
+  providerRegistry: Map<string, { capabilities?: MediaUnderstandingCapability[] }>;
 }): MediaUnderstandingModelConfig[] {
   const { cfg, capability, config } = params;
   const sharedModels = cfg.tools?.media?.models ?? [];
@@ -122,7 +108,7 @@ export function resolveModelEntries(params: {
         entry.capabilities && entry.capabilities.length > 0
           ? entry.capabilities
           : source === "shared"
-            ? inferCapabilities(entry)
+            ? resolveEntryCapabilities({ entry, providerRegistry: params.providerRegistry })
             : undefined;
       if (!caps || caps.length === 0) {
         if (source === "shared") {
@@ -148,13 +134,32 @@ export function resolveConcurrency(cfg: ClawdbotConfig): number {
   return DEFAULT_MEDIA_CONCURRENCY;
 }
 
-export function resolveCapabilityEnabled(params: {
+export function resolveEntriesWithActiveFallback(params: {
   cfg: ClawdbotConfig;
+  capability: MediaUnderstandingCapability;
   config?: MediaUnderstandingConfig;
-}): boolean {
-  if (params.config?.enabled === false) return false;
-  const sharedModels = params.cfg.tools?.media?.models ?? [];
-  const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0;
-  if (!hasModels) return false;
-  return true;
+  providerRegistry: Map<string, { capabilities?: MediaUnderstandingCapability[] }>;
+  activeModel?: { provider: string; model?: string };
+}): MediaUnderstandingModelConfig[] {
+  const entries = resolveModelEntries({
+    cfg: params.cfg,
+    capability: params.capability,
+    config: params.config,
+    providerRegistry: params.providerRegistry,
+  });
+  if (entries.length > 0) return entries;
+  if (params.config?.enabled !== true) return entries;
+  const activeProviderRaw = params.activeModel?.provider?.trim();
+  if (!activeProviderRaw) return entries;
+  const activeProvider = normalizeMediaProviderId(activeProviderRaw);
+  if (!activeProvider) return entries;
+  const capabilities = params.providerRegistry.get(activeProvider)?.capabilities;
+  if (!capabilities || !capabilities.includes(params.capability)) return entries;
+  return [
+    {
+      type: "provider",
+      provider: activeProvider,
+      model: params.activeModel?.model,
+    },
+  ];
 }
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
new file mode 100644
index 00000000000..d33ab6296ab
--- /dev/null
+++ b/src/media-understanding/runner.ts
@@ -0,0 +1,506 @@
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { applyTemplate } from "../auto-reply/templating.js";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { runExec } from "../process/exec.js";
+import type {
+  MediaUnderstandingConfig,
+  MediaUnderstandingModelConfig,
+} from "../config/types.tools.js";
+import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js";
+import {
+  CLI_OUTPUT_MAX_BUFFER,
+  DEFAULT_AUDIO_MODELS,
+  DEFAULT_TIMEOUT_SECONDS,
+} from "./defaults.js";
+import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js";
+import {
+  resolveEntriesWithActiveFallback,
+  resolveMaxBytes,
+  resolveMaxChars,
+  resolvePrompt,
+  resolveScopeDecision,
+  resolveTimeoutMs,
+} from "./resolve.js";
+import type {
+  MediaAttachment,
+  MediaUnderstandingCapability,
+  MediaUnderstandingDecision,
+  MediaUnderstandingModelDecision,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "./types.js";
+import {
+  buildMediaUnderstandingRegistry,
+  getMediaUnderstandingProvider,
+  normalizeMediaProviderId,
+} from "./providers/index.js";
+import { describeImageWithModel } from "./providers/image.js";
+import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
+
+export type ActiveMediaModel = {
+  provider: string;
+  model?: string;
+};
+
+type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
+
+export type RunCapabilityResult = {
+  outputs: MediaUnderstandingOutput[];
+  decision: MediaUnderstandingDecision;
+};
+
+export function buildProviderRegistry(
+  overrides?: Record<string, MediaUnderstandingProvider>,
+): ProviderRegistry {
+  return buildMediaUnderstandingRegistry(overrides);
+}
+
+export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] {
+  return normalizeAttachments(ctx);
+}
+
+export function createMediaAttachmentCache(attachments: MediaAttachment[]): MediaAttachmentCache {
+  return new MediaAttachmentCache(attachments);
+}
+
+function trimOutput(text: string, maxChars?: number): string {
+  const trimmed = text.trim();
+  if (!maxChars || trimmed.length <= maxChars) return trimmed;
+  return trimmed.slice(0, maxChars).trim();
+}
+
+function buildModelDecision(params: {
+  entry: MediaUnderstandingModelConfig;
+  entryType: "provider" | "cli";
+  outcome: MediaUnderstandingModelDecision["outcome"];
+  reason?: string;
+}): MediaUnderstandingModelDecision {
+  if (params.entryType === "cli") {
+    const command = params.entry.command?.trim();
+    return {
+      type: "cli",
+      provider: command ?? "cli",
+      model: params.entry.model ?? command,
+      outcome: params.outcome,
+      reason: params.reason,
+    };
+  }
+  const providerIdRaw = params.entry.provider?.trim();
+  const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined;
+  return {
+    type: "provider",
+    provider: providerId ?? providerIdRaw,
+    model: params.entry.model,
+    outcome: params.outcome,
+    reason: params.reason,
+  };
+}
+
+async function runProviderEntry(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachmentIndex: number;
+  cache: MediaAttachmentCache;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  config?: MediaUnderstandingConfig;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg } = params;
+  const providerIdRaw = entry.provider?.trim();
+  if (!providerIdRaw) {
+    throw new Error(`Provider entry missing provider for ${capability}`);
+  }
+  const providerId = normalizeMediaProviderId(providerIdRaw);
+  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
+  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ??
+      params.config?.timeoutSeconds ??
+      cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+
+  if (capability === "image") {
+    if (!params.agentDir) {
+      throw new Error("Image understanding requires agentDir");
+    }
+    const modelId = entry.model?.trim();
+    if (!modelId) {
+      throw new Error("Image understanding requires model id");
+    }
+    const media = await params.cache.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs,
+    });
+    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+    const result = provider?.describeImage
+      ? await provider.describeImage({
+          buffer: media.buffer,
+          fileName: media.fileName,
+          mime: media.mime,
+          model: modelId,
+          provider: providerId,
+          prompt,
+          timeoutMs,
+          profile: entry.profile,
+          preferredProfile: entry.preferredProfile,
+          agentDir: params.agentDir,
+          cfg: params.cfg,
+        })
+      : await describeImageWithModel({
+          buffer: media.buffer,
+          fileName: media.fileName,
+          mime: media.mime,
+          model: modelId,
+          provider: providerId,
+          prompt,
+          timeoutMs,
+          profile: entry.profile,
+          preferredProfile: entry.preferredProfile,
+          agentDir: params.agentDir,
+          cfg: params.cfg,
+        });
+    return {
+      kind: "image.description",
+      attachmentIndex: params.attachmentIndex,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? modelId,
+    };
+  }
+
+  const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+  if (!provider) {
+    throw new Error(`Media provider not available: ${providerId}`);
+  }
+
+  if (capability === "audio") {
+    if (!provider.transcribeAudio) {
+      throw new Error(`Audio transcription provider "${providerId}" not available.`);
+    }
+    const media = await params.cache.getBuffer({
+      attachmentIndex: params.attachmentIndex,
+      maxBytes,
+      timeoutMs,
+    });
+    const key = await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg,
+      profileId: entry.profile,
+      preferredProfile: entry.preferredProfile,
+      agentDir: params.agentDir,
+    });
+    const providerConfig = cfg.models?.providers?.[providerId];
+    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
+    const result = await provider.transcribeAudio({
+      buffer: media.buffer,
+      fileName: media.fileName,
+      mime: media.mime,
+      apiKey: key.apiKey,
+      baseUrl: providerConfig?.baseUrl,
+      headers: providerConfig?.headers,
+      model,
+      language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
+      prompt,
+      timeoutMs,
+    });
+    return {
+      kind: "audio.transcription",
+      attachmentIndex: params.attachmentIndex,
+      text: trimOutput(result.text, maxChars),
+      provider: providerId,
+      model: result.model ?? model,
+    };
+  }
+
+  if (!provider.describeVideo) {
+    throw new Error(`Video understanding provider "${providerId}" not available.`);
+  }
+  const media = await params.cache.getBuffer({
+    attachmentIndex: params.attachmentIndex,
+    maxBytes,
+    timeoutMs,
+  });
+  const estimatedBase64Bytes = estimateBase64Size(media.size);
+  const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
+  if (estimatedBase64Bytes > maxBase64Bytes) {
+    throw new MediaUnderstandingSkipError(
+      "maxBytes",
+      `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
+    );
+  }
+  const key = await resolveApiKeyForProvider({
+    provider: providerId,
+    cfg,
+    profileId: entry.profile,
+    preferredProfile: entry.preferredProfile,
+    agentDir: params.agentDir,
+  });
+  const providerConfig = cfg.models?.providers?.[providerId];
+  const result = await provider.describeVideo({
+    buffer: media.buffer,
+    fileName: media.fileName,
+    mime: media.mime,
+    apiKey: key.apiKey,
+    baseUrl: providerConfig?.baseUrl,
+    headers: providerConfig?.headers,
+    model: entry.model,
+    prompt,
+    timeoutMs,
+  });
+  return {
+    kind: "video.description",
+    attachmentIndex: params.attachmentIndex,
+    text: trimOutput(result.text, maxChars),
+    provider: providerId,
+    model: result.model ?? entry.model,
+  };
+}
+
+async function runCliEntry(params: {
+  capability: MediaUnderstandingCapability;
+  entry: MediaUnderstandingModelConfig;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachmentIndex: number;
+  cache: MediaAttachmentCache;
+  config?: MediaUnderstandingConfig;
+}): Promise<MediaUnderstandingOutput | null> {
+  const { entry, capability, cfg, ctx } = params;
+  const command = entry.command?.trim();
+  const args = entry.args ?? [];
+  if (!command) {
+    throw new Error(`CLI entry missing command for ${capability}`);
+  }
+  const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
+  const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
+  const timeoutMs = resolveTimeoutMs(
+    entry.timeoutSeconds ??
+      params.config?.timeoutSeconds ??
+      cfg.tools?.media?.[capability]?.timeoutSeconds,
+    DEFAULT_TIMEOUT_SECONDS[capability],
+  );
+  const prompt = resolvePrompt(
+    capability,
+    entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
+    maxChars,
+  );
+  const pathResult = await params.cache.getPath({
+    attachmentIndex: params.attachmentIndex,
+    maxBytes,
+    timeoutMs,
+  });
+
+  const templCtx: MsgContext = {
+    ...ctx,
+    MediaPath: pathResult.path,
+    Prompt: prompt,
+    MaxChars: maxChars,
+  };
+  const argv = [command, ...args].map((part, index) =>
+    index === 0 ? part : applyTemplate(part, templCtx),
+  );
+  if (shouldLogVerbose()) {
+    logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
+  }
+  const { stdout } = await runExec(argv[0], argv.slice(1), {
+    timeoutMs,
+    maxBuffer: CLI_OUTPUT_MAX_BUFFER,
+  });
+  const text = trimOutput(stdout, maxChars);
+  if (!text) return null;
+  return {
+    kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
+    attachmentIndex: params.attachmentIndex,
+    text,
+    provider: "cli",
+    model: command,
+  };
+}
+
+async function runAttachmentEntries(params: {
+  capability: MediaUnderstandingCapability;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachmentIndex: number;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  cache: MediaAttachmentCache;
+  entries: MediaUnderstandingModelConfig[];
+  config?: MediaUnderstandingConfig;
+}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> {
+  const { entries, capability } = params;
+  const attempts: MediaUnderstandingModelDecision[] = [];
+  for (const entry of entries) {
+    const entryType = entry.type ?? (entry.command ? "cli" : "provider");
+    try {
+      const result =
+        entryType === "cli"
+          ? await runCliEntry({
+              capability,
+              entry,
+              cfg: params.cfg,
+              ctx: params.ctx,
+              attachmentIndex: params.attachmentIndex,
+              cache: params.cache,
+              config: params.config,
+            })
+          : await runProviderEntry({
+              capability,
+              entry,
+              cfg: params.cfg,
+              ctx: params.ctx,
+              attachmentIndex: params.attachmentIndex,
+              cache: params.cache,
+              agentDir: params.agentDir,
+              providerRegistry: params.providerRegistry,
+              config: params.config,
+            });
+      if (result) {
+        const decision = buildModelDecision({ entry, entryType, outcome: "success" });
+        if (result.provider) decision.provider = result.provider;
+        if (result.model) decision.model = result.model;
+        attempts.push(decision);
+        return { output: result, attempts };
+      }
+      attempts.push(
+        buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
+      );
+    } catch (err) {
+      if (isMediaUnderstandingSkipError(err)) {
+        attempts.push(
+          buildModelDecision({
+            entry,
+            entryType,
+            outcome: "skipped",
+            reason: `${err.reason}: ${err.message}`,
+          }),
+        );
+        if (shouldLogVerbose()) {
+          logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
+        }
+        continue;
+      }
+      attempts.push(
+        buildModelDecision({
+          entry,
+          entryType,
+          outcome: "failed",
+          reason: String(err),
+        }),
+      );
+      if (shouldLogVerbose()) {
+        logVerbose(`${capability} understanding failed: ${String(err)}`);
+      }
+    }
+  }
+
+  return { output: null, attempts };
+}
+
+export async function runCapability(params: {
+  capability: MediaUnderstandingCapability;
+  cfg: ClawdbotConfig;
+  ctx: MsgContext;
+  attachments: MediaAttachmentCache;
+  media: MediaAttachment[];
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  config?: MediaUnderstandingConfig;
+  activeModel?: ActiveMediaModel;
+}): Promise<RunCapabilityResult> {
+  const { capability, cfg, ctx } = params;
+  const config = params.config ?? cfg.tools?.media?.[capability];
+  if (config?.enabled === false) {
+    return {
+      outputs: [],
+      decision: { capability, outcome: "disabled", attachments: [] },
+    };
+  }
+
+  const attachmentPolicy = config?.attachments;
+  const selected = selectAttachments({
+    capability,
+    attachments: params.media,
+    policy: attachmentPolicy,
+  });
+  if (selected.length === 0) {
+    return {
+      outputs: [],
+      decision: { capability, outcome: "no-attachment", attachments: [] },
+    };
+  }
+
+  const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
+  if (scopeDecision === "deny") {
+    if (shouldLogVerbose()) {
+      logVerbose(`${capability} understanding disabled by scope policy.`);
+    }
+    return {
+      outputs: [],
+      decision: {
+        capability,
+        outcome: "scope-deny",
+        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
+      },
+    };
+  }
+
+  const entries = resolveEntriesWithActiveFallback({
+    cfg,
+    capability,
+    config,
+    providerRegistry: params.providerRegistry,
+    activeModel: params.activeModel,
+  });
+  if (entries.length === 0) {
+    return {
+      outputs: [],
+      decision: {
+        capability,
+        outcome: "skipped",
+        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
+      },
+    };
+  }
+
+  const outputs: MediaUnderstandingOutput[] = [];
+  const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
+  for (const attachment of selected) {
+    const { output, attempts } = await runAttachmentEntries({
+      capability,
+      cfg,
+      ctx,
+      attachmentIndex: attachment.index,
+      agentDir: params.agentDir,
+      providerRegistry: params.providerRegistry,
+      cache: params.attachments,
+      entries,
+      config,
+    });
+    if (output) outputs.push(output);
+    attachmentDecisions.push({
+      attachmentIndex: attachment.index,
+      attempts,
+      chosen: attempts.find((attempt) => attempt.outcome === "success"),
+    });
+  }
+  return {
+    outputs,
+    decision: {
+      capability,
+      outcome: outputs.length > 0 ? "success" : "skipped",
+      attachments: attachmentDecisions,
+    },
+  };
+}
diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts
index 078c5908211..c0aa11c40d8 100644
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -106,6 +106,7 @@ export type ImageDescriptionResult = {
 
 export type MediaUnderstandingProvider = {
   id: string;
+  capabilities?: MediaUnderstandingCapability[];
   transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
   describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
   describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;