diff --git a/src/auto-reply/reply/commands-info.ts b/src/auto-reply/reply/commands-info.ts index 69c80e812e9..1a525150c7c 100644 --- a/src/auto-reply/reply/commands-info.ts +++ b/src/auto-reply/reply/commands-info.ts @@ -68,6 +68,7 @@ export const handleStatusCommand: CommandHandler = async (params, allowTextComma resolveDefaultThinkingLevel: params.resolveDefaultThinkingLevel, isGroup: params.isGroup, defaultGroupActivation: params.defaultGroupActivation, + mediaDecisions: params.ctx.MediaUnderstandingDecisions, }); return { shouldContinue: false, reply }; }; diff --git a/src/auto-reply/reply/commands-status.ts b/src/auto-reply/reply/commands-status.ts index e69b8203753..67081a209ea 100644 --- a/src/auto-reply/reply/commands-status.ts +++ b/src/auto-reply/reply/commands-status.ts @@ -24,6 +24,7 @@ import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from ".. import type { ReplyPayload } from "../types.js"; import type { CommandContext } from "./commands-types.js"; import { getFollowupQueueDepth, resolveQueueSettings } from "./queue.js"; +import type { MediaUnderstandingDecision } from "../../media-understanding/types.js"; function formatApiKeySnippet(apiKey: string): string { const compact = apiKey.replace(/\s+/g, ""); @@ -105,6 +106,7 @@ export async function buildStatusReply(params: { resolveDefaultThinkingLevel: () => Promise; isGroup: boolean; defaultGroupActivation: () => "always" | "mention"; + mediaDecisions?: MediaUnderstandingDecision[]; }): Promise { const { cfg, @@ -200,6 +202,7 @@ export async function buildStatusReply(params: { dropPolicy: queueSettings.dropPolicy, showDetails: queueOverrides, }, + mediaDecisions: params.mediaDecisions, includeTranscriptUsage: false, }); diff --git a/src/auto-reply/reply/get-reply-directives-apply.ts b/src/auto-reply/reply/get-reply-directives-apply.ts index dfc1b69233b..0201ca2870e 100644 --- a/src/auto-reply/reply/get-reply-directives-apply.ts +++ b/src/auto-reply/reply/get-reply-directives-apply.ts @@ -188,6 +188,7 @@ export async function applyInlineDirectiveOverrides(params: { resolveDefaultThinkingLevel: async () => resolvedDefaultThinkLevel, isGroup, defaultGroupActivation: defaultActivation, + mediaDecisions: ctx.MediaUnderstandingDecisions, }); } typing.cleanup(); diff --git a/src/auto-reply/reply/get-reply-inline-actions.ts b/src/auto-reply/reply/get-reply-inline-actions.ts index 2452fdd8f6a..825df34453a 100644 --- a/src/auto-reply/reply/get-reply-inline-actions.ts +++ b/src/auto-reply/reply/get-reply-inline-actions.ts @@ -185,6 +185,7 @@ export async function handleInlineActions(params: { resolveDefaultThinkingLevel, isGroup, defaultGroupActivation: defaultActivation, + mediaDecisions: ctx.MediaUnderstandingDecisions, }); await sendInlineReply(inlineStatusReply); directives = { ...directives, hasStatusDirective: false }; diff --git a/src/auto-reply/status.test.ts b/src/auto-reply/status.test.ts index 0ce0d5d308d..fd1e4d36f9d 100644 --- a/src/auto-reply/status.test.ts +++ b/src/auto-reply/status.test.ts @@ -90,6 +90,59 @@ describe("buildStatusMessage", () => { expect(text).toContain("elevated"); }); + it("includes media understanding decisions when present", () => { + const text = buildStatusMessage({ + agent: { model: "anthropic/claude-opus-4-5" }, + sessionEntry: { sessionId: "media", updatedAt: 0 }, + sessionKey: "agent:main:main", + queue: { mode: "none" }, + mediaDecisions: [ + { + capability: "image", + outcome: "success", + attachments: [ + { + attachmentIndex: 0, + attempts: [ + { + type: "provider", + outcome: "success", + provider: "openai", + model: "gpt-5.2", + }, + ], + chosen: { + type: "provider", + outcome: "success", + provider: "openai", + model: "gpt-5.2", + }, + }, + ], + }, + { + capability: "audio", + outcome: "skipped", + attachments: [ + { + attachmentIndex: 1, + attempts: [ + { + type: "provider", + outcome: "skipped", + reason: "maxBytes: too large", + }, + ], + }, + ], + }, + ], + }); + + const normalized = normalizeTestText(text); + expect(normalized).toContain("Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)"); + }); + it("does not show elevated label when session explicitly disables it", () => { const text = buildStatusMessage({ agent: { model: "anthropic/claude-opus-4-5", elevatedDefault: "on" }, diff --git a/src/auto-reply/status.ts b/src/auto-reply/status.ts index 66e444456f6..a90e7031c3d 100644 --- a/src/auto-reply/status.ts +++ b/src/auto-reply/status.ts @@ -24,6 +24,7 @@ import { VERSION } from "../version.js"; import { listChatCommands, listChatCommandsForConfig } from "./commands-registry.js"; import type { SkillCommandSpec } from "../agents/skills.js"; import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "./thinking.js"; +import type { MediaUnderstandingDecision } from "../media-understanding/types.js"; type AgentConfig = Partial["defaults"]>>; @@ -52,6 +53,7 @@ type StatusArgs = { modelAuth?: string; usageLine?: string; queue?: QueueStatus; + mediaDecisions?: MediaUnderstandingDecision[]; includeTranscriptUsage?: boolean; now?: number; }; @@ -167,6 +169,42 @@ const formatUsagePair = (input?: number | null, output?: number | null) => { return `🧮 Tokens: ${inputLabel} in / ${outputLabel} out`; }; +const formatMediaUnderstandingLine = (decisions?: MediaUnderstandingDecision[]) => { + if (!decisions || decisions.length === 0) return null; + const parts = decisions + .map((decision) => { + const count = decision.attachments.length; + const countLabel = count > 1 ? ` x${count}` : ""; + if (decision.outcome === "success") { + const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen; + const provider = chosen?.provider?.trim(); + const model = chosen?.model?.trim(); + const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : null; + return `${decision.capability}${countLabel} ok${modelLabel ? ` (${modelLabel})` : ""}`; + } + if (decision.outcome === "no-attachment") { + return `${decision.capability} none`; + } + if (decision.outcome === "disabled") { + return `${decision.capability} off`; + } + if (decision.outcome === "scope-deny") { + return `${decision.capability} denied`; + } + if (decision.outcome === "skipped") { + const reason = decision.attachments + .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean)) + .find(Boolean); + const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; + return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`; + } + return null; + }) + .filter(Boolean); + if (parts.length === 0) return null; + return `📎 Media: ${parts.join(" · ")}`; +}; + export function buildStatusMessage(args: StatusArgs): string { const now = args.now ?? Date.now(); const entry = args.sessionEntry; @@ -320,12 +358,14 @@ export function buildStatusMessage(args: StatusArgs): string { const costLine = costLabel ? `💵 Cost: ${costLabel}` : null; const usageCostLine = usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine); + const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions); return [ versionLine, modelLine, usageCostLine, `📚 ${contextLine}`, + mediaLine, args.usageLine, `🧵 ${sessionLine}`, `⚙️ ${optionsLine}`, diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 8a5bac74b7b..d1197ba1a8e 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -1,52 +1,26 @@ import type { ClawdbotConfig } from "../config/config.js"; import type { MsgContext } from "../auto-reply/templating.js"; -import { applyTemplate } from "../auto-reply/templating.js"; import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js"; -import { resolveApiKeyForProvider } from "../agents/model-auth.js"; -import { logVerbose, shouldLogVerbose } from "../globals.js"; -import { runExec } from "../process/exec.js"; -import type { - MediaUnderstandingConfig, - MediaUnderstandingModelConfig, -} from "../config/types.tools.js"; -import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js"; -import { - CLI_OUTPUT_MAX_BUFFER, - DEFAULT_AUDIO_MODELS, - DEFAULT_TIMEOUT_SECONDS, -} from "./defaults.js"; -import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js"; import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, } from "./format.js"; -import { - buildMediaUnderstandingRegistry, - getMediaUnderstandingProvider, - normalizeMediaProviderId, -} from "./providers/index.js"; -import { describeImageWithModel } from "./providers/image.js"; -import { - resolveCapabilityConfig, - inferProviderCapabilities, - resolveConcurrency, - resolveMaxBytes, - resolveMaxChars, - resolveModelEntries, - resolvePrompt, - resolveScopeDecision, - resolveTimeoutMs, -} from "./resolve.js"; import type { MediaUnderstandingCapability, MediaUnderstandingDecision, - MediaUnderstandingModelDecision, MediaUnderstandingOutput, MediaUnderstandingProvider, } from "./types.js"; import { runWithConcurrency } from "./concurrency.js"; -import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; +import { resolveConcurrency } from "./resolve.js"; +import { + type ActiveMediaModel, + buildProviderRegistry, + createMediaAttachmentCache, + normalizeMediaAttachments, + runCapability, +} from "./runner.js"; export type ApplyMediaUnderstandingResult = { outputs: MediaUnderstandingOutput[]; @@ -58,476 +32,6 @@ export type ApplyMediaUnderstandingResult = { const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; -type ActiveMediaModel = { - provider: string; - model?: string; -}; - -function trimOutput(text: string, maxChars?: number): string { - const trimmed = text.trim(); - if (!maxChars || trimmed.length <= maxChars) return trimmed; - return trimmed.slice(0, maxChars).trim(); -} - -function resolveEntriesWithActiveFallback(params: { - cfg: ClawdbotConfig; - capability: MediaUnderstandingCapability; - config?: MediaUnderstandingConfig; - activeModel?: ActiveMediaModel; -}): MediaUnderstandingModelConfig[] { - const entries = resolveModelEntries({ - cfg: params.cfg, - capability: params.capability, - config: params.config, - }); - if (entries.length > 0) return entries; - if (params.config?.enabled !== true) return entries; - const activeProvider = params.activeModel?.provider?.trim(); - if (!activeProvider) return entries; - const capabilities = inferProviderCapabilities(activeProvider); - if (!capabilities || !capabilities.includes(params.capability)) return entries; - return [ - { - type: "provider", - provider: activeProvider, - model: params.activeModel?.model, - }, - ]; -} - -function buildModelDecision(params: { - entry: MediaUnderstandingModelConfig; - entryType: "provider" | "cli"; - outcome: MediaUnderstandingModelDecision["outcome"]; - reason?: string; -}): MediaUnderstandingModelDecision { - if (params.entryType === "cli") { - const command = params.entry.command?.trim(); - return { - type: "cli", - provider: command ?? "cli", - model: params.entry.model ?? command, - outcome: params.outcome, - reason: params.reason, - }; - } - const providerIdRaw = params.entry.provider?.trim(); - const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined; - return { - type: "provider", - provider: providerId ?? providerIdRaw, - model: params.entry.model, - outcome: params.outcome, - reason: params.reason, - }; -} - -async function runProviderEntry(params: { - capability: MediaUnderstandingCapability; - entry: MediaUnderstandingModelConfig; - cfg: ClawdbotConfig; - ctx: MsgContext; - attachmentIndex: number; - cache: MediaAttachmentCache; - agentDir?: string; - providerRegistry: Map; - config?: MediaUnderstandingConfig; -}): Promise { - const { entry, capability, cfg } = params; - const providerIdRaw = entry.provider?.trim(); - if (!providerIdRaw) { - throw new Error(`Provider entry missing provider for ${capability}`); - } - const providerId = normalizeMediaProviderId(providerIdRaw); - const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config }); - const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config }); - const timeoutMs = resolveTimeoutMs( - entry.timeoutSeconds ?? - params.config?.timeoutSeconds ?? - cfg.tools?.media?.[capability]?.timeoutSeconds, - DEFAULT_TIMEOUT_SECONDS[capability], - ); - const prompt = resolvePrompt( - capability, - entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt, - maxChars, - ); - - if (capability === "image") { - if (!params.agentDir) { - throw new Error("Image understanding requires agentDir"); - } - const modelId = entry.model?.trim(); - if (!modelId) { - throw new Error("Image understanding requires model id"); - } - const media = await params.cache.getBuffer({ - attachmentIndex: params.attachmentIndex, - maxBytes, - timeoutMs, - }); - const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); - const result = provider?.describeImage - ? await provider.describeImage({ - buffer: media.buffer, - fileName: media.fileName, - mime: media.mime, - model: modelId, - provider: providerId, - prompt, - timeoutMs, - profile: entry.profile, - preferredProfile: entry.preferredProfile, - agentDir: params.agentDir, - cfg: params.cfg, - }) - : await describeImageWithModel({ - buffer: media.buffer, - fileName: media.fileName, - mime: media.mime, - model: modelId, - provider: providerId, - prompt, - timeoutMs, - profile: entry.profile, - preferredProfile: entry.preferredProfile, - agentDir: params.agentDir, - cfg: params.cfg, - }); - return { - kind: "image.description", - attachmentIndex: params.attachmentIndex, - text: trimOutput(result.text, maxChars), - provider: providerId, - model: result.model ?? modelId, - }; - } - - const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); - if (!provider) { - throw new Error(`Media provider not available: ${providerId}`); - } - - if (capability === "audio") { - if (!provider.transcribeAudio) { - throw new Error(`Audio transcription provider "${providerId}" not available.`); - } - const media = await params.cache.getBuffer({ - attachmentIndex: params.attachmentIndex, - maxBytes, - timeoutMs, - }); - const key = await resolveApiKeyForProvider({ - provider: providerId, - cfg, - profileId: entry.profile, - preferredProfile: entry.preferredProfile, - agentDir: params.agentDir, - }); - const providerConfig = cfg.models?.providers?.[providerId]; - const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model; - const result = await provider.transcribeAudio({ - buffer: media.buffer, - fileName: media.fileName, - mime: media.mime, - apiKey: key.apiKey, - baseUrl: providerConfig?.baseUrl, - headers: providerConfig?.headers, - model, - language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language, - prompt, - timeoutMs, - }); - return { - kind: "audio.transcription", - attachmentIndex: params.attachmentIndex, - text: trimOutput(result.text, maxChars), - provider: providerId, - model: result.model ?? model, - }; - } - - if (!provider.describeVideo) { - throw new Error(`Video understanding provider "${providerId}" not available.`); - } - const media = await params.cache.getBuffer({ - attachmentIndex: params.attachmentIndex, - maxBytes, - timeoutMs, - }); - const estimatedBase64Bytes = estimateBase64Size(media.size); - const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes); - if (estimatedBase64Bytes > maxBase64Bytes) { - throw new MediaUnderstandingSkipError( - "maxBytes", - `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`, - ); - } - const key = await resolveApiKeyForProvider({ - provider: providerId, - cfg, - profileId: entry.profile, - preferredProfile: entry.preferredProfile, - agentDir: params.agentDir, - }); - const providerConfig = cfg.models?.providers?.[providerId]; - const result = await provider.describeVideo({ - buffer: media.buffer, - fileName: media.fileName, - mime: media.mime, - apiKey: key.apiKey, - baseUrl: providerConfig?.baseUrl, - headers: providerConfig?.headers, - model: entry.model, - prompt, - timeoutMs, - }); - return { - kind: "video.description", - attachmentIndex: params.attachmentIndex, - text: trimOutput(result.text, maxChars), - provider: providerId, - model: result.model ?? entry.model, - }; -} - -async function runCliEntry(params: { - capability: MediaUnderstandingCapability; - entry: MediaUnderstandingModelConfig; - cfg: ClawdbotConfig; - ctx: MsgContext; - attachmentIndex: number; - cache: MediaAttachmentCache; - config?: MediaUnderstandingConfig; -}): Promise { - const { entry, capability, cfg, ctx } = params; - const command = entry.command?.trim(); - const args = entry.args ?? []; - if (!command) { - throw new Error(`CLI entry missing command for ${capability}`); - } - const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config }); - const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config }); - const timeoutMs = resolveTimeoutMs( - entry.timeoutSeconds ?? - params.config?.timeoutSeconds ?? - cfg.tools?.media?.[capability]?.timeoutSeconds, - DEFAULT_TIMEOUT_SECONDS[capability], - ); - const prompt = resolvePrompt( - capability, - entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt, - maxChars, - ); - const pathResult = await params.cache.getPath({ - attachmentIndex: params.attachmentIndex, - maxBytes, - timeoutMs, - }); - - const templCtx: MsgContext = { - ...ctx, - MediaPath: pathResult.path, - Prompt: prompt, - MaxChars: maxChars, - }; - const argv = [command, ...args].map((part, index) => - index === 0 ? part : applyTemplate(part, templCtx), - ); - if (shouldLogVerbose()) { - logVerbose(`Media understanding via CLI: ${argv.join(" ")}`); - } - const { stdout } = await runExec(argv[0], argv.slice(1), { - timeoutMs, - maxBuffer: CLI_OUTPUT_MAX_BUFFER, - }); - const text = trimOutput(stdout, maxChars); - if (!text) return null; - return { - kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, - attachmentIndex: params.attachmentIndex, - text, - provider: "cli", - model: command, - }; -} - -async function runAttachmentEntries(params: { - capability: MediaUnderstandingCapability; - cfg: ClawdbotConfig; - ctx: MsgContext; - attachmentIndex: number; - agentDir?: string; - providerRegistry: Map; - cache: MediaAttachmentCache; - entries: MediaUnderstandingModelConfig[]; - config?: MediaUnderstandingConfig; -}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> { - const { entries, capability } = params; - const attempts: MediaUnderstandingModelDecision[] = []; - for (const entry of entries) { - try { - const entryType = entry.type ?? (entry.command ? "cli" : "provider"); - const result = - entryType === "cli" - ? await runCliEntry({ - capability, - entry, - cfg: params.cfg, - ctx: params.ctx, - attachmentIndex: params.attachmentIndex, - cache: params.cache, - config: params.config, - }) - : await runProviderEntry({ - capability, - entry, - cfg: params.cfg, - ctx: params.ctx, - attachmentIndex: params.attachmentIndex, - cache: params.cache, - agentDir: params.agentDir, - providerRegistry: params.providerRegistry, - config: params.config, - }); - if (result) { - const decision = buildModelDecision({ entry, entryType, outcome: "success" }); - if (result.provider) decision.provider = result.provider; - if (result.model) decision.model = result.model; - attempts.push(decision); - return { output: result, attempts }; - } - attempts.push( - buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }), - ); - } catch (err) { - if (isMediaUnderstandingSkipError(err)) { - attempts.push( - buildModelDecision({ - entry, - entryType: entry.type ?? (entry.command ? "cli" : "provider"), - outcome: "skipped", - reason: `${err.reason}: ${err.message}`, - }), - ); - if (shouldLogVerbose()) { - logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); - } - continue; - } - attempts.push( - buildModelDecision({ - entry, - entryType: entry.type ?? (entry.command ? "cli" : "provider"), - outcome: "failed", - reason: String(err), - }), - ); - if (shouldLogVerbose()) { - logVerbose(`${capability} understanding failed: ${String(err)}`); - } - } - } - - return { output: null, attempts }; -} - -async function runCapability(params: { - capability: MediaUnderstandingCapability; - cfg: ClawdbotConfig; - ctx: MsgContext; - attachments: MediaAttachmentCache; - media: ReturnType; - agentDir?: string; - providerRegistry: Map; - config?: MediaUnderstandingConfig; - activeModel?: ActiveMediaModel; -}): Promise<{ outputs: MediaUnderstandingOutput[]; decision: MediaUnderstandingDecision }> { - const { capability, cfg, ctx } = params; - const config = params.config ?? resolveCapabilityConfig(cfg, capability); - if (config?.enabled === false) { - return { - outputs: [], - decision: { capability, outcome: "disabled", attachments: [] }, - }; - } - - const attachmentPolicy = config?.attachments; - const selected = selectAttachments({ - capability, - attachments: params.media, - policy: attachmentPolicy, - }); - if (selected.length === 0) { - return { - outputs: [], - decision: { capability, outcome: "no-attachment", attachments: [] }, - }; - } - - const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); - if (scopeDecision === "deny") { - if (shouldLogVerbose()) { - logVerbose(`${capability} understanding disabled by scope policy.`); - } - return { - outputs: [], - decision: { - capability, - outcome: "scope-deny", - attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), - }, - }; - } - - const entries = resolveEntriesWithActiveFallback({ - cfg, - capability, - config, - activeModel: params.activeModel, - }); - if (entries.length === 0) { - return { - outputs: [], - decision: { - capability, - outcome: "skipped", - attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), - }, - }; - } - - const outputs: MediaUnderstandingOutput[] = []; - const attachmentDecisions: MediaUnderstandingDecision["attachments"] = []; - for (const attachment of selected) { - const { output, attempts } = await runAttachmentEntries({ - capability, - cfg, - ctx, - attachmentIndex: attachment.index, - agentDir: params.agentDir, - providerRegistry: params.providerRegistry, - cache: params.attachments, - entries, - config, - }); - if (output) outputs.push(output); - attachmentDecisions.push({ - attachmentIndex: attachment.index, - attempts, - chosen: attempts.find((attempt) => attempt.outcome === "success"), - }); - } - return { - outputs, - decision: { - capability, - outcome: outputs.length > 0 ? "success" : "skipped", - attachments: attachmentDecisions, - }, - }; -} - export async function applyMediaUnderstanding(params: { ctx: MsgContext; cfg: ClawdbotConfig; @@ -542,13 +46,13 @@ export async function applyMediaUnderstanding(params: { .map((value) => extractMediaUserText(value)) .find((value) => value && value.trim()) ?? undefined; - const attachments = normalizeAttachments(ctx); - const providerRegistry = buildMediaUnderstandingRegistry(params.providers); - const cache = new MediaAttachmentCache(attachments); + const attachments = normalizeMediaAttachments(ctx); + const providerRegistry = buildProviderRegistry(params.providers); + const cache = createMediaAttachmentCache(attachments); try { const tasks = CAPABILITY_ORDER.map((capability) => async () => { - const config = resolveCapabilityConfig(cfg, capability); + const config = cfg.tools?.media?.[capability]; return await runCapability({ capability, cfg, @@ -565,17 +69,12 @@ export async function applyMediaUnderstanding(params: { const results = await runWithConcurrency(tasks, resolveConcurrency(cfg)); const outputs: MediaUnderstandingOutput[] = []; const decisions: MediaUnderstandingDecision[] = []; - for (const [index] of CAPABILITY_ORDER.entries()) { - const entry = results[index]; + for (const entry of results) { if (!entry) continue; - if (Array.isArray(entry.outputs)) { - for (const output of entry.outputs) { - outputs.push(output); - } - } - if (entry.decision) { - decisions.push(entry.decision); + for (const output of entry.outputs) { + outputs.push(output); } + decisions.push(entry.decision); } if (decisions.length > 0) { diff --git a/src/media-understanding/providers/anthropic/index.ts b/src/media-understanding/providers/anthropic/index.ts index 3f9fc584c3f..35ae04a921e 100644 --- a/src/media-understanding/providers/anthropic/index.ts +++ b/src/media-understanding/providers/anthropic/index.ts @@ -3,5 +3,6 @@ import { describeImageWithModel } from "../image.js"; export const anthropicProvider: MediaUnderstandingProvider = { id: "anthropic", + capabilities: ["image"], describeImage: describeImageWithModel, }; diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts index d0f8bae3b8b..6b3d412ba84 100644 --- a/src/media-understanding/providers/google/index.ts +++ b/src/media-understanding/providers/google/index.ts @@ -4,6 +4,7 @@ import { describeGeminiVideo } from "./video.js"; export const googleProvider: MediaUnderstandingProvider = { id: "google", + capabilities: ["image", "audio", "video"], describeImage: describeImageWithModel, describeVideo: describeGeminiVideo, }; diff --git a/src/media-understanding/providers/groq/index.ts b/src/media-understanding/providers/groq/index.ts index 451799e8ef9..5f59e5702ab 100644 --- a/src/media-understanding/providers/groq/index.ts +++ b/src/media-understanding/providers/groq/index.ts @@ -5,6 +5,7 @@ const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1"; export const groqProvider: MediaUnderstandingProvider = { id: "groq", + capabilities: ["audio"], transcribeAudio: (req) => transcribeOpenAiCompatibleAudio({ ...req, diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 6f4387a10a5..9c560c8e72a 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -29,7 +29,16 @@ export function buildMediaUnderstandingRegistry( } if (overrides) { for (const [key, provider] of Object.entries(overrides)) { - registry.set(normalizeMediaProviderId(key), provider); + const normalizedKey = normalizeMediaProviderId(key); + const existing = registry.get(normalizedKey); + const merged = existing + ? { + ...existing, + ...provider, + capabilities: provider.capabilities ?? existing.capabilities, + } + : provider; + registry.set(normalizedKey, merged); } } return registry; diff --git a/src/media-understanding/providers/minimax/index.ts b/src/media-understanding/providers/minimax/index.ts index 8d500353845..6fa6ebf351a 100644 --- a/src/media-understanding/providers/minimax/index.ts +++ b/src/media-understanding/providers/minimax/index.ts @@ -3,5 +3,6 @@ import { describeImageWithModel } from "../image.js"; export const minimaxProvider: MediaUnderstandingProvider = { id: "minimax", + capabilities: ["image"], describeImage: describeImageWithModel, }; diff --git a/src/media-understanding/providers/openai/index.ts b/src/media-understanding/providers/openai/index.ts index 0aabb275fbf..d6e735c18ef 100644 --- a/src/media-understanding/providers/openai/index.ts +++ b/src/media-understanding/providers/openai/index.ts @@ -4,6 +4,7 @@ import { transcribeOpenAiCompatibleAudio } from "./audio.js"; export const openaiProvider: MediaUnderstandingProvider = { id: "openai", + capabilities: ["image"], describeImage: describeImageWithModel, transcribeAudio: transcribeOpenAiCompatibleAudio, }; diff --git a/src/media-understanding/resolve.ts b/src/media-understanding/resolve.ts index bc34ce7d5b3..8f260542e8d 100644 --- a/src/media-understanding/resolve.ts +++ b/src/media-understanding/resolve.ts @@ -77,36 +77,22 @@ export function resolveScopeDecision(params: { }); } -export function inferProviderCapabilities( - providerId?: string, -): MediaUnderstandingCapability[] | undefined { - const provider = normalizeMediaProviderId(providerId ?? ""); - if (!provider) return undefined; - if (provider === "openai" || provider === "anthropic" || provider === "minimax") { - return ["image"]; - } - if (provider === "google") { - return ["image", "audio", "video"]; - } - if (provider === "groq") { - return ["audio"]; - } - return undefined; -} - -function inferCapabilities( - entry: MediaUnderstandingModelConfig, -): MediaUnderstandingCapability[] | undefined { - if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") { - return undefined; - } - return inferProviderCapabilities(entry.provider); +function resolveEntryCapabilities(params: { + entry: MediaUnderstandingModelConfig; + providerRegistry: Map; +}): MediaUnderstandingCapability[] | undefined { + const entryType = params.entry.type ?? (params.entry.command ? "cli" : "provider"); + if (entryType === "cli") return undefined; + const providerId = normalizeMediaProviderId(params.entry.provider ?? ""); + if (!providerId) return undefined; + return params.providerRegistry.get(providerId)?.capabilities; } export function resolveModelEntries(params: { cfg: ClawdbotConfig; capability: MediaUnderstandingCapability; config?: MediaUnderstandingConfig; + providerRegistry: Map; }): MediaUnderstandingModelConfig[] { const { cfg, capability, config } = params; const sharedModels = cfg.tools?.media?.models ?? []; @@ -122,7 +108,7 @@ export function resolveModelEntries(params: { entry.capabilities && entry.capabilities.length > 0 ? entry.capabilities : source === "shared" - ? inferCapabilities(entry) + ? resolveEntryCapabilities({ entry, providerRegistry: params.providerRegistry }) : undefined; if (!caps || caps.length === 0) { if (source === "shared") { @@ -148,13 +134,32 @@ export function resolveConcurrency(cfg: ClawdbotConfig): number { return DEFAULT_MEDIA_CONCURRENCY; } -export function resolveCapabilityEnabled(params: { +export function resolveEntriesWithActiveFallback(params: { cfg: ClawdbotConfig; + capability: MediaUnderstandingCapability; config?: MediaUnderstandingConfig; -}): boolean { - if (params.config?.enabled === false) return false; - const sharedModels = params.cfg.tools?.media?.models ?? []; - const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0; - if (!hasModels) return false; - return true; + providerRegistry: Map; + activeModel?: { provider: string; model?: string }; +}): MediaUnderstandingModelConfig[] { + const entries = resolveModelEntries({ + cfg: params.cfg, + capability: params.capability, + config: params.config, + providerRegistry: params.providerRegistry, + }); + if (entries.length > 0) return entries; + if (params.config?.enabled !== true) return entries; + const activeProviderRaw = params.activeModel?.provider?.trim(); + if (!activeProviderRaw) return entries; + const activeProvider = normalizeMediaProviderId(activeProviderRaw); + if (!activeProvider) return entries; + const capabilities = params.providerRegistry.get(activeProvider)?.capabilities; + if (!capabilities || !capabilities.includes(params.capability)) return entries; + return [ + { + type: "provider", + provider: activeProvider, + model: params.activeModel?.model, + }, + ]; } diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts new file mode 100644 index 00000000000..d33ab6296ab --- /dev/null +++ b/src/media-understanding/runner.ts @@ -0,0 +1,506 @@ +import type { ClawdbotConfig } from "../config/config.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import { applyTemplate } from "../auto-reply/templating.js"; +import { resolveApiKeyForProvider } from "../agents/model-auth.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { runExec } from "../process/exec.js"; +import type { + MediaUnderstandingConfig, + MediaUnderstandingModelConfig, +} from "../config/types.tools.js"; +import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js"; +import { + CLI_OUTPUT_MAX_BUFFER, + DEFAULT_AUDIO_MODELS, + DEFAULT_TIMEOUT_SECONDS, +} from "./defaults.js"; +import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js"; +import { + resolveEntriesWithActiveFallback, + resolveMaxBytes, + resolveMaxChars, + resolvePrompt, + resolveScopeDecision, + resolveTimeoutMs, +} from "./resolve.js"; +import type { + MediaAttachment, + MediaUnderstandingCapability, + MediaUnderstandingDecision, + MediaUnderstandingModelDecision, + MediaUnderstandingOutput, + MediaUnderstandingProvider, +} from "./types.js"; +import { + buildMediaUnderstandingRegistry, + getMediaUnderstandingProvider, + normalizeMediaProviderId, +} from "./providers/index.js"; +import { describeImageWithModel } from "./providers/image.js"; +import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; + +export type ActiveMediaModel = { + provider: string; + model?: string; +}; + +type ProviderRegistry = Map; + +export type RunCapabilityResult = { + outputs: MediaUnderstandingOutput[]; + decision: MediaUnderstandingDecision; +}; + +export function buildProviderRegistry( + overrides?: Record, +): ProviderRegistry { + return buildMediaUnderstandingRegistry(overrides); +} + +export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] { + return normalizeAttachments(ctx); +} + +export function createMediaAttachmentCache(attachments: MediaAttachment[]): MediaAttachmentCache { + return new MediaAttachmentCache(attachments); +} + +function trimOutput(text: string, maxChars?: number): string { + const trimmed = text.trim(); + if (!maxChars || trimmed.length <= maxChars) return trimmed; + return trimmed.slice(0, maxChars).trim(); +} + +function buildModelDecision(params: { + entry: MediaUnderstandingModelConfig; + entryType: "provider" | "cli"; + outcome: MediaUnderstandingModelDecision["outcome"]; + reason?: string; +}): MediaUnderstandingModelDecision { + if (params.entryType === "cli") { + const command = params.entry.command?.trim(); + return { + type: "cli", + provider: command ?? "cli", + model: params.entry.model ?? command, + outcome: params.outcome, + reason: params.reason, + }; + } + const providerIdRaw = params.entry.provider?.trim(); + const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined; + return { + type: "provider", + provider: providerId ?? providerIdRaw, + model: params.entry.model, + outcome: params.outcome, + reason: params.reason, + }; +} + +async function runProviderEntry(params: { + capability: MediaUnderstandingCapability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachmentIndex: number; + cache: MediaAttachmentCache; + agentDir?: string; + providerRegistry: ProviderRegistry; + config?: MediaUnderstandingConfig; +}): Promise { + const { entry, capability, cfg } = params; + const providerIdRaw = entry.provider?.trim(); + if (!providerIdRaw) { + throw new Error(`Provider entry missing provider for ${capability}`); + } + const providerId = normalizeMediaProviderId(providerIdRaw); + const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config }); + const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config }); + const timeoutMs = resolveTimeoutMs( + entry.timeoutSeconds ?? + params.config?.timeoutSeconds ?? + cfg.tools?.media?.[capability]?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS[capability], + ); + const prompt = resolvePrompt( + capability, + entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt, + maxChars, + ); + + if (capability === "image") { + if (!params.agentDir) { + throw new Error("Image understanding requires agentDir"); + } + const modelId = entry.model?.trim(); + if (!modelId) { + throw new Error("Image understanding requires model id"); + } + const media = await params.cache.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, + }); + const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + const result = provider?.describeImage + ? await provider.describeImage({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + model: modelId, + provider: providerId, + prompt, + timeoutMs, + profile: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + cfg: params.cfg, + }) + : await describeImageWithModel({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + model: modelId, + provider: providerId, + prompt, + timeoutMs, + profile: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + cfg: params.cfg, + }); + return { + kind: "image.description", + attachmentIndex: params.attachmentIndex, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model ?? modelId, + }; + } + + const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + if (!provider) { + throw new Error(`Media provider not available: ${providerId}`); + } + + if (capability === "audio") { + if (!provider.transcribeAudio) { + throw new Error(`Audio transcription provider "${providerId}" not available.`); + } + const media = await params.cache.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, + }); + const key = await resolveApiKeyForProvider({ + provider: providerId, + cfg, + profileId: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + }); + const providerConfig = cfg.models?.providers?.[providerId]; + const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model; + const result = await provider.transcribeAudio({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + apiKey: key.apiKey, + baseUrl: providerConfig?.baseUrl, + headers: providerConfig?.headers, + model, + language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language, + prompt, + timeoutMs, + }); + return { + kind: "audio.transcription", + attachmentIndex: params.attachmentIndex, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model ?? model, + }; + } + + if (!provider.describeVideo) { + throw new Error(`Video understanding provider "${providerId}" not available.`); + } + const media = await params.cache.getBuffer({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, + }); + const estimatedBase64Bytes = estimateBase64Size(media.size); + const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes); + if (estimatedBase64Bytes > maxBase64Bytes) { + throw new MediaUnderstandingSkipError( + "maxBytes", + `Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`, + ); + } + const key = await resolveApiKeyForProvider({ + provider: providerId, + cfg, + profileId: entry.profile, + preferredProfile: entry.preferredProfile, + agentDir: params.agentDir, + }); + const providerConfig = cfg.models?.providers?.[providerId]; + const result = await provider.describeVideo({ + buffer: media.buffer, + fileName: media.fileName, + mime: media.mime, + apiKey: key.apiKey, + baseUrl: providerConfig?.baseUrl, + headers: providerConfig?.headers, + model: entry.model, + prompt, + timeoutMs, + }); + return { + kind: "video.description", + attachmentIndex: params.attachmentIndex, + text: trimOutput(result.text, maxChars), + provider: providerId, + model: result.model ?? entry.model, + }; +} + +async function runCliEntry(params: { + capability: MediaUnderstandingCapability; + entry: MediaUnderstandingModelConfig; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachmentIndex: number; + cache: MediaAttachmentCache; + config?: MediaUnderstandingConfig; +}): Promise { + const { entry, capability, cfg, ctx } = params; + const command = entry.command?.trim(); + const args = entry.args ?? []; + if (!command) { + throw new Error(`CLI entry missing command for ${capability}`); + } + const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config }); + const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config }); + const timeoutMs = resolveTimeoutMs( + entry.timeoutSeconds ?? + params.config?.timeoutSeconds ?? + cfg.tools?.media?.[capability]?.timeoutSeconds, + DEFAULT_TIMEOUT_SECONDS[capability], + ); + const prompt = resolvePrompt( + capability, + entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt, + maxChars, + ); + const pathResult = await params.cache.getPath({ + attachmentIndex: params.attachmentIndex, + maxBytes, + timeoutMs, + }); + + const templCtx: MsgContext = { + ...ctx, + MediaPath: pathResult.path, + Prompt: prompt, + MaxChars: maxChars, + }; + const argv = [command, ...args].map((part, index) => + index === 0 ? part : applyTemplate(part, templCtx), + ); + if (shouldLogVerbose()) { + logVerbose(`Media understanding via CLI: ${argv.join(" ")}`); + } + const { stdout } = await runExec(argv[0], argv.slice(1), { + timeoutMs, + maxBuffer: CLI_OUTPUT_MAX_BUFFER, + }); + const text = trimOutput(stdout, maxChars); + if (!text) return null; + return { + kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, + attachmentIndex: params.attachmentIndex, + text, + provider: "cli", + model: command, + }; +} + +async function runAttachmentEntries(params: { + capability: MediaUnderstandingCapability; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachmentIndex: number; + agentDir?: string; + providerRegistry: ProviderRegistry; + cache: MediaAttachmentCache; + entries: MediaUnderstandingModelConfig[]; + config?: MediaUnderstandingConfig; +}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> { + const { entries, capability } = params; + const attempts: MediaUnderstandingModelDecision[] = []; + for (const entry of entries) { + const entryType = entry.type ?? (entry.command ? "cli" : "provider"); + try { + const result = + entryType === "cli" + ? await runCliEntry({ + capability, + entry, + cfg: params.cfg, + ctx: params.ctx, + attachmentIndex: params.attachmentIndex, + cache: params.cache, + config: params.config, + }) + : await runProviderEntry({ + capability, + entry, + cfg: params.cfg, + ctx: params.ctx, + attachmentIndex: params.attachmentIndex, + cache: params.cache, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + config: params.config, + }); + if (result) { + const decision = buildModelDecision({ entry, entryType, outcome: "success" }); + if (result.provider) decision.provider = result.provider; + if (result.model) decision.model = result.model; + attempts.push(decision); + return { output: result, attempts }; + } + attempts.push( + buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }), + ); + } catch (err) { + if (isMediaUnderstandingSkipError(err)) { + attempts.push( + buildModelDecision({ + entry, + entryType, + outcome: "skipped", + reason: `${err.reason}: ${err.message}`, + }), + ); + if (shouldLogVerbose()) { + logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); + } + continue; + } + attempts.push( + buildModelDecision({ + entry, + entryType, + outcome: "failed", + reason: String(err), + }), + ); + if (shouldLogVerbose()) { + logVerbose(`${capability} understanding failed: ${String(err)}`); + } + } + } + + return { output: null, attempts }; +} + +export async function runCapability(params: { + capability: MediaUnderstandingCapability; + cfg: ClawdbotConfig; + ctx: MsgContext; + attachments: MediaAttachmentCache; + media: MediaAttachment[]; + agentDir?: string; + providerRegistry: ProviderRegistry; + config?: MediaUnderstandingConfig; + activeModel?: ActiveMediaModel; +}): Promise { + const { capability, cfg, ctx } = params; + const config = params.config ?? cfg.tools?.media?.[capability]; + if (config?.enabled === false) { + return { + outputs: [], + decision: { capability, outcome: "disabled", attachments: [] }, + }; + } + + const attachmentPolicy = config?.attachments; + const selected = selectAttachments({ + capability, + attachments: params.media, + policy: attachmentPolicy, + }); + if (selected.length === 0) { + return { + outputs: [], + decision: { capability, outcome: "no-attachment", attachments: [] }, + }; + } + + const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); + if (scopeDecision === "deny") { + if (shouldLogVerbose()) { + logVerbose(`${capability} understanding disabled by scope policy.`); + } + return { + outputs: [], + decision: { + capability, + outcome: "scope-deny", + attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), + }, + }; + } + + const entries = resolveEntriesWithActiveFallback({ + cfg, + capability, + config, + providerRegistry: params.providerRegistry, + activeModel: params.activeModel, + }); + if (entries.length === 0) { + return { + outputs: [], + decision: { + capability, + outcome: "skipped", + attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), + }, + }; + } + + const outputs: MediaUnderstandingOutput[] = []; + const attachmentDecisions: MediaUnderstandingDecision["attachments"] = []; + for (const attachment of selected) { + const { output, attempts } = await runAttachmentEntries({ + capability, + cfg, + ctx, + attachmentIndex: attachment.index, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + cache: params.attachments, + entries, + config, + }); + if (output) outputs.push(output); + attachmentDecisions.push({ + attachmentIndex: attachment.index, + attempts, + chosen: attempts.find((attempt) => attempt.outcome === "success"), + }); + } + return { + outputs, + decision: { + capability, + outcome: outputs.length > 0 ? "success" : "skipped", + attachments: attachmentDecisions, + }, + }; +} diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 078c5908211..c0aa11c40d8 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -106,6 +106,7 @@ export type ImageDescriptionResult = { export type MediaUnderstandingProvider = { id: string; + capabilities?: MediaUnderstandingCapability[]; transcribeAudio?: (req: AudioTranscriptionRequest) => Promise; describeVideo?: (req: VideoDescriptionRequest) => Promise; describeImage?: (req: ImageDescriptionRequest) => Promise;