diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 6bbcf304b4b..3ccd8a720ac 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -479,6 +479,44 @@ async function resolveKeyEntry(params: { return null; } +function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] { + const imageModel = cfg.agents?.defaults?.imageModel as + | { primary?: string; fallbacks?: string[] } + | string + | undefined; + if (!imageModel) { + return []; + } + const refs: string[] = []; + if (typeof imageModel === "string") { + if (imageModel.trim()) { + refs.push(imageModel.trim()); + } + } else { + if (imageModel.primary?.trim()) { + refs.push(imageModel.primary.trim()); + } + for (const fb of imageModel.fallbacks ?? []) { + if (fb?.trim()) { + refs.push(fb.trim()); + } + } + } + const entries: MediaUnderstandingModelConfig[] = []; + for (const ref of refs) { + const slashIdx = ref.indexOf("/"); + if (slashIdx <= 0 || slashIdx >= ref.length - 1) { + continue; + } + entries.push({ + type: "provider", + provider: ref.slice(0, slashIdx), + model: ref.slice(slashIdx + 1), + }); + } + return entries; +} + async function resolveAutoEntries(params: { cfg: OpenClawConfig; agentDir?: string; @@ -496,6 +534,12 @@ async function resolveAutoEntries(params: { return [localAudio]; } } + if (params.capability === "image") { + const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg); + if (imageModelEntries.length > 0) { + return imageModelEntries; + } + } const gemini = await resolveGeminiCliEntry(params.capability); if (gemini) { return [gemini];