diff --git a/CHANGELOG.md b/CHANGELOG.md index 048572ce3d2..1435c2f2021 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai - Providers/DeepSeek: backfill DeepSeek V4 `reasoning_content` on plain assistant replay messages as well as tool-call turns, so thinking sessions with prior tool use no longer fail follow-up requests with missing reasoning content. Fixes #73417; refs #71372. Thanks @34262315716 and @Bartok9. - Auto-reply: preserve voice-note media from silent turns while continuing to suppress text and non-voice media, so `NO_REPLY` TTS replies still deliver the requested audio bubble. (#73406) Thanks @zqchris. - Channels/Mattermost: stop enqueueing regular inbound posts as system events, so Mattermost user messages reach the model only as user-role inbound-envelope content instead of also appearing as `System: Mattermost message...` directives. Fixes #71795. Thanks @juan-flores077. +- Agents/media: qualify bare `agents.defaults.imageModel` and `pdfModel` refs from unique configured image-capable providers, so Ollama vision models such as `moondream` and `qwen2.5vl:7b` do not fall through to the default provider. Fixes #38816; supersedes #73396. Thanks @alainasclaw and @vincentkoc. - Agents/Anthropic: send implicit Anthropic beta headers only to direct public Anthropic endpoints, including OAuth, so custom Anthropic-compatible providers no longer mis-handle unsupported beta flags unless explicitly configured. Refs #73346. Thanks @byBrodowski. - Skills: require explicit `skills.entries.coding-agent.enabled` before exposing the bundled coding-agent skill, so installs with Codex on PATH but no OpenAI auth do not silently offer Codex delegation. Fixes #73358. Thanks @LaFleurAdvertising and @Sanjays2402. - Agents/subagents: preserve `sessions_yield` as a paused subagent state and ignore its wait text while freezing completion output, so parent sessions wait for the final post-compaction answer instead of receiving intermediate progress or `(no output)`. Fixes #73413. Thanks @Ask-sola. diff --git a/docs/gateway/config-agents.md b/docs/gateway/config-agents.md index 44fc5f494c5..b9c2eaf6523 100644 --- a/docs/gateway/config-agents.md +++ b/docs/gateway/config-agents.md @@ -342,6 +342,7 @@ Time format in system prompt. Default: `auto` (OS preference). - `imageModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the `image` tool path as its vision-model config. - Also used as fallback routing when the selected/default model cannot accept image input. + - Prefer explicit `provider/model` refs. Bare IDs are accepted for compatibility; if a bare ID uniquely matches a configured image-capable entry in `models.providers.*.models`, OpenClaw qualifies it to that provider. Ambiguous configured matches require an explicit provider prefix. - `imageGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the shared image-generation capability and any future tool/plugin surface that generates images. - Typical values: `google/gemini-3.1-flash-image-preview` for native Gemini image generation, `fal/fal-ai/flux/dev` for fal, `openai/gpt-image-2` for OpenAI Images, or `openai/gpt-image-1.5` for transparent-background OpenAI PNG/WebP output. diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index df4cf5bbe67..9a685763fbb 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -172,6 +172,7 @@ If `tools.media..enabled` is **not** set to `false` and you haven't `agents.defaults.imageModel` primary/fallback refs (image only). + Prefer `provider/model` refs. Bare refs are qualified from configured image-capable provider model entries only when the match is unique. Local CLIs (if installed): diff --git a/docs/providers/ollama.md b/docs/providers/ollama.md index 427a6108ab0..c6a243b4347 100644 --- a/docs/providers/ollama.md +++ b/docs/providers/ollama.md @@ -283,6 +283,8 @@ To make Ollama the default image-understanding model for inbound media, configur } ``` +Prefer the full `ollama/` ref. If the same model is listed under `models.providers.ollama.models` with `input: ["text", "image"]` and no other configured image provider exposes that bare model ID, OpenClaw also normalizes a bare `imageModel` ref such as `qwen2.5vl:7b` to `ollama/qwen2.5vl:7b`. If more than one configured image provider has the same bare ID, use the provider prefix explicitly. + Slow local vision models can need a longer image-understanding timeout than cloud models. They can also crash or stop when Ollama tries to allocate the full advertised vision context on constrained hardware. Set a capability timeout, and cap `num_ctx` on the model entry when you only need a normal image-description turn: ```json5 diff --git a/src/agents/command/attempt-execution.cli.test.ts b/src/agents/command/attempt-execution.cli.test.ts index f53422e46ad..638fb0a8608 100644 --- a/src/agents/command/attempt-execution.cli.test.ts +++ b/src/agents/command/attempt-execution.cli.test.ts @@ -543,8 +543,8 @@ describe("CLI attempt execution", () => { await runAgentAttempt({ providerOverride: "anthropic", - originalProvider: "anthropic", modelOverride: "claude-opus-4-7", + originalProvider: "anthropic", cfg: { agents: { defaults: { diff --git a/src/agents/tools/image-tool.helpers.ts b/src/agents/tools/image-tool.helpers.ts index 9bba2d18cbc..7dd18ae404d 100644 --- a/src/agents/tools/image-tool.helpers.ts +++ b/src/agents/tools/image-tool.helpers.ts @@ -2,7 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { estimateBase64DecodedBytes } from "../../media/base64.js"; import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js"; -import { findNormalizedProviderValue } from "../model-selection.js"; +import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js"; @@ -134,6 +134,106 @@ export function coerceImageModelConfig(cfg?: OpenClawConfig): ImageModelConfig { return coerceToolModelConfig(cfg?.agents?.defaults?.imageModel); } +function formatConfiguredImageModelRef(provider: string, modelId: string): string { + const slash = modelId.indexOf("/"); + if (slash > 0 && normalizeProviderId(modelId.slice(0, slash)) === provider) { + return modelId; + } + return `${provider}/${modelId}`; +} + +function modelIdMatchesProviderlessRef(params: { + provider: string; + modelId: string; + ref: string; +}): boolean { + const candidates = new Set([params.modelId]); + const slash = params.modelId.indexOf("/"); + if (slash > 0 && normalizeProviderId(params.modelId.slice(0, slash)) === params.provider) { + candidates.add(params.modelId.slice(slash + 1)); + } + const normalizedRef = normalizeLowercaseStringOrEmpty(params.ref); + for (const candidate of candidates) { + if (candidate === params.ref || normalizeLowercaseStringOrEmpty(candidate) === normalizedRef) { + return true; + } + } + return false; +} + +function findConfiguredImageModelMatches(params: { cfg?: OpenClawConfig; ref: string }): string[] { + const providers = params.cfg?.models?.providers; + if (!providers || typeof providers !== "object") { + return []; + } + + const matches = new Set(); + for (const [providerKey, providerConfig] of Object.entries(providers)) { + const provider = normalizeProviderId(providerKey); + if (!provider || !Array.isArray(providerConfig?.models)) { + continue; + } + for (const entry of providerConfig.models) { + const modelId = entry?.id?.trim(); + if (!modelId || !Array.isArray(entry?.input) || !entry.input.includes("image")) { + continue; + } + if (!modelIdMatchesProviderlessRef({ provider, modelId, ref: params.ref })) { + continue; + } + matches.add(formatConfiguredImageModelRef(provider, modelId)); + } + } + return [...matches]; +} + +function resolveProviderlessConfiguredImageModelRef(params: { + cfg?: OpenClawConfig; + ref: string; +}): string { + const ref = params.ref.trim(); + if (!ref || ref.includes("/")) { + return ref; + } + + const matches = findConfiguredImageModelMatches({ cfg: params.cfg, ref }); + if (matches.length === 0) { + return ref; + } + if (matches.length === 1) { + return matches[0]; + } + throw new Error( + `Ambiguous image model "${ref}". Configure a provider-prefixed ref such as ${matches + .map((match) => `"${match}"`) + .join(" or ")}.`, + ); +} + +export function resolveConfiguredImageModelRefs(params: { + cfg?: OpenClawConfig; + imageModelConfig: ImageModelConfig; +}): ImageModelConfig { + const primary = params.imageModelConfig.primary?.trim(); + const fallbacks = params.imageModelConfig.fallbacks + ?.map((ref) => resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref })) + .filter((ref) => ref.length > 0); + + return { + ...(params.imageModelConfig.primary !== undefined + ? { + primary: primary + ? resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref: primary }) + : primary, + } + : {}), + ...(fallbacks && fallbacks.length > 0 ? { fallbacks } : {}), + ...(params.imageModelConfig.timeoutMs !== undefined + ? { timeoutMs: params.imageModelConfig.timeoutMs } + : {}), + }; +} + export function resolveProviderVisionModelFromConfig(params: { cfg?: OpenClawConfig; provider: string; diff --git a/src/agents/tools/image-tool.ollama.live.test.ts b/src/agents/tools/image-tool.ollama.live.test.ts index 6cb41b81880..341deaa5095 100644 --- a/src/agents/tools/image-tool.ollama.live.test.ts +++ b/src/agents/tools/image-tool.ollama.live.test.ts @@ -39,13 +39,13 @@ async function withLiveImageWorkspace( } describe.skipIf(!LIVE)("image tool Ollama live", () => { - it("describes a local image through the explicit image tool", async () => { + it("describes a local image through a providerless configured Ollama image model", async () => { process.env.OLLAMA_API_KEY ||= "ollama-local"; await withLiveImageWorkspace(async ({ agentDir, workspaceDir, imagePath }) => { const cfg: OpenClawConfig = { agents: { defaults: { - imageModel: { primary: `ollama/${OLLAMA_IMAGE_MODEL}` }, + imageModel: { primary: OLLAMA_IMAGE_MODEL }, }, }, models: { diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 30117854ca6..4998a76b911 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -920,6 +920,124 @@ describe("image tool implicit imageModel config", () => { }); }); + it("resolves providerless explicit image models from unique configured image providers", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { + primary: "moondream", + fallbacks: ["qwen2.5vl:7b", "G-2.5-f"], + }, + }, + }, + models: { + providers: { + ollama: { + baseUrl: "http://localhost:11434", + models: [ + makeModelDefinition("moondream", ["text", "image"]), + makeModelDefinition("qwen2.5vl:7b", ["text", "image"]), + makeModelDefinition("G-2.5-f", ["text", "image"]), + ], + }, + }, + }, + }; + + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "ollama/moondream", + fallbacks: ["ollama/qwen2.5vl:7b", "ollama/G-2.5-f"], + }); + }); + }); + + it("runs providerless explicit image models on the inferred provider", async () => { + await withTempAgentDir(async (agentDir) => { + const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({ + text: `ok ${params.model}`, + model: params.model, + })); + installImageUnderstandingProviderStubs({ + id: "ollama", + capabilities: ["image"], + describeImage, + }); + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { primary: "moondream" }, + }, + }, + models: { + providers: { + ollama: { + baseUrl: "http://localhost:11434", + models: [makeModelDefinition("moondream", ["text", "image"])], + }, + }, + }, + }; + + const tool = requireImageTool(createImageTool({ config: cfg, agentDir })); + const result = await tool.execute("t1", { + prompt: "Describe this image in one word.", + image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`, + }); + + expect(describeImage).toHaveBeenCalledWith( + expect.objectContaining({ provider: "ollama", model: "moondream" }), + ); + expect(result.content).toEqual( + expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok moondream" })]), + ); + }); + }); + + it("rejects ambiguous providerless explicit image models", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { primary: "moondream" }, + }, + }, + models: { + providers: { + ollama: { + baseUrl: "http://localhost:11434", + models: [makeModelDefinition("moondream", ["text", "image"])], + }, + lmstudio: { + baseUrl: "http://localhost:1234", + models: [makeModelDefinition("moondream", ["text", "image"])], + }, + }, + }, + }; + + expect(() => resolveImageModelConfigForTool({ cfg, agentDir })).toThrow( + 'Ambiguous image model "moondream"', + ); + }); + }); + + it("keeps unmatched providerless explicit image models on the legacy default-provider path", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + imageModel: { primary: "gpt-5.4-mini" }, + }, + }, + }; + + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "gpt-5.4-mini", + }); + }); + }); + it("keeps image tool available when primary model supports images (for explicit requests)", async () => { // When the primary model supports images, we still keep the tool available // because images are auto-injected into prompts. The tool description is diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 4dfcc521b44..8699c5a0e69 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -30,6 +30,7 @@ import { decodeDataUrl, hasImageReasoningOnlyResponse, type ImageModelConfig, + resolveConfiguredImageModelRefs, resolveProviderVisionModelFromConfig, } from "./image-tool.helpers.js"; import { @@ -123,7 +124,10 @@ export function resolveImageModelConfigForTool(params: { // The tool description is adjusted via modelHasVision to discourage redundant usage. const explicit = coerceImageModelConfig(params.cfg); if (hasToolModelConfig(explicit)) { - return explicit; + return resolveConfiguredImageModelRefs({ + cfg: params.cfg, + imageModelConfig: explicit, + }); } const primary = resolveDefaultModelRef(params.cfg); diff --git a/src/agents/tools/pdf-tool.model-config.ts b/src/agents/tools/pdf-tool.model-config.ts index c6ac810de81..272301d6a3a 100644 --- a/src/agents/tools/pdf-tool.model-config.ts +++ b/src/agents/tools/pdf-tool.model-config.ts @@ -7,6 +7,7 @@ import { import { coerceImageModelConfig, type ImageModelConfig, + resolveConfiguredImageModelRefs, resolveProviderVisionModelFromConfig, } from "./image-tool.helpers.js"; import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js"; @@ -42,12 +43,18 @@ export function resolvePdfModelConfigForTool(params: { }): ImageModelConfig | null { const explicitPdf = coercePdfModelConfig(params.cfg); if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) { - return explicitPdf; + return resolveConfiguredImageModelRefs({ + cfg: params.cfg, + imageModelConfig: explicitPdf, + }); } const explicitImage = coerceImageModelConfig(params.cfg); if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) { - return explicitImage; + return resolveConfiguredImageModelRefs({ + cfg: params.cfg, + imageModelConfig: explicitImage, + }); } const primary = resolveDefaultModelRef(params.cfg);