diff --git a/CHANGELOG.md b/CHANGELOG.md index 402dfbfda1e..49124a96676 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Telegram: handle managed select button callbacks before the raw callback fallback while preserving delimiter-containing option values such as `env|prod`. (#79816) Thanks @moeedahmed. +- CLI/media: let explicit image description model refs use bundled static provider catalogs and generic model-backed image hooks, so `openclaw infer image describe --model zai/glm-4.6v` works like direct model runs and Anthropic auth probes avoid stale Claude 3 Haiku catalog entries. - Browser: wait longer for existing-session Chrome MCP status and non-deep doctor probes so slow first attaches do not falsely report offline while keeping raw CDP status probes short. (#77473) Thanks @rubencu. - Exec approvals: keep `exec.approval.list` on the lightweight policy-summary path so listing pending approvals no longer loads the rich tree-sitter command explainer. (#76943) Thanks @rubencu. - Agents: surface concise default-visible warnings when `exec`/`bash` tool calls fail after the assistant claims success, while keeping raw stderr hidden unless verbose details are enabled. Fixes #60497. (#80003) Thanks @jbetala7. diff --git a/src/commands/models/list.probe.targets.test.ts b/src/commands/models/list.probe.targets.test.ts index 88292c652d2..b9ff17c65c8 100644 --- a/src/commands/models/list.probe.targets.test.ts +++ b/src/commands/models/list.probe.targets.test.ts @@ -334,6 +334,52 @@ describe("buildProbeTargets reason codes", () => { }); }); + it("prefers live Anthropic Haiku 4.5 catalog entries over stale Claude 3 probes", async () => { + mockStore = { + version: 1, + profiles: {}, + order: {}, + }; + loadModelCatalogMock.mockResolvedValueOnce([ + { provider: "anthropic", id: "claude-3-haiku-20240307", name: "Claude Haiku 3" }, + { + provider: "anthropic", + id: "claude-haiku-4-5-20251001", + name: "Claude Haiku 4.5", + }, + { provider: "anthropic", id: "claude-sonnet-4-6", name: "Claude Sonnet 4.6" }, + ]); + + const plan = await buildProbeTargets({ + cfg: { + models: { + providers: { + anthropic: { + baseUrl: "https://api.anthropic.com/v1", + api: "anthropic-messages", + apiKey: "sk-ant-test", + models: [], + }, + }, + }, + } as OpenClawConfig, + providers: ["anthropic"], + modelCandidates: [], + options: { + timeoutMs: 5_000, + concurrency: 1, + maxTokens: 16, + }, + }); + + expect(plan.results).toStrictEqual([]); + expect(plan.targets).toHaveLength(1); + expect(plan.targets[0]?.model).toStrictEqual({ + provider: "anthropic", + model: "claude-haiku-4-5-20251001", + }); + }); + it("uses workspace-scoped auth evidence when building env probe targets", async () => { mockStore = { version: 1, diff --git a/src/commands/models/list.probe.ts b/src/commands/models/list.probe.ts index 381db9b799e..f7dc913b1f2 100644 --- a/src/commands/models/list.probe.ts +++ b/src/commands/models/list.probe.ts @@ -152,6 +152,29 @@ function buildCandidateMap(modelCandidates: string[]): Map { return map; } +function catalogProbePriority(provider: string, modelId: string): number { + const id = modelId.trim().toLowerCase(); + if (provider !== "anthropic") { + return 50; + } + if (/^claude-haiku-4-5-\d{8}$/.test(id)) { + return 0; + } + if (id === "claude-haiku-4-5") { + return 1; + } + if (id === "claude-sonnet-4-6" || id.startsWith("claude-sonnet-4-6-")) { + return 2; + } + if (id.startsWith("claude-sonnet-4-")) { + return 3; + } + if (id.startsWith("claude-3-")) { + return 100; + } + return 50; +} + function selectProbeModel(params: { provider: string; candidates: Map; @@ -162,7 +185,15 @@ function selectProbeModel(params: { if (direct && direct.length > 0) { return { provider, model: direct[0] }; } - const fromCatalog = catalog.find((entry) => normalizeProviderId(entry.provider) === provider); + const fromCatalog = catalog + .map((entry, index) => ({ entry, index })) + .filter(({ entry }) => normalizeProviderId(entry.provider) === provider) + .toSorted((left, right) => { + const priority = + catalogProbePriority(provider, left.entry.id) - + catalogProbePriority(provider, right.entry.id); + return priority || left.index - right.index; + })[0]?.entry; if (fromCatalog) { return { provider, model: fromCatalog.id }; } diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index d046e93ffe1..8b2a7430a14 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -19,6 +19,7 @@ const hoisted = vi.hoisted(() => ({ fetchMock: vi.fn(), registerProviderStreamForModelMock: vi.fn(), prepareProviderDynamicModelMock: vi.fn(async () => {}), + resolveModelAsyncMock: vi.fn(), resolveModelWithRegistryMock: vi.fn(), })); const { @@ -32,6 +33,7 @@ const { fetchMock, registerProviderStreamForModelMock, prepareProviderDynamicModelMock, + resolveModelAsyncMock, resolveModelWithRegistryMock, } = hoisted; @@ -86,7 +88,7 @@ vi.mock("../plugins/provider-runtime.js", async () => ({ })); vi.mock("../agents/pi-embedded-runner/model.js", () => ({ - resolveModelWithRegistry: resolveModelWithRegistryMock, + resolveModelAsync: resolveModelAsyncMock, })); const { describeImageWithModel } = await import("./image.js"); @@ -126,6 +128,22 @@ describe("describeImageWithModel", () => { ({ modelRegistry, provider, modelId }: ResolveModelWithRegistryTestParams) => modelRegistry.find(provider, modelId), ); + resolveModelAsyncMock.mockImplementation( + async (provider: string, modelId: string, agentDir?: string, cfg?: unknown) => { + const authStorage = { + setRuntimeApiKey: setRuntimeApiKeyMock, + }; + const modelRegistry = discoverModelsMock(authStorage, agentDir); + const model = resolveModelWithRegistryMock({ + provider, + modelId, + modelRegistry, + cfg, + agentDir, + }); + return { authStorage, model, modelRegistry }; + }, + ); }); function getApiKeyForModelCall(index = 0): AuthRequestCall { diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index 35cc3f26dc8..2ea8cb3d1c9 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -6,16 +6,15 @@ import { requireApiKey, resolveApiKeyForProvider, } from "../agents/model-auth.js"; -import { findNormalizedProviderValue, normalizeModelRef } from "../agents/model-selection.js"; +import { normalizeModelRef } from "../agents/model-selection.js"; import { ensureOpenClawModelsJson } from "../agents/models-config.js"; -import { resolveModelWithRegistry } from "../agents/pi-embedded-runner/model.js"; +import { resolveModelAsync } from "../agents/pi-embedded-runner/model.js"; import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js"; import { registerProviderStreamForModel } from "../agents/provider-stream.js"; import { coerceImageAssistantText, hasImageReasoningOnlyResponse, } from "../agents/tools/image-tool.helpers.js"; -import { prepareProviderDynamicModel } from "../plugins/provider-runtime.js"; import type { ImageDescriptionRequest, ImageDescriptionResult, @@ -23,15 +22,6 @@ import type { ImagesDescriptionResult, } from "./types.js"; -let piModelDiscoveryRuntimePromise: Promise< - typeof import("../agents/pi-model-discovery-runtime.js") -> | null = null; - -function loadPiModelDiscoveryRuntime() { - piModelDiscoveryRuntimePromise ??= import("../agents/pi-model-discovery-runtime.js"); - return piModelDiscoveryRuntimePromise; -} - function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requestedMaxTokens = 4096) { if ( typeof modelMaxTokens !== "number" || @@ -143,48 +133,17 @@ async function resolveImageRuntime(params: { authStore?: ImageDescriptionRequest["authStore"]; }): Promise<{ apiKey: string; model: Model }> { await ensureOpenClawModelsJson(params.cfg, params.agentDir); - const { discoverAuthStorage, discoverModels } = await loadPiModelDiscoveryRuntime(); - const authStorage = discoverAuthStorage(params.agentDir); - const modelRegistry = discoverModels(authStorage, params.agentDir); const resolvedRef = normalizeModelRef(params.provider, params.model); - const configuredProviders = params.cfg.models?.providers; - const providerConfig = - configuredProviders?.[resolvedRef.provider] ?? - findNormalizedProviderValue(configuredProviders, resolvedRef.provider); - // Fast path: resolve without dynamic model preparation first. - // This avoids unnecessary prepare hooks (e.g. OpenRouter catalog fetch) - // for models that are already explicitly resolvable. - let model = resolveModelWithRegistry({ - provider: resolvedRef.provider, - modelId: resolvedRef.model, - modelRegistry, - cfg: params.cfg, - agentDir: params.agentDir, - }) as Model | null; - - // If the model is not in the registry yet, prepare dynamic provider models - // and retry (needed for provider-runtime-backed dynamic models). - if (!model) { - await prepareProviderDynamicModel({ - provider: resolvedRef.provider, - config: params.cfg, - context: { - config: params.cfg, - agentDir: params.agentDir, - provider: resolvedRef.provider, - modelId: resolvedRef.model, - modelRegistry, - providerConfig, - }, - }); - model = resolveModelWithRegistry({ - provider: resolvedRef.provider, - modelId: resolvedRef.model, - modelRegistry, - cfg: params.cfg, - agentDir: params.agentDir, - }) as Model | null; - } + const resolved = await resolveModelAsync( + resolvedRef.provider, + resolvedRef.model, + params.agentDir, + params.cfg, + { + allowBundledStaticCatalogFallback: true, + }, + ); + const { authStorage, model } = resolved; if (!model) { throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`); } diff --git a/src/media-understanding/provider-registry.test.ts b/src/media-understanding/provider-registry.test.ts index ec268c5742d..f7a1583a892 100644 --- a/src/media-understanding/provider-registry.test.ts +++ b/src/media-understanding/provider-registry.test.ts @@ -44,6 +44,8 @@ describe("media-understanding provider registry", () => { const registry = buildMediaUnderstandingRegistry(); expect(requireMediaProvider(registry, "groq").id).toBe("groq"); + expect(typeof requireMediaProvider(registry, "groq").describeImage).toBe("function"); + expect(typeof requireMediaProvider(registry, "groq").describeImages).toBe("function"); expect(requireMediaProvider(registry, "deepgram").id).toBe("deepgram"); expect(resolvePluginCapabilityProvidersMock).toHaveBeenCalledWith({ key: "mediaUnderstandingProviders", @@ -51,6 +53,23 @@ describe("media-understanding provider registry", () => { }); }); + it("hydrates manifest-only image providers with model-backed image hooks", () => { + resolvePluginCapabilityProvidersMock.mockReturnValue([ + createMediaProvider({ + id: "zai", + capabilities: ["image"], + defaultModels: { image: "glm-4.6v" }, + }), + ]); + + const registry = buildMediaUnderstandingRegistry(); + const provider = requireMediaProvider(registry, "zai"); + + expect(provider.defaultModels?.image).toBe("glm-4.6v"); + expect(provider.describeImage).toBeTypeOf("function"); + expect(provider.describeImages).toBeTypeOf("function"); + }); + it("keeps provider id normalization behavior for capability providers", () => { resolvePluginCapabilityProvidersMock.mockReturnValue([ createMediaProvider({ id: "google", capabilities: ["image", "audio", "video"] }), diff --git a/src/media-understanding/provider-registry.ts b/src/media-understanding/provider-registry.ts index 025f1ea02ae..ac1688fc9a7 100644 --- a/src/media-understanding/provider-registry.ts +++ b/src/media-understanding/provider-registry.ts @@ -22,7 +22,23 @@ function mergeProviderIntoRegistry( nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs, } : provider; - registry.set(normalizedKey, merged); + registry.set(normalizedKey, hydrateModelBackedMediaProvider(merged)); +} + +function hydrateModelBackedMediaProvider( + provider: MediaUnderstandingProvider, +): MediaUnderstandingProvider { + if (!provider.capabilities?.includes("image")) { + return provider; + } + if (provider.describeImage && provider.describeImages) { + return provider; + } + return { + ...provider, + describeImage: provider.describeImage ?? describeImageWithModel, + describeImages: provider.describeImages ?? describeImagesWithModel, + }; } export { normalizeMediaProviderId } from "./provider-id.js"; diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts index f1c093c7a88..4b08673a405 100644 --- a/src/media-understanding/runtime.test.ts +++ b/src/media-understanding/runtime.test.ts @@ -1,7 +1,11 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../config/types.js"; import type { MediaAttachment, MediaUnderstandingOutput } from "../media-understanding/types.js"; -import { describeImageFile, runMediaUnderstandingFile } from "./runtime.js"; +import { + describeImageFile, + describeImageFileWithModel, + runMediaUnderstandingFile, +} from "./runtime.js"; const mocks = vi.hoisted(() => { const cleanup = vi.fn(async () => {}); @@ -10,6 +14,8 @@ const mocks = vi.hoisted(() => { createMediaAttachmentCache: vi.fn(() => ({ cleanup })), normalizeMediaAttachments: vi.fn<() => MediaAttachment[]>(() => []), normalizeMediaProviderId: vi.fn((provider: string) => provider.trim().toLowerCase()), + readLocalFileSafely: vi.fn(async () => ({ buffer: Buffer.from("image") })), + describeImageWithModel: vi.fn(async () => ({ text: "generic image ok", model: "vision" })), runCapability: vi.fn(), cleanup, }; @@ -26,12 +32,24 @@ vi.mock("./provider-registry.js", () => ({ normalizeMediaProviderId: mocks.normalizeMediaProviderId, })); +vi.mock("../infra/fs-safe.js", () => ({ + readLocalFileSafely: mocks.readLocalFileSafely, +})); + +vi.mock("./image-runtime.js", () => ({ + describeImageWithModel: mocks.describeImageWithModel, +})); + describe("media-understanding runtime", () => { afterEach(() => { mocks.buildProviderRegistry.mockReset(); mocks.createMediaAttachmentCache.mockReset(); mocks.normalizeMediaAttachments.mockReset(); mocks.normalizeMediaProviderId.mockReset(); + mocks.readLocalFileSafely.mockReset(); + mocks.readLocalFileSafely.mockResolvedValue({ buffer: Buffer.from("image") }); + mocks.describeImageWithModel.mockReset(); + mocks.describeImageWithModel.mockResolvedValue({ text: "generic image ok", model: "vision" }); mocks.runCapability.mockReset(); mocks.cleanup.mockReset(); mocks.cleanup.mockResolvedValue(undefined); @@ -204,6 +222,37 @@ describe("media-understanding runtime", () => { }); }); + it("uses the generic model-backed image runtime for explicit models without media hooks", async () => { + mocks.buildProviderRegistry.mockReturnValue( + new Map([["zai", { id: "zai", capabilities: ["image"] }]]), + ); + + await expect( + describeImageFileWithModel({ + filePath: "/tmp/sample.jpg", + mime: "image/jpeg", + provider: "zai", + model: "glm-4.6v", + prompt: "Describe it", + cfg: {} as OpenClawConfig, + agentDir: "/tmp/agent", + }), + ).resolves.toEqual({ text: "generic image ok", model: "vision" }); + + expect(mocks.describeImageWithModel).toHaveBeenCalledWith({ + buffer: Buffer.from("image"), + fileName: "sample.jpg", + mime: "image/jpeg", + provider: "zai", + model: "glm-4.6v", + prompt: "Describe it", + maxTokens: undefined, + timeoutMs: 30_000, + cfg: {}, + agentDir: "/tmp/agent", + }); + }); + it("surfaces the underlying provider failure when media understanding fails", async () => { mocks.normalizeMediaAttachments.mockReturnValue([ { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" }, diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts index 74cd4c0b052..369219e3d63 100644 --- a/src/media-understanding/runtime.ts +++ b/src/media-understanding/runtime.ts @@ -1,5 +1,6 @@ import path from "node:path"; import { readLocalFileSafely } from "../infra/fs-safe.js"; +import { describeImageWithModel } from "./image-runtime.js"; import { normalizeMediaProviderId } from "./provider-registry.js"; import { findDecisionReason, normalizeDecisionReason } from "./runner.entries.js"; import { @@ -153,11 +154,9 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo const timeoutMs = params.timeoutMs ?? 30_000; const providerRegistry = buildProviderRegistry(undefined, params.cfg); const provider = providerRegistry.get(normalizeMediaProviderId(params.provider)); - if (!provider?.describeImage) { - throw new Error(`Provider does not support image analysis: ${params.provider}`); - } const buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer; - return await provider.describeImage({ + const describeImage = provider?.describeImage ?? describeImageWithModel; + return await describeImage({ buffer, fileName: path.basename(params.filePath), mime: params.mime,