From dab0e97c222c4dc3faebfd3f7fef219ea965041a Mon Sep 17 00:00:00 2001 From: Tars Date: Sun, 8 Mar 2026 04:30:53 +0800 Subject: [PATCH] fix(models): support minimax-portal coding plan vlm routing for image tool (openclaw#33953) Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test:macmini Co-authored-by: tars90percent <252094836+tars90percent@users.noreply.github.com> --- CHANGELOG.md | 1 + .../minimax-vlm.normalizes-api-key.test.ts | 11 ++ src/agents/minimax-vlm.ts | 8 ++ .../models-config.providers.nvidia.test.ts | 19 ++- src/agents/models-config.providers.ts | 9 +- src/agents/tools/image-tool.test.ts | 26 ++++ src/agents/tools/image-tool.ts | 8 +- src/media-understanding/defaults.test.ts | 14 ++ src/media-understanding/defaults.ts | 2 + .../providers/image.test.ts | 133 ++++++++++++++++++ src/media-understanding/providers/image.ts | 4 +- .../providers/index.test.ts | 8 ++ src/media-understanding/providers/index.ts | 3 +- .../providers/minimax/index.ts | 6 + src/telegram/sticker-cache.ts | 17 +-- 15 files changed, 246 insertions(+), 23 deletions(-) create mode 100644 src/media-understanding/providers/image.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 71a864bdd7a..89a67e187d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -134,6 +134,7 @@ Docs: https://docs.openclaw.ai - Routing/legacy route guard tightening: require legacy session-key channel hints to match the saved delivery channel before inheriting external routing metadata, preventing custom namespaced keys like `agent::work:` from inheriting stale non-webchat routes. - Gateway/internal client routing continuity: prevent webchat/TUI/UI turns from inheriting stale external reply routes by requiring explicit `deliver: true` for external delivery, keeping main-session external inheritance scoped to non-Webchat/UI clients, and honoring configured `session.mainKey` when identifying main-session continuity. (from #35321, #34635, #35356) Thanks @alexyyyander and @Octane0411. - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n. +- Models/MiniMax portal vision routing: add `MiniMax-VL-01` to the `minimax-portal` provider, route portal image understanding through the MiniMax VLM endpoint, and align media auto-selection plus Telegram sticker description with the shared portal image provider path. (#33953) Thanks @tars90percent. - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant. - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot. - Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts, add short overload backoff before retry/failover, record overloaded prompt/assistant failures as transient auth-profile cooldowns (with probeable same-provider fallback) instead of treating them like persistent auth/billing failures, and keep one-shot cron retry classification aligned so overloaded fallback summaries still count as transient retries. diff --git a/src/agents/minimax-vlm.normalizes-api-key.test.ts b/src/agents/minimax-vlm.normalizes-api-key.test.ts index faa33b8682c..146f90bbb62 100644 --- a/src/agents/minimax-vlm.normalizes-api-key.test.ts +++ b/src/agents/minimax-vlm.normalizes-api-key.test.ts @@ -45,3 +45,14 @@ describe("minimaxUnderstandImage apiKey normalization", () => { await runNormalizationCase("minimax-\u0417\u2502test-key"); }); }); + +describe("isMinimaxVlmModel", () => { + it("only matches the canonical MiniMax VLM model id", async () => { + const { isMinimaxVlmModel } = await import("./minimax-vlm.js"); + + expect(isMinimaxVlmModel("minimax", "MiniMax-VL-01")).toBe(true); + expect(isMinimaxVlmModel("minimax-portal", "MiniMax-VL-01")).toBe(true); + expect(isMinimaxVlmModel("minimax-portal", "custom-vision")).toBe(false); + expect(isMinimaxVlmModel("openai", "MiniMax-VL-01")).toBe(false); + }); +}); diff --git a/src/agents/minimax-vlm.ts b/src/agents/minimax-vlm.ts index c167936189e..6a86dcc87a2 100644 --- a/src/agents/minimax-vlm.ts +++ b/src/agents/minimax-vlm.ts @@ -6,6 +6,14 @@ type MinimaxBaseResp = { status_msg?: string; }; +export function isMinimaxVlmProvider(provider: string): boolean { + return provider === "minimax" || provider === "minimax-portal"; +} + +export function isMinimaxVlmModel(provider: string, modelId: string): boolean { + return isMinimaxVlmProvider(provider) && modelId.trim() === "MiniMax-VL-01"; +} + function coerceApiHost(params: { apiHost?: string; modelBaseUrl?: string; diff --git a/src/agents/models-config.providers.nvidia.test.ts b/src/agents/models-config.providers.nvidia.test.ts index 02086283c84..fe61b343369 100644 --- a/src/agents/models-config.providers.nvidia.test.ts +++ b/src/agents/models-config.providers.nvidia.test.ts @@ -71,10 +71,9 @@ describe("MiniMax implicit provider (#15275)", () => { "minimax-portal:default": { type: "oauth", provider: "minimax-portal", - oauth: { - access: "token", - expires: Date.now() + 60_000, - }, + access: "token", + refresh: "refresh-token", + expires: Date.now() + 60_000, }, }, }, @@ -87,6 +86,18 @@ describe("MiniMax implicit provider (#15275)", () => { const providers = await resolveImplicitProviders({ agentDir }); expect(providers?.["minimax-portal"]?.authHeader).toBe(true); }); + + it("should include minimax portal provider when MINIMAX_OAUTH_TOKEN is configured", async () => { + const agentDir = mkdtempSync(join(tmpdir(), "openclaw-test-")); + await withEnvAsync({ MINIMAX_OAUTH_TOKEN: "portal-token" }, async () => { + const providers = await resolveImplicitProviders({ agentDir }); + expect(providers?.["minimax-portal"]).toBeDefined(); + expect(providers?.["minimax-portal"]?.authHeader).toBe(true); + expect(providers?.["minimax-portal"]?.models?.some((m) => m.id === "MiniMax-VL-01")).toBe( + true, + ); + }); + }); }); describe("vLLM provider", () => { diff --git a/src/agents/models-config.providers.ts b/src/agents/models-config.providers.ts index a7d42fb7696..985b82c6ef2 100644 --- a/src/agents/models-config.providers.ts +++ b/src/agents/models-config.providers.ts @@ -771,6 +771,12 @@ function buildMinimaxPortalProvider(): ProviderConfig { api: "anthropic-messages", authHeader: true, models: [ + buildMinimaxModel({ + id: MINIMAX_DEFAULT_VISION_MODEL_ID, + name: "MiniMax VL 01", + reasoning: false, + input: ["text", "image"], + }), buildMinimaxTextModel({ id: MINIMAX_DEFAULT_MODEL_ID, name: "MiniMax M2.5", @@ -1116,8 +1122,9 @@ export async function resolveImplicitProviders(params: { providers.minimax = { ...buildMinimaxProvider(), apiKey: minimaxKey }; } + const minimaxPortalEnvKey = resolveEnvApiKeyVarName("minimax-portal"); const minimaxOauthProfile = listProfilesForProvider(authStore, "minimax-portal"); - if (minimaxOauthProfile.length > 0) { + if (minimaxPortalEnvKey || minimaxOauthProfile.length > 0) { providers["minimax-portal"] = { ...buildMinimaxPortalProvider(), apiKey: MINIMAX_OAUTH_MARKER, diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 66f985c1cac..78a7754e84a 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -273,6 +273,32 @@ describe("image tool implicit imageModel config", () => { }); }); + it("pairs minimax-portal primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => { + await withTempAgentDir(async (agentDir) => { + await writeAuthProfiles(agentDir, { + version: 1, + profiles: { + "minimax-portal:default": { + type: "oauth", + provider: "minimax-portal", + access: "oauth-test", + refresh: "refresh-test", + expires: Date.now() + 60_000, + }, + }, + }); + vi.stubEnv("OPENAI_API_KEY", "openai-test"); + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "minimax-portal/MiniMax-M2.5" } } }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual( + createDefaultImageFallbackExpectation("minimax-portal/MiniMax-VL-01"), + ); + expect(createImageTool({ config: cfg, agentDir })).not.toBeNull(); + }); + }); + it("pairs zai primary with glm-4.6v (and fallbacks) when auth exists", async () => { await withTempAgentDir(async (agentDir) => { vi.stubEnv("ZAI_API_KEY", "zai-test"); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 3046098ab4f..c1e9537d8c5 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -3,7 +3,7 @@ import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; import { resolveUserPath } from "../../utils.js"; import { loadWebMedia } from "../../web/media.js"; -import { minimaxUnderstandImage } from "../minimax-vlm.js"; +import { isMinimaxVlmModel, isMinimaxVlmProvider, minimaxUnderstandImage } from "../minimax-vlm.js"; import { coerceImageAssistantText, coerceImageModelConfig, @@ -110,8 +110,8 @@ export function resolveImageModelConfigForTool(params: { let preferred: string | null = null; // MiniMax users: always try the canonical vision model first when auth exists. - if (primary.provider === "minimax" && providerOk) { - preferred = "minimax/MiniMax-VL-01"; + if (isMinimaxVlmProvider(primary.provider) && providerOk) { + preferred = `${primary.provider}/MiniMax-VL-01`; } else if (providerOk && providerVisionFromConfig) { preferred = providerVisionFromConfig; } else if (primary.provider === "zai" && providerOk) { @@ -229,7 +229,7 @@ async function runImagePrompt(params: { }); // MiniMax VLM only supports a single image; use the first one. - if (model.provider === "minimax") { + if (isMinimaxVlmModel(model.provider, model.id)) { const first = params.images[0]; const imageDataUrl = `data:${first.mimeType};base64,${first.base64}`; const text = await minimaxUnderstandImage({ diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts index f7bc540b104..1670d4bdf6a 100644 --- a/src/media-understanding/defaults.test.ts +++ b/src/media-understanding/defaults.test.ts @@ -1,8 +1,10 @@ import { describe, expect, it } from "vitest"; import { AUTO_AUDIO_KEY_PROVIDERS, + AUTO_IMAGE_KEY_PROVIDERS, AUTO_VIDEO_KEY_PROVIDERS, DEFAULT_AUDIO_MODELS, + DEFAULT_IMAGE_MODELS, } from "./defaults.js"; describe("DEFAULT_AUDIO_MODELS", () => { @@ -22,3 +24,15 @@ describe("AUTO_VIDEO_KEY_PROVIDERS", () => { expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot"); }); }); + +describe("AUTO_IMAGE_KEY_PROVIDERS", () => { + it("includes minimax-portal auto key resolution", () => { + expect(AUTO_IMAGE_KEY_PROVIDERS).toContain("minimax-portal"); + }); +}); + +describe("DEFAULT_IMAGE_MODELS", () => { + it("includes the MiniMax portal vision default", () => { + expect(DEFAULT_IMAGE_MODELS["minimax-portal"]).toBe("MiniMax-VL-01"); + }); +}); diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index cac7dbf5271..a7c0d76d021 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -46,6 +46,7 @@ export const AUTO_IMAGE_KEY_PROVIDERS = [ "anthropic", "google", "minimax", + "minimax-portal", "zai", ] as const; export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const; @@ -54,6 +55,7 @@ export const DEFAULT_IMAGE_MODELS: Record = { anthropic: "claude-opus-4-6", google: "gemini-3-flash-preview", minimax: "MiniMax-VL-01", + "minimax-portal": "MiniMax-VL-01", zai: "glm-4.6v", }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; diff --git a/src/media-understanding/providers/image.test.ts b/src/media-understanding/providers/image.test.ts new file mode 100644 index 00000000000..948f4c74d11 --- /dev/null +++ b/src/media-understanding/providers/image.test.ts @@ -0,0 +1,133 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const completeMock = vi.fn(); +const minimaxUnderstandImageMock = vi.fn(); +const ensureOpenClawModelsJsonMock = vi.fn(async () => {}); +const getApiKeyForModelMock = vi.fn(async () => ({ + apiKey: "oauth-test", + source: "test", + mode: "oauth", +})); +const requireApiKeyMock = vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""); +const setRuntimeApiKeyMock = vi.fn(); +const discoverModelsMock = vi.fn(); + +vi.mock("@mariozechner/pi-ai", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + complete: completeMock, + }; +}); + +vi.mock("../../agents/minimax-vlm.js", () => ({ + isMinimaxVlmProvider: (provider: string) => + provider === "minimax" || provider === "minimax-portal", + isMinimaxVlmModel: (provider: string, modelId: string) => + (provider === "minimax" || provider === "minimax-portal") && modelId === "MiniMax-VL-01", + minimaxUnderstandImage: minimaxUnderstandImageMock, +})); + +vi.mock("../../agents/models-config.js", () => ({ + ensureOpenClawModelsJson: ensureOpenClawModelsJsonMock, +})); + +vi.mock("../../agents/model-auth.js", () => ({ + getApiKeyForModel: getApiKeyForModelMock, + requireApiKey: requireApiKeyMock, +})); + +vi.mock("../../agents/pi-model-discovery-runtime.js", () => ({ + discoverAuthStorage: () => ({ + setRuntimeApiKey: setRuntimeApiKeyMock, + }), + discoverModels: discoverModelsMock, +})); + +describe("describeImageWithModel", () => { + beforeEach(() => { + vi.clearAllMocks(); + minimaxUnderstandImageMock.mockResolvedValue("portal ok"); + discoverModelsMock.mockReturnValue({ + find: vi.fn(() => ({ + provider: "minimax-portal", + id: "MiniMax-VL-01", + input: ["text", "image"], + baseUrl: "https://api.minimax.io/anthropic", + })), + }); + }); + + it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => { + const { describeImageWithModel } = await import("./image.js"); + + const result = await describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider: "minimax-portal", + model: "MiniMax-VL-01", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "portal ok", + model: "MiniMax-VL-01", + }); + expect(ensureOpenClawModelsJsonMock).toHaveBeenCalled(); + expect(getApiKeyForModelMock).toHaveBeenCalled(); + expect(requireApiKeyMock).toHaveBeenCalled(); + expect(setRuntimeApiKeyMock).toHaveBeenCalledWith("minimax-portal", "oauth-test"); + expect(minimaxUnderstandImageMock).toHaveBeenCalledWith({ + apiKey: "oauth-test", + prompt: "Describe the image.", + imageDataUrl: `data:image/png;base64,${Buffer.from("png-bytes").toString("base64")}`, + modelBaseUrl: "https://api.minimax.io/anthropic", + }); + expect(completeMock).not.toHaveBeenCalled(); + }); + + it("uses generic completion for non-canonical minimax-portal image models", async () => { + discoverModelsMock.mockReturnValue({ + find: vi.fn(() => ({ + provider: "minimax-portal", + id: "custom-vision", + input: ["text", "image"], + baseUrl: "https://api.minimax.io/anthropic", + })), + }); + completeMock.mockResolvedValue({ + role: "assistant", + api: "anthropic-messages", + provider: "minimax-portal", + model: "custom-vision", + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "generic ok" }], + }); + + const { describeImageWithModel } = await import("./image.js"); + + const result = await describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider: "minimax-portal", + model: "custom-vision", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "generic ok", + model: "custom-vision", + }); + expect(completeMock).toHaveBeenCalledOnce(); + expect(minimaxUnderstandImageMock).not.toHaveBeenCalled(); + }); +}); diff --git a/src/media-understanding/providers/image.ts b/src/media-understanding/providers/image.ts index d0dc13c0086..79c36292f0c 100644 --- a/src/media-understanding/providers/image.ts +++ b/src/media-understanding/providers/image.ts @@ -1,6 +1,6 @@ import type { Api, Context, Model } from "@mariozechner/pi-ai"; import { complete } from "@mariozechner/pi-ai"; -import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js"; +import { isMinimaxVlmModel, minimaxUnderstandImage } from "../../agents/minimax-vlm.js"; import { getApiKeyForModel, requireApiKey } from "../../agents/model-auth.js"; import { ensureOpenClawModelsJson } from "../../agents/models-config.js"; import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js"; @@ -40,7 +40,7 @@ export async function describeImageWithModel( authStorage.setRuntimeApiKey(model.provider, apiKey); const base64 = params.buffer.toString("base64"); - if (model.provider === "minimax") { + if (isMinimaxVlmModel(model.provider, model.id)) { const text = await minimaxUnderstandImage({ apiKey, prompt: params.prompt ?? "Describe the image.", diff --git a/src/media-understanding/providers/index.test.ts b/src/media-understanding/providers/index.test.ts index 430e89e84a6..9294d44acd5 100644 --- a/src/media-understanding/providers/index.test.ts +++ b/src/media-understanding/providers/index.test.ts @@ -24,4 +24,12 @@ describe("media-understanding provider registry", () => { expect(provider?.id).toBe("moonshot"); expect(provider?.capabilities).toEqual(["image", "video"]); }); + + it("registers the minimax portal provider", () => { + const registry = buildMediaUnderstandingRegistry(); + const provider = getMediaUnderstandingProvider("minimax-portal", registry); + + expect(provider?.id).toBe("minimax-portal"); + expect(provider?.capabilities).toEqual(["image"]); + }); }); diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 5aef51790a2..0ceaa78fd80 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -4,7 +4,7 @@ import { anthropicProvider } from "./anthropic/index.js"; import { deepgramProvider } from "./deepgram/index.js"; import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; -import { minimaxProvider } from "./minimax/index.js"; +import { minimaxPortalProvider, minimaxProvider } from "./minimax/index.js"; import { mistralProvider } from "./mistral/index.js"; import { moonshotProvider } from "./moonshot/index.js"; import { openaiProvider } from "./openai/index.js"; @@ -16,6 +16,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [ googleProvider, anthropicProvider, minimaxProvider, + minimaxPortalProvider, moonshotProvider, mistralProvider, zaiProvider, diff --git a/src/media-understanding/providers/minimax/index.ts b/src/media-understanding/providers/minimax/index.ts index 6fa6ebf351a..c9a7936f4d3 100644 --- a/src/media-understanding/providers/minimax/index.ts +++ b/src/media-understanding/providers/minimax/index.ts @@ -6,3 +6,9 @@ export const minimaxProvider: MediaUnderstandingProvider = { capabilities: ["image"], describeImage: describeImageWithModel, }; + +export const minimaxPortalProvider: MediaUnderstandingProvider = { + id: "minimax-portal", + capabilities: ["image"], + describeImage: describeImageWithModel, +}; diff --git a/src/telegram/sticker-cache.ts b/src/telegram/sticker-cache.ts index 26fb33ee538..be8966b1eb5 100644 --- a/src/telegram/sticker-cache.ts +++ b/src/telegram/sticker-cache.ts @@ -12,6 +12,7 @@ import type { OpenClawConfig } from "../config/config.js"; import { STATE_DIR } from "../config/paths.js"; import { logVerbose } from "../globals.js"; import { loadJsonFile, saveJsonFile } from "../infra/json-file.js"; +import { AUTO_IMAGE_KEY_PROVIDERS, DEFAULT_IMAGE_MODELS } from "../media-understanding/defaults.js"; import { resolveAutoImageModel } from "../media-understanding/runner.js"; const CACHE_FILE = path.join(STATE_DIR, "telegram", "sticker-cache.json"); @@ -142,7 +143,6 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?: const STICKER_DESCRIPTION_PROMPT = "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective."; -const VISION_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const; let imageRuntimePromise: Promise< typeof import("../media-understanding/providers/image-runtime.js") > | null = null; @@ -198,14 +198,7 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi if (entries.length === 0) { return undefined; } - const defaultId = - provider === "openai" - ? "gpt-5-mini" - : provider === "anthropic" - ? "claude-opus-4-6" - : provider === "google" - ? "gemini-3-flash-preview" - : "MiniMax-VL-01"; + const defaultId = DEFAULT_IMAGE_MODELS[provider]; const preferred = entries.find((entry) => entry.id === defaultId); return preferred ?? entries[0]; }; @@ -213,14 +206,16 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi let resolved = null as { provider: string; model?: string } | null; if ( activeModel && - VISION_PROVIDERS.includes(activeModel.provider as (typeof VISION_PROVIDERS)[number]) && + AUTO_IMAGE_KEY_PROVIDERS.includes( + activeModel.provider as (typeof AUTO_IMAGE_KEY_PROVIDERS)[number], + ) && (await hasProviderKey(activeModel.provider)) ) { resolved = activeModel; } if (!resolved) { - for (const provider of VISION_PROVIDERS) { + for (const provider of AUTO_IMAGE_KEY_PROVIDERS) { if (!(await hasProviderKey(provider))) { continue; }