fix(agents): harden model fallback failover paths

This commit is contained in:
Peter Steinberger
2026-02-25 03:46:34 +00:00
parent 480cc4b85c
commit d2597d5ecf
10 changed files with 187 additions and 11 deletions

View File

@@ -8,6 +8,8 @@ Docs: https://docs.openclaw.ai
### Fixes
- Agents/Model fallback: keep explicit text + image fallback chains reachable even when `agents.defaults.models` allowlists are present, prefer explicit run `agentId` over session-key parsing for followup fallback override resolution (with session-key fallback), treat agent-level fallback overrides as configured in embedded runner preflight, and classify `model_cooldown` / `cooling down` errors as `rate_limit` so failover continues. (#11972, #24137, #17231)
## 2026.2.24
### Changes

View File

@@ -8,7 +8,7 @@ import type { AuthProfileStore } from "./auth-profiles.js";
import { saveAuthProfileStore } from "./auth-profiles.js";
import { AUTH_STORE_VERSION } from "./auth-profiles/constants.js";
import { isAnthropicBillingError } from "./live-auth-keys.js";
import { runWithModelFallback } from "./model-fallback.js";
import { runWithImageModelFallback, runWithModelFallback } from "./model-fallback.js";
import { makeModelFallbackCfg } from "./test-helpers/model-fallback-config-fixture.js";
const makeCfg = makeModelFallbackCfg;
@@ -581,6 +581,39 @@ describe("runWithModelFallback", () => {
expect(calls).toEqual([{ provider: "anthropic", model: "claude-opus-4-5" }]);
});
it("keeps explicit fallbacks reachable when models allowlist is present", async () => {
const cfg = makeCfg({
agents: {
defaults: {
model: {
primary: "anthropic/claude-sonnet-4",
fallbacks: ["openai/gpt-4o", "ollama/llama-3"],
},
models: {
"anthropic/claude-sonnet-4": {},
},
},
},
});
const run = vi
.fn()
.mockRejectedValueOnce(Object.assign(new Error("rate limited"), { status: 429 }))
.mockResolvedValueOnce("ok");
const result = await runWithModelFallback({
cfg,
provider: "anthropic",
model: "claude-sonnet-4",
run,
});
expect(result.result).toBe("ok");
expect(run.mock.calls).toEqual([
["anthropic", "claude-sonnet-4"],
["openai", "gpt-4o"],
]);
});
it("defaults provider/model when missing (regression #946)", async () => {
const cfg = makeCfg({
agents: {
@@ -721,6 +754,39 @@ describe("runWithModelFallback", () => {
});
});
describe("runWithImageModelFallback", () => {
it("keeps explicit image fallbacks reachable when models allowlist is present", async () => {
const cfg = makeCfg({
agents: {
defaults: {
imageModel: {
primary: "openai/gpt-image-1",
fallbacks: ["google/gemini-2.5-flash-image-preview"],
},
models: {
"openai/gpt-image-1": {},
},
},
},
});
const run = vi
.fn()
.mockRejectedValueOnce(new Error("rate limited"))
.mockResolvedValueOnce("ok");
const result = await runWithImageModelFallback({
cfg,
run,
});
expect(result.result).toBe("ok");
expect(run.mock.calls).toEqual([
["openai", "gpt-image-1"],
["google", "gemini-2.5-flash-image-preview"],
]);
});
});
describe("isAnthropicBillingError", () => {
it("does not false-positive on plain 'a 402' prose", () => {
const samples = [

View File

@@ -164,7 +164,9 @@ function resolveImageFallbackCandidates(params: {
const imageFallbacks = resolveAgentModelFallbackValues(params.cfg?.agents?.defaults?.imageModel);
for (const raw of imageFallbacks) {
addRaw(raw, true);
// Explicitly configured image fallbacks should remain reachable even when a
// model allowlist is present.
addRaw(raw, false);
}
return candidates;
@@ -235,7 +237,9 @@ function resolveFallbackCandidates(params: {
if (!resolved) {
continue;
}
addCandidate(resolved.ref, true);
// Fallbacks are explicit user intent; do not silently filter them by the
// model allowlist.
addCandidate(resolved.ref, false);
}
if (params.fallbacksOverride === undefined && primary?.provider && primary.model) {

View File

@@ -433,6 +433,12 @@ describe("classifyFailoverReason", () => {
expect(classifyFailoverReason("Missing scopes: model.request")).toBe("auth");
expect(classifyFailoverReason("429 too many requests")).toBe("rate_limit");
expect(classifyFailoverReason("resource has been exhausted")).toBe("rate_limit");
expect(
classifyFailoverReason("model_cooldown: All credentials for model gpt-5 are cooling down"),
).toBe("rate_limit");
expect(classifyFailoverReason("all credentials for model x are cooling down")).toBe(
"rate_limit",
);
expect(
classifyFailoverReason(
'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',

View File

@@ -615,6 +615,8 @@ type ErrorPattern = RegExp | string;
const ERROR_PATTERNS = {
rateLimit: [
/rate[_ ]limit|too many requests|429/,
"model_cooldown",
"cooling down",
"exceeded your current quota",
"resource has been exhausted",
"quota exceeded",

View File

@@ -109,6 +109,45 @@ const makeConfig = (opts?: { fallbacks?: string[]; apiKey?: string }): OpenClawC
},
}) satisfies OpenClawConfig;
const makeAgentOverrideOnlyFallbackConfig = (agentId: string): OpenClawConfig =>
({
agents: {
defaults: {
model: {
fallbacks: [],
},
},
list: [
{
id: agentId,
model: {
fallbacks: ["openai/mock-2"],
},
},
],
},
models: {
providers: {
openai: {
api: "openai-responses",
apiKey: "sk-test",
baseUrl: "https://example.com",
models: [
{
id: "mock-1",
name: "Mock 1",
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 16_000,
maxTokens: 2048,
},
],
},
},
},
}) satisfies OpenClawConfig;
const writeAuthStore = async (
agentDir: string,
opts?: {
@@ -516,6 +555,42 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
});
});
it("treats agent-level fallbacks as configured when defaults have none", async () => {
await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => {
await writeAuthStore(agentDir, {
usageStats: {
"openai:p1": { lastUsed: 1, cooldownUntil: now + 60 * 60 * 1000 },
"openai:p2": { lastUsed: 2, cooldownUntil: now + 60 * 60 * 1000 },
},
});
await expect(
runEmbeddedPiAgent({
sessionId: "session:test",
sessionKey: "agent:support:cooldown-failover",
sessionFile: path.join(workspaceDir, "session.jsonl"),
workspaceDir,
agentDir,
config: makeAgentOverrideOnlyFallbackConfig("support"),
prompt: "hello",
provider: "openai",
model: "mock-1",
authProfileIdSource: "auto",
timeoutMs: 5_000,
runId: "run:agent-override-fallback",
agentId: "support",
}),
).rejects.toMatchObject({
name: "FailoverError",
reason: "rate_limit",
provider: "openai",
model: "mock-1",
});
expect(runEmbeddedAttemptMock).not.toHaveBeenCalled();
});
});
it("fails over with disabled reason when all profiles are unavailable", async () => {
await withTimedAgentWorkspace(async ({ agentDir, workspaceDir, now }) => {
await writeAuthStore(agentDir, {

View File

@@ -8,6 +8,7 @@ import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js";
import { enqueueCommandInLane } from "../../process/command-queue.js";
import { isMarkdownCapableMessageChannel } from "../../utils/message-channel.js";
import { resolveOpenClawAgentDir } from "../agent-paths.js";
import { resolveAgentModelFallbacksOverride } from "../agent-scope.js";
import {
isProfileInCooldown,
markAuthProfileFailure,
@@ -231,8 +232,15 @@ export async function runEmbeddedPiAgent(
let provider = (params.provider ?? DEFAULT_PROVIDER).trim() || DEFAULT_PROVIDER;
let modelId = (params.model ?? DEFAULT_MODEL).trim() || DEFAULT_MODEL;
const agentDir = params.agentDir ?? resolveOpenClawAgentDir();
const agentFallbacksOverride =
params.config && params.agentId
? resolveAgentModelFallbacksOverride(params.config, params.agentId)
: undefined;
const fallbackConfigured =
resolveAgentModelFallbackValues(params.config?.agents?.defaults?.model).length > 0;
(
agentFallbacksOverride ??
resolveAgentModelFallbackValues(params.config?.agents?.defaults?.model)
).length > 0;
await ensureOpenClawModelsJson(params.config, agentDir);
// Run before_model_resolve hooks early so plugins can override the

View File

@@ -61,10 +61,10 @@ describe("agent-runner-utils", () => {
const resolved = resolveModelFallbackOptions(run);
expect(hoisted.resolveAgentIdFromSessionKeyMock).toHaveBeenCalledWith(run.sessionKey);
expect(hoisted.resolveAgentIdFromSessionKeyMock).not.toHaveBeenCalled();
expect(hoisted.resolveAgentModelFallbacksOverrideMock).toHaveBeenCalledWith(
run.config,
"agent-id",
run.agentId,
);
expect(resolved).toEqual({
cfg: run.config,
@@ -75,6 +75,21 @@ describe("agent-runner-utils", () => {
});
});
it("falls back to sessionKey agent id when run.agentId is missing", () => {
hoisted.resolveAgentIdFromSessionKeyMock.mockReturnValue("agent-from-session-key");
hoisted.resolveAgentModelFallbacksOverrideMock.mockReturnValue(["fallback-model"]);
const run = makeRun({ agentId: undefined });
const resolved = resolveModelFallbackOptions(run);
expect(hoisted.resolveAgentIdFromSessionKeyMock).toHaveBeenCalledWith(run.sessionKey);
expect(hoisted.resolveAgentModelFallbacksOverrideMock).toHaveBeenCalledWith(
run.config,
"agent-from-session-key",
);
expect(resolved.fallbacksOverride).toEqual(["fallback-model"]);
});
it("builds embedded run base params with auth profile and run metadata", () => {
const run = makeRun({ enforceFinalTag: true });
const authProfile = resolveProviderScopedAuthProfile({

View File

@@ -147,15 +147,13 @@ export const resolveEnforceFinalTag = (run: FollowupRun["run"], provider: string
Boolean(run.enforceFinalTag || isReasoningTagProvider(provider));
export function resolveModelFallbackOptions(run: FollowupRun["run"]) {
const fallbackAgentId = run.agentId ?? resolveAgentIdFromSessionKey(run.sessionKey);
return {
cfg: run.config,
provider: run.provider,
model: run.model,
agentDir: run.agentDir,
fallbacksOverride: resolveAgentModelFallbacksOverride(
run.config,
resolveAgentIdFromSessionKey(run.sessionKey),
),
fallbacksOverride: resolveAgentModelFallbacksOverride(run.config, fallbackAgentId),
};
}

View File

@@ -135,7 +135,7 @@ export function createFollowupRunner(params: {
agentDir: queued.run.agentDir,
fallbacksOverride: resolveAgentModelFallbacksOverride(
queued.run.config,
resolveAgentIdFromSessionKey(queued.run.sessionKey),
queued.run.agentId ?? resolveAgentIdFromSessionKey(queued.run.sessionKey),
),
run: (provider, model) => {
const authProfile = resolveRunAuthProfile(queued.run, provider);