From bf5a96ad63c0a946c5887f92641df416750697f1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 25 Feb 2026 01:46:20 +0000 Subject: [PATCH] fix(agents): keep fallback chain reachable on configured fallback models (#25922) --- CHANGELOG.md | 1 + src/agents/model-fallback.test.ts | 38 +++++++++++++++++++++++++++++++ src/agents/model-fallback.ts | 22 ++++++++++++++---- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a528cbfcb71..4b07860dbe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ Docs: https://docs.openclaw.ai - Models/Bedrock auth: normalize additional Bedrock provider aliases (`bedrock`, `aws-bedrock`, `aws_bedrock`, `amazon bedrock`) to canonical `amazon-bedrock`, ensuring auth-mode resolution consistently selects AWS SDK fallback. (#25756) Thanks @fwhite13. - Providers/SiliconFlow: normalize `thinking="off"` to `thinking: null` for `Pro/*` model payloads to avoid provider-side 400 loops and misleading compaction retries. (#25435) Thanks @Zjianru. - Gateway/Models: honor explicit `agents.defaults.models` allowlist refs even when bundled model catalog data is stale, synthesize missing allowlist entries in `models.list`, and allow `sessions.patch`/`/model` selection for those refs without false `model not allowed` errors. (#20291) Thanks @kensipe, @nikolasdehor, and @vincentkoc. +- Agents/Model fallback: when a run is currently on a configured fallback model, keep traversing the configured fallback chain instead of collapsing straight to primary-only, preventing dead-end failures when primary stays in cooldown. (#25922, #25912) Thanks @Taskle. - Control UI/Agents: inherit `agents.defaults.model.fallbacks` in the Overview fallback input when no per-agent model entry exists, while preserving explicit per-agent fallback overrides (including empty lists). (#25729, #25710) Thanks @Suko. - Automation/Subagent/Cron reliability: honor `ANNOUNCE_SKIP` in `sessions_spawn` completion/direct announce flows (no user-visible token leaks), add transient direct-announce retries for channel unavailability (for example WhatsApp listener reconnect windows), and include `cron` in the `coding` tool profile so `/tools/invoke` can execute cron actions when explicitly allowed by gateway policy. (#25800, #25656, #25842, #25813, #25822, #25821) Thanks @astra-fer, @aaajiao, @dwight11232-coder, @kevinWangSheng, @widingmarcus-cyber, and @stakeswky. - Discord/Proxy + reactions + model picker: thread channel proxy fetch into inbound media/sticker downloads, use proxy-aware gateway metadata fetch for WSL/corporate proxy setups, wire `messages.statusReactions.{emojis,timing}` into Discord reaction lifecycle control, and compact model-picker `custom_id` keys to stay under Discord's 100-char limit while keeping backward-compatible parsing. (#25232, #25507, #25564, #25695) Thanks @openperf, @chilu18, @Yipsh, @lbo728, and @s1korrrr. diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index 6b5128d90ea..f727ea5e925 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -237,6 +237,44 @@ describe("runWithModelFallback", () => { ]); }); + it("keeps configured fallback chain when current model is a configured fallback", async () => { + const cfg = makeCfg({ + agents: { + defaults: { + model: { + primary: "openai/gpt-4.1-mini", + fallbacks: ["anthropic/claude-haiku-3-5", "openrouter/deepseek-chat"], + }, + }, + }, + }); + + const run = vi.fn().mockImplementation(async (provider: string, model: string) => { + if (provider === "anthropic" && model === "claude-haiku-3-5") { + throw Object.assign(new Error("rate-limited"), { status: 429 }); + } + if (provider === "openrouter" && model === "openrouter/deepseek-chat") { + return "ok"; + } + throw new Error(`unexpected fallback candidate: ${provider}/${model}`); + }); + + const result = await runWithModelFallback({ + cfg, + provider: "anthropic", + model: "claude-haiku-3-5", + run, + }); + + expect(result.result).toBe("ok"); + expect(result.provider).toBe("openrouter"); + expect(result.model).toBe("openrouter/deepseek-chat"); + expect(run.mock.calls).toEqual([ + ["anthropic", "claude-haiku-3-5"], + ["openrouter", "openrouter/deepseek-chat"], + ]); + }); + it("treats normalized default refs as primary and keeps configured fallback chain", async () => { const cfg = makeCfg({ agents: { diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index b0050602590..fc44165e0b2 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -206,12 +206,24 @@ function resolveFallbackCandidates(params: { if (params.fallbacksOverride !== undefined) { return params.fallbacksOverride; } - // Skip configured fallback chain when the user runs a non-default override. - // In that case, retry should return directly to configured primary. - if (!sameModelCandidate(normalizedPrimary, configuredPrimary)) { - return []; // Override model failed → go straight to configured default + const configuredFallbacks = resolveAgentModelFallbackValues( + params.cfg?.agents?.defaults?.model, + ); + if (sameModelCandidate(normalizedPrimary, configuredPrimary)) { + return configuredFallbacks; } - return resolveAgentModelFallbackValues(params.cfg?.agents?.defaults?.model); + // Preserve resilience after failover: when current model is one of the + // configured fallback refs, keep traversing the configured fallback chain. + const isConfiguredFallback = configuredFallbacks.some((raw) => { + const resolved = resolveModelRefFromString({ + raw: String(raw ?? ""), + defaultProvider, + aliasIndex, + }); + return resolved ? sameModelCandidate(resolved.ref, normalizedPrimary) : false; + }); + // Keep legacy override behavior for ad-hoc models outside configured chain. + return isConfiguredFallback ? configuredFallbacks : []; })(); for (const raw of modelFallbacks) {