Fix: live session model switch no longer blocks failover (Resolves #58466) (#58589)

* fix: prevent infinite retry loop when live session model switch blocks failover (#58466)

* fix: remove unused resolveOllamaBaseUrlForRun import after rebase
This commit is contained in:
Han Yang
2026-04-01 09:09:41 +08:00
committed by GitHub
parent 350fe63bbf
commit 547154865b
4 changed files with 121 additions and 0 deletions

View File

@@ -9,6 +9,7 @@ import type { AuthProfileStore } from "./auth-profiles.js";
import { saveAuthProfileStore } from "./auth-profiles.js";
import { AUTH_STORE_VERSION } from "./auth-profiles/constants.js";
import { isAnthropicBillingError } from "./live-auth-keys.js";
import { LiveSessionModelSwitchError } from "./live-model-switch.js";
import { runWithImageModelFallback, runWithModelFallback } from "./model-fallback.js";
import { makeModelFallbackCfg } from "./test-helpers/model-fallback-config-fixture.js";
@@ -263,6 +264,50 @@ describe("runWithModelFallback", () => {
expect(run).toHaveBeenCalledTimes(1);
});
it("treats LiveSessionModelSwitchError as failover on last candidate (#58466)", async () => {
const cfg = makeCfg();
const switchError = new LiveSessionModelSwitchError({
provider: "anthropic",
model: "claude-sonnet-4-6",
});
const run = vi.fn().mockRejectedValue(switchError);
// With no fallbacks, the single candidate is also the last one.
// Previously this would re-throw LiveSessionModelSwitchError, causing
// the outer retry loop to restart with the overloaded model indefinitely.
// Now it should surface as a FailoverError instead.
const err = await runWithModelFallback({
cfg,
provider: "anthropic",
model: "claude-sonnet-4-6",
run,
fallbacksOverride: [],
}).catch((e: unknown) => e);
expect(err).toBeInstanceOf(Error);
// Should NOT be a LiveSessionModelSwitchError — the outer retry loop must
// not restart with the conflicting model.
expect(err).not.toBeInstanceOf(LiveSessionModelSwitchError);
expect(run).toHaveBeenCalledTimes(1);
});
it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58466)", async () => {
const cfg = makeCfg();
const switchError = new LiveSessionModelSwitchError({
provider: "anthropic",
model: "claude-sonnet-4-6",
});
const run = vi.fn().mockRejectedValueOnce(switchError).mockResolvedValueOnce("ok");
const result = await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-4.1-mini",
run,
});
expect(result.result).toBe("ok");
expect(run).toHaveBeenCalledTimes(2);
});
it("falls back on auth errors", async () => {
await expectFallsBackToHaiku({
provider: "openai",

View File

@@ -15,6 +15,7 @@ import {
} from "./auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
import {
FailoverError,
coerceToFailoverError,
describeFailoverError,
isFailoverError,
@@ -25,6 +26,7 @@ import {
shouldPreserveTransientCooldownProbeSlot,
shouldUseTransientCooldownProbeSlot,
} from "./failover-policy.js";
import { LiveSessionModelSwitchError } from "./live-model-switch.js";
import { logModelFallbackDecision } from "./model-fallback-observation.js";
import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
import {
@@ -781,6 +783,48 @@ export async function runWithModelFallback<T>(params: {
model: candidate.model,
}) ?? err;
// LiveSessionModelSwitchError during fallback means the session's
// persisted model conflicts with this fallback candidate. Treat it
// as a known failover so the chain continues to the next candidate
// instead of re-throwing and triggering infinite retry loops in the
// outer runner. (#58466)
if (err instanceof LiveSessionModelSwitchError) {
const switchMsg = err.message;
const switchNormalized = new FailoverError(switchMsg, {
reason: "overloaded",
provider: candidate.provider,
model: candidate.model,
});
lastError = switchNormalized;
const described = describeFailoverError(switchNormalized);
attempts.push({
provider: candidate.provider,
model: candidate.model,
error: described.message,
reason: described.reason ?? "unknown",
status: described.status,
code: described.code,
});
logModelFallbackDecision({
decision: "candidate_failed",
runId: params.runId,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
attempt: i + 1,
total: candidates.length,
reason: described.reason,
status: described.status,
code: described.code,
error: described.message,
nextCandidate: candidates[i + 1],
isPrimary,
requestedModelMatched: requestedModel,
fallbackConfigured: hasFallbackCandidates,
});
continue;
}
// Even unrecognized errors should not abort the fallback loop when
// there are remaining candidates. Only abort/context-overflow errors
// (handled above) are truly non-retryable.

View File

@@ -249,6 +249,26 @@ describe("runCronIsolatedAgentTurn — LiveSessionModelSwitchError retry (#57206
expect(callCount).toBe(2);
});
it("aborts after exceeding LiveSessionModelSwitchError retry limit (#58466)", async () => {
const switchError = new LiveSessionModelSwitchError({
provider: "anthropic",
model: "claude-sonnet-4-6",
});
let callCount = 0;
runWithModelFallbackMock.mockImplementation(async () => {
callCount++;
throw switchError;
});
const result = await runCronIsolatedAgentTurn(makeParams());
expect(result.status).toBe("error");
// Circuit breaker: max 2 retries → 3 total attempts (initial + 2 retries)
expect(callCount).toBe(3);
expect(logWarnMock).toHaveBeenCalledWith(expect.stringContaining("retry limit reached"));
});
it("does not retry when the thrown error is not a LiveSessionModelSwitchError", async () => {
let callCount = 0;
runWithModelFallbackMock.mockImplementation(async () => {

View File

@@ -592,12 +592,24 @@ export async function runCronIsolatedAgentTurn(params: {
// in the main agent runner (agent-runner-execution.ts). Without this, cron
// jobs that specify a model different from the agent primary always fail.
// See: https://github.com/openclaw/openclaw/issues/57206
//
// Circuit breaker: cap retries to prevent infinite loops when the live
// session model switch guard fires repeatedly during failover (#58466).
const MAX_MODEL_SWITCH_RETRIES = 2;
let modelSwitchRetries = 0;
while (true) {
try {
await runPrompt(commandBody);
break;
} catch (err) {
if (err instanceof LiveSessionModelSwitchError) {
modelSwitchRetries += 1;
if (modelSwitchRetries > MAX_MODEL_SWITCH_RETRIES) {
logWarn(
`[cron:${params.job.id}] LiveSessionModelSwitchError retry limit reached (${MAX_MODEL_SWITCH_RETRIES}); aborting`,
);
throw err;
}
liveSelection = {
provider: err.provider,
model: err.model,