Fix: live session model switch no longer blocks failover (Resolves #58466) (#58589)

* fix: prevent infinite retry loop when live session model switch blocks failover (#58466) * fix: remove unused resolveOllamaBaseUrlForRun import after rebase
2026-04-26 16:06:16 +00:00 · 2026-04-01 09:09:41 +08:00
parent 350fe63bbf
commit 547154865b
4 changed files with 121 additions and 0 deletions
--- a/src/agents/model-fallback.test.ts
+++ b/src/agents/model-fallback.test.ts
@@ -9,6 +9,7 @@ import type { AuthProfileStore } from "./auth-profiles.js";
 import { saveAuthProfileStore } from "./auth-profiles.js";
 import { AUTH_STORE_VERSION } from "./auth-profiles/constants.js";
 import { isAnthropicBillingError } from "./live-auth-keys.js";
+import { LiveSessionModelSwitchError } from "./live-model-switch.js";
 import { runWithImageModelFallback, runWithModelFallback } from "./model-fallback.js";
 import { makeModelFallbackCfg } from "./test-helpers/model-fallback-config-fixture.js";

@@ -263,6 +264,50 @@ describe("runWithModelFallback", () => {
    expect(run).toHaveBeenCalledTimes(1);
  });

+  it("treats LiveSessionModelSwitchError as failover on last candidate (#58466)", async () => {
+    const cfg = makeCfg();
+    const switchError = new LiveSessionModelSwitchError({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+    });
+    const run = vi.fn().mockRejectedValue(switchError);
+
+    // With no fallbacks, the single candidate is also the last one.
+    // Previously this would re-throw LiveSessionModelSwitchError, causing
+    // the outer retry loop to restart with the overloaded model indefinitely.
+    // Now it should surface as a FailoverError instead.
+    const err = await runWithModelFallback({
+      cfg,
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+      run,
+      fallbacksOverride: [],
+    }).catch((e: unknown) => e);
+    expect(err).toBeInstanceOf(Error);
+    // Should NOT be a LiveSessionModelSwitchError — the outer retry loop must
+    // not restart with the conflicting model.
+    expect(err).not.toBeInstanceOf(LiveSessionModelSwitchError);
+    expect(run).toHaveBeenCalledTimes(1);
+  });
+
+  it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58466)", async () => {
+    const cfg = makeCfg();
+    const switchError = new LiveSessionModelSwitchError({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+    });
+    const run = vi.fn().mockRejectedValueOnce(switchError).mockResolvedValueOnce("ok");
+
+    const result = await runWithModelFallback({
+      cfg,
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      run,
+    });
+    expect(result.result).toBe("ok");
+    expect(run).toHaveBeenCalledTimes(2);
+  });
+
  it("falls back on auth errors", async () => {
    await expectFallsBackToHaiku({
      provider: "openai",
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -15,6 +15,7 @@ import {
 } from "./auth-profiles.js";
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import {
+  FailoverError,
  coerceToFailoverError,
  describeFailoverError,
  isFailoverError,
@@ -25,6 +26,7 @@ import {
  shouldPreserveTransientCooldownProbeSlot,
  shouldUseTransientCooldownProbeSlot,
 } from "./failover-policy.js";
+import { LiveSessionModelSwitchError } from "./live-model-switch.js";
 import { logModelFallbackDecision } from "./model-fallback-observation.js";
 import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
 import {
@@ -781,6 +783,48 @@ export async function runWithModelFallback<T>(params: {
          model: candidate.model,
        }) ?? err;

+      // LiveSessionModelSwitchError during fallback means the session's
+      // persisted model conflicts with this fallback candidate.  Treat it
+      // as a known failover so the chain continues to the next candidate
+      // instead of re-throwing and triggering infinite retry loops in the
+      // outer runner.  (#58466)
+      if (err instanceof LiveSessionModelSwitchError) {
+        const switchMsg = err.message;
+        const switchNormalized = new FailoverError(switchMsg, {
+          reason: "overloaded",
+          provider: candidate.provider,
+          model: candidate.model,
+        });
+        lastError = switchNormalized;
+        const described = describeFailoverError(switchNormalized);
+        attempts.push({
+          provider: candidate.provider,
+          model: candidate.model,
+          error: described.message,
+          reason: described.reason ?? "unknown",
+          status: described.status,
+          code: described.code,
+        });
+        logModelFallbackDecision({
+          decision: "candidate_failed",
+          runId: params.runId,
+          requestedProvider: params.provider,
+          requestedModel: params.model,
+          candidate,
+          attempt: i + 1,
+          total: candidates.length,
+          reason: described.reason,
+          status: described.status,
+          code: described.code,
+          error: described.message,
+          nextCandidate: candidates[i + 1],
+          isPrimary,
+          requestedModelMatched: requestedModel,
+          fallbackConfigured: hasFallbackCandidates,
+        });
+        continue;
+      }
+
      // Even unrecognized errors should not abort the fallback loop when
      // there are remaining candidates.  Only abort/context-overflow errors
      // (handled above) are truly non-retryable.
--- a/src/cron/isolated-agent/run.live-session-model-switch.test.ts
+++ b/src/cron/isolated-agent/run.live-session-model-switch.test.ts
@@ -249,6 +249,26 @@ describe("runCronIsolatedAgentTurn — LiveSessionModelSwitchError retry (#57206
    expect(callCount).toBe(2);
  });

+  it("aborts after exceeding LiveSessionModelSwitchError retry limit (#58466)", async () => {
+    const switchError = new LiveSessionModelSwitchError({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+    });
+
+    let callCount = 0;
+    runWithModelFallbackMock.mockImplementation(async () => {
+      callCount++;
+      throw switchError;
+    });
+
+    const result = await runCronIsolatedAgentTurn(makeParams());
+
+    expect(result.status).toBe("error");
+    // Circuit breaker: max 2 retries → 3 total attempts (initial + 2 retries)
+    expect(callCount).toBe(3);
+    expect(logWarnMock).toHaveBeenCalledWith(expect.stringContaining("retry limit reached"));
+  });
+
  it("does not retry when the thrown error is not a LiveSessionModelSwitchError", async () => {
    let callCount = 0;
    runWithModelFallbackMock.mockImplementation(async () => {
--- a/src/cron/isolated-agent/run.ts
+++ b/src/cron/isolated-agent/run.ts
@@ -592,12 +592,24 @@ export async function runCronIsolatedAgentTurn(params: {
    // in the main agent runner (agent-runner-execution.ts). Without this, cron
    // jobs that specify a model different from the agent primary always fail.
    // See: https://github.com/openclaw/openclaw/issues/57206
+    //
+    // Circuit breaker: cap retries to prevent infinite loops when the live
+    // session model switch guard fires repeatedly during failover (#58466).
+    const MAX_MODEL_SWITCH_RETRIES = 2;
+    let modelSwitchRetries = 0;
    while (true) {
      try {
        await runPrompt(commandBody);
        break;
      } catch (err) {
        if (err instanceof LiveSessionModelSwitchError) {
+          modelSwitchRetries += 1;
+          if (modelSwitchRetries > MAX_MODEL_SWITCH_RETRIES) {
+            logWarn(
+              `[cron:${params.job.id}] LiveSessionModelSwitchError retry limit reached (${MAX_MODEL_SWITCH_RETRIES}); aborting`,
+            );
+            throw err;
+          }
          liveSelection = {
            provider: err.provider,
            model: err.model,