mirror of
https://github.com/moltbot/moltbot.git
synced 2026-04-26 16:06:16 +00:00
* fix: prevent infinite retry loop when live session model switch blocks failover (#58466) * fix: remove unused resolveOllamaBaseUrlForRun import after rebase
This commit is contained in:
@@ -9,6 +9,7 @@ import type { AuthProfileStore } from "./auth-profiles.js";
|
||||
import { saveAuthProfileStore } from "./auth-profiles.js";
|
||||
import { AUTH_STORE_VERSION } from "./auth-profiles/constants.js";
|
||||
import { isAnthropicBillingError } from "./live-auth-keys.js";
|
||||
import { LiveSessionModelSwitchError } from "./live-model-switch.js";
|
||||
import { runWithImageModelFallback, runWithModelFallback } from "./model-fallback.js";
|
||||
import { makeModelFallbackCfg } from "./test-helpers/model-fallback-config-fixture.js";
|
||||
|
||||
@@ -263,6 +264,50 @@ describe("runWithModelFallback", () => {
|
||||
expect(run).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("treats LiveSessionModelSwitchError as failover on last candidate (#58466)", async () => {
|
||||
const cfg = makeCfg();
|
||||
const switchError = new LiveSessionModelSwitchError({
|
||||
provider: "anthropic",
|
||||
model: "claude-sonnet-4-6",
|
||||
});
|
||||
const run = vi.fn().mockRejectedValue(switchError);
|
||||
|
||||
// With no fallbacks, the single candidate is also the last one.
|
||||
// Previously this would re-throw LiveSessionModelSwitchError, causing
|
||||
// the outer retry loop to restart with the overloaded model indefinitely.
|
||||
// Now it should surface as a FailoverError instead.
|
||||
const err = await runWithModelFallback({
|
||||
cfg,
|
||||
provider: "anthropic",
|
||||
model: "claude-sonnet-4-6",
|
||||
run,
|
||||
fallbacksOverride: [],
|
||||
}).catch((e: unknown) => e);
|
||||
expect(err).toBeInstanceOf(Error);
|
||||
// Should NOT be a LiveSessionModelSwitchError — the outer retry loop must
|
||||
// not restart with the conflicting model.
|
||||
expect(err).not.toBeInstanceOf(LiveSessionModelSwitchError);
|
||||
expect(run).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("continues fallback chain past LiveSessionModelSwitchError to next candidate (#58466)", async () => {
|
||||
const cfg = makeCfg();
|
||||
const switchError = new LiveSessionModelSwitchError({
|
||||
provider: "anthropic",
|
||||
model: "claude-sonnet-4-6",
|
||||
});
|
||||
const run = vi.fn().mockRejectedValueOnce(switchError).mockResolvedValueOnce("ok");
|
||||
|
||||
const result = await runWithModelFallback({
|
||||
cfg,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1-mini",
|
||||
run,
|
||||
});
|
||||
expect(result.result).toBe("ok");
|
||||
expect(run).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it("falls back on auth errors", async () => {
|
||||
await expectFallsBackToHaiku({
|
||||
provider: "openai",
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
} from "./auth-profiles.js";
|
||||
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
|
||||
import {
|
||||
FailoverError,
|
||||
coerceToFailoverError,
|
||||
describeFailoverError,
|
||||
isFailoverError,
|
||||
@@ -25,6 +26,7 @@ import {
|
||||
shouldPreserveTransientCooldownProbeSlot,
|
||||
shouldUseTransientCooldownProbeSlot,
|
||||
} from "./failover-policy.js";
|
||||
import { LiveSessionModelSwitchError } from "./live-model-switch.js";
|
||||
import { logModelFallbackDecision } from "./model-fallback-observation.js";
|
||||
import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
|
||||
import {
|
||||
@@ -781,6 +783,48 @@ export async function runWithModelFallback<T>(params: {
|
||||
model: candidate.model,
|
||||
}) ?? err;
|
||||
|
||||
// LiveSessionModelSwitchError during fallback means the session's
|
||||
// persisted model conflicts with this fallback candidate. Treat it
|
||||
// as a known failover so the chain continues to the next candidate
|
||||
// instead of re-throwing and triggering infinite retry loops in the
|
||||
// outer runner. (#58466)
|
||||
if (err instanceof LiveSessionModelSwitchError) {
|
||||
const switchMsg = err.message;
|
||||
const switchNormalized = new FailoverError(switchMsg, {
|
||||
reason: "overloaded",
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
});
|
||||
lastError = switchNormalized;
|
||||
const described = describeFailoverError(switchNormalized);
|
||||
attempts.push({
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
error: described.message,
|
||||
reason: described.reason ?? "unknown",
|
||||
status: described.status,
|
||||
code: described.code,
|
||||
});
|
||||
logModelFallbackDecision({
|
||||
decision: "candidate_failed",
|
||||
runId: params.runId,
|
||||
requestedProvider: params.provider,
|
||||
requestedModel: params.model,
|
||||
candidate,
|
||||
attempt: i + 1,
|
||||
total: candidates.length,
|
||||
reason: described.reason,
|
||||
status: described.status,
|
||||
code: described.code,
|
||||
error: described.message,
|
||||
nextCandidate: candidates[i + 1],
|
||||
isPrimary,
|
||||
requestedModelMatched: requestedModel,
|
||||
fallbackConfigured: hasFallbackCandidates,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Even unrecognized errors should not abort the fallback loop when
|
||||
// there are remaining candidates. Only abort/context-overflow errors
|
||||
// (handled above) are truly non-retryable.
|
||||
|
||||
@@ -249,6 +249,26 @@ describe("runCronIsolatedAgentTurn — LiveSessionModelSwitchError retry (#57206
|
||||
expect(callCount).toBe(2);
|
||||
});
|
||||
|
||||
it("aborts after exceeding LiveSessionModelSwitchError retry limit (#58466)", async () => {
|
||||
const switchError = new LiveSessionModelSwitchError({
|
||||
provider: "anthropic",
|
||||
model: "claude-sonnet-4-6",
|
||||
});
|
||||
|
||||
let callCount = 0;
|
||||
runWithModelFallbackMock.mockImplementation(async () => {
|
||||
callCount++;
|
||||
throw switchError;
|
||||
});
|
||||
|
||||
const result = await runCronIsolatedAgentTurn(makeParams());
|
||||
|
||||
expect(result.status).toBe("error");
|
||||
// Circuit breaker: max 2 retries → 3 total attempts (initial + 2 retries)
|
||||
expect(callCount).toBe(3);
|
||||
expect(logWarnMock).toHaveBeenCalledWith(expect.stringContaining("retry limit reached"));
|
||||
});
|
||||
|
||||
it("does not retry when the thrown error is not a LiveSessionModelSwitchError", async () => {
|
||||
let callCount = 0;
|
||||
runWithModelFallbackMock.mockImplementation(async () => {
|
||||
|
||||
@@ -592,12 +592,24 @@ export async function runCronIsolatedAgentTurn(params: {
|
||||
// in the main agent runner (agent-runner-execution.ts). Without this, cron
|
||||
// jobs that specify a model different from the agent primary always fail.
|
||||
// See: https://github.com/openclaw/openclaw/issues/57206
|
||||
//
|
||||
// Circuit breaker: cap retries to prevent infinite loops when the live
|
||||
// session model switch guard fires repeatedly during failover (#58466).
|
||||
const MAX_MODEL_SWITCH_RETRIES = 2;
|
||||
let modelSwitchRetries = 0;
|
||||
while (true) {
|
||||
try {
|
||||
await runPrompt(commandBody);
|
||||
break;
|
||||
} catch (err) {
|
||||
if (err instanceof LiveSessionModelSwitchError) {
|
||||
modelSwitchRetries += 1;
|
||||
if (modelSwitchRetries > MAX_MODEL_SWITCH_RETRIES) {
|
||||
logWarn(
|
||||
`[cron:${params.job.id}] LiveSessionModelSwitchError retry limit reached (${MAX_MODEL_SWITCH_RETRIES}); aborting`,
|
||||
);
|
||||
throw err;
|
||||
}
|
||||
liveSelection = {
|
||||
provider: err.provider,
|
||||
model: err.model,
|
||||
|
||||
Reference in New Issue
Block a user