fix(agents): continue fallback loop for unrecognized provider errors (#26106)

* fix(agents): continue fallback loop for unrecognized provider errors

When a provider returns an error that coerceToFailoverError cannot
classify (e.g., custom error messages without standard HTTP status
codes), the fallback loop threw immediately instead of trying the
next candidate. This caused fallback to stop after 2 models even
when 17 were configured.

Only rethrow unrecognized errors when they occur on the last
candidate. For intermediate candidates, record the error as an
attempt and continue to the next model.

Closes #25926

Co-authored-by: Cursor <cursoragent@cursor.com>

* test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin)

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Sid
2026-02-25 12:53:26 +08:00
committed by GitHub
parent f7de41ca20
commit 156f13aa64
3 changed files with 54 additions and 6 deletions

View File

@@ -178,18 +178,60 @@ describe("runWithModelFallback", () => {
expect(run).toHaveBeenCalledWith("openai-codex", "gpt-5.3-codex");
});
it("does not fall back on non-auth errors", async () => {
it("falls back on unrecognized errors when candidates remain", async () => {
const cfg = makeCfg();
const run = vi.fn().mockRejectedValueOnce(new Error("bad request")).mockResolvedValueOnce("ok");
const result = await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-4.1-mini",
run,
});
expect(result.result).toBe("ok");
expect(run).toHaveBeenCalledTimes(2);
expect(result.attempts).toHaveLength(1);
expect(result.attempts[0].error).toBe("bad request");
expect(result.attempts[0].reason).toBe("unknown");
});
it("passes original unknown errors to onError during fallback", async () => {
const cfg = makeCfg();
const unknownError = new Error("provider misbehaved");
const run = vi.fn().mockRejectedValueOnce(unknownError).mockResolvedValueOnce("ok");
const onError = vi.fn();
await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-4.1-mini",
run,
onError,
});
expect(onError).toHaveBeenCalledTimes(1);
expect(onError.mock.calls[0]?.[0]).toMatchObject({
provider: "openai",
model: "gpt-4.1-mini",
attempt: 1,
total: 2,
});
expect(onError.mock.calls[0]?.[0]?.error).toBe(unknownError);
});
it("throws unrecognized error on last candidate", async () => {
const cfg = makeCfg();
const run = vi.fn().mockRejectedValueOnce(new Error("something weird"));
await expect(
runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-4.1-mini",
run,
fallbacksOverride: [],
}),
).rejects.toThrow("bad request");
).rejects.toThrow("something weird");
expect(run).toHaveBeenCalledTimes(1);
});

View File

@@ -402,24 +402,29 @@ export async function runWithModelFallback<T>(params: {
provider: candidate.provider,
model: candidate.model,
}) ?? err;
if (!isFailoverError(normalized)) {
// Even unrecognized errors should not abort the fallback loop when
// there are remaining candidates. Only abort/context-overflow errors
// (handled above) are truly non-retryable.
const isKnownFailover = isFailoverError(normalized);
if (!isKnownFailover && i === candidates.length - 1) {
throw err;
}
lastError = normalized;
lastError = isKnownFailover ? normalized : err;
const described = describeFailoverError(normalized);
attempts.push({
provider: candidate.provider,
model: candidate.model,
error: described.message,
reason: described.reason,
reason: described.reason ?? "unknown",
status: described.status,
code: described.code,
});
await params.onError?.({
provider: candidate.provider,
model: candidate.model,
error: normalized,
error: isKnownFailover ? normalized : err,
attempt: i + 1,
total: candidates.length,
});