fix(agents): continue fallback loop for unrecognized provider errors (#26106)

* fix(agents): continue fallback loop for unrecognized provider errors When a provider returns an error that coerceToFailoverError cannot classify (e.g., custom error messages without standard HTTP status codes), the fallback loop threw immediately instead of trying the next candidate. This caused fallback to stop after 2 models even when 17 were configured. Only rethrow unrecognized errors when they occur on the last candidate. For intermediate candidates, record the error as an attempt and continue to the next model. Closes #25926 Co-authored-by: Cursor <cursoragent@cursor.com> * test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin) --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-04-26 16:06:16 +00:00 · 2026-02-25 12:53:26 +08:00
parent f7de41ca20
commit 156f13aa64
3 changed files with 54 additions and 6 deletions
--- a/src/agents/model-fallback.test.ts
+++ b/src/agents/model-fallback.test.ts
@@ -178,18 +178,60 @@ describe("runWithModelFallback", () => {
    expect(run).toHaveBeenCalledWith("openai-codex", "gpt-5.3-codex");
  });

-  it("does not fall back on non-auth errors", async () => {
+  it("falls back on unrecognized errors when candidates remain", async () => {
    const cfg = makeCfg();
    const run = vi.fn().mockRejectedValueOnce(new Error("bad request")).mockResolvedValueOnce("ok");

+    const result = await runWithModelFallback({
+      cfg,
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      run,
+    });
+    expect(result.result).toBe("ok");
+    expect(run).toHaveBeenCalledTimes(2);
+    expect(result.attempts).toHaveLength(1);
+    expect(result.attempts[0].error).toBe("bad request");
+    expect(result.attempts[0].reason).toBe("unknown");
+  });
+
+  it("passes original unknown errors to onError during fallback", async () => {
+    const cfg = makeCfg();
+    const unknownError = new Error("provider misbehaved");
+    const run = vi.fn().mockRejectedValueOnce(unknownError).mockResolvedValueOnce("ok");
+    const onError = vi.fn();
+
+    await runWithModelFallback({
+      cfg,
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      run,
+      onError,
+    });
+
+    expect(onError).toHaveBeenCalledTimes(1);
+    expect(onError.mock.calls[0]?.[0]).toMatchObject({
+      provider: "openai",
+      model: "gpt-4.1-mini",
+      attempt: 1,
+      total: 2,
+    });
+    expect(onError.mock.calls[0]?.[0]?.error).toBe(unknownError);
+  });
+
+  it("throws unrecognized error on last candidate", async () => {
+    const cfg = makeCfg();
+    const run = vi.fn().mockRejectedValueOnce(new Error("something weird"));
+
    await expect(
      runWithModelFallback({
        cfg,
        provider: "openai",
        model: "gpt-4.1-mini",
        run,
+        fallbacksOverride: [],
      }),
-    ).rejects.toThrow("bad request");
+    ).rejects.toThrow("something weird");
    expect(run).toHaveBeenCalledTimes(1);
  });

--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -402,24 +402,29 @@ export async function runWithModelFallback<T>(params: {
          provider: candidate.provider,
          model: candidate.model,
        }) ?? err;
-      if (!isFailoverError(normalized)) {
+
+      // Even unrecognized errors should not abort the fallback loop when
+      // there are remaining candidates.  Only abort/context-overflow errors
+      // (handled above) are truly non-retryable.
+      const isKnownFailover = isFailoverError(normalized);
+      if (!isKnownFailover && i === candidates.length - 1) {
        throw err;
      }

-      lastError = normalized;
+      lastError = isKnownFailover ? normalized : err;
      const described = describeFailoverError(normalized);
      attempts.push({
        provider: candidate.provider,
        model: candidate.model,
        error: described.message,
-        reason: described.reason,
+        reason: described.reason ?? "unknown",
        status: described.status,
        code: described.code,
      });
      await params.onError?.({
        provider: candidate.provider,
        model: candidate.model,
-        error: normalized,
+        error: isKnownFailover ? normalized : err,
        attempt: i + 1,
        total: candidates.length,
      });