From 30ab9b2068cfddf92d9447fa567161daa04ddf0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AI=E5=8D=97=E6=9F=AF=28KingMo=29?= Date: Tue, 3 Mar 2026 10:37:23 +0800 Subject: [PATCH] fix(agents): recognize connection errors as retryable timeout failures (#31697) * fix(agents): recognize connection errors as retryable timeout failures ## Problem When a model endpoint becomes unreachable (e.g., local proxy down, relay server offline), the failover system fails to switch to the next candidate model. Errors like "Connection error." are not classified as retryable, causing the session to hang on a broken endpoint instead of falling back to healthy alternatives. ## Root Cause Connection/network errors are not recognized by the current failover classifier: - Text patterns like "Connection error.", "fetch failed", "network error" - Error codes like ECONNREFUSED, ENOTFOUND, EAI_AGAIN (in message text) While `failover-error.ts` handles these as error codes (err.code), it misses them when they appear as plain text in error messages. ## Solution Extend timeout error patterns to include connection/network failures: **In `errors.ts` (ERROR_PATTERNS.timeout):** - Text: "connection error", "network error", "fetch failed", etc. - Regex: /\beconn(?:refused|reset|aborted)\b/i, /\benotfound\b/i, /\beai_again\b/i **In `failover-error.ts` (TIMEOUT_HINT_RE):** - Same patterns for non-assistant error paths ## Testing Added test cases covering: - "Connection error." - "fetch failed" - "network error: ECONNREFUSED" - "ENOTFOUND" / "EAI_AGAIN" in message text ## Impact - **Compatibility:** High - only expands retryable error detection - **Behavior:** Connection failures now trigger automatic fallback - **Risk:** Low - changes are additive and well-tested * style: fix code formatting for test file --- src/agents/failover-error.test.ts | 16 ++++++++++++++++ src/agents/failover-error.ts | 2 +- ...mbedded-helpers.isbillingerrormessage.test.ts | 8 ++++++++ src/agents/pi-embedded-helpers/errors.ts | 8 ++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index 33ffe2d2d57..fa8a4e553a6 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -48,6 +48,22 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ message: "reason: error" })).toBe("timeout"); }); + it("infers timeout from connection/network error messages", () => { + expect(resolveFailoverReasonFromError({ message: "Connection error." })).toBe("timeout"); + expect(resolveFailoverReasonFromError({ message: "fetch failed" })).toBe("timeout"); + expect(resolveFailoverReasonFromError({ message: "Network error: ECONNREFUSED" })).toBe( + "timeout", + ); + expect( + resolveFailoverReasonFromError({ + message: "dial tcp: lookup api.example.com: no such host (ENOTFOUND)", + }), + ).toBe("timeout"); + expect(resolveFailoverReasonFromError({ message: "temporary dns failure EAI_AGAIN" })).toBe( + "timeout", + ); + }); + it("treats AbortError reason=abort as timeout", () => { const err = Object.assign(new Error("aborted"), { name: "AbortError", diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index 63e5c26c7a3..47660664c8c 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -6,7 +6,7 @@ import { } from "./pi-embedded-helpers.js"; const TIMEOUT_HINT_RE = - /timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*(?:abort|error)|reason:\s*(?:abort|error)|unhandled stop reason:\s*(?:abort|error)/i; + /timeout|timed out|deadline exceeded|context deadline exceeded|connection error|network error|network request failed|fetch failed|socket hang up|econnrefused|econnreset|econnaborted|enotfound|eai_again|stop reason:\s*(?:abort|error)|reason:\s*(?:abort|error)|unhandled stop reason:\s*(?:abort|error)/i; const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i; export class FailoverError extends Error { diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 11b29abad3a..c9d073ce8c9 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -415,6 +415,7 @@ describe("isFailoverErrorMessage", () => { "429 rate limit exceeded", "Your credit balance is too low", "request timed out", + "Connection error.", "invalid request format", ]; for (const sample of samples) { @@ -494,6 +495,13 @@ describe("classifyFailoverReason", () => { expect(classifyFailoverReason("credit balance too low")).toBe("billing"); expect(classifyFailoverReason("deadline exceeded")).toBe("timeout"); expect(classifyFailoverReason("request ended without sending any chunks")).toBe("timeout"); + expect(classifyFailoverReason("Connection error.")).toBe("timeout"); + expect(classifyFailoverReason("fetch failed")).toBe("timeout"); + expect(classifyFailoverReason("network error: ECONNREFUSED")).toBe("timeout"); + expect( + classifyFailoverReason("dial tcp: lookup api.example.com: no such host (ENOTFOUND)"), + ).toBe("timeout"); + expect(classifyFailoverReason("temporary dns failure EAI_AGAIN")).toBe("timeout"); expect( classifyFailoverReason( "521 Web server is downCloudflare", diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 754bd03ba9c..cfd1460fc61 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -640,6 +640,14 @@ const ERROR_PATTERNS = { "timed out", "deadline exceeded", "context deadline exceeded", + "connection error", + "network error", + "network request failed", + "fetch failed", + "socket hang up", + /\beconn(?:refused|reset|aborted)\b/i, + /\benotfound\b/i, + /\beai_again\b/i, /without sending (?:any )?chunks?/i, /\bstop reason:\s*(?:abort|error)\b/i, /\breason:\s*(?:abort|error)\b/i,