diff --git a/CHANGELOG.md b/CHANGELOG.md index 67afe89281e..b1a46285dc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ Docs: https://docs.openclaw.ai - Discord/doctor: migrate unsupported per-channel `agentId` entries under guild channel config into top-level `bindings[]` routes, so `openclaw doctor --fix` preserves the intended agent route instead of stripping it as an unknown key. Fixes #62455. Thanks @lobster-biscuit. - Discord/DMs: set inbound direct-message `ctx.To` to the semantic `user:` target while keeping delivery routed through the DM channel, so mirror and recovery paths do not treat DMs as channel conversations. Fixes #68126. Thanks @illuminate0623. - Discord/DMs: keep no-guild inbound messages on direct-message routing when Discord channel lookup is temporarily unavailable, preventing degraded DMs from forking into channel sessions. Fixes #59817. Thanks @DooPeePey. +- Discord: retry outbound API calls on HTTP 5xx, request-timeout, and transient transport failures instead of only Discord rate limits, reducing dropped cron and agent replies during short Discord or network outages. Fixes #52396. Thanks @sunshineo. - Gateway/config: log config health-state write failures instead of silently hiding config observe-recovery write errors. Thanks @sallyom. - Diagnostics: reset stuck-session timers on reply, tool, status, block, and ACP progress events, and back off repeated `session.stuck` diagnostics while a session remains unchanged. Supersedes #72010. Thanks @rubencu. diff --git a/docs/concepts/retry.md b/docs/concepts/retry.md index a21969811d9..da9958434f0 100644 --- a/docs/concepts/retry.md +++ b/docs/concepts/retry.md @@ -37,7 +37,9 @@ title: "Retry policy" ### Discord -- Retries only on rate-limit errors (HTTP 429). +- Retries on rate-limit errors (HTTP 429), request timeouts, HTTP 5xx responses, + and transient transport failures such as DNS lookup failures, connection + resets, socket closes, and fetch failures. - Uses Discord `retry_after` when available, otherwise exponential backoff. ### Telegram diff --git a/extensions/discord/src/delivery-retry.ts b/extensions/discord/src/delivery-retry.ts index a7f52a410fc..f49bbc5481c 100644 --- a/extensions/discord/src/delivery-retry.ts +++ b/extensions/discord/src/delivery-retry.ts @@ -5,6 +5,7 @@ import { type RetryConfig, } from "openclaw/plugin-sdk/retry-runtime"; import { resolveDiscordAccount } from "./accounts.js"; +import { DiscordError } from "./internal/discord.js"; const DISCORD_DELIVERY_RETRY_DEFAULTS = { attempts: 3, @@ -13,7 +14,10 @@ const DISCORD_DELIVERY_RETRY_DEFAULTS = { jitter: 0, } satisfies Required; -function isRetryableDiscordDeliveryError(err: unknown): boolean { +export function isRetryableDiscordDeliveryError(err: unknown): boolean { + if (err instanceof DiscordError) { + return false; + } const status = (err as { status?: number }).status ?? (err as { statusCode?: number }).statusCode; return status === 429 || (status !== undefined && status >= 500); } diff --git a/extensions/discord/src/retry.test.ts b/extensions/discord/src/retry.test.ts new file mode 100644 index 00000000000..6a3fb70feb2 --- /dev/null +++ b/extensions/discord/src/retry.test.ts @@ -0,0 +1,83 @@ +import { describe, expect, it, vi } from "vitest"; +import { isRetryableDiscordDeliveryError } from "./delivery-retry.js"; +import { DiscordError, RateLimitError } from "./internal/discord.js"; +import { createDiscordRetryRunner, isRetryableDiscordTransientError } from "./retry.js"; + +const ZERO_DELAY_RETRY = { attempts: 2, minDelayMs: 0, maxDelayMs: 0, jitter: 0 }; + +function createRateLimitError(retryAfter = 0): RateLimitError { + const response = new Response(null, { + status: 429, + headers: { + "X-RateLimit-Scope": "user", + "X-RateLimit-Bucket": "bucket-1", + }, + }); + const RateLimitErrorCtor = RateLimitError as unknown as new ( + response: Response, + body: { message: string; retry_after: number; global: boolean }, + ) => RateLimitError; + return new RateLimitErrorCtor(response, { + message: "rate limited", + retry_after: retryAfter, + global: false, + }); +} + +describe("isRetryableDiscordTransientError", () => { + it.each([ + ["rate limit", createRateLimitError()], + ["408 status", Object.assign(new Error("request timeout"), { status: 408 })], + ["502 status", Object.assign(new Error("bad gateway"), { status: 502 })], + ["503 statusCode", Object.assign(new Error("service unavailable"), { statusCode: 503 })], + ["fetch failed", new TypeError("fetch failed")], + ["ECONNRESET", Object.assign(new Error("socket hang up"), { code: "ECONNRESET" })], + ["ETIMEDOUT cause", new Error("request failed", { cause: { code: "ETIMEDOUT" } })], + ["abort", Object.assign(new Error("aborted"), { name: "AbortError" })], + ])("retries %s", (_name, err) => { + expect(isRetryableDiscordTransientError(err)).toBe(true); + }); + + it.each([ + ["400 status", Object.assign(new Error("bad request"), { status: 400 })], + ["403 status", Object.assign(new Error("missing permissions"), { statusCode: 403 })], + ["unknown channel", new Error("Unknown Channel")], + ["plain string", "fetch failed"], + ])("does not retry %s", (_name, err) => { + expect(isRetryableDiscordTransientError(err)).toBe(false); + }); +}); + +describe("createDiscordRetryRunner", () => { + it("retries transient transport errors", async () => { + const fn = vi.fn().mockRejectedValueOnce(new TypeError("fetch failed")).mockResolvedValue("ok"); + const runner = createDiscordRetryRunner({ retry: ZERO_DELAY_RETRY }); + + await expect(runner(fn, "send")).resolves.toBe("ok"); + expect(fn).toHaveBeenCalledTimes(2); + }); + + it("stops after configured transient retry attempts", async () => { + const fn = vi.fn().mockRejectedValue(new TypeError("fetch failed")); + const runner = createDiscordRetryRunner({ retry: ZERO_DELAY_RETRY }); + + await expect(runner(fn, "send")).rejects.toThrow("fetch failed"); + expect(fn).toHaveBeenCalledTimes(2); + }); +}); + +describe("isRetryableDiscordDeliveryError", () => { + it("retries status-coded errors from injected delivery dependencies", () => { + expect( + isRetryableDiscordDeliveryError(Object.assign(new Error("bad gateway"), { status: 502 })), + ).toBe(true); + }); + + it("does not retry Discord client errors after the request runner handled them", () => { + const err = new DiscordError(new Response("upstream", { status: 502 }), { + message: "Bad Gateway", + }); + + expect(isRetryableDiscordDeliveryError(err)).toBe(false); + }); +}); diff --git a/extensions/discord/src/retry.ts b/extensions/discord/src/retry.ts index 8454bcb800e..41b94f0a3de 100644 --- a/extensions/discord/src/retry.ts +++ b/extensions/discord/src/retry.ts @@ -1,3 +1,9 @@ +import { + collectErrorGraphCandidates, + extractErrorCode, + formatErrorMessage, + readErrorName, +} from "openclaw/plugin-sdk/error-runtime"; import { createRateLimitRetryRunner, type RetryConfig, @@ -12,6 +18,71 @@ const DISCORD_RETRY_DEFAULTS = { jitter: 0.1, } satisfies RetryConfig; +const DISCORD_RETRYABLE_STATUS_CODES = new Set([408, 429]); +const DISCORD_RETRYABLE_ERROR_CODES = new Set([ + "EAI_AGAIN", + "ECONNREFUSED", + "ECONNRESET", + "ENETUNREACH", + "ENOTFOUND", + "EPIPE", + "ETIMEDOUT", + "UND_ERR_BODY_TIMEOUT", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_SOCKET", +]); +const DISCORD_TRANSIENT_MESSAGE_RE = + /\b(?:bad gateway|fetch failed|network error|networkerror|service unavailable|socket hang up|temporarily unavailable|timed out|timeout)\b|connection (?:closed|reset|refused)/i; + +function readDiscordErrorStatus(err: unknown): number | undefined { + if (!err || typeof err !== "object") { + return undefined; + } + const raw = + "status" in err && err.status !== undefined + ? err.status + : "statusCode" in err && err.statusCode !== undefined + ? err.statusCode + : undefined; + if (typeof raw === "number" && Number.isFinite(raw)) { + return raw; + } + if (typeof raw === "string" && /^\d+$/.test(raw)) { + return Number(raw); + } + return undefined; +} + +export function isRetryableDiscordTransientError(err: unknown): boolean { + if (err instanceof RateLimitError) { + return true; + } + for (const candidate of collectErrorGraphCandidates(err, (current) => [ + current.cause, + current.error, + ])) { + const status = readDiscordErrorStatus(candidate); + if (status !== undefined && (DISCORD_RETRYABLE_STATUS_CODES.has(status) || status >= 500)) { + return true; + } + const code = extractErrorCode(candidate); + if (code && DISCORD_RETRYABLE_ERROR_CODES.has(code.toUpperCase())) { + return true; + } + if (readErrorName(candidate) === "AbortError") { + return true; + } + if ( + (candidate instanceof Error || (candidate !== null && typeof candidate === "object")) && + DISCORD_TRANSIENT_MESSAGE_RE.test(formatErrorMessage(candidate)) + ) { + return true; + } + } + return false; +} + export function createDiscordRetryRunner(params: { retry?: RetryConfig; configRetry?: RetryConfig; @@ -21,7 +92,7 @@ export function createDiscordRetryRunner(params: { ...params, defaults: DISCORD_RETRY_DEFAULTS, logLabel: "discord", - shouldRetry: (err) => err instanceof RateLimitError, + shouldRetry: isRetryableDiscordTransientError, retryAfterMs: (err) => (err instanceof RateLimitError ? err.retryAfter * 1000 : undefined), }); } diff --git a/extensions/discord/src/send.creates-thread.test.ts b/extensions/discord/src/send.creates-thread.test.ts index c2fc59c154e..19faac5be0f 100644 --- a/extensions/discord/src/send.creates-thread.test.ts +++ b/extensions/discord/src/send.creates-thread.test.ts @@ -547,16 +547,33 @@ describe("retry rate limits", () => { expect(postMock).toHaveBeenCalledTimes(2); }); - it("does not retry non-rate-limit errors", async () => { + it("does not retry permanent non-rate-limit errors", async () => { const { rest, postMock } = makeDiscordRest(); - postMock.mockRejectedValueOnce(new Error("network error")); + postMock.mockRejectedValueOnce(new Error("invalid request")); await expect( sendMessageDiscord("channel:789", "hello", discordClientOpts(rest)), - ).rejects.toThrow("network error"); + ).rejects.toThrow("invalid request"); expect(postMock).toHaveBeenCalledTimes(1); }); + it("retries transient network errors", async () => { + const { rest, postMock } = makeDiscordRest(); + postMock + .mockRejectedValueOnce(new TypeError("fetch failed")) + .mockResolvedValueOnce({ id: "msg1", channel_id: "789" }); + + const result = await sendMessageDiscord("channel:789", "hello", { + cfg: DISCORD_TEST_CFG, + rest, + token: "t", + retry: { attempts: 2, minDelayMs: 0, maxDelayMs: 0, jitter: 0 }, + }); + + expect(result).toEqual({ messageId: "msg1", channelId: "789" }); + expect(postMock).toHaveBeenCalledTimes(2); + }); + it("retries reactions on rate limits", async () => { const { rest, putMock } = makeDiscordRest(); const rateLimitError = createMockRateLimitError(0);