diff --git a/CHANGELOG.md b/CHANGELOG.md index ab09adc2209..2be9e346e5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -107,6 +107,7 @@ Docs: https://docs.openclaw.ai - Telegram/Empty final replies: skip outbound send for null/undefined final text payloads without media so Telegram typing indicators do not linger on `text must be non-empty` errors. Landed from contributor PR #30969 by @haosenwang1018. Thanks @haosenwang1018. - Routing/Binding peer-kind parity: treat `peer.kind` `group` and `channel` as equivalent for binding scope matching (while keeping `direct` separate) so Slack/public channel bindings do not silently fall through. Landed from contributor PR #31135 by @Sid-Qin. Thanks @Sid-Qin. - Agents/FS workspace default: honor documented host file-tool default `tools.fs.workspaceOnly=false` when unset so host `write`/`edit` calls are not incorrectly workspace-restricted unless explicitly enabled. Landed from contributor PR #31128 by @SaucePackets. Thanks @SaucePackets. +- Cron/Timer hot-loop guard: enforce a minimum timer re-arm delay when stale past-due jobs would otherwise trigger repeated `setTimeout(0)` loops, preventing event-loop saturation and log-flood behavior. (#29853) Thanks @FlamesCN. - Gateway/CLI session recovery: handle expired CLI session IDs gracefully by clearing stale session state and retrying without crashing gateway runs. Landed from contributor PR #31090 by @frankekn. Thanks @frankekn. - Slack/Subagent completion delivery: stop forcing bound conversation IDs into `threadId` so Slack completion announces do not send invalid `thread_ts` for DMs/top-level channels. Landed from contributor PR #31105 by @stakeswky. Thanks @stakeswky. - Signal/Loop protection: evaluate own-account detection before sync-message filtering (including UUID-only `accountUuid` configs) so `sentTranscript` sync events cannot bypass loop protection and self-reply loops. Landed from contributor PR #31093 by @kevinWangSheng. Thanks @kevinWangSheng. diff --git a/src/cron/service.armtimer-tight-loop.test.ts b/src/cron/service.armtimer-tight-loop.test.ts new file mode 100644 index 00000000000..a82aa36fbb2 --- /dev/null +++ b/src/cron/service.armtimer-tight-loop.test.ts @@ -0,0 +1,184 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { createNoopLogger, createCronStoreHarness } from "./service.test-harness.js"; +import { createCronServiceState } from "./service/state.js"; +import { armTimer, onTimer } from "./service/timer.js"; +import type { CronJob } from "./types.js"; + +const noopLogger = createNoopLogger(); +const { makeStorePath } = createCronStoreHarness({ prefix: "openclaw-cron-tight-loop-" }); + +/** + * Create a cron job that is past-due AND has a stuck `runningAtMs` marker. + * This combination causes `findDueJobs` to return `[]` (blocked by + * `runningAtMs`) while `nextWakeAtMs` still returns the past-due timestamp, + * which before the fix resulted in a `setTimeout(0)` tight loop. + */ +function createStuckPastDueJob(params: { id: string; nowMs: number; pastDueMs: number }): CronJob { + const pastDueAt = params.nowMs - params.pastDueMs; + return { + id: params.id, + name: "stuck-job", + enabled: true, + deleteAfterRun: false, + createdAtMs: pastDueAt - 60_000, + updatedAtMs: pastDueAt - 60_000, + schedule: { kind: "cron", expr: "*/15 * * * *" }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "monitor" }, + delivery: { mode: "none" }, + state: { + nextRunAtMs: pastDueAt, + // Stuck: set from a previous execution that was interrupted. + // Not yet old enough for STUCK_RUN_MS (2 h) to clear it. + runningAtMs: pastDueAt + 1, + }, + }; +} + +describe("CronService - armTimer tight loop prevention", () => { + beforeEach(() => { + noopLogger.debug.mockClear(); + noopLogger.info.mockClear(); + noopLogger.warn.mockClear(); + noopLogger.error.mockClear(); + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it("enforces a minimum delay when the next wake time is in the past", () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const now = Date.parse("2026-02-28T12:32:00.000Z"); + const pastDueMs = 17 * 60 * 1000; // 17 minutes past due + + const state = createCronServiceState({ + storePath: "/tmp/test-cron/jobs.json", + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }), + }); + state.store = { + version: 1, + jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })], + }; + + armTimer(state); + + expect(state.timer).not.toBeNull(); + const delays = timeoutSpy.mock.calls + .map(([, delay]) => delay) + .filter((d): d is number => typeof d === "number"); + + // Before the fix, delay would be 0 (tight loop). + // After the fix, delay must be >= MIN_REFIRE_GAP_MS (2000 ms). + expect(delays.length).toBeGreaterThan(0); + for (const d of delays) { + expect(d).toBeGreaterThanOrEqual(2_000); + } + + timeoutSpy.mockRestore(); + }); + + it("does not add extra delay when the next wake time is in the future", () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const now = Date.parse("2026-02-28T12:32:00.000Z"); + + const state = createCronServiceState({ + storePath: "/tmp/test-cron/jobs.json", + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }), + }); + state.store = { + version: 1, + jobs: [ + { + id: "future-job", + name: "future-job", + enabled: true, + deleteAfterRun: false, + createdAtMs: now, + updatedAtMs: now, + schedule: { kind: "cron", expr: "*/15 * * * *" }, + sessionTarget: "isolated" as const, + wakeMode: "next-heartbeat" as const, + payload: { kind: "agentTurn" as const, message: "test" }, + delivery: { mode: "none" as const }, + state: { nextRunAtMs: now + 10_000 }, // 10 seconds in the future + }, + ], + }; + + armTimer(state); + + const delays = timeoutSpy.mock.calls + .map(([, delay]) => delay) + .filter((d): d is number => typeof d === "number"); + + // The natural delay (10 s) should be used, not the floor. + expect(delays).toContain(10_000); + + timeoutSpy.mockRestore(); + }); + + it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const store = await makeStorePath(); + const now = Date.parse("2026-02-28T12:32:00.000Z"); + const pastDueMs = 17 * 60 * 1000; + + await fs.mkdir(path.dirname(store.storePath), { recursive: true }); + await fs.writeFile( + store.storePath, + JSON.stringify( + { + version: 1, + jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })], + }, + null, + 2, + ), + "utf-8", + ); + + const state = createCronServiceState({ + storePath: store.storePath, + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }), + }); + + // Simulate the onTimer path: it will find no runnable jobs (blocked by + // runningAtMs) and re-arm the timer in its finally block. + await onTimer(state); + + expect(state.running).toBe(false); + expect(state.timer).not.toBeNull(); + + // The re-armed timer must NOT use delay=0. It should use at least + // MIN_REFIRE_GAP_MS to prevent the hot-loop. + const allDelays = timeoutSpy.mock.calls + .map(([, delay]) => delay) + .filter((d): d is number => typeof d === "number"); + + // The last setTimeout call is from the finally→armTimer path. + const lastDelay = allDelays[allDelays.length - 1]; + expect(lastDelay).toBeGreaterThanOrEqual(2_000); + + timeoutSpy.mockRestore(); + await store.cleanup(); + }); +}); diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 333caabbfb1..4d38e7c33f1 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -446,9 +446,18 @@ export function armTimer(state: CronServiceState) { } const now = state.deps.nowMs(); const delay = Math.max(nextAt - now, 0); + // Floor: when the next wake time is in the past (delay === 0), enforce a + // minimum delay to prevent a tight setTimeout(0) loop. This can happen + // when a job has a stuck runningAtMs marker and a past-due nextRunAtMs: + // findDueJobs skips the job (blocked by runningAtMs), while + // recomputeNextRunsForMaintenance intentionally does not advance the + // past-due nextRunAtMs (per #13992). The finally block in onTimer then + // re-invokes armTimer with delay === 0, creating an infinite hot-loop + // that saturates the event loop and fills the log file to its size cap. + const flooredDelay = delay === 0 ? MIN_REFIRE_GAP_MS : delay; // Wake at least once a minute to avoid schedule drift and recover quickly // when the process was paused or wall-clock time jumps. - const clampedDelay = Math.min(delay, MAX_TIMER_DELAY_MS); + const clampedDelay = Math.min(flooredDelay, MAX_TIMER_DELAY_MS); // Intentionally avoid an `async` timer callback: // Vitest's fake-timer helpers can await async callbacks, which would block // tests that simulate long-running jobs. Runtime behavior is unchanged.