fix(cron): prevent armTimer tight loop when job has stuck runningAtMs (openclaw#29853) thanks @FlamesCN

Verified:
- pnpm build
- pnpm check
- pnpm test:macmini

Co-authored-by: FlamesCN <12966659+FlamesCN@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
FlamesCN
2026-03-02 09:58:58 +08:00
committed by GitHub
parent ffe1937b92
commit aaa7de45fa
3 changed files with 195 additions and 1 deletions

View File

@@ -107,6 +107,7 @@ Docs: https://docs.openclaw.ai
- Telegram/Empty final replies: skip outbound send for null/undefined final text payloads without media so Telegram typing indicators do not linger on `text must be non-empty` errors. Landed from contributor PR #30969 by @haosenwang1018. Thanks @haosenwang1018.
- Routing/Binding peer-kind parity: treat `peer.kind` `group` and `channel` as equivalent for binding scope matching (while keeping `direct` separate) so Slack/public channel bindings do not silently fall through. Landed from contributor PR #31135 by @Sid-Qin. Thanks @Sid-Qin.
- Agents/FS workspace default: honor documented host file-tool default `tools.fs.workspaceOnly=false` when unset so host `write`/`edit` calls are not incorrectly workspace-restricted unless explicitly enabled. Landed from contributor PR #31128 by @SaucePackets. Thanks @SaucePackets.
- Cron/Timer hot-loop guard: enforce a minimum timer re-arm delay when stale past-due jobs would otherwise trigger repeated `setTimeout(0)` loops, preventing event-loop saturation and log-flood behavior. (#29853) Thanks @FlamesCN.
- Gateway/CLI session recovery: handle expired CLI session IDs gracefully by clearing stale session state and retrying without crashing gateway runs. Landed from contributor PR #31090 by @frankekn. Thanks @frankekn.
- Slack/Subagent completion delivery: stop forcing bound conversation IDs into `threadId` so Slack completion announces do not send invalid `thread_ts` for DMs/top-level channels. Landed from contributor PR #31105 by @stakeswky. Thanks @stakeswky.
- Signal/Loop protection: evaluate own-account detection before sync-message filtering (including UUID-only `accountUuid` configs) so `sentTranscript` sync events cannot bypass loop protection and self-reply loops. Landed from contributor PR #31093 by @kevinWangSheng. Thanks @kevinWangSheng.

View File

@@ -0,0 +1,184 @@
import fs from "node:fs/promises";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { createNoopLogger, createCronStoreHarness } from "./service.test-harness.js";
import { createCronServiceState } from "./service/state.js";
import { armTimer, onTimer } from "./service/timer.js";
import type { CronJob } from "./types.js";
const noopLogger = createNoopLogger();
const { makeStorePath } = createCronStoreHarness({ prefix: "openclaw-cron-tight-loop-" });
/**
* Create a cron job that is past-due AND has a stuck `runningAtMs` marker.
* This combination causes `findDueJobs` to return `[]` (blocked by
* `runningAtMs`) while `nextWakeAtMs` still returns the past-due timestamp,
* which before the fix resulted in a `setTimeout(0)` tight loop.
*/
function createStuckPastDueJob(params: { id: string; nowMs: number; pastDueMs: number }): CronJob {
const pastDueAt = params.nowMs - params.pastDueMs;
return {
id: params.id,
name: "stuck-job",
enabled: true,
deleteAfterRun: false,
createdAtMs: pastDueAt - 60_000,
updatedAtMs: pastDueAt - 60_000,
schedule: { kind: "cron", expr: "*/15 * * * *" },
sessionTarget: "isolated",
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "monitor" },
delivery: { mode: "none" },
state: {
nextRunAtMs: pastDueAt,
// Stuck: set from a previous execution that was interrupted.
// Not yet old enough for STUCK_RUN_MS (2 h) to clear it.
runningAtMs: pastDueAt + 1,
},
};
}
describe("CronService - armTimer tight loop prevention", () => {
beforeEach(() => {
noopLogger.debug.mockClear();
noopLogger.info.mockClear();
noopLogger.warn.mockClear();
noopLogger.error.mockClear();
});
afterEach(() => {
vi.clearAllMocks();
});
it("enforces a minimum delay when the next wake time is in the past", () => {
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
const now = Date.parse("2026-02-28T12:32:00.000Z");
const pastDueMs = 17 * 60 * 1000; // 17 minutes past due
const state = createCronServiceState({
storePath: "/tmp/test-cron/jobs.json",
cronEnabled: true,
log: noopLogger,
nowMs: () => now,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
});
state.store = {
version: 1,
jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })],
};
armTimer(state);
expect(state.timer).not.toBeNull();
const delays = timeoutSpy.mock.calls
.map(([, delay]) => delay)
.filter((d): d is number => typeof d === "number");
// Before the fix, delay would be 0 (tight loop).
// After the fix, delay must be >= MIN_REFIRE_GAP_MS (2000 ms).
expect(delays.length).toBeGreaterThan(0);
for (const d of delays) {
expect(d).toBeGreaterThanOrEqual(2_000);
}
timeoutSpy.mockRestore();
});
it("does not add extra delay when the next wake time is in the future", () => {
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
const now = Date.parse("2026-02-28T12:32:00.000Z");
const state = createCronServiceState({
storePath: "/tmp/test-cron/jobs.json",
cronEnabled: true,
log: noopLogger,
nowMs: () => now,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
});
state.store = {
version: 1,
jobs: [
{
id: "future-job",
name: "future-job",
enabled: true,
deleteAfterRun: false,
createdAtMs: now,
updatedAtMs: now,
schedule: { kind: "cron", expr: "*/15 * * * *" },
sessionTarget: "isolated" as const,
wakeMode: "next-heartbeat" as const,
payload: { kind: "agentTurn" as const, message: "test" },
delivery: { mode: "none" as const },
state: { nextRunAtMs: now + 10_000 }, // 10 seconds in the future
},
],
};
armTimer(state);
const delays = timeoutSpy.mock.calls
.map(([, delay]) => delay)
.filter((d): d is number => typeof d === "number");
// The natural delay (10 s) should be used, not the floor.
expect(delays).toContain(10_000);
timeoutSpy.mockRestore();
});
it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => {
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
const store = await makeStorePath();
const now = Date.parse("2026-02-28T12:32:00.000Z");
const pastDueMs = 17 * 60 * 1000;
await fs.mkdir(path.dirname(store.storePath), { recursive: true });
await fs.writeFile(
store.storePath,
JSON.stringify(
{
version: 1,
jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })],
},
null,
2,
),
"utf-8",
);
const state = createCronServiceState({
storePath: store.storePath,
cronEnabled: true,
log: noopLogger,
nowMs: () => now,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
});
// Simulate the onTimer path: it will find no runnable jobs (blocked by
// runningAtMs) and re-arm the timer in its finally block.
await onTimer(state);
expect(state.running).toBe(false);
expect(state.timer).not.toBeNull();
// The re-armed timer must NOT use delay=0. It should use at least
// MIN_REFIRE_GAP_MS to prevent the hot-loop.
const allDelays = timeoutSpy.mock.calls
.map(([, delay]) => delay)
.filter((d): d is number => typeof d === "number");
// The last setTimeout call is from the finally→armTimer path.
const lastDelay = allDelays[allDelays.length - 1];
expect(lastDelay).toBeGreaterThanOrEqual(2_000);
timeoutSpy.mockRestore();
await store.cleanup();
});
});

View File

@@ -446,9 +446,18 @@ export function armTimer(state: CronServiceState) {
}
const now = state.deps.nowMs();
const delay = Math.max(nextAt - now, 0);
// Floor: when the next wake time is in the past (delay === 0), enforce a
// minimum delay to prevent a tight setTimeout(0) loop. This can happen
// when a job has a stuck runningAtMs marker and a past-due nextRunAtMs:
// findDueJobs skips the job (blocked by runningAtMs), while
// recomputeNextRunsForMaintenance intentionally does not advance the
// past-due nextRunAtMs (per #13992). The finally block in onTimer then
// re-invokes armTimer with delay === 0, creating an infinite hot-loop
// that saturates the event loop and fills the log file to its size cap.
const flooredDelay = delay === 0 ? MIN_REFIRE_GAP_MS : delay;
// Wake at least once a minute to avoid schedule drift and recover quickly
// when the process was paused or wall-clock time jumps.
const clampedDelay = Math.min(delay, MAX_TIMER_DELAY_MS);
const clampedDelay = Math.min(flooredDelay, MAX_TIMER_DELAY_MS);
// Intentionally avoid an `async` timer callback:
// Vitest's fake-timer helpers can await async callbacks, which would block
// tests that simulate long-running jobs. Runtime behavior is unchanged.