mirror of
https://github.com/moltbot/moltbot.git
synced 2026-03-09 15:35:17 +00:00
fix(cron): prevent armTimer tight loop when job has stuck runningAtMs (openclaw#29853) thanks @FlamesCN
Verified: - pnpm build - pnpm check - pnpm test:macmini Co-authored-by: FlamesCN <12966659+FlamesCN@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -107,6 +107,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Telegram/Empty final replies: skip outbound send for null/undefined final text payloads without media so Telegram typing indicators do not linger on `text must be non-empty` errors. Landed from contributor PR #30969 by @haosenwang1018. Thanks @haosenwang1018.
|
||||
- Routing/Binding peer-kind parity: treat `peer.kind` `group` and `channel` as equivalent for binding scope matching (while keeping `direct` separate) so Slack/public channel bindings do not silently fall through. Landed from contributor PR #31135 by @Sid-Qin. Thanks @Sid-Qin.
|
||||
- Agents/FS workspace default: honor documented host file-tool default `tools.fs.workspaceOnly=false` when unset so host `write`/`edit` calls are not incorrectly workspace-restricted unless explicitly enabled. Landed from contributor PR #31128 by @SaucePackets. Thanks @SaucePackets.
|
||||
- Cron/Timer hot-loop guard: enforce a minimum timer re-arm delay when stale past-due jobs would otherwise trigger repeated `setTimeout(0)` loops, preventing event-loop saturation and log-flood behavior. (#29853) Thanks @FlamesCN.
|
||||
- Gateway/CLI session recovery: handle expired CLI session IDs gracefully by clearing stale session state and retrying without crashing gateway runs. Landed from contributor PR #31090 by @frankekn. Thanks @frankekn.
|
||||
- Slack/Subagent completion delivery: stop forcing bound conversation IDs into `threadId` so Slack completion announces do not send invalid `thread_ts` for DMs/top-level channels. Landed from contributor PR #31105 by @stakeswky. Thanks @stakeswky.
|
||||
- Signal/Loop protection: evaluate own-account detection before sync-message filtering (including UUID-only `accountUuid` configs) so `sentTranscript` sync events cannot bypass loop protection and self-reply loops. Landed from contributor PR #31093 by @kevinWangSheng. Thanks @kevinWangSheng.
|
||||
|
||||
184
src/cron/service.armtimer-tight-loop.test.ts
Normal file
184
src/cron/service.armtimer-tight-loop.test.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { createNoopLogger, createCronStoreHarness } from "./service.test-harness.js";
|
||||
import { createCronServiceState } from "./service/state.js";
|
||||
import { armTimer, onTimer } from "./service/timer.js";
|
||||
import type { CronJob } from "./types.js";
|
||||
|
||||
const noopLogger = createNoopLogger();
|
||||
const { makeStorePath } = createCronStoreHarness({ prefix: "openclaw-cron-tight-loop-" });
|
||||
|
||||
/**
|
||||
* Create a cron job that is past-due AND has a stuck `runningAtMs` marker.
|
||||
* This combination causes `findDueJobs` to return `[]` (blocked by
|
||||
* `runningAtMs`) while `nextWakeAtMs` still returns the past-due timestamp,
|
||||
* which before the fix resulted in a `setTimeout(0)` tight loop.
|
||||
*/
|
||||
function createStuckPastDueJob(params: { id: string; nowMs: number; pastDueMs: number }): CronJob {
|
||||
const pastDueAt = params.nowMs - params.pastDueMs;
|
||||
return {
|
||||
id: params.id,
|
||||
name: "stuck-job",
|
||||
enabled: true,
|
||||
deleteAfterRun: false,
|
||||
createdAtMs: pastDueAt - 60_000,
|
||||
updatedAtMs: pastDueAt - 60_000,
|
||||
schedule: { kind: "cron", expr: "*/15 * * * *" },
|
||||
sessionTarget: "isolated",
|
||||
wakeMode: "next-heartbeat",
|
||||
payload: { kind: "agentTurn", message: "monitor" },
|
||||
delivery: { mode: "none" },
|
||||
state: {
|
||||
nextRunAtMs: pastDueAt,
|
||||
// Stuck: set from a previous execution that was interrupted.
|
||||
// Not yet old enough for STUCK_RUN_MS (2 h) to clear it.
|
||||
runningAtMs: pastDueAt + 1,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe("CronService - armTimer tight loop prevention", () => {
|
||||
beforeEach(() => {
|
||||
noopLogger.debug.mockClear();
|
||||
noopLogger.info.mockClear();
|
||||
noopLogger.warn.mockClear();
|
||||
noopLogger.error.mockClear();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it("enforces a minimum delay when the next wake time is in the past", () => {
|
||||
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
|
||||
const now = Date.parse("2026-02-28T12:32:00.000Z");
|
||||
const pastDueMs = 17 * 60 * 1000; // 17 minutes past due
|
||||
|
||||
const state = createCronServiceState({
|
||||
storePath: "/tmp/test-cron/jobs.json",
|
||||
cronEnabled: true,
|
||||
log: noopLogger,
|
||||
nowMs: () => now,
|
||||
enqueueSystemEvent: vi.fn(),
|
||||
requestHeartbeatNow: vi.fn(),
|
||||
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
|
||||
});
|
||||
state.store = {
|
||||
version: 1,
|
||||
jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })],
|
||||
};
|
||||
|
||||
armTimer(state);
|
||||
|
||||
expect(state.timer).not.toBeNull();
|
||||
const delays = timeoutSpy.mock.calls
|
||||
.map(([, delay]) => delay)
|
||||
.filter((d): d is number => typeof d === "number");
|
||||
|
||||
// Before the fix, delay would be 0 (tight loop).
|
||||
// After the fix, delay must be >= MIN_REFIRE_GAP_MS (2000 ms).
|
||||
expect(delays.length).toBeGreaterThan(0);
|
||||
for (const d of delays) {
|
||||
expect(d).toBeGreaterThanOrEqual(2_000);
|
||||
}
|
||||
|
||||
timeoutSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("does not add extra delay when the next wake time is in the future", () => {
|
||||
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
|
||||
const now = Date.parse("2026-02-28T12:32:00.000Z");
|
||||
|
||||
const state = createCronServiceState({
|
||||
storePath: "/tmp/test-cron/jobs.json",
|
||||
cronEnabled: true,
|
||||
log: noopLogger,
|
||||
nowMs: () => now,
|
||||
enqueueSystemEvent: vi.fn(),
|
||||
requestHeartbeatNow: vi.fn(),
|
||||
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
|
||||
});
|
||||
state.store = {
|
||||
version: 1,
|
||||
jobs: [
|
||||
{
|
||||
id: "future-job",
|
||||
name: "future-job",
|
||||
enabled: true,
|
||||
deleteAfterRun: false,
|
||||
createdAtMs: now,
|
||||
updatedAtMs: now,
|
||||
schedule: { kind: "cron", expr: "*/15 * * * *" },
|
||||
sessionTarget: "isolated" as const,
|
||||
wakeMode: "next-heartbeat" as const,
|
||||
payload: { kind: "agentTurn" as const, message: "test" },
|
||||
delivery: { mode: "none" as const },
|
||||
state: { nextRunAtMs: now + 10_000 }, // 10 seconds in the future
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
armTimer(state);
|
||||
|
||||
const delays = timeoutSpy.mock.calls
|
||||
.map(([, delay]) => delay)
|
||||
.filter((d): d is number => typeof d === "number");
|
||||
|
||||
// The natural delay (10 s) should be used, not the floor.
|
||||
expect(delays).toContain(10_000);
|
||||
|
||||
timeoutSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => {
|
||||
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
|
||||
const store = await makeStorePath();
|
||||
const now = Date.parse("2026-02-28T12:32:00.000Z");
|
||||
const pastDueMs = 17 * 60 * 1000;
|
||||
|
||||
await fs.mkdir(path.dirname(store.storePath), { recursive: true });
|
||||
await fs.writeFile(
|
||||
store.storePath,
|
||||
JSON.stringify(
|
||||
{
|
||||
version: 1,
|
||||
jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const state = createCronServiceState({
|
||||
storePath: store.storePath,
|
||||
cronEnabled: true,
|
||||
log: noopLogger,
|
||||
nowMs: () => now,
|
||||
enqueueSystemEvent: vi.fn(),
|
||||
requestHeartbeatNow: vi.fn(),
|
||||
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
|
||||
});
|
||||
|
||||
// Simulate the onTimer path: it will find no runnable jobs (blocked by
|
||||
// runningAtMs) and re-arm the timer in its finally block.
|
||||
await onTimer(state);
|
||||
|
||||
expect(state.running).toBe(false);
|
||||
expect(state.timer).not.toBeNull();
|
||||
|
||||
// The re-armed timer must NOT use delay=0. It should use at least
|
||||
// MIN_REFIRE_GAP_MS to prevent the hot-loop.
|
||||
const allDelays = timeoutSpy.mock.calls
|
||||
.map(([, delay]) => delay)
|
||||
.filter((d): d is number => typeof d === "number");
|
||||
|
||||
// The last setTimeout call is from the finally→armTimer path.
|
||||
const lastDelay = allDelays[allDelays.length - 1];
|
||||
expect(lastDelay).toBeGreaterThanOrEqual(2_000);
|
||||
|
||||
timeoutSpy.mockRestore();
|
||||
await store.cleanup();
|
||||
});
|
||||
});
|
||||
@@ -446,9 +446,18 @@ export function armTimer(state: CronServiceState) {
|
||||
}
|
||||
const now = state.deps.nowMs();
|
||||
const delay = Math.max(nextAt - now, 0);
|
||||
// Floor: when the next wake time is in the past (delay === 0), enforce a
|
||||
// minimum delay to prevent a tight setTimeout(0) loop. This can happen
|
||||
// when a job has a stuck runningAtMs marker and a past-due nextRunAtMs:
|
||||
// findDueJobs skips the job (blocked by runningAtMs), while
|
||||
// recomputeNextRunsForMaintenance intentionally does not advance the
|
||||
// past-due nextRunAtMs (per #13992). The finally block in onTimer then
|
||||
// re-invokes armTimer with delay === 0, creating an infinite hot-loop
|
||||
// that saturates the event loop and fills the log file to its size cap.
|
||||
const flooredDelay = delay === 0 ? MIN_REFIRE_GAP_MS : delay;
|
||||
// Wake at least once a minute to avoid schedule drift and recover quickly
|
||||
// when the process was paused or wall-clock time jumps.
|
||||
const clampedDelay = Math.min(delay, MAX_TIMER_DELAY_MS);
|
||||
const clampedDelay = Math.min(flooredDelay, MAX_TIMER_DELAY_MS);
|
||||
// Intentionally avoid an `async` timer callback:
|
||||
// Vitest's fake-timer helpers can await async callbacks, which would block
|
||||
// tests that simulate long-running jobs. Runtime behavior is unchanged.
|
||||
|
||||
Reference in New Issue
Block a user