fix(cron): isolate maintenance schedule recompute errors

2026-03-20 16:15:15 +00:00 · 2026-02-16 08:22:54 -05:00
parent aed8b90a0a
commit 1ffe6a45af
3 changed files with 88 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -65,6 +65,7 @@ Docs: https://docs.openclaw.ai
 - Auto-reply/WhatsApp/TUI/Web: when a final assistant message is `NO_REPLY` and a messaging tool send succeeded, mirror the delivered messaging-tool text into session-visible assistant output so TUI/Web no longer show `NO_REPLY` placeholders. (#7010) Thanks @Morrowind-Xie.
 - Auto-reply/TTS: keep tool-result media delivery enabled in group chats and native command sessions (while still suppressing tool summary text) so `NO_REPLY` follow-ups do not drop successful TTS audio. (#17991) Thanks @zerone0x.
 - Cron: infer `payload.kind="agentTurn"` for model-only `cron.update` payload patches, so partial agent-turn updates do not fail validation when `kind` is omitted. (#15664) Thanks @rodrigouroz.
+- Cron: preserve per-job schedule-error isolation in post-run maintenance recompute so malformed sibling jobs no longer abort persistence of successful runs. (#17852) Thanks @pierreeurope.
 - TUI: make searchable-select filtering and highlight rendering ANSI-aware so queries ignore hidden escape codes and no longer corrupt ANSI styling sequences during match highlighting. (#4519) Thanks @bee4come.
 - TUI/Windows: coalesce rapid single-line submit bursts in Git Bash into one multiline message as a fallback when bracketed paste is unavailable, preventing pasted multiline text from being split into multiple sends. (#4986) Thanks @adamkane.
 - TUI: suppress false `(no output)` placeholders for non-local empty final events during concurrent runs, preventing external-channel replies from showing empty assistant bubbles while a local run is still streaming. (#5782) Thanks @LagWizard and @vignesh07.
--- a/src/cron/service.issue-13992-regression.test.ts
+++ b/src/cron/service.issue-13992-regression.test.ts
@@ -127,4 +127,45 @@ describe("issue #13992 regression - cron jobs skip execution", () => {
    // But should NOT have changed nextRunAtMs (it's still future)
    expect(job.state.nextRunAtMs).toBe(futureTime);
  });
+
+  it("isolates schedule errors while filling missing nextRunAtMs", () => {
+    const now = Date.now();
+    const pastDue = now - 1_000;
+
+    const dueJob: CronJob = {
+      id: "due-job",
+      name: "due job",
+      enabled: true,
+      schedule: { kind: "cron", expr: "0 8 * * *", tz: "UTC" },
+      payload: { kind: "systemEvent", text: "due" },
+      sessionTarget: "main",
+      createdAtMs: now - 3600_000,
+      updatedAtMs: now - 3600_000,
+      state: {
+        nextRunAtMs: pastDue,
+      },
+    };
+
+    const malformedJob: CronJob = {
+      id: "bad-job",
+      name: "bad job",
+      enabled: true,
+      schedule: { kind: "cron", expr: "not a valid cron", tz: "UTC" },
+      payload: { kind: "systemEvent", text: "bad" },
+      sessionTarget: "main",
+      createdAtMs: now - 3600_000,
+      updatedAtMs: now - 3600_000,
+      state: {
+        // missing nextRunAtMs
+      },
+    };
+
+    const state = createMockState([dueJob, malformedJob]);
+
+    expect(() => recomputeNextRunsForMaintenance(state)).not.toThrow();
+    expect(dueJob.state.nextRunAtMs).toBe(pastDue);
+    expect(malformedJob.state.nextRunAtMs).toBeUndefined();
+    expect(malformedJob.state.scheduleErrorCount).toBe(1);
+    expect(malformedJob.state.lastError).toMatch(/^schedule error:/);
+  });
 });
--- a/src/cron/service/jobs.ts
+++ b/src/cron/service/jobs.ts
@@ -102,6 +102,35 @@ export function computeJobNextRunAtMs(job: CronJob, nowMs: number): number | und
 /** Maximum consecutive schedule errors before auto-disabling a job. */
 const MAX_SCHEDULE_ERRORS = 3;

+function recordScheduleComputeError(params: {
+  state: CronServiceState;
+  job: CronJob;
+  err: unknown;
+}): boolean {
+  const { state, job, err } = params;
+  const errorCount = (job.state.scheduleErrorCount ?? 0) + 1;
+  const errText = String(err);
+
+  job.state.scheduleErrorCount = errorCount;
+  job.state.nextRunAtMs = undefined;
+  job.state.lastError = `schedule error: ${errText}`;
+
+  if (errorCount >= MAX_SCHEDULE_ERRORS) {
+    job.enabled = false;
+    state.deps.log.error(
+      { jobId: job.id, name: job.name, errorCount, err: errText },
+      "cron: auto-disabled job after repeated schedule errors",
+    );
+  } else {
+    state.deps.log.warn(
+      { jobId: job.id, name: job.name, errorCount, err: errText },
+      "cron: failed to compute next run for job (skipping)",
+    );
+  }
+
+  return true;
+}
+
 function normalizeJobTickState(params: { state: CronServiceState; job: CronJob; nowMs: number }): {
  changed: boolean;
  skip: boolean;
@@ -184,23 +213,8 @@ export function recomputeNextRuns(state: CronServiceState): boolean {
          changed = true;
        }
      } catch (err) {
-        const errorCount = (job.state.scheduleErrorCount ?? 0) + 1;
-        job.state.scheduleErrorCount = errorCount;
-        job.state.nextRunAtMs = undefined;
-        job.state.lastError = `schedule error: ${String(err)}`;
-        changed = true;
-
-        if (errorCount >= MAX_SCHEDULE_ERRORS) {
-          job.enabled = false;
-          state.deps.log.error(
-            { jobId: job.id, name: job.name, errorCount, err: String(err) },
-            "cron: auto-disabled job after repeated schedule errors",
-          );
-        } else {
-          state.deps.log.warn(
-            { jobId: job.id, name: job.name, errorCount, err: String(err) },
-            "cron: failed to compute next run for job (skipping)",
-          );
+        if (recordScheduleComputeError({ state, job, err })) {
+          changed = true;
        }
      }
    }
@@ -222,10 +236,21 @@ export function recomputeNextRunsForMaintenance(state: CronServiceState): boolea
    // If a job was past-due but not found by findDueJobs, recomputing would
    // cause it to be silently skipped.
    if (job.state.nextRunAtMs === undefined) {
-      const newNext = computeJobNextRunAtMs(job, now);
-      if (newNext !== undefined) {
-        job.state.nextRunAtMs = newNext;
-        changed = true;
+      try {
+        const newNext = computeJobNextRunAtMs(job, now);
+        if (job.state.nextRunAtMs !== newNext) {
+          job.state.nextRunAtMs = newNext;
+          changed = true;
+        }
+        // Clear schedule error count on successful computation.
+        if (job.state.scheduleErrorCount) {
+          job.state.scheduleErrorCount = undefined;
+          changed = true;
+        }
+      } catch (err) {
+        if (recordScheduleComputeError({ state, job, err })) {
+          changed = true;
+        }
      }
    }
    return changed;