From 85cd41e68ee9544d57fbea5e8f7efbc71804407b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 8 May 2026 16:36:08 +0100 Subject: [PATCH] refactor: move restart handoff state to sqlite --- docs/refactor/database-first.md | 3 + .../check-database-first-legacy-stores.mjs | 5 + src/infra/restart-handoff.test.ts | 28 +++-- src/infra/restart-handoff.ts | 110 +++++------------- src/infra/restart-intent.test.ts | 13 ++- src/infra/restart.ts | 107 +++++++---------- 6 files changed, 111 insertions(+), 155 deletions(-) diff --git a/docs/refactor/database-first.md b/docs/refactor/database-first.md index 314d83c007c..a8e288d6ff0 100644 --- a/docs/refactor/database-first.md +++ b/docs/refactor/database-first.md @@ -376,6 +376,9 @@ The remaining cleanup is mostly consolidation and deletion: - Gateway restart sentinel state now uses shared SQLite KV instead of `restart-sentinel.json`; the old path resolver remains only for legacy cleanup/status compatibility. +- Gateway restart intent and supervisor handoff state now use shared SQLite KV + instead of `gateway-restart-intent.json` and + `gateway-supervisor-restart-handoff.json` sidecars. - Gateway singleton coordination now uses SQLite KV rows under `gateway_locks` instead of writing `gateway..lock` files. The lock still records pid, config path, process start time, and stale-owner metadata, diff --git a/scripts/check-database-first-legacy-stores.mjs b/scripts/check-database-first-legacy-stores.mjs index ec5b90c9cb1..5e7fad0e2df 100644 --- a/scripts/check-database-first-legacy-stores.mjs +++ b/scripts/check-database-first-legacy-stores.mjs @@ -39,6 +39,11 @@ const legacyStoreMarkers = [ { label: "ACPX process leases JSON", pattern: /\bprocess-leases\.json\b/u }, { label: "ACPX gateway instance id file", pattern: /\bgateway-instance-id\b/u }, { label: "gateway restart sentinel JSON", pattern: /\brestart-sentinel\.json\b/u }, + { label: "gateway restart intent JSON", pattern: /\bgateway-restart-intent\.json\b/u }, + { + label: "gateway supervisor restart handoff JSON", + pattern: /\bgateway-supervisor-restart-handoff\.json\b/u, + }, { label: "gateway singleton lock file", pattern: /\bgateway\.[A-Za-z0-9._-]+\.lock\b/u }, { label: "QMD embed lock file", pattern: /\bqmd[/\\]embed\.lock\b/u }, { diff --git a/src/infra/restart-handoff.test.ts b/src/infra/restart-handoff.test.ts index 854176bac4d..7c54dd2c51e 100644 --- a/src/infra/restart-handoff.test.ts +++ b/src/infra/restart-handoff.test.ts @@ -2,10 +2,10 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, it } from "vitest"; +import { closeOpenClawStateDatabaseForTest } from "../state/openclaw-state-db.js"; import { consumeGatewayRestartHandoffForExitedProcessSync, formatGatewayRestartHandoffDiagnostic, - GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME, GATEWAY_SUPERVISOR_RESTART_HANDOFF_KIND, readGatewayRestartHandoffSync, writeGatewayRestartHandoffSync, @@ -13,6 +13,8 @@ import { import type { GatewayRestartHandoff } from "./restart-handoff.js"; const tempDirs: string[] = []; +const LEGACY_GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME = + "gateway-supervisor-restart-handoff.json"; function createHandoffEnv(): NodeJS.ProcessEnv { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-restart-handoff-")); @@ -24,7 +26,10 @@ function createHandoffEnv(): NodeJS.ProcessEnv { } function handoffPath(env: NodeJS.ProcessEnv): string { - return path.join(env.OPENCLAW_STATE_DIR ?? "", GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME); + return path.join( + env.OPENCLAW_STATE_DIR ?? "", + LEGACY_GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME, + ); } function expectWrittenHandoff( @@ -39,6 +44,7 @@ function expectWrittenHandoff( describe("gateway restart handoff", () => { afterEach(() => { + closeOpenClawStateDatabaseForTest(); for (const dir of tempDirs.splice(0)) { fs.rmSync(dir, { force: true, recursive: true }); } @@ -69,7 +75,7 @@ describe("gateway restart handoff", () => { createdAt: 1_000, expiresAt: 61_000, }); - expect(fs.statSync(handoffPath(env)).mode & 0o777).toBe(0o600); + expect(fs.existsSync(handoffPath(env))).toBe(false); expect(readGatewayRestartHandoffSync(env, 1_500)).toMatchObject({ pid: 12_345, reason: "plugin source changed", @@ -101,6 +107,7 @@ describe("gateway restart handoff", () => { supervisorMode: "systemd", }); expect(fs.existsSync(handoffPath(env))).toBe(false); + expect(readGatewayRestartHandoffSync(env, 2_001)).toBeNull(); }); it("rejects handoffs for a different exited pid and clears them", () => { @@ -147,7 +154,7 @@ describe("gateway restart handoff", () => { expect(fs.existsSync(handoffPath(env))).toBe(false); }); - it("rejects malformed handoff payloads", () => { + it("ignores malformed legacy handoff files", () => { const env = createHandoffEnv(); fs.writeFileSync( @@ -168,9 +175,10 @@ describe("gateway restart handoff", () => { ); expect(readGatewayRestartHandoffSync(env, 1_001)).toBeNull(); + expect(fs.existsSync(handoffPath(env))).toBe(true); }); - it("rejects expired and oversized handoff files", () => { + it("rejects expired SQLite handoffs and ignores oversized legacy files", () => { const env = createHandoffEnv(); expectWrittenHandoff({ @@ -191,10 +199,10 @@ describe("gateway restart handoff", () => { now: 2_001, }), ).toBeNull(); - expect(fs.existsSync(handoffPath(env))).toBe(false); + expect(fs.existsSync(handoffPath(env))).toBe(true); }); - it("rejects persisted handoffs with a ttl longer than the supported window", () => { + it("ignores legacy persisted handoffs with a ttl longer than the supported window", () => { const env = createHandoffEnv(); fs.writeFileSync( @@ -221,10 +229,10 @@ describe("gateway restart handoff", () => { now: 1_001, }), ).toBeNull(); - expect(fs.existsSync(handoffPath(env))).toBe(false); + expect(fs.existsSync(handoffPath(env))).toBe(true); }); - it("does not follow an existing handoff-path symlink when writing", () => { + it("does not touch an existing legacy handoff-path symlink when writing", () => { const env = createHandoffEnv(); const targetPath = path.join(env.OPENCLAW_STATE_DIR ?? "", "attacker-target.txt"); fs.writeFileSync(targetPath, "keep", "utf8"); @@ -242,7 +250,7 @@ describe("gateway restart handoff", () => { }); expect(fs.readFileSync(targetPath, "utf8")).toBe("keep"); - expect(fs.lstatSync(handoffPath(env)).isSymbolicLink()).toBe(false); + expect(fs.lstatSync(handoffPath(env)).isSymbolicLink()).toBe(true); expect( consumeGatewayRestartHandoffForExitedProcessSync({ env, diff --git a/src/infra/restart-handoff.ts b/src/infra/restart-handoff.ts index af0c31bb754..0aa54b2654b 100644 --- a/src/infra/restart-handoff.ts +++ b/src/infra/restart-handoff.ts @@ -1,14 +1,16 @@ import { randomUUID } from "node:crypto"; -import fs from "node:fs"; -import path from "node:path"; -import { resolveStateDir } from "../config/paths.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; +import { + deleteOpenClawStateKvJson, + readOpenClawStateKvJson, + writeOpenClawStateKvJson, + type OpenClawStateJsonValue, +} from "../state/openclaw-state-kv.js"; -export const GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME = - "gateway-supervisor-restart-handoff.json"; export const GATEWAY_SUPERVISOR_RESTART_HANDOFF_KIND = "gateway-supervisor-restart-handoff"; const GATEWAY_RESTART_HANDOFF_TTL_MS = 60_000; -const GATEWAY_RESTART_HANDOFF_MAX_BYTES = 4096; +const GATEWAY_RESTART_HANDOFF_KV_SCOPE = "gateway.restart-handoff"; +const GATEWAY_RESTART_HANDOFF_KV_KEY = "current"; const MAX_INTENT_ID_LENGTH = 120; const MAX_PROCESS_INSTANCE_ID_LENGTH = 120; const MAX_REASON_LENGTH = 200; @@ -87,25 +89,10 @@ export function formatGatewayRestartHandoffDiagnostic( return `Recent restart handoff: ${detail.join("; ")}`; } -function resolveGatewayRestartHandoffPath(env: NodeJS.ProcessEnv = process.env): string { - return path.join(resolveStateDir(env), GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME); -} - -function unlinkRegularFileSync(filePath: string): boolean { - try { - const stat = fs.lstatSync(filePath); - if (!stat.isFile() || stat.nlink > 1) { - return false; - } - fs.unlinkSync(filePath); - return true; - } catch { - return false; - } -} - export function clearGatewayRestartHandoffSync(env: NodeJS.ProcessEnv = process.env): void { - unlinkRegularFileSync(resolveGatewayRestartHandoffPath(env)); + deleteOpenClawStateKvJson(GATEWAY_RESTART_HANDOFF_KV_SCOPE, GATEWAY_RESTART_HANDOFF_KV_KEY, { + env, + }); } function normalizePid(pid: number | undefined): number | null { @@ -182,13 +169,7 @@ function isRecord(value: unknown): value is Record { return typeof value === "object" && value !== null && !Array.isArray(value); } -function parseGatewayRestartHandoff(raw: string): GatewayRestartHandoff | null { - let parsed: unknown; - try { - parsed = JSON.parse(raw); - } catch { - return null; - } +function parseGatewayRestartHandoff(parsed: unknown): GatewayRestartHandoff | null { if (!isRecord(parsed)) { return null; } @@ -236,19 +217,6 @@ function parseGatewayRestartHandoff(raw: string): GatewayRestartHandoff | null { }; } -function readGatewayRestartHandoffRawSync(env: NodeJS.ProcessEnv): string | null { - const handoffPath = resolveGatewayRestartHandoffPath(env); - try { - const stat = fs.lstatSync(handoffPath); - if (!stat.isFile() || stat.nlink > 1 || stat.size > GATEWAY_RESTART_HANDOFF_MAX_BYTES) { - return null; - } - return fs.readFileSync(handoffPath, "utf8"); - } catch { - return null; - } -} - export function writeGatewayRestartHandoffSync(opts: { env?: NodeJS.ProcessEnv; pid?: number; @@ -291,29 +259,15 @@ export function writeGatewayRestartHandoffSync(opts: { supervisorMode, }; - let tmpPath: string | undefined; try { - const handoffPath = resolveGatewayRestartHandoffPath(env); - fs.mkdirSync(path.dirname(handoffPath), { recursive: true }); - tmpPath = path.join( - path.dirname(handoffPath), - `.${path.basename(handoffPath)}.${process.pid}.${Date.now()}.${randomUUID()}.tmp`, + writeOpenClawStateKvJson( + GATEWAY_RESTART_HANDOFF_KV_SCOPE, + GATEWAY_RESTART_HANDOFF_KV_KEY, + payload as unknown as OpenClawStateJsonValue, + { env }, ); - let fd: number | undefined; - try { - fd = fs.openSync(tmpPath, "wx", 0o600); - fs.writeFileSync(fd, `${JSON.stringify(payload)}\n`, "utf8"); - } finally { - if (fd !== undefined) { - fs.closeSync(fd); - } - } - fs.renameSync(tmpPath, handoffPath); return payload; } catch (err) { - if (tmpPath) { - unlinkRegularFileSync(tmpPath); - } handoffLog.warn(`failed to write gateway restart handoff: ${String(err)}`); return null; } @@ -323,10 +277,13 @@ export function readGatewayRestartHandoffSync( env: NodeJS.ProcessEnv = process.env, now = Date.now(), ): GatewayRestartHandoff | null { - const raw = readGatewayRestartHandoffRawSync(env); - if (!raw) { - return null; - } + const raw = readOpenClawStateKvJson( + GATEWAY_RESTART_HANDOFF_KV_SCOPE, + GATEWAY_RESTART_HANDOFF_KV_KEY, + { + env, + }, + ); const payload = parseGatewayRestartHandoff(raw); if (!payload || now < payload.createdAt || now > payload.expiresAt) { return null; @@ -341,21 +298,18 @@ export function consumeGatewayRestartHandoffForExitedProcessSync(opts: { now?: number; }): GatewayRestartHandoff | null { const env = opts.env ?? process.env; - const handoffPath = resolveGatewayRestartHandoffPath(env); - let raw: string | null = null; + const raw = readOpenClawStateKvJson( + GATEWAY_RESTART_HANDOFF_KV_SCOPE, + GATEWAY_RESTART_HANDOFF_KV_KEY, + { env }, + ); try { - const stat = fs.lstatSync(handoffPath); - if (!stat.isFile() || stat.nlink > 1 || stat.size > GATEWAY_RESTART_HANDOFF_MAX_BYTES) { - return null; - } - raw = fs.readFileSync(handoffPath, "utf8"); - } catch { - return null; - } finally { clearGatewayRestartHandoffSync(env); + } catch { + // best-effort cleanup } - const payload = raw ? parseGatewayRestartHandoff(raw) : null; + const payload = parseGatewayRestartHandoff(raw); const exitedPid = normalizePid(opts.exitedPid); if (!payload || exitedPid === null || payload.pid !== exitedPid) { return null; diff --git a/src/infra/restart-intent.test.ts b/src/infra/restart-intent.test.ts index 7abccce03f7..622a1f04385 100644 --- a/src/infra/restart-intent.test.ts +++ b/src/infra/restart-intent.test.ts @@ -2,6 +2,7 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, it } from "vitest"; +import { closeOpenClawStateDatabaseForTest } from "../state/openclaw-state-db.js"; import { consumeGatewayRestartIntentPayloadSync, consumeGatewayRestartIntentSync, @@ -25,6 +26,7 @@ function intentPath(env: NodeJS.ProcessEnv): string { describe("gateway restart intent", () => { afterEach(() => { + closeOpenClawStateDatabaseForTest(); for (const dir of tempDirs.splice(0)) { fs.rmSync(dir, { force: true, recursive: true }); } @@ -53,15 +55,16 @@ describe("gateway restart intent", () => { fs.writeFileSync(intentPath(env), "x".repeat(2048), { encoding: "utf8", mode: 0o600 }); expect(consumeGatewayRestartIntentSync(env)).toBe(false); - expect(fs.existsSync(intentPath(env))).toBe(false); + expect(fs.existsSync(intentPath(env))).toBe(true); }); - it("writes intent files with owner-only permissions", () => { + it("stores intents in SQLite instead of a legacy JSON file", () => { const env = createIntentEnv(); expect(writeGatewayRestartIntentSync({ env, targetPid: process.pid })).toBe(true); - expect(fs.statSync(intentPath(env)).mode & 0o777).toBe(0o600); + expect(fs.existsSync(intentPath(env))).toBe(false); + expect(consumeGatewayRestartIntentSync(env)).toBe(true); }); it("round-trips restart force and wait options", () => { @@ -82,7 +85,7 @@ describe("gateway restart intent", () => { expect(fs.existsSync(intentPath(env))).toBe(false); }); - it("does not follow an existing intent-path symlink when writing", () => { + it("does not touch an existing legacy intent-path symlink when writing", () => { const env = createIntentEnv(); const targetPath = path.join(env.OPENCLAW_STATE_DIR ?? "", "attacker-target.txt"); fs.writeFileSync(targetPath, "keep", "utf8"); @@ -95,7 +98,7 @@ describe("gateway restart intent", () => { expect(writeGatewayRestartIntentSync({ env, targetPid: process.pid })).toBe(true); expect(fs.readFileSync(targetPath, "utf8")).toBe("keep"); - expect(fs.lstatSync(intentPath(env)).isSymbolicLink()).toBe(false); + expect(fs.lstatSync(intentPath(env)).isSymbolicLink()).toBe(true); expect(consumeGatewayRestartIntentSync(env)).toBe(true); }); }); diff --git a/src/infra/restart.ts b/src/infra/restart.ts index f6144080da5..790e5648248 100644 --- a/src/infra/restart.ts +++ b/src/infra/restart.ts @@ -1,15 +1,18 @@ import { spawnSync } from "node:child_process"; -import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { getRuntimeConfig } from "../config/config.js"; -import { resolveStateDir } from "../config/paths.js"; import { resolveGatewayLaunchAgentLabel, resolveGatewaySystemdServiceName, } from "../daemon/constants.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; -import { replaceFileAtomicSync } from "./replace-file.js"; +import { + deleteOpenClawStateKvJson, + readOpenClawStateKvJson, + writeOpenClawStateKvJson, + type OpenClawStateJsonValue, +} from "../state/openclaw-state-kv.js"; import { cleanStaleGatewayProcessesSync, findGatewayPidsOnPortSync } from "./restart-stale-pids.js"; import type { RestartAttempt } from "./restart.types.js"; import { relaunchGatewayScheduledTask } from "./windows-task-restart.js"; @@ -23,9 +26,9 @@ const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000; export const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000; const RESTART_COOLDOWN_MS = 30_000; const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37; -const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json"; +const GATEWAY_RESTART_INTENT_KV_SCOPE = "gateway.restart-intent"; +const GATEWAY_RESTART_INTENT_KV_KEY = "current"; const GATEWAY_RESTART_INTENT_TTL_MS = 60_000; -const GATEWAY_RESTART_INTENT_MAX_BYTES = 1024; const restartLog = createSubsystemLogger("restart"); @@ -100,23 +103,6 @@ export type GatewayRestartIntent = { waitMs?: number; }; -function resolveGatewayRestartIntentPath(env: NodeJS.ProcessEnv = process.env): string { - return path.join(resolveStateDir(env), GATEWAY_RESTART_INTENT_FILENAME); -} - -function unlinkGatewayRestartIntentFileSync(intentPath: string): boolean { - try { - const stat = fs.lstatSync(intentPath); - if (!stat.isFile() || stat.nlink > 1) { - return false; - } - fs.unlinkSync(intentPath); - return true; - } catch { - return false; - } -} - function normalizeRestartIntentPid(pid: number | undefined): number | null { return typeof pid === "number" && Number.isSafeInteger(pid) && pid > 0 ? pid : null; } @@ -132,7 +118,6 @@ export function writeGatewayRestartIntentSync(opts: { } const env = opts.env ?? process.env; try { - const intentPath = resolveGatewayRestartIntentPath(env); const payload: GatewayRestartIntentPayload = { kind: "gateway-restart", pid: targetPid, @@ -144,12 +129,12 @@ export function writeGatewayRestartIntentSync(opts: { ? { waitMs: Math.floor(opts.intent.waitMs) } : {}), }; - replaceFileAtomicSync({ - filePath: intentPath, - content: `${JSON.stringify(payload)}\n`, - mode: 0o600, - tempPrefix: ".gateway-restart-intent", - }); + writeOpenClawStateKvJson( + GATEWAY_RESTART_INTENT_KV_SCOPE, + GATEWAY_RESTART_INTENT_KV_KEY, + payload as unknown as OpenClawStateJsonValue, + { env }, + ); return true; } catch (err) { restartLog.warn(`failed to write gateway restart intent: ${String(err)}`); @@ -158,33 +143,34 @@ export function writeGatewayRestartIntentSync(opts: { } export function clearGatewayRestartIntentSync(env: NodeJS.ProcessEnv = process.env): void { - unlinkGatewayRestartIntentFileSync(resolveGatewayRestartIntentPath(env)); + deleteOpenClawStateKvJson(GATEWAY_RESTART_INTENT_KV_SCOPE, GATEWAY_RESTART_INTENT_KV_KEY, { + env, + }); } -function parseGatewayRestartIntent(raw: string): GatewayRestartIntentPayload | null { - try { - const parsed = JSON.parse(raw) as Partial; - if ( - parsed.kind === "gateway-restart" && - typeof parsed.pid === "number" && - Number.isFinite(parsed.pid) && - typeof parsed.createdAt === "number" && - Number.isFinite(parsed.createdAt) && - (parsed.force === undefined || typeof parsed.force === "boolean") && - (parsed.waitMs === undefined || - (typeof parsed.waitMs === "number" && Number.isFinite(parsed.waitMs) && parsed.waitMs >= 0)) - ) { - return { - kind: "gateway-restart", - pid: parsed.pid, - createdAt: parsed.createdAt, - ...(parsed.force ? { force: true } : {}), - ...(typeof parsed.waitMs === "number" ? { waitMs: Math.floor(parsed.waitMs) } : {}), - }; - } - } catch { +function parseGatewayRestartIntent(parsed: unknown): GatewayRestartIntentPayload | null { + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { return null; } + const value = parsed as Partial; + if ( + value.kind === "gateway-restart" && + typeof value.pid === "number" && + Number.isFinite(value.pid) && + typeof value.createdAt === "number" && + Number.isFinite(value.createdAt) && + (value.force === undefined || typeof value.force === "boolean") && + (value.waitMs === undefined || + (typeof value.waitMs === "number" && Number.isFinite(value.waitMs) && value.waitMs >= 0)) + ) { + return { + kind: "gateway-restart", + pid: value.pid, + createdAt: value.createdAt, + ...(value.force ? { force: true } : {}), + ...(typeof value.waitMs === "number" ? { waitMs: Math.floor(value.waitMs) } : {}), + }; + } return null; } @@ -192,18 +178,15 @@ export function consumeGatewayRestartIntentPayloadSync( env: NodeJS.ProcessEnv = process.env, now = Date.now(), ): GatewayRestartIntent | null { - const intentPath = resolveGatewayRestartIntentPath(env); - let raw: string; + const raw = readOpenClawStateKvJson( + GATEWAY_RESTART_INTENT_KV_SCOPE, + GATEWAY_RESTART_INTENT_KV_KEY, + { env }, + ); try { - const stat = fs.lstatSync(intentPath); - if (!stat.isFile() || stat.size > GATEWAY_RESTART_INTENT_MAX_BYTES) { - return null; - } - raw = fs.readFileSync(intentPath, "utf8"); - } catch { - return null; - } finally { clearGatewayRestartIntentSync(env); + } catch { + // best-effort cleanup } const payload = parseGatewayRestartIntent(raw); if (!payload) {