refactor: move restart handoff state to sqlite

This commit is contained in:
Peter Steinberger
2026-05-08 16:36:08 +01:00
parent 6bed3ac6cc
commit 85cd41e68e
6 changed files with 111 additions and 155 deletions

View File

@@ -376,6 +376,9 @@ The remaining cleanup is mostly consolidation and deletion:
- Gateway restart sentinel state now uses shared SQLite KV instead of
`restart-sentinel.json`; the old path resolver remains only for legacy
cleanup/status compatibility.
- Gateway restart intent and supervisor handoff state now use shared SQLite KV
instead of `gateway-restart-intent.json` and
`gateway-supervisor-restart-handoff.json` sidecars.
- Gateway singleton coordination now uses SQLite KV rows under
`gateway_locks` instead of writing `gateway.<hash>.lock` files. The lock
still records pid, config path, process start time, and stale-owner metadata,

View File

@@ -39,6 +39,11 @@ const legacyStoreMarkers = [
{ label: "ACPX process leases JSON", pattern: /\bprocess-leases\.json\b/u },
{ label: "ACPX gateway instance id file", pattern: /\bgateway-instance-id\b/u },
{ label: "gateway restart sentinel JSON", pattern: /\brestart-sentinel\.json\b/u },
{ label: "gateway restart intent JSON", pattern: /\bgateway-restart-intent\.json\b/u },
{
label: "gateway supervisor restart handoff JSON",
pattern: /\bgateway-supervisor-restart-handoff\.json\b/u,
},
{ label: "gateway singleton lock file", pattern: /\bgateway\.[A-Za-z0-9._-]+\.lock\b/u },
{ label: "QMD embed lock file", pattern: /\bqmd[/\\]embed\.lock\b/u },
{

View File

@@ -2,10 +2,10 @@ import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import { closeOpenClawStateDatabaseForTest } from "../state/openclaw-state-db.js";
import {
consumeGatewayRestartHandoffForExitedProcessSync,
formatGatewayRestartHandoffDiagnostic,
GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME,
GATEWAY_SUPERVISOR_RESTART_HANDOFF_KIND,
readGatewayRestartHandoffSync,
writeGatewayRestartHandoffSync,
@@ -13,6 +13,8 @@ import {
import type { GatewayRestartHandoff } from "./restart-handoff.js";
const tempDirs: string[] = [];
const LEGACY_GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME =
"gateway-supervisor-restart-handoff.json";
function createHandoffEnv(): NodeJS.ProcessEnv {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-restart-handoff-"));
@@ -24,7 +26,10 @@ function createHandoffEnv(): NodeJS.ProcessEnv {
}
function handoffPath(env: NodeJS.ProcessEnv): string {
return path.join(env.OPENCLAW_STATE_DIR ?? "", GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME);
return path.join(
env.OPENCLAW_STATE_DIR ?? "",
LEGACY_GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME,
);
}
function expectWrittenHandoff(
@@ -39,6 +44,7 @@ function expectWrittenHandoff(
describe("gateway restart handoff", () => {
afterEach(() => {
closeOpenClawStateDatabaseForTest();
for (const dir of tempDirs.splice(0)) {
fs.rmSync(dir, { force: true, recursive: true });
}
@@ -69,7 +75,7 @@ describe("gateway restart handoff", () => {
createdAt: 1_000,
expiresAt: 61_000,
});
expect(fs.statSync(handoffPath(env)).mode & 0o777).toBe(0o600);
expect(fs.existsSync(handoffPath(env))).toBe(false);
expect(readGatewayRestartHandoffSync(env, 1_500)).toMatchObject({
pid: 12_345,
reason: "plugin source changed",
@@ -101,6 +107,7 @@ describe("gateway restart handoff", () => {
supervisorMode: "systemd",
});
expect(fs.existsSync(handoffPath(env))).toBe(false);
expect(readGatewayRestartHandoffSync(env, 2_001)).toBeNull();
});
it("rejects handoffs for a different exited pid and clears them", () => {
@@ -147,7 +154,7 @@ describe("gateway restart handoff", () => {
expect(fs.existsSync(handoffPath(env))).toBe(false);
});
it("rejects malformed handoff payloads", () => {
it("ignores malformed legacy handoff files", () => {
const env = createHandoffEnv();
fs.writeFileSync(
@@ -168,9 +175,10 @@ describe("gateway restart handoff", () => {
);
expect(readGatewayRestartHandoffSync(env, 1_001)).toBeNull();
expect(fs.existsSync(handoffPath(env))).toBe(true);
});
it("rejects expired and oversized handoff files", () => {
it("rejects expired SQLite handoffs and ignores oversized legacy files", () => {
const env = createHandoffEnv();
expectWrittenHandoff({
@@ -191,10 +199,10 @@ describe("gateway restart handoff", () => {
now: 2_001,
}),
).toBeNull();
expect(fs.existsSync(handoffPath(env))).toBe(false);
expect(fs.existsSync(handoffPath(env))).toBe(true);
});
it("rejects persisted handoffs with a ttl longer than the supported window", () => {
it("ignores legacy persisted handoffs with a ttl longer than the supported window", () => {
const env = createHandoffEnv();
fs.writeFileSync(
@@ -221,10 +229,10 @@ describe("gateway restart handoff", () => {
now: 1_001,
}),
).toBeNull();
expect(fs.existsSync(handoffPath(env))).toBe(false);
expect(fs.existsSync(handoffPath(env))).toBe(true);
});
it("does not follow an existing handoff-path symlink when writing", () => {
it("does not touch an existing legacy handoff-path symlink when writing", () => {
const env = createHandoffEnv();
const targetPath = path.join(env.OPENCLAW_STATE_DIR ?? "", "attacker-target.txt");
fs.writeFileSync(targetPath, "keep", "utf8");
@@ -242,7 +250,7 @@ describe("gateway restart handoff", () => {
});
expect(fs.readFileSync(targetPath, "utf8")).toBe("keep");
expect(fs.lstatSync(handoffPath(env)).isSymbolicLink()).toBe(false);
expect(fs.lstatSync(handoffPath(env)).isSymbolicLink()).toBe(true);
expect(
consumeGatewayRestartHandoffForExitedProcessSync({
env,

View File

@@ -1,14 +1,16 @@
import { randomUUID } from "node:crypto";
import fs from "node:fs";
import path from "node:path";
import { resolveStateDir } from "../config/paths.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import {
deleteOpenClawStateKvJson,
readOpenClawStateKvJson,
writeOpenClawStateKvJson,
type OpenClawStateJsonValue,
} from "../state/openclaw-state-kv.js";
export const GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME =
"gateway-supervisor-restart-handoff.json";
export const GATEWAY_SUPERVISOR_RESTART_HANDOFF_KIND = "gateway-supervisor-restart-handoff";
const GATEWAY_RESTART_HANDOFF_TTL_MS = 60_000;
const GATEWAY_RESTART_HANDOFF_MAX_BYTES = 4096;
const GATEWAY_RESTART_HANDOFF_KV_SCOPE = "gateway.restart-handoff";
const GATEWAY_RESTART_HANDOFF_KV_KEY = "current";
const MAX_INTENT_ID_LENGTH = 120;
const MAX_PROCESS_INSTANCE_ID_LENGTH = 120;
const MAX_REASON_LENGTH = 200;
@@ -87,25 +89,10 @@ export function formatGatewayRestartHandoffDiagnostic(
return `Recent restart handoff: ${detail.join("; ")}`;
}
function resolveGatewayRestartHandoffPath(env: NodeJS.ProcessEnv = process.env): string {
return path.join(resolveStateDir(env), GATEWAY_SUPERVISOR_RESTART_HANDOFF_FILENAME);
}
function unlinkRegularFileSync(filePath: string): boolean {
try {
const stat = fs.lstatSync(filePath);
if (!stat.isFile() || stat.nlink > 1) {
return false;
}
fs.unlinkSync(filePath);
return true;
} catch {
return false;
}
}
export function clearGatewayRestartHandoffSync(env: NodeJS.ProcessEnv = process.env): void {
unlinkRegularFileSync(resolveGatewayRestartHandoffPath(env));
deleteOpenClawStateKvJson(GATEWAY_RESTART_HANDOFF_KV_SCOPE, GATEWAY_RESTART_HANDOFF_KV_KEY, {
env,
});
}
function normalizePid(pid: number | undefined): number | null {
@@ -182,13 +169,7 @@ function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
function parseGatewayRestartHandoff(raw: string): GatewayRestartHandoff | null {
let parsed: unknown;
try {
parsed = JSON.parse(raw);
} catch {
return null;
}
function parseGatewayRestartHandoff(parsed: unknown): GatewayRestartHandoff | null {
if (!isRecord(parsed)) {
return null;
}
@@ -236,19 +217,6 @@ function parseGatewayRestartHandoff(raw: string): GatewayRestartHandoff | null {
};
}
function readGatewayRestartHandoffRawSync(env: NodeJS.ProcessEnv): string | null {
const handoffPath = resolveGatewayRestartHandoffPath(env);
try {
const stat = fs.lstatSync(handoffPath);
if (!stat.isFile() || stat.nlink > 1 || stat.size > GATEWAY_RESTART_HANDOFF_MAX_BYTES) {
return null;
}
return fs.readFileSync(handoffPath, "utf8");
} catch {
return null;
}
}
export function writeGatewayRestartHandoffSync(opts: {
env?: NodeJS.ProcessEnv;
pid?: number;
@@ -291,29 +259,15 @@ export function writeGatewayRestartHandoffSync(opts: {
supervisorMode,
};
let tmpPath: string | undefined;
try {
const handoffPath = resolveGatewayRestartHandoffPath(env);
fs.mkdirSync(path.dirname(handoffPath), { recursive: true });
tmpPath = path.join(
path.dirname(handoffPath),
`.${path.basename(handoffPath)}.${process.pid}.${Date.now()}.${randomUUID()}.tmp`,
writeOpenClawStateKvJson<OpenClawStateJsonValue>(
GATEWAY_RESTART_HANDOFF_KV_SCOPE,
GATEWAY_RESTART_HANDOFF_KV_KEY,
payload as unknown as OpenClawStateJsonValue,
{ env },
);
let fd: number | undefined;
try {
fd = fs.openSync(tmpPath, "wx", 0o600);
fs.writeFileSync(fd, `${JSON.stringify(payload)}\n`, "utf8");
} finally {
if (fd !== undefined) {
fs.closeSync(fd);
}
}
fs.renameSync(tmpPath, handoffPath);
return payload;
} catch (err) {
if (tmpPath) {
unlinkRegularFileSync(tmpPath);
}
handoffLog.warn(`failed to write gateway restart handoff: ${String(err)}`);
return null;
}
@@ -323,10 +277,13 @@ export function readGatewayRestartHandoffSync(
env: NodeJS.ProcessEnv = process.env,
now = Date.now(),
): GatewayRestartHandoff | null {
const raw = readGatewayRestartHandoffRawSync(env);
if (!raw) {
return null;
}
const raw = readOpenClawStateKvJson(
GATEWAY_RESTART_HANDOFF_KV_SCOPE,
GATEWAY_RESTART_HANDOFF_KV_KEY,
{
env,
},
);
const payload = parseGatewayRestartHandoff(raw);
if (!payload || now < payload.createdAt || now > payload.expiresAt) {
return null;
@@ -341,21 +298,18 @@ export function consumeGatewayRestartHandoffForExitedProcessSync(opts: {
now?: number;
}): GatewayRestartHandoff | null {
const env = opts.env ?? process.env;
const handoffPath = resolveGatewayRestartHandoffPath(env);
let raw: string | null = null;
const raw = readOpenClawStateKvJson(
GATEWAY_RESTART_HANDOFF_KV_SCOPE,
GATEWAY_RESTART_HANDOFF_KV_KEY,
{ env },
);
try {
const stat = fs.lstatSync(handoffPath);
if (!stat.isFile() || stat.nlink > 1 || stat.size > GATEWAY_RESTART_HANDOFF_MAX_BYTES) {
return null;
}
raw = fs.readFileSync(handoffPath, "utf8");
} catch {
return null;
} finally {
clearGatewayRestartHandoffSync(env);
} catch {
// best-effort cleanup
}
const payload = raw ? parseGatewayRestartHandoff(raw) : null;
const payload = parseGatewayRestartHandoff(raw);
const exitedPid = normalizePid(opts.exitedPid);
if (!payload || exitedPid === null || payload.pid !== exitedPid) {
return null;

View File

@@ -2,6 +2,7 @@ import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import { closeOpenClawStateDatabaseForTest } from "../state/openclaw-state-db.js";
import {
consumeGatewayRestartIntentPayloadSync,
consumeGatewayRestartIntentSync,
@@ -25,6 +26,7 @@ function intentPath(env: NodeJS.ProcessEnv): string {
describe("gateway restart intent", () => {
afterEach(() => {
closeOpenClawStateDatabaseForTest();
for (const dir of tempDirs.splice(0)) {
fs.rmSync(dir, { force: true, recursive: true });
}
@@ -53,15 +55,16 @@ describe("gateway restart intent", () => {
fs.writeFileSync(intentPath(env), "x".repeat(2048), { encoding: "utf8", mode: 0o600 });
expect(consumeGatewayRestartIntentSync(env)).toBe(false);
expect(fs.existsSync(intentPath(env))).toBe(false);
expect(fs.existsSync(intentPath(env))).toBe(true);
});
it("writes intent files with owner-only permissions", () => {
it("stores intents in SQLite instead of a legacy JSON file", () => {
const env = createIntentEnv();
expect(writeGatewayRestartIntentSync({ env, targetPid: process.pid })).toBe(true);
expect(fs.statSync(intentPath(env)).mode & 0o777).toBe(0o600);
expect(fs.existsSync(intentPath(env))).toBe(false);
expect(consumeGatewayRestartIntentSync(env)).toBe(true);
});
it("round-trips restart force and wait options", () => {
@@ -82,7 +85,7 @@ describe("gateway restart intent", () => {
expect(fs.existsSync(intentPath(env))).toBe(false);
});
it("does not follow an existing intent-path symlink when writing", () => {
it("does not touch an existing legacy intent-path symlink when writing", () => {
const env = createIntentEnv();
const targetPath = path.join(env.OPENCLAW_STATE_DIR ?? "", "attacker-target.txt");
fs.writeFileSync(targetPath, "keep", "utf8");
@@ -95,7 +98,7 @@ describe("gateway restart intent", () => {
expect(writeGatewayRestartIntentSync({ env, targetPid: process.pid })).toBe(true);
expect(fs.readFileSync(targetPath, "utf8")).toBe("keep");
expect(fs.lstatSync(intentPath(env)).isSymbolicLink()).toBe(false);
expect(fs.lstatSync(intentPath(env)).isSymbolicLink()).toBe(true);
expect(consumeGatewayRestartIntentSync(env)).toBe(true);
});
});

View File

@@ -1,15 +1,18 @@
import { spawnSync } from "node:child_process";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { getRuntimeConfig } from "../config/config.js";
import { resolveStateDir } from "../config/paths.js";
import {
resolveGatewayLaunchAgentLabel,
resolveGatewaySystemdServiceName,
} from "../daemon/constants.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import { replaceFileAtomicSync } from "./replace-file.js";
import {
deleteOpenClawStateKvJson,
readOpenClawStateKvJson,
writeOpenClawStateKvJson,
type OpenClawStateJsonValue,
} from "../state/openclaw-state-kv.js";
import { cleanStaleGatewayProcessesSync, findGatewayPidsOnPortSync } from "./restart-stale-pids.js";
import type { RestartAttempt } from "./restart.types.js";
import { relaunchGatewayScheduledTask } from "./windows-task-restart.js";
@@ -23,9 +26,9 @@ const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000;
export const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
const RESTART_COOLDOWN_MS = 30_000;
const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37;
const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json";
const GATEWAY_RESTART_INTENT_KV_SCOPE = "gateway.restart-intent";
const GATEWAY_RESTART_INTENT_KV_KEY = "current";
const GATEWAY_RESTART_INTENT_TTL_MS = 60_000;
const GATEWAY_RESTART_INTENT_MAX_BYTES = 1024;
const restartLog = createSubsystemLogger("restart");
@@ -100,23 +103,6 @@ export type GatewayRestartIntent = {
waitMs?: number;
};
function resolveGatewayRestartIntentPath(env: NodeJS.ProcessEnv = process.env): string {
return path.join(resolveStateDir(env), GATEWAY_RESTART_INTENT_FILENAME);
}
function unlinkGatewayRestartIntentFileSync(intentPath: string): boolean {
try {
const stat = fs.lstatSync(intentPath);
if (!stat.isFile() || stat.nlink > 1) {
return false;
}
fs.unlinkSync(intentPath);
return true;
} catch {
return false;
}
}
function normalizeRestartIntentPid(pid: number | undefined): number | null {
return typeof pid === "number" && Number.isSafeInteger(pid) && pid > 0 ? pid : null;
}
@@ -132,7 +118,6 @@ export function writeGatewayRestartIntentSync(opts: {
}
const env = opts.env ?? process.env;
try {
const intentPath = resolveGatewayRestartIntentPath(env);
const payload: GatewayRestartIntentPayload = {
kind: "gateway-restart",
pid: targetPid,
@@ -144,12 +129,12 @@ export function writeGatewayRestartIntentSync(opts: {
? { waitMs: Math.floor(opts.intent.waitMs) }
: {}),
};
replaceFileAtomicSync({
filePath: intentPath,
content: `${JSON.stringify(payload)}\n`,
mode: 0o600,
tempPrefix: ".gateway-restart-intent",
});
writeOpenClawStateKvJson<OpenClawStateJsonValue>(
GATEWAY_RESTART_INTENT_KV_SCOPE,
GATEWAY_RESTART_INTENT_KV_KEY,
payload as unknown as OpenClawStateJsonValue,
{ env },
);
return true;
} catch (err) {
restartLog.warn(`failed to write gateway restart intent: ${String(err)}`);
@@ -158,33 +143,34 @@ export function writeGatewayRestartIntentSync(opts: {
}
export function clearGatewayRestartIntentSync(env: NodeJS.ProcessEnv = process.env): void {
unlinkGatewayRestartIntentFileSync(resolveGatewayRestartIntentPath(env));
deleteOpenClawStateKvJson(GATEWAY_RESTART_INTENT_KV_SCOPE, GATEWAY_RESTART_INTENT_KV_KEY, {
env,
});
}
function parseGatewayRestartIntent(raw: string): GatewayRestartIntentPayload | null {
try {
const parsed = JSON.parse(raw) as Partial<GatewayRestartIntentPayload>;
if (
parsed.kind === "gateway-restart" &&
typeof parsed.pid === "number" &&
Number.isFinite(parsed.pid) &&
typeof parsed.createdAt === "number" &&
Number.isFinite(parsed.createdAt) &&
(parsed.force === undefined || typeof parsed.force === "boolean") &&
(parsed.waitMs === undefined ||
(typeof parsed.waitMs === "number" && Number.isFinite(parsed.waitMs) && parsed.waitMs >= 0))
) {
return {
kind: "gateway-restart",
pid: parsed.pid,
createdAt: parsed.createdAt,
...(parsed.force ? { force: true } : {}),
...(typeof parsed.waitMs === "number" ? { waitMs: Math.floor(parsed.waitMs) } : {}),
};
}
} catch {
function parseGatewayRestartIntent(parsed: unknown): GatewayRestartIntentPayload | null {
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
return null;
}
const value = parsed as Partial<GatewayRestartIntentPayload>;
if (
value.kind === "gateway-restart" &&
typeof value.pid === "number" &&
Number.isFinite(value.pid) &&
typeof value.createdAt === "number" &&
Number.isFinite(value.createdAt) &&
(value.force === undefined || typeof value.force === "boolean") &&
(value.waitMs === undefined ||
(typeof value.waitMs === "number" && Number.isFinite(value.waitMs) && value.waitMs >= 0))
) {
return {
kind: "gateway-restart",
pid: value.pid,
createdAt: value.createdAt,
...(value.force ? { force: true } : {}),
...(typeof value.waitMs === "number" ? { waitMs: Math.floor(value.waitMs) } : {}),
};
}
return null;
}
@@ -192,18 +178,15 @@ export function consumeGatewayRestartIntentPayloadSync(
env: NodeJS.ProcessEnv = process.env,
now = Date.now(),
): GatewayRestartIntent | null {
const intentPath = resolveGatewayRestartIntentPath(env);
let raw: string;
const raw = readOpenClawStateKvJson(
GATEWAY_RESTART_INTENT_KV_SCOPE,
GATEWAY_RESTART_INTENT_KV_KEY,
{ env },
);
try {
const stat = fs.lstatSync(intentPath);
if (!stat.isFile() || stat.size > GATEWAY_RESTART_INTENT_MAX_BYTES) {
return null;
}
raw = fs.readFileSync(intentPath, "utf8");
} catch {
return null;
} finally {
clearGatewayRestartIntentSync(env);
} catch {
// best-effort cleanup
}
const payload = parseGatewayRestartIntent(raw);
if (!payload) {