From 1be39d4250e3d124da12d0723705d0281b113c94 Mon Sep 17 00:00:00 2001 From: Tak Hoffman <781889+Takhoffman@users.noreply.github.com> Date: Tue, 3 Mar 2026 21:31:12 -0600 Subject: [PATCH] fix(gateway): synthesize lifecycle robustness for restart and startup probes (#33831) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gateway): correct launchctl command sequence for gateway restart (closes #20030) * fix(restart): expand HOME and escape label in launchctl plist path * fix(restart): poll port free after SIGKILL to prevent EADDRINUSE restart loop When cleanStaleGatewayProcessesSync() kills a stale gateway process, the kernel may not immediately release the TCP port. Previously the function returned after a fixed 500ms sleep (300ms SIGTERM + 200ms SIGKILL), allowing triggerOpenClawRestart() to hand off to systemd before the port was actually free. The new systemd process then raced the dying socket for port 18789, hit EADDRINUSE, and exited with status 1, causing systemd to retry indefinitely — the zombie restart loop reported in #33103. Fix: add waitForPortFreeSync() that polls lsof at 50ms intervals for up to 2 seconds after SIGKILL. cleanStaleGatewayProcessesSync() now blocks until the port is confirmed free (or the budget expires with a warning) before returning. The increased SIGTERM/SIGKILL wait budgets (600ms / 400ms) also give slow processes more time to exit cleanly. Fixes #33103 Related: #28134 * fix: add EADDRINUSE retry and TIME_WAIT port-bind checks for gateway startup * fix(ports): treat EADDRNOTAVAIL as non-retryable and fix flaky test * fix(gateway): hot-reload agents.defaults.models allowlist changes The reload plan had a rule for `agents.defaults.model` (singular) but not `agents.defaults.models` (plural — the allowlist array). Because `agents.defaults.models` does not prefix-match `agents.defaults.model.`, it fell through to the catch-all `agents` tail rule (kind=none), so allowlist edits in openclaw.json were silently ignored at runtime. Add a dedicated reload rule so changes to the models allowlist trigger a heartbeat restart, which re-reads the config and serves the updated list to clients. Fixes #33600 Co-authored-by: HCL Signed-off-by: HCL * test(restart): 100% branch coverage — audit round 2 Audit findings fixed: - remove dead guard: terminateStaleProcessesSync pids.length===0 check was unreachable (only caller cleanStaleGatewayProcessesSync already guards) - expose __testing.callSleepSyncRaw so sleepSync's real Atomics.wait path can be unit-tested directly without going through the override - fix broken sleepSync Atomics.wait test: previous test set override=null but cleanStaleGatewayProcessesSync returned before calling sleepSync — replaced with direct callSleepSyncRaw calls that actually exercise L36/L42-47 - fix pid collision: two tests used process.pid+304 (EPERM + dead-at-SIGTERM); EPERM test changed to process.pid+305 - fix misindented tests: 'deduplicates pids' and 'lsof status 1 container edge case' were outside their intended describe blocks; moved to correct scopes (findGatewayPidsOnPortSync and pollPortOnce respectively) - add missing branch tests: - status 1 + non-empty stdout with zero openclaw pids → free:true (L145) - mid-loop non-openclaw cmd in &&-chain (L67) - consecutive p-lines without c-line between them (L67) - invalid PID in p-line (p0 / pNaN) — ternary false branch (L67) - unknown lsof output line (else-if false branch L69) Coverage: 100% stmts / 100% branch / 100% funcs / 100% lines (36 tests) * test(restart): fix stale-pid test typing for tsgo * fix(gateway): address lifecycle review findings * test(update): make restart-helper path assertions windows-safe --------- Signed-off-by: HCL Co-authored-by: Glucksberg Co-authored-by: Efe Büken Co-authored-by: Riccardo Marino Co-authored-by: HCL --- .../gateway-cli/run.option-collisions.test.ts | 7 + src/cli/gateway-cli/run.ts | 48 +- src/cli/ports.test.ts | 124 +++ src/cli/ports.ts | 61 ++ src/cli/update-cli/restart-helper.test.ts | 41 + src/cli/update-cli/restart-helper.ts | 12 +- src/gateway/config-reload-plan.ts | 5 + src/gateway/config-reload.test.ts | 8 + src/gateway/server/http-listen.test.ts | 100 +++ src/gateway/server/http-listen.ts | 68 +- src/infra/restart-stale-pids.test.ts | 810 ++++++++++++++++++ src/infra/restart-stale-pids.ts | 205 ++++- src/infra/restart.ts | 36 +- 13 files changed, 1462 insertions(+), 63 deletions(-) create mode 100644 src/cli/ports.test.ts create mode 100644 src/gateway/server/http-listen.test.ts create mode 100644 src/infra/restart-stale-pids.test.ts diff --git a/src/cli/gateway-cli/run.option-collisions.test.ts b/src/cli/gateway-cli/run.option-collisions.test.ts index 95245a91989..b26b4c86e47 100644 --- a/src/cli/gateway-cli/run.option-collisions.test.ts +++ b/src/cli/gateway-cli/run.option-collisions.test.ts @@ -12,6 +12,7 @@ const forceFreePortAndWait = vi.fn(async (_port: number, _opts: unknown) => ({ waitedMs: 0, escalatedToSigkill: false, })); +const waitForPortBindable = vi.fn(async (_port: number, _opts?: unknown) => 0); const ensureDevGatewayConfig = vi.fn(async (_opts?: unknown) => {}); const runGatewayLoop = vi.fn(async ({ start }: { start: () => Promise }) => { await start(); @@ -80,6 +81,7 @@ vi.mock("../command-format.js", () => ({ vi.mock("../ports.js", () => ({ forceFreePortAndWait: (port: number, opts: unknown) => forceFreePortAndWait(port, opts), + waitForPortBindable: (port: number, opts?: unknown) => waitForPortBindable(port, opts), })); vi.mock("./dev.js", () => ({ @@ -108,6 +110,7 @@ describe("gateway run option collisions", () => { setGatewayWsLogStyle.mockClear(); setVerbose.mockClear(); forceFreePortAndWait.mockClear(); + waitForPortBindable.mockClear(); ensureDevGatewayConfig.mockClear(); runGatewayLoop.mockClear(); }); @@ -140,6 +143,10 @@ describe("gateway run option collisions", () => { ]); expect(forceFreePortAndWait).toHaveBeenCalledWith(18789, expect.anything()); + expect(waitForPortBindable).toHaveBeenCalledWith( + 18789, + expect.objectContaining({ host: "127.0.0.1" }), + ); expect(setGatewayWsLogStyle).toHaveBeenCalledWith("full"); expect(startGatewayServer).toHaveBeenCalledWith( 18789, diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 291328273e3..666adc289a6 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -21,7 +21,7 @@ import { createSubsystemLogger } from "../../logging/subsystem.js"; import { defaultRuntime } from "../../runtime.js"; import { formatCliCommand } from "../command-format.js"; import { inheritOptionFromParent } from "../command-options.js"; -import { forceFreePortAndWait } from "../ports.js"; +import { forceFreePortAndWait, waitForPortBindable } from "../ports.js"; import { ensureDevGatewayConfig } from "./dev.js"; import { runGatewayLoop } from "./run-loop.js"; import { @@ -186,6 +186,20 @@ async function runGatewayCommand(opts: GatewayRunOpts) { defaultRuntime.error("Invalid port"); defaultRuntime.exit(1); } + const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback"; + const bind = + bindRaw === "loopback" || + bindRaw === "lan" || + bindRaw === "auto" || + bindRaw === "custom" || + bindRaw === "tailnet" + ? bindRaw + : null; + if (!bind) { + defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")'); + defaultRuntime.exit(1); + return; + } if (opts.force) { try { const { killed, waitedMs, escalatedToSigkill } = await forceFreePortAndWait(port, { @@ -208,6 +222,23 @@ async function runGatewayCommand(opts: GatewayRunOpts) { gatewayLog.info(`force: waited ${waitedMs}ms for port ${port} to free`); } } + // After killing, verify the port is actually bindable (handles TIME_WAIT). + const bindProbeHost = + bind === "loopback" + ? "127.0.0.1" + : bind === "lan" + ? "0.0.0.0" + : bind === "custom" + ? toOptionString(cfg.gateway?.customBindHost) + : undefined; + const bindWaitMs = await waitForPortBindable(port, { + timeoutMs: 3000, + intervalMs: 150, + host: bindProbeHost, + }); + if (bindWaitMs > 0) { + gatewayLog.info(`force: waited ${bindWaitMs}ms for port ${port} to become bindable`); + } } catch (err) { defaultRuntime.error(`Force: ${String(err)}`); defaultRuntime.exit(1); @@ -257,21 +288,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) { defaultRuntime.exit(1); return; } - const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback"; - const bind = - bindRaw === "loopback" || - bindRaw === "lan" || - bindRaw === "auto" || - bindRaw === "custom" || - bindRaw === "tailnet" - ? bindRaw - : null; - if (!bind) { - defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")'); - defaultRuntime.exit(1); - return; - } - const miskeys = extractGatewayMiskeys(snapshot?.parsed); const authOverride = authMode || passwordRaw || tokenRaw || authModeRaw diff --git a/src/cli/ports.test.ts b/src/cli/ports.test.ts new file mode 100644 index 00000000000..082dfa09a12 --- /dev/null +++ b/src/cli/ports.test.ts @@ -0,0 +1,124 @@ +import { EventEmitter } from "node:events"; +import net from "node:net"; +import { describe, expect, it, vi } from "vitest"; + +// Hoist the factory so vi.mock can access it. +const mockCreateServer = vi.hoisted(() => vi.fn()); + +vi.mock("node:net", async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, createServer: mockCreateServer }; +}); + +import { probePortFree, waitForPortBindable } from "./ports.js"; + +/** Build a minimal fake net.Server that emits a given error code on listen(). */ +function makeErrServer(code: string): net.Server { + const err = Object.assign(new Error(`bind error: ${code}`), { + code, + }) as NodeJS.ErrnoException; + + const fake = new EventEmitter() as unknown as net.Server; + (fake as unknown as { close: (cb?: () => void) => net.Server }).close = (cb?: () => void) => { + cb?.(); + return fake; + }; + (fake as unknown as { unref: () => net.Server }).unref = () => fake; + (fake as unknown as { listen: (...args: unknown[]) => net.Server }).listen = ( + ..._args: unknown[] + ) => { + setImmediate(() => fake.emit("error", err)); + return fake; + }; + return fake; +} + +describe("probePortFree", () => { + it("resolves false (not rejects) when bind returns EADDRINUSE", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EADDRINUSE")); + await expect(probePortFree(9999, "127.0.0.1")).resolves.toBe(false); + }); + + it("rejects immediately for EADDRNOTAVAIL (non-retryable: host address not on any interface)", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EADDRNOTAVAIL")); + await expect(probePortFree(9999, "192.0.2.1")).rejects.toMatchObject({ code: "EADDRNOTAVAIL" }); + }); + + it("rejects immediately for EACCES (non-retryable bind error)", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EACCES")); + await expect(probePortFree(80, "0.0.0.0")).rejects.toMatchObject({ code: "EACCES" }); + }); + + it("rejects immediately for other non-retryable errors", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EINVAL")); + await expect(probePortFree(9999, "0.0.0.0")).rejects.toMatchObject({ code: "EINVAL" }); + }); + + it("resolves true when the port is free", async () => { + // Mock a successful bind: the "listening" event fires immediately without + // acquiring a real socket, making this deterministic and avoiding TOCTOU races. + // (A real-socket approach would bind to :0, release, then reprobe — the OS can + // reassign the ephemeral port in between, causing a flaky EADDRINUSE failure.) + const fakeServer = new EventEmitter() as unknown as net.Server; + (fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = ( + cb?: () => void, + ) => { + cb?.(); + return fakeServer; + }; + (fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer; + (fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = ( + ..._args: unknown[] + ) => { + // Simulate a successful bind by firing the "listening" callback. + const callback = _args.find((a) => typeof a === "function") as (() => void) | undefined; + setImmediate(() => callback?.()); + return fakeServer; + }; + mockCreateServer.mockReturnValue(fakeServer); + + const result = await probePortFree(9999, "127.0.0.1"); + expect(result).toBe(true); + }); +}); + +describe("waitForPortBindable", () => { + it("probes the provided host when waiting for bindability", async () => { + const listenCalls: Array<{ port: number; host: string }> = []; + const fakeServer = new EventEmitter() as unknown as net.Server; + (fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = ( + cb?: () => void, + ) => { + cb?.(); + return fakeServer; + }; + (fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer; + (fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = ( + ...args: unknown[] + ) => { + const [port, host] = args as [number, string]; + listenCalls.push({ port, host }); + const callback = args.find((a) => typeof a === "function") as (() => void) | undefined; + setImmediate(() => callback?.()); + return fakeServer; + }; + mockCreateServer.mockReturnValue(fakeServer); + + await expect( + waitForPortBindable(9999, { timeoutMs: 100, intervalMs: 10, host: "127.0.0.1" }), + ).resolves.toBe(0); + expect(listenCalls[0]).toEqual({ port: 9999, host: "127.0.0.1" }); + }); + + it("propagates EACCES rejection immediately without retrying", async () => { + // Every call to createServer will emit EACCES — so if waitForPortBindable retried, + // mockCreateServer would be called many times. We assert it's called exactly once. + mockCreateServer.mockClear(); + mockCreateServer.mockReturnValue(makeErrServer("EACCES")); + await expect( + waitForPortBindable(80, { timeoutMs: 5000, intervalMs: 50 }), + ).rejects.toMatchObject({ code: "EACCES" }); + // Only one probe should have been attempted — no spinning through the retry loop. + expect(mockCreateServer).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/cli/ports.ts b/src/cli/ports.ts index e2bfa67aad9..fd829bf21db 100644 --- a/src/cli/ports.ts +++ b/src/cli/ports.ts @@ -1,4 +1,5 @@ import { execFileSync } from "node:child_process"; +import { createServer } from "node:net"; import { resolveLsofCommandSync } from "../infra/ports-lsof.js"; import { tryListenOnPort } from "../infra/ports-probe.js"; import { sleep } from "../utils.js"; @@ -324,3 +325,63 @@ export async function forceFreePortAndWait( `port ${port} still has listeners after --force: ${still.map((p) => p.pid).join(", ")}`, ); } + +/** + * Attempt a real TCP bind to verify the port is available at the OS level. + * Catches TIME_WAIT / kernel-level holds that lsof won't show. + * + * Resolves false only for EADDRINUSE — a genuinely transient condition + * (port still in TIME_WAIT after a --force kill) that the caller should retry. + * + * All other errors are non-retryable and are rejected immediately: + * - EADDRNOTAVAIL: the host address doesn't exist on any local interface + * (hard misconfiguration, not a transient kernel hold). + * - EACCES: bind to a privileged port as non-root. + * - EINVAL, etc.: other unrecoverable OS errors. + */ +export function probePortFree(port: number, host = "0.0.0.0"): Promise { + return new Promise((resolve, reject) => { + const srv = createServer(); + srv.unref(); + srv.once("error", (err: NodeJS.ErrnoException) => { + srv.close(); + if (err.code === "EADDRINUSE") { + // Genuinely transient — port still in use or TIME_WAIT after a --force kill. + resolve(false); + } else { + // Non-retryable: EADDRNOTAVAIL (bad host address), EACCES (privileged port), + // EINVAL, and any other OS errors. Surface immediately; no retry loop. + reject(err); + } + }); + srv.listen(port, host, () => { + srv.close(() => resolve(true)); + }); + }); +} + +/** + * Poll until a real test-bind succeeds, up to `timeoutMs`. + * Returns the number of ms waited, or throws if the port never freed. + */ +export async function waitForPortBindable( + port: number, + opts: { timeoutMs?: number; intervalMs?: number; host?: string } = {}, +): Promise { + const timeoutMs = Math.max(opts.timeoutMs ?? 3000, 0); + const intervalMs = Math.max(opts.intervalMs ?? 150, 1); + const host = opts.host; + let waited = 0; + while (waited < timeoutMs) { + if (await probePortFree(port, host)) { + return waited; + } + await sleep(intervalMs); + waited += intervalMs; + } + // Final attempt + if (await probePortFree(port, host)) { + return waited; + } + throw new Error(`port ${port} still not bindable after ${waited}ms (TIME_WAIT or kernel hold)`); +} diff --git a/src/cli/update-cli/restart-helper.test.ts b/src/cli/update-cli/restart-helper.test.ts index a152f3fdb48..18888c27f53 100644 --- a/src/cli/update-cli/restart-helper.test.ts +++ b/src/cli/update-cli/restart-helper.test.ts @@ -98,6 +98,8 @@ describe("restart-helper", () => { expect(scriptPath.endsWith(".sh")).toBe(true); expect(content).toContain("#!/bin/sh"); expect(content).toContain("launchctl kickstart -k 'gui/501/ai.openclaw.gateway'"); + // Should fall back to bootstrap when kickstart fails (service deregistered after bootout) + expect(content).toContain("launchctl bootstrap 'gui/501'"); expect(content).toContain('rm -f "$0"'); await cleanupScript(scriptPath); }); @@ -223,6 +225,45 @@ describe("restart-helper", () => { await cleanupScript(scriptPath); }); + it("expands HOME in plist path instead of leaving literal $HOME", async () => { + Object.defineProperty(process, "platform", { value: "darwin" }); + process.getuid = () => 501; + + const { scriptPath, content } = await prepareAndReadScript({ + HOME: "/Users/testuser", + OPENCLAW_PROFILE: "default", + }); + // The plist path must contain the resolved home dir, not literal $HOME + expect(content).toMatch(/[\\/]Users[\\/]testuser[\\/]Library[\\/]LaunchAgents[\\/]/); + expect(content).not.toContain("$HOME"); + await cleanupScript(scriptPath); + }); + + it("prefers env parameter HOME over process.env.HOME for plist path", async () => { + Object.defineProperty(process, "platform", { value: "darwin" }); + process.getuid = () => 502; + + const { scriptPath, content } = await prepareAndReadScript({ + HOME: "/Users/envhome", + OPENCLAW_PROFILE: "default", + }); + expect(content).toMatch(/[\\/]Users[\\/]envhome[\\/]Library[\\/]LaunchAgents[\\/]/); + await cleanupScript(scriptPath); + }); + + it("shell-escapes the label in the plist path on macOS", async () => { + Object.defineProperty(process, "platform", { value: "darwin" }); + process.getuid = () => 501; + + const { scriptPath, content } = await prepareAndReadScript({ + HOME: "/Users/testuser", + OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.it's-a-test", + }); + // The plist path must also shell-escape the label to prevent injection + expect(content).toContain("ai.openclaw.it'\\''s-a-test.plist"); + await cleanupScript(scriptPath); + }); + it("rejects unsafe batch profile names on Windows", async () => { Object.defineProperty(process, "platform", { value: "win32" }); const scriptPath = await prepareRestartScript({ diff --git a/src/cli/update-cli/restart-helper.ts b/src/cli/update-cli/restart-helper.ts index cef4e25418b..4f7d45aab0c 100644 --- a/src/cli/update-cli/restart-helper.ts +++ b/src/cli/update-cli/restart-helper.ts @@ -83,12 +83,22 @@ rm -f "$0" const escaped = shellEscape(label); // Fallback to 501 if getuid is not available (though it should be on macOS) const uid = process.getuid ? process.getuid() : 501; + // Resolve HOME at generation time via env/process.env to match launchd.ts, + // and shell-escape the label in the plist filename to prevent injection. + const home = env.HOME?.trim() || process.env.HOME || os.homedir(); + const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`); + const escapedPlistPath = shellEscape(plistPath); filename = `openclaw-restart-${timestamp}.sh`; scriptContent = `#!/bin/sh # Standalone restart script — survives parent process termination. # Wait briefly to ensure file locks are released after update. sleep 1 -launchctl kickstart -k 'gui/${uid}/${escaped}' +# Try kickstart first (works when the service is still registered). +# If it fails (e.g. after bootout), re-register via bootstrap then kickstart. +if ! launchctl kickstart -k 'gui/${uid}/${escaped}' 2>/dev/null; then + launchctl bootstrap 'gui/${uid}' '${escapedPlistPath}' 2>/dev/null + launchctl kickstart -k 'gui/${uid}/${escaped}' 2>/dev/null || true +fi # Self-cleanup rm -f "$0" `; diff --git a/src/gateway/config-reload-plan.ts b/src/gateway/config-reload-plan.ts index 1af87d25020..4ca1fcea7f0 100644 --- a/src/gateway/config-reload-plan.ts +++ b/src/gateway/config-reload-plan.ts @@ -50,6 +50,11 @@ const BASE_RELOAD_RULES: ReloadRule[] = [ kind: "hot", actions: ["restart-heartbeat"], }, + { + prefix: "agents.defaults.models", + kind: "hot", + actions: ["restart-heartbeat"], + }, { prefix: "agents.defaults.model", kind: "hot", diff --git a/src/gateway/config-reload.test.ts b/src/gateway/config-reload.test.ts index e45347b0040..9c4994541e9 100644 --- a/src/gateway/config-reload.test.ts +++ b/src/gateway/config-reload.test.ts @@ -159,6 +159,14 @@ describe("buildGatewayReloadPlan", () => { ); }); + it("restarts heartbeat when agents.defaults.models allowlist changes", () => { + const plan = buildGatewayReloadPlan(["agents.defaults.models"]); + expect(plan.restartGateway).toBe(false); + expect(plan.restartHeartbeat).toBe(true); + expect(plan.hotReasons).toContain("agents.defaults.models"); + expect(plan.noopPaths).toEqual([]); + }); + it("hot-reloads health monitor when channelHealthCheckMinutes changes", () => { const plan = buildGatewayReloadPlan(["gateway.channelHealthCheckMinutes"]); expect(plan.restartGateway).toBe(false); diff --git a/src/gateway/server/http-listen.test.ts b/src/gateway/server/http-listen.test.ts new file mode 100644 index 00000000000..12358713faf --- /dev/null +++ b/src/gateway/server/http-listen.test.ts @@ -0,0 +1,100 @@ +import { EventEmitter } from "node:events"; +import type { Server as HttpServer } from "node:http"; +import { describe, expect, it, vi } from "vitest"; +import { GatewayLockError } from "../../infra/gateway-lock.js"; +import { listenGatewayHttpServer } from "./http-listen.js"; + +const sleepMock = vi.hoisted(() => vi.fn(async (_ms: number) => {})); + +vi.mock("../../utils.js", () => ({ + sleep: (ms: number) => sleepMock(ms), +})); + +type ListenOutcome = { kind: "error"; code: string } | { kind: "listening" }; + +function createFakeHttpServer(outcomes: ListenOutcome[]) { + class FakeHttpServer extends EventEmitter { + public closeCalls = 0; + private attempt = 0; + + listen(_port: number, _host: string) { + const outcome = outcomes[this.attempt] ?? { kind: "listening" }; + this.attempt += 1; + setImmediate(() => { + if (outcome.kind === "error") { + const err = Object.assign(new Error(outcome.code), { code: outcome.code }); + this.emit("error", err); + } else { + this.emit("listening"); + } + }); + return this; + } + + close(cb?: () => void) { + this.closeCalls += 1; + setImmediate(() => cb?.()); + return this; + } + } + + return new FakeHttpServer(); +} + +describe("listenGatewayHttpServer", () => { + it("retries EADDRINUSE and closes server handle before retry", async () => { + sleepMock.mockClear(); + const fake = createFakeHttpServer([ + { kind: "error", code: "EADDRINUSE" }, + { kind: "listening" }, + ]); + + await expect( + listenGatewayHttpServer({ + httpServer: fake as unknown as HttpServer, + bindHost: "127.0.0.1", + port: 18789, + }), + ).resolves.toBeUndefined(); + + expect(fake.closeCalls).toBe(1); + expect(sleepMock).toHaveBeenCalledTimes(1); + }); + + it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => { + sleepMock.mockClear(); + const fake = createFakeHttpServer([ + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + ]); + + await expect( + listenGatewayHttpServer({ + httpServer: fake as unknown as HttpServer, + bindHost: "127.0.0.1", + port: 18789, + }), + ).rejects.toBeInstanceOf(GatewayLockError); + + expect(fake.closeCalls).toBe(4); + }); + + it("wraps non-EADDRINUSE errors as GatewayLockError", async () => { + sleepMock.mockClear(); + const fake = createFakeHttpServer([{ kind: "error", code: "EACCES" }]); + + await expect( + listenGatewayHttpServer({ + httpServer: fake as unknown as HttpServer, + bindHost: "127.0.0.1", + port: 18789, + }), + ).rejects.toBeInstanceOf(GatewayLockError); + + expect(fake.closeCalls).toBe(0); + }); +}); diff --git a/src/gateway/server/http-listen.ts b/src/gateway/server/http-listen.ts index c2ae20a879f..0aa9f7b399f 100644 --- a/src/gateway/server/http-listen.ts +++ b/src/gateway/server/http-listen.ts @@ -1,5 +1,19 @@ import type { Server as HttpServer } from "node:http"; import { GatewayLockError } from "../../infra/gateway-lock.js"; +import { sleep } from "../../utils.js"; + +const EADDRINUSE_MAX_RETRIES = 4; +const EADDRINUSE_RETRY_INTERVAL_MS = 500; + +async function closeServerQuietly(httpServer: HttpServer): Promise { + await new Promise((resolve) => { + try { + httpServer.close(() => resolve()); + } catch { + resolve(); + } + }); +} export async function listenGatewayHttpServer(params: { httpServer: HttpServer; @@ -7,31 +21,41 @@ export async function listenGatewayHttpServer(params: { port: number; }) { const { httpServer, bindHost, port } = params; - try { - await new Promise((resolve, reject) => { - const onError = (err: NodeJS.ErrnoException) => { - httpServer.off("listening", onListening); - reject(err); - }; - const onListening = () => { - httpServer.off("error", onError); - resolve(); - }; - httpServer.once("error", onError); - httpServer.once("listening", onListening); - httpServer.listen(port, bindHost); - }); - } catch (err) { - const code = (err as NodeJS.ErrnoException).code; - if (code === "EADDRINUSE") { + + for (let attempt = 0; ; attempt++) { + try { + await new Promise((resolve, reject) => { + const onError = (err: NodeJS.ErrnoException) => { + httpServer.off("listening", onListening); + reject(err); + }; + const onListening = () => { + httpServer.off("error", onError); + resolve(); + }; + httpServer.once("error", onError); + httpServer.once("listening", onListening); + httpServer.listen(port, bindHost); + }); + return; // bound successfully + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code === "EADDRINUSE" && attempt < EADDRINUSE_MAX_RETRIES) { + // Port may still be in TIME_WAIT after a recent process exit; retry. + await closeServerQuietly(httpServer); + await sleep(EADDRINUSE_RETRY_INTERVAL_MS); + continue; + } + if (code === "EADDRINUSE") { + throw new GatewayLockError( + `another gateway instance is already listening on ws://${bindHost}:${port}`, + err, + ); + } throw new GatewayLockError( - `another gateway instance is already listening on ws://${bindHost}:${port}`, + `failed to bind gateway socket on ws://${bindHost}:${port}: ${String(err)}`, err, ); } - throw new GatewayLockError( - `failed to bind gateway socket on ws://${bindHost}:${port}: ${String(err)}`, - err, - ); } } diff --git a/src/infra/restart-stale-pids.test.ts b/src/infra/restart-stale-pids.test.ts new file mode 100644 index 00000000000..f7bf0709d9f --- /dev/null +++ b/src/infra/restart-stale-pids.test.ts @@ -0,0 +1,810 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +// This entire file tests lsof-based Unix port polling. The feature is a deliberate +// no-op on Windows (findGatewayPidsOnPortSync returns [] immediately). Running these +// tests on a Windows CI runner would require lsof which does not exist there, so we +// skip the suite entirely and rely on the Linux/macOS runners for coverage. +const isWindows = process.platform === "win32"; + +const mockSpawnSync = vi.hoisted(() => vi.fn()); +const mockResolveGatewayPort = vi.hoisted(() => vi.fn(() => 18789)); +const mockRestartWarn = vi.hoisted(() => vi.fn()); + +vi.mock("node:child_process", () => ({ + spawnSync: (...args: unknown[]) => mockSpawnSync(...args), + execFileSync: vi.fn(), +})); + +vi.mock("../config/paths.js", () => ({ + resolveGatewayPort: () => mockResolveGatewayPort(), +})); + +vi.mock("./ports-lsof.js", () => ({ + resolveLsofCommandSync: vi.fn(() => "lsof"), +})); + +vi.mock("../logging/subsystem.js", () => ({ + createSubsystemLogger: vi.fn(() => ({ + warn: (...args: unknown[]) => mockRestartWarn(...args), + info: vi.fn(), + error: vi.fn(), + })), +})); + +import { resolveLsofCommandSync } from "./ports-lsof.js"; +import { + __testing, + cleanStaleGatewayProcessesSync, + findGatewayPidsOnPortSync, +} from "./restart-stale-pids.js"; + +function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string { + return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n"; +} + +describe.skipIf(isWindows)("restart-stale-pids", () => { + beforeEach(() => { + mockSpawnSync.mockReset(); + mockResolveGatewayPort.mockReset(); + mockRestartWarn.mockReset(); + mockResolveGatewayPort.mockReturnValue(18789); + __testing.setSleepSyncOverride(() => {}); + }); + + afterEach(() => { + __testing.setSleepSyncOverride(null); + __testing.setDateNowOverride(null); + vi.restoreAllMocks(); + }); + + // ------------------------------------------------------------------------- + // findGatewayPidsOnPortSync + // ------------------------------------------------------------------------- + describe("findGatewayPidsOnPortSync", () => { + it("returns [] when lsof exits with non-zero status", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 1, stdout: "", stderr: "" }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + + it("logs warning when initial lsof scan exits with status > 1", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 2, stdout: "", stderr: "lsof error" }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + expect(mockRestartWarn).toHaveBeenCalledWith( + expect.stringContaining("lsof exited with status 2"), + ); + }); + + it("returns [] when lsof returns an error object (e.g. ENOENT)", () => { + mockSpawnSync.mockReturnValue({ + error: new Error("ENOENT"), + status: null, + stdout: "", + stderr: "", + }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + expect(mockRestartWarn).toHaveBeenCalledWith( + expect.stringContaining("lsof failed during initial stale-pid scan"), + ); + }); + + it("parses openclaw-gateway pids and excludes the current process", () => { + const stalePid = process.pid + 1; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: stalePid, cmd: "openclaw-gateway" }, + { pid: process.pid, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const pids = findGatewayPidsOnPortSync(18789); + expect(pids).toContain(stalePid); + expect(pids).not.toContain(process.pid); + }); + + it("excludes pids whose command does not include 'openclaw'", () => { + const otherPid = process.pid + 2; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([{ pid: otherPid, cmd: "nginx" }]), + stderr: "", + }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + + it("forwards the spawnTimeoutMs argument to spawnSync", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" }); + findGatewayPidsOnPortSync(18789, 400); + expect(mockSpawnSync).toHaveBeenCalledWith( + "lsof", + expect.any(Array), + expect.objectContaining({ timeout: 400 }), + ); + }); + + it("deduplicates pids from dual-stack listeners (IPv4+IPv6 emit same pid twice)", () => { + // Dual-stack listeners cause lsof to emit the same PID twice in -Fpc output + // (once for the IPv4 socket, once for IPv6). Without dedup, terminateStaleProcessesSync + // sends SIGTERM twice and returns killed=[pid, pid], corrupting the count. + const stalePid = process.pid + 600; + const stdout = `p${stalePid}\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toEqual([stalePid]); // deduped — not [pid, pid] + }); + + it("returns [] and skips lsof on win32", () => { + // The entire describe block is skipped on Windows (isWindows guard at top), + // so this test only runs on Linux/macOS. It mocks platform to win32 for the + // single assertion without needing to restore — the suite-level skipIf means + // this will never run on an actual Windows runner where the mock could leak. + const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + Object.defineProperty(process, "platform", { value: "win32", configurable: true }); + try { + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + expect(mockSpawnSync).not.toHaveBeenCalled(); + } finally { + if (origDescriptor) { + Object.defineProperty(process, "platform", origDescriptor); + } + } + }); + }); + + // ------------------------------------------------------------------------- + // parsePidsFromLsofOutput — pure unit tests (no I/O, driven via spawnSync mock) + // ------------------------------------------------------------------------- + describe("parsePidsFromLsofOutput (via findGatewayPidsOnPortSync stdout path)", () => { + it("returns [] for empty lsof stdout (status 0, nothing listening)", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + + it("parses multiple openclaw pids from a single lsof output block", () => { + const pid1 = process.pid + 10; + const pid2 = process.pid + 11; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: pid1, cmd: "openclaw-gateway" }, + { pid: pid2, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(pid1); + expect(result).toContain(pid2); + }); + + it("returns [] when status 0 but only non-openclaw pids present", () => { + // Port may be bound by an unrelated process. findGatewayPidsOnPortSync + // only tracks openclaw processes — non-openclaw listeners are ignored. + const otherPid = process.pid + 50; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([{ pid: otherPid, cmd: "caddy" }]), + stderr: "", + }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + }); + + // ------------------------------------------------------------------------- + // pollPortOnce (via cleanStaleGatewayProcessesSync) — Codex P1 regression + // ------------------------------------------------------------------------- + describe("pollPortOnce — no second lsof spawn (Codex P1 regression)", () => { + it("treats lsof exit status 1 as port-free (no listeners)", () => { + // lsof exits with status 1 when no matching processes are found — this is + // the canonical "port is free" signal, not an error. + const stalePid = process.pid + 500; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Poll returns status 1 — no listeners + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + // Should complete cleanly (port reported free on status 1) + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + }); + + it("treats lsof exit status >1 as inconclusive, not port-free — Codex P2 regression", () => { + // Codex P2: non-zero lsof exits other than status 1 (e.g. permission denied, + // bad flag, runtime error) must not be mapped to free:true. They are + // inconclusive and should keep the polling loop running until budget expires. + const stalePid = process.pid + 501; + let call = 0; + const events: string[] = []; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call === 2) { + // Permission/runtime error — status 2, should NOT be treated as free + events.push("error-poll"); + return { error: null, status: 2, stdout: "", stderr: "lsof: permission denied" }; + } + // Eventually port is free + events.push("free-poll"); + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + // Must have continued polling after the status-2 error, not exited early + expect(events).toContain("free-poll"); + }); + + it("does not make a second lsof call when the first returns status 0", () => { + // The bug: pollPortOnce previously called findGatewayPidsOnPortSync as a + // second probe after getting status===0 from the first lsof. That second + // call collapses any error/timeout back into [], which maps to free:true — + // silently misclassifying an inconclusive result as "port is free". + // + // The fix: pollPortOnce now parses res.stdout directly from the first + // spawnSync call. Exactly ONE lsof invocation per poll cycle. + const stalePid = process.pid + 400; + let spawnCount = 0; + mockSpawnSync.mockImplementation(() => { + spawnCount++; + if (spawnCount === 1) { + // Initial findGatewayPidsOnPortSync — returns stale pid + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (spawnCount === 2) { + // First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again) + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Port free on third call + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + // If pollPortOnce made a second lsof call internally, spawnCount would + // be at least 4 (initial + 2 polls each doubled). With the fix, each poll + // is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3. + expect(spawnCount).toBe(3); + }); + + it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => { + // On Linux containers with restricted /proc (AppArmor, seccomp, user namespaces), + // lsof can exit 1 AND still emit output for processes it could read. + // status 1 + non-empty openclaw stdout must not be treated as port-free. + const stalePid = process.pid + 601; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + // Initial scan: finds stale pid + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call === 2) { + // status 1 + openclaw pid in stdout — container-restricted lsof reports partial results + return { + error: null, + status: 1, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "lsof: WARNING: can't stat() fuse", + }; + } + // Third poll: port is genuinely free + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + // Poll 2 returned busy (not free), so we must have polled at least 3 times + expect(call).toBeGreaterThanOrEqual(3); + }); + + it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => { + // If resolveLsofCommandSync throws (e.g. lsof resolution fails at runtime), + // pollPortOnce must catch it and return the transient-inconclusive result + // rather than propagating the exception. + const stalePid = process.pid + 402; + const mockedResolveLsof = vi.mocked(resolveLsofCommandSync); + + mockedResolveLsof.mockImplementationOnce(() => { + // First call: initial findGatewayPidsOnPortSync — succeed normally + return "lsof"; + }); + + mockSpawnSync.mockImplementationOnce(() => { + // Initial scan: finds stale pid + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + }); + + // Second call: poll — resolveLsofCommandSync throws + mockedResolveLsof.mockImplementationOnce(() => { + throw new Error("lsof binary resolution failed"); + }); + + // Third call: poll — port is free + mockedResolveLsof.mockImplementation(() => "lsof"); + mockSpawnSync.mockImplementation(() => ({ error: null, status: 1, stdout: "", stderr: "" })); + + vi.spyOn(process, "kill").mockReturnValue(true); + // Must not throw — the catch path returns transient inconclusive, loop continues + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + }); + }); + + // ------------------------------------------------------------------------- + // cleanStaleGatewayProcessesSync + // ------------------------------------------------------------------------- + describe("cleanStaleGatewayProcessesSync", () => { + it("returns [] and does not call process.kill when port has no listeners", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" }); + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + expect(cleanStaleGatewayProcessesSync()).toEqual([]); + expect(killSpy).not.toHaveBeenCalled(); + }); + + it("sends SIGTERM to stale pids and returns them", () => { + const stalePid = process.pid + 100; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // waitForPortFreeSync polls: port free immediately + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + const result = cleanStaleGatewayProcessesSync(); + + expect(result).toContain(stalePid); + expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM"); + }); + + it("escalates to SIGKILL when process survives the SIGTERM window", () => { + const stalePid = process.pid + 101; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call <= 5) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM"); + expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGKILL"); + }); + + it("polls until port is confirmed free before returning — regression for #33103", () => { + // Core regression: cleanStaleGatewayProcessesSync must not return while + // the port is still bound. Previously it returned after a fixed 500ms + // sleep regardless of port state, causing systemd's new process to hit + // EADDRINUSE and enter an unbounded restart loop. + const stalePid = process.pid + 200; + const events: string[] = []; + let call = 0; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call <= 4) { + events.push(`busy-poll-${call}`); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + events.push("port-free"); + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + expect(events).toContain("port-free"); + expect(events.filter((e) => e.startsWith("busy-poll")).length).toBeGreaterThan(0); + }); + + it("bails immediately when lsof is permanently unavailable (ENOENT) — Greptile edge case", () => { + // Regression for the edge case identified in PR review: lsof returning an + // error must not be treated as "port free". ENOENT means lsof is not + // installed — a permanent condition. The polling loop should bail + // immediately on ENOENT rather than spinning the full 2-second budget. + const stalePid = process.pid + 300; + const events: string[] = []; + let call = 0; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Permanent ENOENT — lsof is not installed + events.push(`enoent-poll-${call}`); + const err = new Error("lsof not found") as NodeJS.ErrnoException; + err.code = "ENOENT"; + return { error: err, status: null, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + + // Must bail after first ENOENT poll — no point retrying a missing binary + const enoentPolls = events.filter((e) => e.startsWith("enoent-poll")); + expect(enoentPolls.length).toBe(1); + }); + + it("bails immediately when lsof is permanently unavailable (EPERM) — SELinux/AppArmor", () => { + // EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks + // execution. Like ENOENT/EACCES, this is permanent — retrying is pointless. + const stalePid = process.pid + 305; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + const err = new Error("lsof eperm") as NodeJS.ErrnoException; + err.code = "EPERM"; + return { error: err, status: null, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + // Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES + expect(call).toBe(2); // 1 initial find + 1 EPERM poll + }); + + it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => { + // EACCES and EPERM are also permanent conditions — lsof exists but the + // process has no permission to run it. No point retrying. + const stalePid = process.pid + 302; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + const err = new Error("lsof permission denied") as NodeJS.ErrnoException; + err.code = "EACCES"; + return { error: err, status: null, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + // Should have bailed after exactly 1 poll call (the EACCES one) + expect(call).toBe(2); // 1 initial find + 1 EACCES poll + }); + + it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => { + // Sub-agent audit HIGH finding: the original test relied on real wall-clock + // time (Date.now() + 2000ms deadline), burning 2 full seconds of CI time + // every run. Fix: expose dateNowOverride in __testing so the deadline can + // be synthesised instantly, keeping the test under 10ms. + const stalePid = process.pid + 303; + let fakeNow = 0; + __testing.setDateNowOverride(() => fakeNow); + + mockSpawnSync.mockImplementation(() => { + // Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline. + fakeNow += 2001; + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + // Must return without throwing (proceeds with warning after budget expires) + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + }); + + it("still polls for port-free when all stale pids were already dead at SIGTERM time", () => { + // Sub-agent audit MEDIUM finding: if all pids from the initial scan are + // already dead before SIGTERM runs (race), terminateStaleProcessesSync + // returns killed=[] — but cleanStaleGatewayProcessesSync MUST still call + // waitForPortFreeSync. The process may have exited on its own while + // leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would + // silently recreate the EADDRINUSE race we are fixing. + const stalePid = process.pid + 304; + let call = 0; + const events: string[] = []; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + // Initial scan: finds stale pid + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Port is already free on first poll — pid was dead before SIGTERM + events.push("poll-free"); + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + + // All SIGTERMs throw ESRCH — pid already gone + vi.spyOn(process, "kill").mockImplementation(() => { + throw Object.assign(new Error("ESRCH"), { code: "ESRCH" }); + }); + + cleanStaleGatewayProcessesSync(); + + // waitForPortFreeSync must still have fired even though killed=[] + expect(events).toContain("poll-free"); + }); + + it("continues polling on transient lsof errors (not ENOENT) — Codex P1 fix", () => { + // A transient lsof error (spawnSync timeout, status 2, etc.) must NOT abort + // the polling loop. The loop should keep retrying until the budget expires + // or a definitive result is returned. Bailing on the first transient error + // would recreate the EADDRINUSE race this PR is designed to prevent. + const stalePid = process.pid + 301; + const events: string[] = []; + let call = 0; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call === 2) { + // Transient: spawnSync timeout (no ENOENT code) + events.push("transient-error"); + return { error: new Error("timeout"), status: null, stdout: "", stderr: "" }; + } + // Port free on the next poll + events.push("port-free"); + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + // Must have kept polling after the transient error and reached port-free + expect(events).toContain("transient-error"); + expect(events).toContain("port-free"); + }); + + it("returns gracefully when resolveGatewayPort throws", () => { + mockResolveGatewayPort.mockImplementationOnce(() => { + throw new Error("config read error"); + }); + expect(cleanStaleGatewayProcessesSync()).toEqual([]); + }); + + it("returns gracefully when lsof is unavailable from the start", () => { + mockSpawnSync.mockReturnValue({ + error: new Error("ENOENT"), + status: null, + stdout: "", + stderr: "", + }); + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + expect(cleanStaleGatewayProcessesSync()).toEqual([]); + expect(killSpy).not.toHaveBeenCalled(); + }); + }); + + // ------------------------------------------------------------------------- + // parsePidsFromLsofOutput — branch-coverage for mid-loop && short-circuits + // ------------------------------------------------------------------------- + describe("parsePidsFromLsofOutput — branch coverage (lines 67-69)", () => { + it("skips a mid-loop entry when the command does not include 'openclaw'", () => { + // Exercises the false branch of currentCmd.toLowerCase().includes("openclaw") + // inside the mid-loop flush: a non-openclaw cmd between two entries must not + // be pushed, but the following openclaw entry still must be. + const stalePid = process.pid + 700; + // Mixed output: non-openclaw entry first, then openclaw entry + const stdout = `p${process.pid + 699}\ncnginx\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(stalePid); + expect(result).not.toContain(process.pid + 699); + }); + + it("skips a mid-loop entry when currentCmd is missing (two consecutive p-lines)", () => { + // Exercises currentCmd falsy branch mid-loop: two 'p' lines in a row + // (no 'c' line between them) — the first PID must be skipped, the second handled. + const stalePid = process.pid + 701; + // Two consecutive p-lines: first has no c-line before the next p-line + const stdout = `p${process.pid + 702}\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(stalePid); + }); + + it("ignores a p-line with an invalid (non-positive) PID — ternary false branch", () => { + // Exercises the `Number.isFinite(parsed) && parsed > 0 ? parsed : undefined` + // false branch: a malformed 'p' line (e.g. 'p0' or 'pNaN') must not corrupt + // currentPid and must not end up in the returned pids array. + const stalePid = process.pid + 703; + // p0 is invalid (not > 0); the following valid openclaw entry must still be found. + const stdout = `p0\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(stalePid); + expect(result).not.toContain(0); + }); + + it("silently skips lines that start with neither 'p' nor 'c' — else-if false branch", () => { + // lsof -Fpc only emits 'p' and 'c' lines, but defensive handling of + // unexpected output (e.g. 'f' for file descriptor in other lsof formats) + // must not throw or corrupt the pid list. Unknown lines are just skipped. + const stalePid = process.pid + 704; + // Intersperse an 'f' line (file descriptor marker) — not a 'p' or 'c' line + const stdout = `p${stalePid}\nf8\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + // The 'f' line must not corrupt parsing; stalePid must still be found + // (the 'c' line after 'f' correctly sets currentCmd) + expect(result).toContain(stalePid); + }); + }); + + // ------------------------------------------------------------------------- + // pollPortOnce branch — status 1 + non-empty stdout with zero openclaw pids + // ------------------------------------------------------------------------- + describe("pollPortOnce — status 1 + non-empty non-openclaw stdout (line 145)", () => { + it("treats status 1 + non-openclaw stdout as port-free (not an openclaw process)", () => { + // status 1 + non-empty stdout where no openclaw pids are present: + // the port may be held by an unrelated process. From our perspective + // (we only kill openclaw pids) it is effectively free. + const stalePid = process.pid + 800; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // status 1 + non-openclaw output — should be treated as free:true for our purposes + return { + error: null, + status: 1, + stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]), + stderr: "", + }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + // Should complete cleanly — no openclaw pids in status-1 output → free + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + // Completed in exactly 2 calls (initial find + 1 free poll) + expect(call).toBe(2); + }); + }); + + // ------------------------------------------------------------------------- + // sleepSync — direct unit tests via __testing.callSleepSyncRaw + // ------------------------------------------------------------------------- + describe("sleepSync — Atomics.wait paths", () => { + it("returns immediately when called with 0ms (timeoutMs <= 0 early return)", () => { + // sleepSync(0) must short-circuit before touching Atomics.wait. + // Verify it does not throw and returns synchronously. + __testing.setSleepSyncOverride(null); // bypass override so real path runs + expect(() => __testing.callSleepSyncRaw(0)).not.toThrow(); + }); + + it("returns immediately when called with a negative value (Math.max(0,...) clamp)", () => { + __testing.setSleepSyncOverride(null); + expect(() => __testing.callSleepSyncRaw(-1)).not.toThrow(); + }); + + it("executes the Atomics.wait path successfully when called with a positive timeout", () => { + // Verify the real Atomics.wait code path runs without error. + // Use 1ms to keep the test fast; Atomics.wait resolves immediately + // because the timeout expires in 1ms. + __testing.setSleepSyncOverride(null); + expect(() => __testing.callSleepSyncRaw(1)).not.toThrow(); + }); + + it("falls back to busy-wait when Atomics.wait throws (Worker / sandboxed env)", () => { + // Atomics.wait throws in Worker threads and some sandboxed runtimes. + // The catch branch must handle this without propagating the exception. + const origWait = Atomics.wait; + Atomics.wait = () => { + throw new Error("not on main thread"); + }; + __testing.setSleepSyncOverride(null); + try { + // 1ms is enough to exercise the busy-wait loop without slowing CI. + expect(() => __testing.callSleepSyncRaw(1)).not.toThrow(); + } finally { + Atomics.wait = origWait; + __testing.setSleepSyncOverride(() => {}); + } + }); + }); +}); diff --git a/src/infra/restart-stale-pids.ts b/src/infra/restart-stale-pids.ts index bbab76f8374..c6c9535c737 100644 --- a/src/infra/restart-stale-pids.ts +++ b/src/infra/restart-stale-pids.ts @@ -4,11 +4,31 @@ import { createSubsystemLogger } from "../logging/subsystem.js"; import { resolveLsofCommandSync } from "./ports-lsof.js"; const SPAWN_TIMEOUT_MS = 2000; -const STALE_SIGTERM_WAIT_MS = 300; -const STALE_SIGKILL_WAIT_MS = 200; +const STALE_SIGTERM_WAIT_MS = 600; +const STALE_SIGKILL_WAIT_MS = 400; +/** + * After SIGKILL, the kernel may not release the TCP port immediately. + * Poll until the port is confirmed free (or until the budget expires) before + * returning control to the caller (typically `triggerOpenClawRestart` → + * `systemctl restart`). Without this wait the new process races the dying + * process for the port and systemd enters an EADDRINUSE restart loop. + * + * POLL_SPAWN_TIMEOUT_MS is intentionally much shorter than SPAWN_TIMEOUT_MS + * so that a single slow or hung lsof invocation does not consume the entire + * polling budget. At 400 ms per call, up to five independent lsof attempts + * fit within PORT_FREE_TIMEOUT_MS = 2000 ms, each with a definitive outcome. + */ +const PORT_FREE_POLL_INTERVAL_MS = 50; +const PORT_FREE_TIMEOUT_MS = 2000; +const POLL_SPAWN_TIMEOUT_MS = 400; const restartLog = createSubsystemLogger("restart"); let sleepSyncOverride: ((ms: number) => void) | null = null; +let dateNowOverride: (() => number) | null = null; + +function getTimeMs(): number { + return dateNowOverride ? dateNowOverride() : Date.now(); +} function sleepSync(ms: number): void { const timeoutMs = Math.max(0, Math.floor(ms)); @@ -31,25 +51,14 @@ function sleepSync(ms: number): void { } /** - * Find PIDs of gateway processes listening on the given port using synchronous lsof. - * Returns only PIDs that belong to openclaw gateway processes (not the current process). + * Parse openclaw gateway PIDs from lsof -Fpc stdout. + * Pure function — no I/O. Excludes the current process. */ -export function findGatewayPidsOnPortSync(port: number): number[] { - if (process.platform === "win32") { - return []; - } - const lsof = resolveLsofCommandSync(); - const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], { - encoding: "utf8", - timeout: SPAWN_TIMEOUT_MS, - }); - if (res.error || res.status !== 0) { - return []; - } +function parsePidsFromLsofOutput(stdout: string): number[] { const pids: number[] = []; let currentPid: number | undefined; let currentCmd: string | undefined; - for (const line of res.stdout.split(/\r?\n/).filter(Boolean)) { + for (const line of stdout.split(/\r?\n/).filter(Boolean)) { if (line.startsWith("p")) { if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) { pids.push(currentPid); @@ -64,17 +73,117 @@ export function findGatewayPidsOnPortSync(port: number): number[] { if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) { pids.push(currentPid); } - return pids.filter((pid) => pid !== process.pid); + // Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the + // same PID twice. Return each PID at most once to avoid double-killing. + return [...new Set(pids)].filter((pid) => pid !== process.pid); +} + +/** + * Find PIDs of gateway processes listening on the given port using synchronous lsof. + * Returns only PIDs that belong to openclaw gateway processes (not the current process). + */ +export function findGatewayPidsOnPortSync( + port: number, + spawnTimeoutMs = SPAWN_TIMEOUT_MS, +): number[] { + if (process.platform === "win32") { + return []; + } + const lsof = resolveLsofCommandSync(); + const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], { + encoding: "utf8", + timeout: spawnTimeoutMs, + }); + if (res.error) { + const code = (res.error as NodeJS.ErrnoException).code; + const detail = + code && code.trim().length > 0 + ? code + : res.error instanceof Error + ? res.error.message + : "unknown error"; + restartLog.warn(`lsof failed during initial stale-pid scan for port ${port}: ${detail}`); + return []; + } + if (res.status === 1) { + return []; + } + if (res.status !== 0) { + restartLog.warn( + `lsof exited with status ${res.status} during initial stale-pid scan for port ${port}; skipping stale pid check`, + ); + return []; + } + return parsePidsFromLsofOutput(res.stdout); +} + +/** + * Attempt a single lsof poll for the given port. + * + * Returns a discriminated union with four possible states: + * + * { free: true } — port confirmed free + * { free: false } — port confirmed busy + * { free: null; permanent: false } — transient error, keep retrying + * { free: null; permanent: true } — lsof unavailable (ENOENT / EACCES), + * no point retrying + * + * Separating transient from permanent errors is critical so that: + * 1. A slow/timed-out lsof call (transient) does not abort the polling loop — + * the caller retries until the wall-clock budget expires. + * 2. Non-zero lsof exits from runtime/permission failures (status > 1) are + * not misclassified as "port free" — they are inconclusive and retried. + * 3. A missing lsof binary (permanent) short-circuits cleanly rather than + * spinning the full budget pointlessly. + */ +type PollResult = { free: true } | { free: false } | { free: null; permanent: boolean }; + +function pollPortOnce(port: number): PollResult { + try { + const lsof = resolveLsofCommandSync(); + const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], { + encoding: "utf8", + timeout: POLL_SPAWN_TIMEOUT_MS, + }); + if (res.error) { + // Spawn-level failure. ENOENT / EACCES means lsof is permanently + // unavailable on this system; other errors (e.g. timeout) are transient. + const code = (res.error as NodeJS.ErrnoException).code; + const permanent = code === "ENOENT" || code === "EACCES" || code === "EPERM"; + return { free: null, permanent }; + } + if (res.status === 1) { + // lsof canonical "no matching processes" exit — port is genuinely free. + // Guard: on Linux containers with restricted /proc (AppArmor, seccomp, + // user namespaces), lsof can exit 1 AND still emit some output for the + // processes it could read. Parse stdout when non-empty to avoid false-free. + if (res.stdout) { + const pids = parsePidsFromLsofOutput(res.stdout); + return pids.length === 0 ? { free: true } : { free: false }; + } + return { free: true }; + } + if (res.status !== 0) { + // status > 1: runtime/permission/flag error. Cannot confirm port state — + // treat as a transient failure and keep polling rather than falsely + // reporting the port as free (which would recreate the EADDRINUSE race). + return { free: null, permanent: false }; + } + // status === 0: lsof found listeners. Parse pids from the stdout we + // already hold — no second lsof spawn, no new failure surface. + const pids = parsePidsFromLsofOutput(res.stdout); + return pids.length === 0 ? { free: true } : { free: false }; + } catch { + return { free: null, permanent: false }; + } } /** * Synchronously terminate stale gateway processes. + * Callers must pass a non-empty pids array. * Sends SIGTERM, waits briefly, then SIGKILL for survivors. */ function terminateStaleProcessesSync(pids: number[]): number[] { - if (pids.length === 0) { - return []; - } const killed: number[] = []; for (const pid of pids) { try { @@ -100,8 +209,48 @@ function terminateStaleProcessesSync(pids: number[]): number[] { return killed; } +/** + * Poll the given port until it is confirmed free, lsof is confirmed unavailable, + * or the wall-clock budget expires. + * + * Each poll invocation uses POLL_SPAWN_TIMEOUT_MS (400 ms), which is + * significantly shorter than PORT_FREE_TIMEOUT_MS (2000 ms). This ensures + * that a single slow or hung lsof call cannot consume the entire polling + * budget and cause the function to exit prematurely with an inconclusive + * result. Up to five independent lsof attempts fit within the budget. + * + * Exit conditions: + * - `pollPortOnce` returns `{ free: true }` → port confirmed free + * - `pollPortOnce` returns `{ free: null, permanent: true }` → lsof unavailable, bail + * - `pollPortOnce` returns `{ free: false }` → port busy, sleep + retry + * - `pollPortOnce` returns `{ free: null, permanent: false }` → transient error, sleep + retry + * - Wall-clock deadline exceeded → log warning, proceed anyway + */ +function waitForPortFreeSync(port: number): void { + const deadline = getTimeMs() + PORT_FREE_TIMEOUT_MS; + while (getTimeMs() < deadline) { + const result = pollPortOnce(port); + if (result.free === true) { + return; + } + if (result.free === null && result.permanent) { + // lsof is permanently unavailable (ENOENT / EACCES) — bail immediately, + // no point spinning the remaining budget. + return; + } + // result.free === false: port still bound. + // result.free === null && !permanent: transient lsof error — keep polling. + sleepSync(PORT_FREE_POLL_INTERVAL_MS); + } + restartLog.warn(`port ${port} still in use after ${PORT_FREE_TIMEOUT_MS}ms; proceeding anyway`); +} + /** * Inspect the gateway port and kill any stale gateway processes holding it. + * Blocks until the port is confirmed free (or the poll budget expires) so + * the supervisor (systemd / launchctl) does not race a zombie process for + * the port and enter an EADDRINUSE restart loop. + * * Called before service restart commands to prevent port conflicts. */ export function cleanStaleGatewayProcessesSync(): number[] { @@ -114,7 +263,14 @@ export function cleanStaleGatewayProcessesSync(): number[] { restartLog.warn( `killing ${stalePids.length} stale gateway process(es) before restart: ${stalePids.join(", ")}`, ); - return terminateStaleProcessesSync(stalePids); + const killed = terminateStaleProcessesSync(stalePids); + // Wait for the port to be released before returning — called unconditionally + // even when `killed` is empty (all pids were already dead before SIGTERM). + // A process can exit before our signal arrives yet still leave its socket + // in TIME_WAIT / FIN_WAIT; polling is the only reliable way to confirm the + // kernel has fully released the port before systemd fires the new process. + waitForPortFreeSync(port); + return killed; } catch { return []; } @@ -124,4 +280,9 @@ export const __testing = { setSleepSyncOverride(fn: ((ms: number) => void) | null) { sleepSyncOverride = fn; }, + setDateNowOverride(fn: (() => number) | null) { + dateNowOverride = fn; + }, + /** Invoke sleepSync directly (bypasses the override) for unit-testing the real Atomics path. */ + callSleepSyncRaw: sleepSync, }; diff --git a/src/infra/restart.ts b/src/infra/restart.ts index c84dfc6f7ac..3f65cfc1614 100644 --- a/src/infra/restart.ts +++ b/src/infra/restart.ts @@ -1,4 +1,6 @@ import { spawnSync } from "node:child_process"; +import os from "node:os"; +import path from "node:path"; import { resolveGatewayLaunchAgentLabel, resolveGatewaySystemdServiceName, @@ -335,7 +337,8 @@ export function triggerOpenClawRestart(): RestartAttempt { process.env.OPENCLAW_LAUNCHD_LABEL || resolveGatewayLaunchAgentLabel(process.env.OPENCLAW_PROFILE); const uid = typeof process.getuid === "function" ? process.getuid() : undefined; - const target = uid !== undefined ? `gui/${uid}/${label}` : label; + const domain = uid !== undefined ? `gui/${uid}` : "gui/501"; + const target = `${domain}/${label}`; const args = ["kickstart", "-k", target]; tried.push(`launchctl ${args.join(" ")}`); const res = spawnSync("launchctl", args, { @@ -345,10 +348,39 @@ export function triggerOpenClawRestart(): RestartAttempt { if (!res.error && res.status === 0) { return { ok: true, method: "launchctl", tried }; } + + // kickstart fails when the service was previously booted out (deregistered from launchd). + // Fall back to bootstrap (re-register from plist) + kickstart. + // Use env HOME to match how launchd.ts resolves the plist install path. + const home = process.env.HOME?.trim() || os.homedir(); + const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`); + const bootstrapArgs = ["bootstrap", domain, plistPath]; + tried.push(`launchctl ${bootstrapArgs.join(" ")}`); + const boot = spawnSync("launchctl", bootstrapArgs, { + encoding: "utf8", + timeout: SPAWN_TIMEOUT_MS, + }); + if (boot.error || (boot.status !== 0 && boot.status !== null)) { + return { + ok: false, + method: "launchctl", + detail: formatSpawnDetail(boot), + tried, + }; + } + const retryArgs = ["kickstart", "-k", target]; + tried.push(`launchctl ${retryArgs.join(" ")}`); + const retry = spawnSync("launchctl", retryArgs, { + encoding: "utf8", + timeout: SPAWN_TIMEOUT_MS, + }); + if (!retry.error && retry.status === 0) { + return { ok: true, method: "launchctl", tried }; + } return { ok: false, method: "launchctl", - detail: formatSpawnDetail(res), + detail: formatSpawnDetail(retry), tried, }; }