diff --git a/src/cli/gateway-cli/run.option-collisions.test.ts b/src/cli/gateway-cli/run.option-collisions.test.ts index 95245a91989..b26b4c86e47 100644 --- a/src/cli/gateway-cli/run.option-collisions.test.ts +++ b/src/cli/gateway-cli/run.option-collisions.test.ts @@ -12,6 +12,7 @@ const forceFreePortAndWait = vi.fn(async (_port: number, _opts: unknown) => ({ waitedMs: 0, escalatedToSigkill: false, })); +const waitForPortBindable = vi.fn(async (_port: number, _opts?: unknown) => 0); const ensureDevGatewayConfig = vi.fn(async (_opts?: unknown) => {}); const runGatewayLoop = vi.fn(async ({ start }: { start: () => Promise }) => { await start(); @@ -80,6 +81,7 @@ vi.mock("../command-format.js", () => ({ vi.mock("../ports.js", () => ({ forceFreePortAndWait: (port: number, opts: unknown) => forceFreePortAndWait(port, opts), + waitForPortBindable: (port: number, opts?: unknown) => waitForPortBindable(port, opts), })); vi.mock("./dev.js", () => ({ @@ -108,6 +110,7 @@ describe("gateway run option collisions", () => { setGatewayWsLogStyle.mockClear(); setVerbose.mockClear(); forceFreePortAndWait.mockClear(); + waitForPortBindable.mockClear(); ensureDevGatewayConfig.mockClear(); runGatewayLoop.mockClear(); }); @@ -140,6 +143,10 @@ describe("gateway run option collisions", () => { ]); expect(forceFreePortAndWait).toHaveBeenCalledWith(18789, expect.anything()); + expect(waitForPortBindable).toHaveBeenCalledWith( + 18789, + expect.objectContaining({ host: "127.0.0.1" }), + ); expect(setGatewayWsLogStyle).toHaveBeenCalledWith("full"); expect(startGatewayServer).toHaveBeenCalledWith( 18789, diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 291328273e3..666adc289a6 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -21,7 +21,7 @@ import { createSubsystemLogger } from "../../logging/subsystem.js"; import { defaultRuntime } from "../../runtime.js"; import { formatCliCommand } from "../command-format.js"; import { inheritOptionFromParent } from "../command-options.js"; -import { forceFreePortAndWait } from "../ports.js"; +import { forceFreePortAndWait, waitForPortBindable } from "../ports.js"; import { ensureDevGatewayConfig } from "./dev.js"; import { runGatewayLoop } from "./run-loop.js"; import { @@ -186,6 +186,20 @@ async function runGatewayCommand(opts: GatewayRunOpts) { defaultRuntime.error("Invalid port"); defaultRuntime.exit(1); } + const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback"; + const bind = + bindRaw === "loopback" || + bindRaw === "lan" || + bindRaw === "auto" || + bindRaw === "custom" || + bindRaw === "tailnet" + ? bindRaw + : null; + if (!bind) { + defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")'); + defaultRuntime.exit(1); + return; + } if (opts.force) { try { const { killed, waitedMs, escalatedToSigkill } = await forceFreePortAndWait(port, { @@ -208,6 +222,23 @@ async function runGatewayCommand(opts: GatewayRunOpts) { gatewayLog.info(`force: waited ${waitedMs}ms for port ${port} to free`); } } + // After killing, verify the port is actually bindable (handles TIME_WAIT). + const bindProbeHost = + bind === "loopback" + ? "127.0.0.1" + : bind === "lan" + ? "0.0.0.0" + : bind === "custom" + ? toOptionString(cfg.gateway?.customBindHost) + : undefined; + const bindWaitMs = await waitForPortBindable(port, { + timeoutMs: 3000, + intervalMs: 150, + host: bindProbeHost, + }); + if (bindWaitMs > 0) { + gatewayLog.info(`force: waited ${bindWaitMs}ms for port ${port} to become bindable`); + } } catch (err) { defaultRuntime.error(`Force: ${String(err)}`); defaultRuntime.exit(1); @@ -257,21 +288,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) { defaultRuntime.exit(1); return; } - const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback"; - const bind = - bindRaw === "loopback" || - bindRaw === "lan" || - bindRaw === "auto" || - bindRaw === "custom" || - bindRaw === "tailnet" - ? bindRaw - : null; - if (!bind) { - defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")'); - defaultRuntime.exit(1); - return; - } - const miskeys = extractGatewayMiskeys(snapshot?.parsed); const authOverride = authMode || passwordRaw || tokenRaw || authModeRaw diff --git a/src/cli/ports.test.ts b/src/cli/ports.test.ts new file mode 100644 index 00000000000..082dfa09a12 --- /dev/null +++ b/src/cli/ports.test.ts @@ -0,0 +1,124 @@ +import { EventEmitter } from "node:events"; +import net from "node:net"; +import { describe, expect, it, vi } from "vitest"; + +// Hoist the factory so vi.mock can access it. +const mockCreateServer = vi.hoisted(() => vi.fn()); + +vi.mock("node:net", async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, createServer: mockCreateServer }; +}); + +import { probePortFree, waitForPortBindable } from "./ports.js"; + +/** Build a minimal fake net.Server that emits a given error code on listen(). */ +function makeErrServer(code: string): net.Server { + const err = Object.assign(new Error(`bind error: ${code}`), { + code, + }) as NodeJS.ErrnoException; + + const fake = new EventEmitter() as unknown as net.Server; + (fake as unknown as { close: (cb?: () => void) => net.Server }).close = (cb?: () => void) => { + cb?.(); + return fake; + }; + (fake as unknown as { unref: () => net.Server }).unref = () => fake; + (fake as unknown as { listen: (...args: unknown[]) => net.Server }).listen = ( + ..._args: unknown[] + ) => { + setImmediate(() => fake.emit("error", err)); + return fake; + }; + return fake; +} + +describe("probePortFree", () => { + it("resolves false (not rejects) when bind returns EADDRINUSE", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EADDRINUSE")); + await expect(probePortFree(9999, "127.0.0.1")).resolves.toBe(false); + }); + + it("rejects immediately for EADDRNOTAVAIL (non-retryable: host address not on any interface)", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EADDRNOTAVAIL")); + await expect(probePortFree(9999, "192.0.2.1")).rejects.toMatchObject({ code: "EADDRNOTAVAIL" }); + }); + + it("rejects immediately for EACCES (non-retryable bind error)", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EACCES")); + await expect(probePortFree(80, "0.0.0.0")).rejects.toMatchObject({ code: "EACCES" }); + }); + + it("rejects immediately for other non-retryable errors", async () => { + mockCreateServer.mockReturnValue(makeErrServer("EINVAL")); + await expect(probePortFree(9999, "0.0.0.0")).rejects.toMatchObject({ code: "EINVAL" }); + }); + + it("resolves true when the port is free", async () => { + // Mock a successful bind: the "listening" event fires immediately without + // acquiring a real socket, making this deterministic and avoiding TOCTOU races. + // (A real-socket approach would bind to :0, release, then reprobe — the OS can + // reassign the ephemeral port in between, causing a flaky EADDRINUSE failure.) + const fakeServer = new EventEmitter() as unknown as net.Server; + (fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = ( + cb?: () => void, + ) => { + cb?.(); + return fakeServer; + }; + (fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer; + (fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = ( + ..._args: unknown[] + ) => { + // Simulate a successful bind by firing the "listening" callback. + const callback = _args.find((a) => typeof a === "function") as (() => void) | undefined; + setImmediate(() => callback?.()); + return fakeServer; + }; + mockCreateServer.mockReturnValue(fakeServer); + + const result = await probePortFree(9999, "127.0.0.1"); + expect(result).toBe(true); + }); +}); + +describe("waitForPortBindable", () => { + it("probes the provided host when waiting for bindability", async () => { + const listenCalls: Array<{ port: number; host: string }> = []; + const fakeServer = new EventEmitter() as unknown as net.Server; + (fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = ( + cb?: () => void, + ) => { + cb?.(); + return fakeServer; + }; + (fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer; + (fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = ( + ...args: unknown[] + ) => { + const [port, host] = args as [number, string]; + listenCalls.push({ port, host }); + const callback = args.find((a) => typeof a === "function") as (() => void) | undefined; + setImmediate(() => callback?.()); + return fakeServer; + }; + mockCreateServer.mockReturnValue(fakeServer); + + await expect( + waitForPortBindable(9999, { timeoutMs: 100, intervalMs: 10, host: "127.0.0.1" }), + ).resolves.toBe(0); + expect(listenCalls[0]).toEqual({ port: 9999, host: "127.0.0.1" }); + }); + + it("propagates EACCES rejection immediately without retrying", async () => { + // Every call to createServer will emit EACCES — so if waitForPortBindable retried, + // mockCreateServer would be called many times. We assert it's called exactly once. + mockCreateServer.mockClear(); + mockCreateServer.mockReturnValue(makeErrServer("EACCES")); + await expect( + waitForPortBindable(80, { timeoutMs: 5000, intervalMs: 50 }), + ).rejects.toMatchObject({ code: "EACCES" }); + // Only one probe should have been attempted — no spinning through the retry loop. + expect(mockCreateServer).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/cli/ports.ts b/src/cli/ports.ts index e2bfa67aad9..fd829bf21db 100644 --- a/src/cli/ports.ts +++ b/src/cli/ports.ts @@ -1,4 +1,5 @@ import { execFileSync } from "node:child_process"; +import { createServer } from "node:net"; import { resolveLsofCommandSync } from "../infra/ports-lsof.js"; import { tryListenOnPort } from "../infra/ports-probe.js"; import { sleep } from "../utils.js"; @@ -324,3 +325,63 @@ export async function forceFreePortAndWait( `port ${port} still has listeners after --force: ${still.map((p) => p.pid).join(", ")}`, ); } + +/** + * Attempt a real TCP bind to verify the port is available at the OS level. + * Catches TIME_WAIT / kernel-level holds that lsof won't show. + * + * Resolves false only for EADDRINUSE — a genuinely transient condition + * (port still in TIME_WAIT after a --force kill) that the caller should retry. + * + * All other errors are non-retryable and are rejected immediately: + * - EADDRNOTAVAIL: the host address doesn't exist on any local interface + * (hard misconfiguration, not a transient kernel hold). + * - EACCES: bind to a privileged port as non-root. + * - EINVAL, etc.: other unrecoverable OS errors. + */ +export function probePortFree(port: number, host = "0.0.0.0"): Promise { + return new Promise((resolve, reject) => { + const srv = createServer(); + srv.unref(); + srv.once("error", (err: NodeJS.ErrnoException) => { + srv.close(); + if (err.code === "EADDRINUSE") { + // Genuinely transient — port still in use or TIME_WAIT after a --force kill. + resolve(false); + } else { + // Non-retryable: EADDRNOTAVAIL (bad host address), EACCES (privileged port), + // EINVAL, and any other OS errors. Surface immediately; no retry loop. + reject(err); + } + }); + srv.listen(port, host, () => { + srv.close(() => resolve(true)); + }); + }); +} + +/** + * Poll until a real test-bind succeeds, up to `timeoutMs`. + * Returns the number of ms waited, or throws if the port never freed. + */ +export async function waitForPortBindable( + port: number, + opts: { timeoutMs?: number; intervalMs?: number; host?: string } = {}, +): Promise { + const timeoutMs = Math.max(opts.timeoutMs ?? 3000, 0); + const intervalMs = Math.max(opts.intervalMs ?? 150, 1); + const host = opts.host; + let waited = 0; + while (waited < timeoutMs) { + if (await probePortFree(port, host)) { + return waited; + } + await sleep(intervalMs); + waited += intervalMs; + } + // Final attempt + if (await probePortFree(port, host)) { + return waited; + } + throw new Error(`port ${port} still not bindable after ${waited}ms (TIME_WAIT or kernel hold)`); +} diff --git a/src/cli/update-cli/restart-helper.test.ts b/src/cli/update-cli/restart-helper.test.ts index a152f3fdb48..18888c27f53 100644 --- a/src/cli/update-cli/restart-helper.test.ts +++ b/src/cli/update-cli/restart-helper.test.ts @@ -98,6 +98,8 @@ describe("restart-helper", () => { expect(scriptPath.endsWith(".sh")).toBe(true); expect(content).toContain("#!/bin/sh"); expect(content).toContain("launchctl kickstart -k 'gui/501/ai.openclaw.gateway'"); + // Should fall back to bootstrap when kickstart fails (service deregistered after bootout) + expect(content).toContain("launchctl bootstrap 'gui/501'"); expect(content).toContain('rm -f "$0"'); await cleanupScript(scriptPath); }); @@ -223,6 +225,45 @@ describe("restart-helper", () => { await cleanupScript(scriptPath); }); + it("expands HOME in plist path instead of leaving literal $HOME", async () => { + Object.defineProperty(process, "platform", { value: "darwin" }); + process.getuid = () => 501; + + const { scriptPath, content } = await prepareAndReadScript({ + HOME: "/Users/testuser", + OPENCLAW_PROFILE: "default", + }); + // The plist path must contain the resolved home dir, not literal $HOME + expect(content).toMatch(/[\\/]Users[\\/]testuser[\\/]Library[\\/]LaunchAgents[\\/]/); + expect(content).not.toContain("$HOME"); + await cleanupScript(scriptPath); + }); + + it("prefers env parameter HOME over process.env.HOME for plist path", async () => { + Object.defineProperty(process, "platform", { value: "darwin" }); + process.getuid = () => 502; + + const { scriptPath, content } = await prepareAndReadScript({ + HOME: "/Users/envhome", + OPENCLAW_PROFILE: "default", + }); + expect(content).toMatch(/[\\/]Users[\\/]envhome[\\/]Library[\\/]LaunchAgents[\\/]/); + await cleanupScript(scriptPath); + }); + + it("shell-escapes the label in the plist path on macOS", async () => { + Object.defineProperty(process, "platform", { value: "darwin" }); + process.getuid = () => 501; + + const { scriptPath, content } = await prepareAndReadScript({ + HOME: "/Users/testuser", + OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.it's-a-test", + }); + // The plist path must also shell-escape the label to prevent injection + expect(content).toContain("ai.openclaw.it'\\''s-a-test.plist"); + await cleanupScript(scriptPath); + }); + it("rejects unsafe batch profile names on Windows", async () => { Object.defineProperty(process, "platform", { value: "win32" }); const scriptPath = await prepareRestartScript({ diff --git a/src/cli/update-cli/restart-helper.ts b/src/cli/update-cli/restart-helper.ts index cef4e25418b..4f7d45aab0c 100644 --- a/src/cli/update-cli/restart-helper.ts +++ b/src/cli/update-cli/restart-helper.ts @@ -83,12 +83,22 @@ rm -f "$0" const escaped = shellEscape(label); // Fallback to 501 if getuid is not available (though it should be on macOS) const uid = process.getuid ? process.getuid() : 501; + // Resolve HOME at generation time via env/process.env to match launchd.ts, + // and shell-escape the label in the plist filename to prevent injection. + const home = env.HOME?.trim() || process.env.HOME || os.homedir(); + const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`); + const escapedPlistPath = shellEscape(plistPath); filename = `openclaw-restart-${timestamp}.sh`; scriptContent = `#!/bin/sh # Standalone restart script — survives parent process termination. # Wait briefly to ensure file locks are released after update. sleep 1 -launchctl kickstart -k 'gui/${uid}/${escaped}' +# Try kickstart first (works when the service is still registered). +# If it fails (e.g. after bootout), re-register via bootstrap then kickstart. +if ! launchctl kickstart -k 'gui/${uid}/${escaped}' 2>/dev/null; then + launchctl bootstrap 'gui/${uid}' '${escapedPlistPath}' 2>/dev/null + launchctl kickstart -k 'gui/${uid}/${escaped}' 2>/dev/null || true +fi # Self-cleanup rm -f "$0" `; diff --git a/src/gateway/config-reload-plan.ts b/src/gateway/config-reload-plan.ts index 1af87d25020..4ca1fcea7f0 100644 --- a/src/gateway/config-reload-plan.ts +++ b/src/gateway/config-reload-plan.ts @@ -50,6 +50,11 @@ const BASE_RELOAD_RULES: ReloadRule[] = [ kind: "hot", actions: ["restart-heartbeat"], }, + { + prefix: "agents.defaults.models", + kind: "hot", + actions: ["restart-heartbeat"], + }, { prefix: "agents.defaults.model", kind: "hot", diff --git a/src/gateway/config-reload.test.ts b/src/gateway/config-reload.test.ts index e45347b0040..9c4994541e9 100644 --- a/src/gateway/config-reload.test.ts +++ b/src/gateway/config-reload.test.ts @@ -159,6 +159,14 @@ describe("buildGatewayReloadPlan", () => { ); }); + it("restarts heartbeat when agents.defaults.models allowlist changes", () => { + const plan = buildGatewayReloadPlan(["agents.defaults.models"]); + expect(plan.restartGateway).toBe(false); + expect(plan.restartHeartbeat).toBe(true); + expect(plan.hotReasons).toContain("agents.defaults.models"); + expect(plan.noopPaths).toEqual([]); + }); + it("hot-reloads health monitor when channelHealthCheckMinutes changes", () => { const plan = buildGatewayReloadPlan(["gateway.channelHealthCheckMinutes"]); expect(plan.restartGateway).toBe(false); diff --git a/src/gateway/server/http-listen.test.ts b/src/gateway/server/http-listen.test.ts new file mode 100644 index 00000000000..12358713faf --- /dev/null +++ b/src/gateway/server/http-listen.test.ts @@ -0,0 +1,100 @@ +import { EventEmitter } from "node:events"; +import type { Server as HttpServer } from "node:http"; +import { describe, expect, it, vi } from "vitest"; +import { GatewayLockError } from "../../infra/gateway-lock.js"; +import { listenGatewayHttpServer } from "./http-listen.js"; + +const sleepMock = vi.hoisted(() => vi.fn(async (_ms: number) => {})); + +vi.mock("../../utils.js", () => ({ + sleep: (ms: number) => sleepMock(ms), +})); + +type ListenOutcome = { kind: "error"; code: string } | { kind: "listening" }; + +function createFakeHttpServer(outcomes: ListenOutcome[]) { + class FakeHttpServer extends EventEmitter { + public closeCalls = 0; + private attempt = 0; + + listen(_port: number, _host: string) { + const outcome = outcomes[this.attempt] ?? { kind: "listening" }; + this.attempt += 1; + setImmediate(() => { + if (outcome.kind === "error") { + const err = Object.assign(new Error(outcome.code), { code: outcome.code }); + this.emit("error", err); + } else { + this.emit("listening"); + } + }); + return this; + } + + close(cb?: () => void) { + this.closeCalls += 1; + setImmediate(() => cb?.()); + return this; + } + } + + return new FakeHttpServer(); +} + +describe("listenGatewayHttpServer", () => { + it("retries EADDRINUSE and closes server handle before retry", async () => { + sleepMock.mockClear(); + const fake = createFakeHttpServer([ + { kind: "error", code: "EADDRINUSE" }, + { kind: "listening" }, + ]); + + await expect( + listenGatewayHttpServer({ + httpServer: fake as unknown as HttpServer, + bindHost: "127.0.0.1", + port: 18789, + }), + ).resolves.toBeUndefined(); + + expect(fake.closeCalls).toBe(1); + expect(sleepMock).toHaveBeenCalledTimes(1); + }); + + it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => { + sleepMock.mockClear(); + const fake = createFakeHttpServer([ + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + { kind: "error", code: "EADDRINUSE" }, + ]); + + await expect( + listenGatewayHttpServer({ + httpServer: fake as unknown as HttpServer, + bindHost: "127.0.0.1", + port: 18789, + }), + ).rejects.toBeInstanceOf(GatewayLockError); + + expect(fake.closeCalls).toBe(4); + }); + + it("wraps non-EADDRINUSE errors as GatewayLockError", async () => { + sleepMock.mockClear(); + const fake = createFakeHttpServer([{ kind: "error", code: "EACCES" }]); + + await expect( + listenGatewayHttpServer({ + httpServer: fake as unknown as HttpServer, + bindHost: "127.0.0.1", + port: 18789, + }), + ).rejects.toBeInstanceOf(GatewayLockError); + + expect(fake.closeCalls).toBe(0); + }); +}); diff --git a/src/gateway/server/http-listen.ts b/src/gateway/server/http-listen.ts index c2ae20a879f..0aa9f7b399f 100644 --- a/src/gateway/server/http-listen.ts +++ b/src/gateway/server/http-listen.ts @@ -1,5 +1,19 @@ import type { Server as HttpServer } from "node:http"; import { GatewayLockError } from "../../infra/gateway-lock.js"; +import { sleep } from "../../utils.js"; + +const EADDRINUSE_MAX_RETRIES = 4; +const EADDRINUSE_RETRY_INTERVAL_MS = 500; + +async function closeServerQuietly(httpServer: HttpServer): Promise { + await new Promise((resolve) => { + try { + httpServer.close(() => resolve()); + } catch { + resolve(); + } + }); +} export async function listenGatewayHttpServer(params: { httpServer: HttpServer; @@ -7,31 +21,41 @@ export async function listenGatewayHttpServer(params: { port: number; }) { const { httpServer, bindHost, port } = params; - try { - await new Promise((resolve, reject) => { - const onError = (err: NodeJS.ErrnoException) => { - httpServer.off("listening", onListening); - reject(err); - }; - const onListening = () => { - httpServer.off("error", onError); - resolve(); - }; - httpServer.once("error", onError); - httpServer.once("listening", onListening); - httpServer.listen(port, bindHost); - }); - } catch (err) { - const code = (err as NodeJS.ErrnoException).code; - if (code === "EADDRINUSE") { + + for (let attempt = 0; ; attempt++) { + try { + await new Promise((resolve, reject) => { + const onError = (err: NodeJS.ErrnoException) => { + httpServer.off("listening", onListening); + reject(err); + }; + const onListening = () => { + httpServer.off("error", onError); + resolve(); + }; + httpServer.once("error", onError); + httpServer.once("listening", onListening); + httpServer.listen(port, bindHost); + }); + return; // bound successfully + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code === "EADDRINUSE" && attempt < EADDRINUSE_MAX_RETRIES) { + // Port may still be in TIME_WAIT after a recent process exit; retry. + await closeServerQuietly(httpServer); + await sleep(EADDRINUSE_RETRY_INTERVAL_MS); + continue; + } + if (code === "EADDRINUSE") { + throw new GatewayLockError( + `another gateway instance is already listening on ws://${bindHost}:${port}`, + err, + ); + } throw new GatewayLockError( - `another gateway instance is already listening on ws://${bindHost}:${port}`, + `failed to bind gateway socket on ws://${bindHost}:${port}: ${String(err)}`, err, ); } - throw new GatewayLockError( - `failed to bind gateway socket on ws://${bindHost}:${port}: ${String(err)}`, - err, - ); } } diff --git a/src/infra/restart-stale-pids.test.ts b/src/infra/restart-stale-pids.test.ts new file mode 100644 index 00000000000..f7bf0709d9f --- /dev/null +++ b/src/infra/restart-stale-pids.test.ts @@ -0,0 +1,810 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +// This entire file tests lsof-based Unix port polling. The feature is a deliberate +// no-op on Windows (findGatewayPidsOnPortSync returns [] immediately). Running these +// tests on a Windows CI runner would require lsof which does not exist there, so we +// skip the suite entirely and rely on the Linux/macOS runners for coverage. +const isWindows = process.platform === "win32"; + +const mockSpawnSync = vi.hoisted(() => vi.fn()); +const mockResolveGatewayPort = vi.hoisted(() => vi.fn(() => 18789)); +const mockRestartWarn = vi.hoisted(() => vi.fn()); + +vi.mock("node:child_process", () => ({ + spawnSync: (...args: unknown[]) => mockSpawnSync(...args), + execFileSync: vi.fn(), +})); + +vi.mock("../config/paths.js", () => ({ + resolveGatewayPort: () => mockResolveGatewayPort(), +})); + +vi.mock("./ports-lsof.js", () => ({ + resolveLsofCommandSync: vi.fn(() => "lsof"), +})); + +vi.mock("../logging/subsystem.js", () => ({ + createSubsystemLogger: vi.fn(() => ({ + warn: (...args: unknown[]) => mockRestartWarn(...args), + info: vi.fn(), + error: vi.fn(), + })), +})); + +import { resolveLsofCommandSync } from "./ports-lsof.js"; +import { + __testing, + cleanStaleGatewayProcessesSync, + findGatewayPidsOnPortSync, +} from "./restart-stale-pids.js"; + +function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string { + return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n"; +} + +describe.skipIf(isWindows)("restart-stale-pids", () => { + beforeEach(() => { + mockSpawnSync.mockReset(); + mockResolveGatewayPort.mockReset(); + mockRestartWarn.mockReset(); + mockResolveGatewayPort.mockReturnValue(18789); + __testing.setSleepSyncOverride(() => {}); + }); + + afterEach(() => { + __testing.setSleepSyncOverride(null); + __testing.setDateNowOverride(null); + vi.restoreAllMocks(); + }); + + // ------------------------------------------------------------------------- + // findGatewayPidsOnPortSync + // ------------------------------------------------------------------------- + describe("findGatewayPidsOnPortSync", () => { + it("returns [] when lsof exits with non-zero status", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 1, stdout: "", stderr: "" }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + + it("logs warning when initial lsof scan exits with status > 1", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 2, stdout: "", stderr: "lsof error" }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + expect(mockRestartWarn).toHaveBeenCalledWith( + expect.stringContaining("lsof exited with status 2"), + ); + }); + + it("returns [] when lsof returns an error object (e.g. ENOENT)", () => { + mockSpawnSync.mockReturnValue({ + error: new Error("ENOENT"), + status: null, + stdout: "", + stderr: "", + }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + expect(mockRestartWarn).toHaveBeenCalledWith( + expect.stringContaining("lsof failed during initial stale-pid scan"), + ); + }); + + it("parses openclaw-gateway pids and excludes the current process", () => { + const stalePid = process.pid + 1; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: stalePid, cmd: "openclaw-gateway" }, + { pid: process.pid, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const pids = findGatewayPidsOnPortSync(18789); + expect(pids).toContain(stalePid); + expect(pids).not.toContain(process.pid); + }); + + it("excludes pids whose command does not include 'openclaw'", () => { + const otherPid = process.pid + 2; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([{ pid: otherPid, cmd: "nginx" }]), + stderr: "", + }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + + it("forwards the spawnTimeoutMs argument to spawnSync", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" }); + findGatewayPidsOnPortSync(18789, 400); + expect(mockSpawnSync).toHaveBeenCalledWith( + "lsof", + expect.any(Array), + expect.objectContaining({ timeout: 400 }), + ); + }); + + it("deduplicates pids from dual-stack listeners (IPv4+IPv6 emit same pid twice)", () => { + // Dual-stack listeners cause lsof to emit the same PID twice in -Fpc output + // (once for the IPv4 socket, once for IPv6). Without dedup, terminateStaleProcessesSync + // sends SIGTERM twice and returns killed=[pid, pid], corrupting the count. + const stalePid = process.pid + 600; + const stdout = `p${stalePid}\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toEqual([stalePid]); // deduped — not [pid, pid] + }); + + it("returns [] and skips lsof on win32", () => { + // The entire describe block is skipped on Windows (isWindows guard at top), + // so this test only runs on Linux/macOS. It mocks platform to win32 for the + // single assertion without needing to restore — the suite-level skipIf means + // this will never run on an actual Windows runner where the mock could leak. + const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + Object.defineProperty(process, "platform", { value: "win32", configurable: true }); + try { + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + expect(mockSpawnSync).not.toHaveBeenCalled(); + } finally { + if (origDescriptor) { + Object.defineProperty(process, "platform", origDescriptor); + } + } + }); + }); + + // ------------------------------------------------------------------------- + // parsePidsFromLsofOutput — pure unit tests (no I/O, driven via spawnSync mock) + // ------------------------------------------------------------------------- + describe("parsePidsFromLsofOutput (via findGatewayPidsOnPortSync stdout path)", () => { + it("returns [] for empty lsof stdout (status 0, nothing listening)", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + + it("parses multiple openclaw pids from a single lsof output block", () => { + const pid1 = process.pid + 10; + const pid2 = process.pid + 11; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: pid1, cmd: "openclaw-gateway" }, + { pid: pid2, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(pid1); + expect(result).toContain(pid2); + }); + + it("returns [] when status 0 but only non-openclaw pids present", () => { + // Port may be bound by an unrelated process. findGatewayPidsOnPortSync + // only tracks openclaw processes — non-openclaw listeners are ignored. + const otherPid = process.pid + 50; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([{ pid: otherPid, cmd: "caddy" }]), + stderr: "", + }); + expect(findGatewayPidsOnPortSync(18789)).toEqual([]); + }); + }); + + // ------------------------------------------------------------------------- + // pollPortOnce (via cleanStaleGatewayProcessesSync) — Codex P1 regression + // ------------------------------------------------------------------------- + describe("pollPortOnce — no second lsof spawn (Codex P1 regression)", () => { + it("treats lsof exit status 1 as port-free (no listeners)", () => { + // lsof exits with status 1 when no matching processes are found — this is + // the canonical "port is free" signal, not an error. + const stalePid = process.pid + 500; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Poll returns status 1 — no listeners + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + // Should complete cleanly (port reported free on status 1) + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + }); + + it("treats lsof exit status >1 as inconclusive, not port-free — Codex P2 regression", () => { + // Codex P2: non-zero lsof exits other than status 1 (e.g. permission denied, + // bad flag, runtime error) must not be mapped to free:true. They are + // inconclusive and should keep the polling loop running until budget expires. + const stalePid = process.pid + 501; + let call = 0; + const events: string[] = []; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call === 2) { + // Permission/runtime error — status 2, should NOT be treated as free + events.push("error-poll"); + return { error: null, status: 2, stdout: "", stderr: "lsof: permission denied" }; + } + // Eventually port is free + events.push("free-poll"); + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + // Must have continued polling after the status-2 error, not exited early + expect(events).toContain("free-poll"); + }); + + it("does not make a second lsof call when the first returns status 0", () => { + // The bug: pollPortOnce previously called findGatewayPidsOnPortSync as a + // second probe after getting status===0 from the first lsof. That second + // call collapses any error/timeout back into [], which maps to free:true — + // silently misclassifying an inconclusive result as "port is free". + // + // The fix: pollPortOnce now parses res.stdout directly from the first + // spawnSync call. Exactly ONE lsof invocation per poll cycle. + const stalePid = process.pid + 400; + let spawnCount = 0; + mockSpawnSync.mockImplementation(() => { + spawnCount++; + if (spawnCount === 1) { + // Initial findGatewayPidsOnPortSync — returns stale pid + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (spawnCount === 2) { + // First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again) + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Port free on third call + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + // If pollPortOnce made a second lsof call internally, spawnCount would + // be at least 4 (initial + 2 polls each doubled). With the fix, each poll + // is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3. + expect(spawnCount).toBe(3); + }); + + it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => { + // On Linux containers with restricted /proc (AppArmor, seccomp, user namespaces), + // lsof can exit 1 AND still emit output for processes it could read. + // status 1 + non-empty openclaw stdout must not be treated as port-free. + const stalePid = process.pid + 601; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + // Initial scan: finds stale pid + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call === 2) { + // status 1 + openclaw pid in stdout — container-restricted lsof reports partial results + return { + error: null, + status: 1, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "lsof: WARNING: can't stat() fuse", + }; + } + // Third poll: port is genuinely free + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + // Poll 2 returned busy (not free), so we must have polled at least 3 times + expect(call).toBeGreaterThanOrEqual(3); + }); + + it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => { + // If resolveLsofCommandSync throws (e.g. lsof resolution fails at runtime), + // pollPortOnce must catch it and return the transient-inconclusive result + // rather than propagating the exception. + const stalePid = process.pid + 402; + const mockedResolveLsof = vi.mocked(resolveLsofCommandSync); + + mockedResolveLsof.mockImplementationOnce(() => { + // First call: initial findGatewayPidsOnPortSync — succeed normally + return "lsof"; + }); + + mockSpawnSync.mockImplementationOnce(() => { + // Initial scan: finds stale pid + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + }); + + // Second call: poll — resolveLsofCommandSync throws + mockedResolveLsof.mockImplementationOnce(() => { + throw new Error("lsof binary resolution failed"); + }); + + // Third call: poll — port is free + mockedResolveLsof.mockImplementation(() => "lsof"); + mockSpawnSync.mockImplementation(() => ({ error: null, status: 1, stdout: "", stderr: "" })); + + vi.spyOn(process, "kill").mockReturnValue(true); + // Must not throw — the catch path returns transient inconclusive, loop continues + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + }); + }); + + // ------------------------------------------------------------------------- + // cleanStaleGatewayProcessesSync + // ------------------------------------------------------------------------- + describe("cleanStaleGatewayProcessesSync", () => { + it("returns [] and does not call process.kill when port has no listeners", () => { + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" }); + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + expect(cleanStaleGatewayProcessesSync()).toEqual([]); + expect(killSpy).not.toHaveBeenCalled(); + }); + + it("sends SIGTERM to stale pids and returns them", () => { + const stalePid = process.pid + 100; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // waitForPortFreeSync polls: port free immediately + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + const result = cleanStaleGatewayProcessesSync(); + + expect(result).toContain(stalePid); + expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM"); + }); + + it("escalates to SIGKILL when process survives the SIGTERM window", () => { + const stalePid = process.pid + 101; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call <= 5) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM"); + expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGKILL"); + }); + + it("polls until port is confirmed free before returning — regression for #33103", () => { + // Core regression: cleanStaleGatewayProcessesSync must not return while + // the port is still bound. Previously it returned after a fixed 500ms + // sleep regardless of port state, causing systemd's new process to hit + // EADDRINUSE and enter an unbounded restart loop. + const stalePid = process.pid + 200; + const events: string[] = []; + let call = 0; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call <= 4) { + events.push(`busy-poll-${call}`); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + events.push("port-free"); + return { error: null, status: 0, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + expect(events).toContain("port-free"); + expect(events.filter((e) => e.startsWith("busy-poll")).length).toBeGreaterThan(0); + }); + + it("bails immediately when lsof is permanently unavailable (ENOENT) — Greptile edge case", () => { + // Regression for the edge case identified in PR review: lsof returning an + // error must not be treated as "port free". ENOENT means lsof is not + // installed — a permanent condition. The polling loop should bail + // immediately on ENOENT rather than spinning the full 2-second budget. + const stalePid = process.pid + 300; + const events: string[] = []; + let call = 0; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Permanent ENOENT — lsof is not installed + events.push(`enoent-poll-${call}`); + const err = new Error("lsof not found") as NodeJS.ErrnoException; + err.code = "ENOENT"; + return { error: err, status: null, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + + // Must bail after first ENOENT poll — no point retrying a missing binary + const enoentPolls = events.filter((e) => e.startsWith("enoent-poll")); + expect(enoentPolls.length).toBe(1); + }); + + it("bails immediately when lsof is permanently unavailable (EPERM) — SELinux/AppArmor", () => { + // EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks + // execution. Like ENOENT/EACCES, this is permanent — retrying is pointless. + const stalePid = process.pid + 305; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + const err = new Error("lsof eperm") as NodeJS.ErrnoException; + err.code = "EPERM"; + return { error: err, status: null, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + // Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES + expect(call).toBe(2); // 1 initial find + 1 EPERM poll + }); + + it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => { + // EACCES and EPERM are also permanent conditions — lsof exists but the + // process has no permission to run it. No point retrying. + const stalePid = process.pid + 302; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + const err = new Error("lsof permission denied") as NodeJS.ErrnoException; + err.code = "EACCES"; + return { error: err, status: null, stdout: "", stderr: "" }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + // Should have bailed after exactly 1 poll call (the EACCES one) + expect(call).toBe(2); // 1 initial find + 1 EACCES poll + }); + + it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => { + // Sub-agent audit HIGH finding: the original test relied on real wall-clock + // time (Date.now() + 2000ms deadline), burning 2 full seconds of CI time + // every run. Fix: expose dateNowOverride in __testing so the deadline can + // be synthesised instantly, keeping the test under 10ms. + const stalePid = process.pid + 303; + let fakeNow = 0; + __testing.setDateNowOverride(() => fakeNow); + + mockSpawnSync.mockImplementation(() => { + // Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline. + fakeNow += 2001; + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + // Must return without throwing (proceeds with warning after budget expires) + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + }); + + it("still polls for port-free when all stale pids were already dead at SIGTERM time", () => { + // Sub-agent audit MEDIUM finding: if all pids from the initial scan are + // already dead before SIGTERM runs (race), terminateStaleProcessesSync + // returns killed=[] — but cleanStaleGatewayProcessesSync MUST still call + // waitForPortFreeSync. The process may have exited on its own while + // leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would + // silently recreate the EADDRINUSE race we are fixing. + const stalePid = process.pid + 304; + let call = 0; + const events: string[] = []; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + // Initial scan: finds stale pid + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // Port is already free on first poll — pid was dead before SIGTERM + events.push("poll-free"); + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + + // All SIGTERMs throw ESRCH — pid already gone + vi.spyOn(process, "kill").mockImplementation(() => { + throw Object.assign(new Error("ESRCH"), { code: "ESRCH" }); + }); + + cleanStaleGatewayProcessesSync(); + + // waitForPortFreeSync must still have fired even though killed=[] + expect(events).toContain("poll-free"); + }); + + it("continues polling on transient lsof errors (not ENOENT) — Codex P1 fix", () => { + // A transient lsof error (spawnSync timeout, status 2, etc.) must NOT abort + // the polling loop. The loop should keep retrying until the budget expires + // or a definitive result is returned. Bailing on the first transient error + // would recreate the EADDRINUSE race this PR is designed to prevent. + const stalePid = process.pid + 301; + const events: string[] = []; + let call = 0; + + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + events.push("initial-find"); + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + if (call === 2) { + // Transient: spawnSync timeout (no ENOENT code) + events.push("transient-error"); + return { error: new Error("timeout"), status: null, stdout: "", stderr: "" }; + } + // Port free on the next poll + events.push("port-free"); + return { error: null, status: 1, stdout: "", stderr: "" }; + }); + + vi.spyOn(process, "kill").mockReturnValue(true); + cleanStaleGatewayProcessesSync(); + + // Must have kept polling after the transient error and reached port-free + expect(events).toContain("transient-error"); + expect(events).toContain("port-free"); + }); + + it("returns gracefully when resolveGatewayPort throws", () => { + mockResolveGatewayPort.mockImplementationOnce(() => { + throw new Error("config read error"); + }); + expect(cleanStaleGatewayProcessesSync()).toEqual([]); + }); + + it("returns gracefully when lsof is unavailable from the start", () => { + mockSpawnSync.mockReturnValue({ + error: new Error("ENOENT"), + status: null, + stdout: "", + stderr: "", + }); + const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); + expect(cleanStaleGatewayProcessesSync()).toEqual([]); + expect(killSpy).not.toHaveBeenCalled(); + }); + }); + + // ------------------------------------------------------------------------- + // parsePidsFromLsofOutput — branch-coverage for mid-loop && short-circuits + // ------------------------------------------------------------------------- + describe("parsePidsFromLsofOutput — branch coverage (lines 67-69)", () => { + it("skips a mid-loop entry when the command does not include 'openclaw'", () => { + // Exercises the false branch of currentCmd.toLowerCase().includes("openclaw") + // inside the mid-loop flush: a non-openclaw cmd between two entries must not + // be pushed, but the following openclaw entry still must be. + const stalePid = process.pid + 700; + // Mixed output: non-openclaw entry first, then openclaw entry + const stdout = `p${process.pid + 699}\ncnginx\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(stalePid); + expect(result).not.toContain(process.pid + 699); + }); + + it("skips a mid-loop entry when currentCmd is missing (two consecutive p-lines)", () => { + // Exercises currentCmd falsy branch mid-loop: two 'p' lines in a row + // (no 'c' line between them) — the first PID must be skipped, the second handled. + const stalePid = process.pid + 701; + // Two consecutive p-lines: first has no c-line before the next p-line + const stdout = `p${process.pid + 702}\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(stalePid); + }); + + it("ignores a p-line with an invalid (non-positive) PID — ternary false branch", () => { + // Exercises the `Number.isFinite(parsed) && parsed > 0 ? parsed : undefined` + // false branch: a malformed 'p' line (e.g. 'p0' or 'pNaN') must not corrupt + // currentPid and must not end up in the returned pids array. + const stalePid = process.pid + 703; + // p0 is invalid (not > 0); the following valid openclaw entry must still be found. + const stdout = `p0\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + expect(result).toContain(stalePid); + expect(result).not.toContain(0); + }); + + it("silently skips lines that start with neither 'p' nor 'c' — else-if false branch", () => { + // lsof -Fpc only emits 'p' and 'c' lines, but defensive handling of + // unexpected output (e.g. 'f' for file descriptor in other lsof formats) + // must not throw or corrupt the pid list. Unknown lines are just skipped. + const stalePid = process.pid + 704; + // Intersperse an 'f' line (file descriptor marker) — not a 'p' or 'c' line + const stdout = `p${stalePid}\nf8\ncopenclaw-gateway\n`; + mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" }); + const result = findGatewayPidsOnPortSync(18789); + // The 'f' line must not corrupt parsing; stalePid must still be found + // (the 'c' line after 'f' correctly sets currentCmd) + expect(result).toContain(stalePid); + }); + }); + + // ------------------------------------------------------------------------- + // pollPortOnce branch — status 1 + non-empty stdout with zero openclaw pids + // ------------------------------------------------------------------------- + describe("pollPortOnce — status 1 + non-empty non-openclaw stdout (line 145)", () => { + it("treats status 1 + non-openclaw stdout as port-free (not an openclaw process)", () => { + // status 1 + non-empty stdout where no openclaw pids are present: + // the port may be held by an unrelated process. From our perspective + // (we only kill openclaw pids) it is effectively free. + const stalePid = process.pid + 800; + let call = 0; + mockSpawnSync.mockImplementation(() => { + call++; + if (call === 1) { + return { + error: null, + status: 0, + stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), + stderr: "", + }; + } + // status 1 + non-openclaw output — should be treated as free:true for our purposes + return { + error: null, + status: 1, + stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]), + stderr: "", + }; + }); + vi.spyOn(process, "kill").mockReturnValue(true); + // Should complete cleanly — no openclaw pids in status-1 output → free + expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); + // Completed in exactly 2 calls (initial find + 1 free poll) + expect(call).toBe(2); + }); + }); + + // ------------------------------------------------------------------------- + // sleepSync — direct unit tests via __testing.callSleepSyncRaw + // ------------------------------------------------------------------------- + describe("sleepSync — Atomics.wait paths", () => { + it("returns immediately when called with 0ms (timeoutMs <= 0 early return)", () => { + // sleepSync(0) must short-circuit before touching Atomics.wait. + // Verify it does not throw and returns synchronously. + __testing.setSleepSyncOverride(null); // bypass override so real path runs + expect(() => __testing.callSleepSyncRaw(0)).not.toThrow(); + }); + + it("returns immediately when called with a negative value (Math.max(0,...) clamp)", () => { + __testing.setSleepSyncOverride(null); + expect(() => __testing.callSleepSyncRaw(-1)).not.toThrow(); + }); + + it("executes the Atomics.wait path successfully when called with a positive timeout", () => { + // Verify the real Atomics.wait code path runs without error. + // Use 1ms to keep the test fast; Atomics.wait resolves immediately + // because the timeout expires in 1ms. + __testing.setSleepSyncOverride(null); + expect(() => __testing.callSleepSyncRaw(1)).not.toThrow(); + }); + + it("falls back to busy-wait when Atomics.wait throws (Worker / sandboxed env)", () => { + // Atomics.wait throws in Worker threads and some sandboxed runtimes. + // The catch branch must handle this without propagating the exception. + const origWait = Atomics.wait; + Atomics.wait = () => { + throw new Error("not on main thread"); + }; + __testing.setSleepSyncOverride(null); + try { + // 1ms is enough to exercise the busy-wait loop without slowing CI. + expect(() => __testing.callSleepSyncRaw(1)).not.toThrow(); + } finally { + Atomics.wait = origWait; + __testing.setSleepSyncOverride(() => {}); + } + }); + }); +}); diff --git a/src/infra/restart-stale-pids.ts b/src/infra/restart-stale-pids.ts index bbab76f8374..c6c9535c737 100644 --- a/src/infra/restart-stale-pids.ts +++ b/src/infra/restart-stale-pids.ts @@ -4,11 +4,31 @@ import { createSubsystemLogger } from "../logging/subsystem.js"; import { resolveLsofCommandSync } from "./ports-lsof.js"; const SPAWN_TIMEOUT_MS = 2000; -const STALE_SIGTERM_WAIT_MS = 300; -const STALE_SIGKILL_WAIT_MS = 200; +const STALE_SIGTERM_WAIT_MS = 600; +const STALE_SIGKILL_WAIT_MS = 400; +/** + * After SIGKILL, the kernel may not release the TCP port immediately. + * Poll until the port is confirmed free (or until the budget expires) before + * returning control to the caller (typically `triggerOpenClawRestart` → + * `systemctl restart`). Without this wait the new process races the dying + * process for the port and systemd enters an EADDRINUSE restart loop. + * + * POLL_SPAWN_TIMEOUT_MS is intentionally much shorter than SPAWN_TIMEOUT_MS + * so that a single slow or hung lsof invocation does not consume the entire + * polling budget. At 400 ms per call, up to five independent lsof attempts + * fit within PORT_FREE_TIMEOUT_MS = 2000 ms, each with a definitive outcome. + */ +const PORT_FREE_POLL_INTERVAL_MS = 50; +const PORT_FREE_TIMEOUT_MS = 2000; +const POLL_SPAWN_TIMEOUT_MS = 400; const restartLog = createSubsystemLogger("restart"); let sleepSyncOverride: ((ms: number) => void) | null = null; +let dateNowOverride: (() => number) | null = null; + +function getTimeMs(): number { + return dateNowOverride ? dateNowOverride() : Date.now(); +} function sleepSync(ms: number): void { const timeoutMs = Math.max(0, Math.floor(ms)); @@ -31,25 +51,14 @@ function sleepSync(ms: number): void { } /** - * Find PIDs of gateway processes listening on the given port using synchronous lsof. - * Returns only PIDs that belong to openclaw gateway processes (not the current process). + * Parse openclaw gateway PIDs from lsof -Fpc stdout. + * Pure function — no I/O. Excludes the current process. */ -export function findGatewayPidsOnPortSync(port: number): number[] { - if (process.platform === "win32") { - return []; - } - const lsof = resolveLsofCommandSync(); - const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], { - encoding: "utf8", - timeout: SPAWN_TIMEOUT_MS, - }); - if (res.error || res.status !== 0) { - return []; - } +function parsePidsFromLsofOutput(stdout: string): number[] { const pids: number[] = []; let currentPid: number | undefined; let currentCmd: string | undefined; - for (const line of res.stdout.split(/\r?\n/).filter(Boolean)) { + for (const line of stdout.split(/\r?\n/).filter(Boolean)) { if (line.startsWith("p")) { if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) { pids.push(currentPid); @@ -64,17 +73,117 @@ export function findGatewayPidsOnPortSync(port: number): number[] { if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) { pids.push(currentPid); } - return pids.filter((pid) => pid !== process.pid); + // Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the + // same PID twice. Return each PID at most once to avoid double-killing. + return [...new Set(pids)].filter((pid) => pid !== process.pid); +} + +/** + * Find PIDs of gateway processes listening on the given port using synchronous lsof. + * Returns only PIDs that belong to openclaw gateway processes (not the current process). + */ +export function findGatewayPidsOnPortSync( + port: number, + spawnTimeoutMs = SPAWN_TIMEOUT_MS, +): number[] { + if (process.platform === "win32") { + return []; + } + const lsof = resolveLsofCommandSync(); + const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], { + encoding: "utf8", + timeout: spawnTimeoutMs, + }); + if (res.error) { + const code = (res.error as NodeJS.ErrnoException).code; + const detail = + code && code.trim().length > 0 + ? code + : res.error instanceof Error + ? res.error.message + : "unknown error"; + restartLog.warn(`lsof failed during initial stale-pid scan for port ${port}: ${detail}`); + return []; + } + if (res.status === 1) { + return []; + } + if (res.status !== 0) { + restartLog.warn( + `lsof exited with status ${res.status} during initial stale-pid scan for port ${port}; skipping stale pid check`, + ); + return []; + } + return parsePidsFromLsofOutput(res.stdout); +} + +/** + * Attempt a single lsof poll for the given port. + * + * Returns a discriminated union with four possible states: + * + * { free: true } — port confirmed free + * { free: false } — port confirmed busy + * { free: null; permanent: false } — transient error, keep retrying + * { free: null; permanent: true } — lsof unavailable (ENOENT / EACCES), + * no point retrying + * + * Separating transient from permanent errors is critical so that: + * 1. A slow/timed-out lsof call (transient) does not abort the polling loop — + * the caller retries until the wall-clock budget expires. + * 2. Non-zero lsof exits from runtime/permission failures (status > 1) are + * not misclassified as "port free" — they are inconclusive and retried. + * 3. A missing lsof binary (permanent) short-circuits cleanly rather than + * spinning the full budget pointlessly. + */ +type PollResult = { free: true } | { free: false } | { free: null; permanent: boolean }; + +function pollPortOnce(port: number): PollResult { + try { + const lsof = resolveLsofCommandSync(); + const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], { + encoding: "utf8", + timeout: POLL_SPAWN_TIMEOUT_MS, + }); + if (res.error) { + // Spawn-level failure. ENOENT / EACCES means lsof is permanently + // unavailable on this system; other errors (e.g. timeout) are transient. + const code = (res.error as NodeJS.ErrnoException).code; + const permanent = code === "ENOENT" || code === "EACCES" || code === "EPERM"; + return { free: null, permanent }; + } + if (res.status === 1) { + // lsof canonical "no matching processes" exit — port is genuinely free. + // Guard: on Linux containers with restricted /proc (AppArmor, seccomp, + // user namespaces), lsof can exit 1 AND still emit some output for the + // processes it could read. Parse stdout when non-empty to avoid false-free. + if (res.stdout) { + const pids = parsePidsFromLsofOutput(res.stdout); + return pids.length === 0 ? { free: true } : { free: false }; + } + return { free: true }; + } + if (res.status !== 0) { + // status > 1: runtime/permission/flag error. Cannot confirm port state — + // treat as a transient failure and keep polling rather than falsely + // reporting the port as free (which would recreate the EADDRINUSE race). + return { free: null, permanent: false }; + } + // status === 0: lsof found listeners. Parse pids from the stdout we + // already hold — no second lsof spawn, no new failure surface. + const pids = parsePidsFromLsofOutput(res.stdout); + return pids.length === 0 ? { free: true } : { free: false }; + } catch { + return { free: null, permanent: false }; + } } /** * Synchronously terminate stale gateway processes. + * Callers must pass a non-empty pids array. * Sends SIGTERM, waits briefly, then SIGKILL for survivors. */ function terminateStaleProcessesSync(pids: number[]): number[] { - if (pids.length === 0) { - return []; - } const killed: number[] = []; for (const pid of pids) { try { @@ -100,8 +209,48 @@ function terminateStaleProcessesSync(pids: number[]): number[] { return killed; } +/** + * Poll the given port until it is confirmed free, lsof is confirmed unavailable, + * or the wall-clock budget expires. + * + * Each poll invocation uses POLL_SPAWN_TIMEOUT_MS (400 ms), which is + * significantly shorter than PORT_FREE_TIMEOUT_MS (2000 ms). This ensures + * that a single slow or hung lsof call cannot consume the entire polling + * budget and cause the function to exit prematurely with an inconclusive + * result. Up to five independent lsof attempts fit within the budget. + * + * Exit conditions: + * - `pollPortOnce` returns `{ free: true }` → port confirmed free + * - `pollPortOnce` returns `{ free: null, permanent: true }` → lsof unavailable, bail + * - `pollPortOnce` returns `{ free: false }` → port busy, sleep + retry + * - `pollPortOnce` returns `{ free: null, permanent: false }` → transient error, sleep + retry + * - Wall-clock deadline exceeded → log warning, proceed anyway + */ +function waitForPortFreeSync(port: number): void { + const deadline = getTimeMs() + PORT_FREE_TIMEOUT_MS; + while (getTimeMs() < deadline) { + const result = pollPortOnce(port); + if (result.free === true) { + return; + } + if (result.free === null && result.permanent) { + // lsof is permanently unavailable (ENOENT / EACCES) — bail immediately, + // no point spinning the remaining budget. + return; + } + // result.free === false: port still bound. + // result.free === null && !permanent: transient lsof error — keep polling. + sleepSync(PORT_FREE_POLL_INTERVAL_MS); + } + restartLog.warn(`port ${port} still in use after ${PORT_FREE_TIMEOUT_MS}ms; proceeding anyway`); +} + /** * Inspect the gateway port and kill any stale gateway processes holding it. + * Blocks until the port is confirmed free (or the poll budget expires) so + * the supervisor (systemd / launchctl) does not race a zombie process for + * the port and enter an EADDRINUSE restart loop. + * * Called before service restart commands to prevent port conflicts. */ export function cleanStaleGatewayProcessesSync(): number[] { @@ -114,7 +263,14 @@ export function cleanStaleGatewayProcessesSync(): number[] { restartLog.warn( `killing ${stalePids.length} stale gateway process(es) before restart: ${stalePids.join(", ")}`, ); - return terminateStaleProcessesSync(stalePids); + const killed = terminateStaleProcessesSync(stalePids); + // Wait for the port to be released before returning — called unconditionally + // even when `killed` is empty (all pids were already dead before SIGTERM). + // A process can exit before our signal arrives yet still leave its socket + // in TIME_WAIT / FIN_WAIT; polling is the only reliable way to confirm the + // kernel has fully released the port before systemd fires the new process. + waitForPortFreeSync(port); + return killed; } catch { return []; } @@ -124,4 +280,9 @@ export const __testing = { setSleepSyncOverride(fn: ((ms: number) => void) | null) { sleepSyncOverride = fn; }, + setDateNowOverride(fn: (() => number) | null) { + dateNowOverride = fn; + }, + /** Invoke sleepSync directly (bypasses the override) for unit-testing the real Atomics path. */ + callSleepSyncRaw: sleepSync, }; diff --git a/src/infra/restart.ts b/src/infra/restart.ts index c84dfc6f7ac..3f65cfc1614 100644 --- a/src/infra/restart.ts +++ b/src/infra/restart.ts @@ -1,4 +1,6 @@ import { spawnSync } from "node:child_process"; +import os from "node:os"; +import path from "node:path"; import { resolveGatewayLaunchAgentLabel, resolveGatewaySystemdServiceName, @@ -335,7 +337,8 @@ export function triggerOpenClawRestart(): RestartAttempt { process.env.OPENCLAW_LAUNCHD_LABEL || resolveGatewayLaunchAgentLabel(process.env.OPENCLAW_PROFILE); const uid = typeof process.getuid === "function" ? process.getuid() : undefined; - const target = uid !== undefined ? `gui/${uid}/${label}` : label; + const domain = uid !== undefined ? `gui/${uid}` : "gui/501"; + const target = `${domain}/${label}`; const args = ["kickstart", "-k", target]; tried.push(`launchctl ${args.join(" ")}`); const res = spawnSync("launchctl", args, { @@ -345,10 +348,39 @@ export function triggerOpenClawRestart(): RestartAttempt { if (!res.error && res.status === 0) { return { ok: true, method: "launchctl", tried }; } + + // kickstart fails when the service was previously booted out (deregistered from launchd). + // Fall back to bootstrap (re-register from plist) + kickstart. + // Use env HOME to match how launchd.ts resolves the plist install path. + const home = process.env.HOME?.trim() || os.homedir(); + const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`); + const bootstrapArgs = ["bootstrap", domain, plistPath]; + tried.push(`launchctl ${bootstrapArgs.join(" ")}`); + const boot = spawnSync("launchctl", bootstrapArgs, { + encoding: "utf8", + timeout: SPAWN_TIMEOUT_MS, + }); + if (boot.error || (boot.status !== 0 && boot.status !== null)) { + return { + ok: false, + method: "launchctl", + detail: formatSpawnDetail(boot), + tried, + }; + } + const retryArgs = ["kickstart", "-k", target]; + tried.push(`launchctl ${retryArgs.join(" ")}`); + const retry = spawnSync("launchctl", retryArgs, { + encoding: "utf8", + timeout: SPAWN_TIMEOUT_MS, + }); + if (!retry.error && retry.status === 0) { + return { ok: true, method: "launchctl", tried }; + } return { ok: false, method: "launchctl", - detail: formatSpawnDetail(res), + detail: formatSpawnDetail(retry), tried, }; }