From cc7e61612aa83e057db683dedc0f2ce170225d55 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 7 Mar 2026 21:36:00 +0000 Subject: [PATCH] fix(gateway): harden service-mode stale process cleanup (#38463, thanks @spirittechie) Co-authored-by: Jesse Paul --- src/cli/gateway-cli/run.ts | 9 +++++++++ src/daemon/systemd-unit.test.ts | 3 +++ src/daemon/systemd-unit.ts | 3 +++ src/infra/restart-stale-pids.ts | 7 +++++-- src/infra/restart.test.ts | 21 +++++++++++++++++++++ 5 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index ece545e3d5d..7d456992f0e 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -17,6 +17,7 @@ import { setGatewayWsLogStyle } from "../../gateway/ws-logging.js"; import { setVerbose } from "../../globals.js"; import { GatewayLockError } from "../../infra/gateway-lock.js"; import { formatPortDiagnostics, inspectPortUsage } from "../../infra/ports.js"; +import { cleanStaleGatewayProcessesSync } from "../../infra/restart-stale-pids.js"; import { setConsoleSubsystemFilter, setConsoleTimestampPrefix } from "../../logging/console.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; import { defaultRuntime } from "../../runtime.js"; @@ -201,6 +202,14 @@ async function runGatewayCommand(opts: GatewayRunOpts) { defaultRuntime.exit(1); return; } + if (process.env.OPENCLAW_SERVICE_MARKER?.trim()) { + const stale = cleanStaleGatewayProcessesSync(port); + if (stale.length > 0) { + gatewayLog.info( + `service-mode: cleared ${stale.length} stale gateway pid(s) before bind on port ${port}`, + ); + } + } if (opts.force) { try { const { killed, waitedMs, escalatedToSigkill } = await forceFreePortAndWait(port, { diff --git a/src/daemon/systemd-unit.test.ts b/src/daemon/systemd-unit.test.ts index 5c5562b25e6..0a94a1c6b4b 100644 --- a/src/daemon/systemd-unit.test.ts +++ b/src/daemon/systemd-unit.test.ts @@ -19,6 +19,9 @@ describe("buildSystemdUnit", () => { environment: {}, }); expect(unit).toContain("KillMode=control-group"); + expect(unit).toContain("TimeoutStopSec=30"); + expect(unit).toContain("TimeoutStartSec=30"); + expect(unit).toContain("SuccessExitStatus=0 143"); }); it("rejects environment values with line breaks", () => { diff --git a/src/daemon/systemd-unit.ts b/src/daemon/systemd-unit.ts index 9cddbee24d1..0d2d44715f4 100644 --- a/src/daemon/systemd-unit.ts +++ b/src/daemon/systemd-unit.ts @@ -59,6 +59,9 @@ export function buildSystemdUnit({ `ExecStart=${execStart}`, "Restart=always", "RestartSec=5", + "TimeoutStopSec=30", + "TimeoutStartSec=30", + "SuccessExitStatus=0 143", // Keep service children in the same lifecycle so restarts do not leave // orphan ACP/runtime workers behind. "KillMode=control-group", diff --git a/src/infra/restart-stale-pids.ts b/src/infra/restart-stale-pids.ts index c6c9535c737..1d66cc385c9 100644 --- a/src/infra/restart-stale-pids.ts +++ b/src/infra/restart-stale-pids.ts @@ -253,9 +253,12 @@ function waitForPortFreeSync(port: number): void { * * Called before service restart commands to prevent port conflicts. */ -export function cleanStaleGatewayProcessesSync(): number[] { +export function cleanStaleGatewayProcessesSync(portOverride?: number): number[] { try { - const port = resolveGatewayPort(undefined, process.env); + const port = + typeof portOverride === "number" && Number.isFinite(portOverride) && portOverride > 0 + ? Math.floor(portOverride) + : resolveGatewayPort(undefined, process.env); const stalePids = findGatewayPidsOnPortSync(port); if (stalePids.length === 0) { return []; diff --git a/src/infra/restart.test.ts b/src/infra/restart.test.ts index 23795e46f8e..e21225be37b 100644 --- a/src/infra/restart.test.ts +++ b/src/infra/restart.test.ts @@ -95,6 +95,27 @@ describe.runIf(process.platform !== "win32")("cleanStaleGatewayProcessesSync", ( expect(killSpy).toHaveBeenCalledWith(6002, "SIGKILL"); }); + it("uses explicit port override when provided", () => { + spawnSyncMock.mockReturnValue({ + error: undefined, + status: 0, + stdout: ["p7001", "copenclaw"].join("\n"), + }); + const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true); + + const killed = cleanStaleGatewayProcessesSync(19999); + + expect(killed).toEqual([7001]); + expect(resolveGatewayPortMock).not.toHaveBeenCalled(); + expect(spawnSyncMock).toHaveBeenCalledWith( + "/usr/sbin/lsof", + ["-nP", "-iTCP:19999", "-sTCP:LISTEN", "-Fpc"], + expect.objectContaining({ encoding: "utf8", timeout: 2000 }), + ); + expect(killSpy).toHaveBeenCalledWith(7001, "SIGTERM"); + expect(killSpy).toHaveBeenCalledWith(7001, "SIGKILL"); + }); + it("returns empty when no stale listeners are found", () => { spawnSyncMock.mockReturnValue({ error: undefined,