fix(gateway): harden service-mode stale process cleanup (#38463, thanks @spirittechie)

Co-authored-by: Jesse Paul <drzin69@gmail.com>
This commit is contained in:
Peter Steinberger
2026-03-07 21:36:00 +00:00
parent 1835d5808f
commit cc7e61612a
5 changed files with 41 additions and 2 deletions

View File

@@ -17,6 +17,7 @@ import { setGatewayWsLogStyle } from "../../gateway/ws-logging.js";
import { setVerbose } from "../../globals.js";
import { GatewayLockError } from "../../infra/gateway-lock.js";
import { formatPortDiagnostics, inspectPortUsage } from "../../infra/ports.js";
import { cleanStaleGatewayProcessesSync } from "../../infra/restart-stale-pids.js";
import { setConsoleSubsystemFilter, setConsoleTimestampPrefix } from "../../logging/console.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { defaultRuntime } from "../../runtime.js";
@@ -201,6 +202,14 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
defaultRuntime.exit(1);
return;
}
if (process.env.OPENCLAW_SERVICE_MARKER?.trim()) {
const stale = cleanStaleGatewayProcessesSync(port);
if (stale.length > 0) {
gatewayLog.info(
`service-mode: cleared ${stale.length} stale gateway pid(s) before bind on port ${port}`,
);
}
}
if (opts.force) {
try {
const { killed, waitedMs, escalatedToSigkill } = await forceFreePortAndWait(port, {

View File

@@ -19,6 +19,9 @@ describe("buildSystemdUnit", () => {
environment: {},
});
expect(unit).toContain("KillMode=control-group");
expect(unit).toContain("TimeoutStopSec=30");
expect(unit).toContain("TimeoutStartSec=30");
expect(unit).toContain("SuccessExitStatus=0 143");
});
it("rejects environment values with line breaks", () => {

View File

@@ -59,6 +59,9 @@ export function buildSystemdUnit({
`ExecStart=${execStart}`,
"Restart=always",
"RestartSec=5",
"TimeoutStopSec=30",
"TimeoutStartSec=30",
"SuccessExitStatus=0 143",
// Keep service children in the same lifecycle so restarts do not leave
// orphan ACP/runtime workers behind.
"KillMode=control-group",

View File

@@ -253,9 +253,12 @@ function waitForPortFreeSync(port: number): void {
*
* Called before service restart commands to prevent port conflicts.
*/
export function cleanStaleGatewayProcessesSync(): number[] {
export function cleanStaleGatewayProcessesSync(portOverride?: number): number[] {
try {
const port = resolveGatewayPort(undefined, process.env);
const port =
typeof portOverride === "number" && Number.isFinite(portOverride) && portOverride > 0
? Math.floor(portOverride)
: resolveGatewayPort(undefined, process.env);
const stalePids = findGatewayPidsOnPortSync(port);
if (stalePids.length === 0) {
return [];

View File

@@ -95,6 +95,27 @@ describe.runIf(process.platform !== "win32")("cleanStaleGatewayProcessesSync", (
expect(killSpy).toHaveBeenCalledWith(6002, "SIGKILL");
});
it("uses explicit port override when provided", () => {
spawnSyncMock.mockReturnValue({
error: undefined,
status: 0,
stdout: ["p7001", "copenclaw"].join("\n"),
});
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
const killed = cleanStaleGatewayProcessesSync(19999);
expect(killed).toEqual([7001]);
expect(resolveGatewayPortMock).not.toHaveBeenCalled();
expect(spawnSyncMock).toHaveBeenCalledWith(
"/usr/sbin/lsof",
["-nP", "-iTCP:19999", "-sTCP:LISTEN", "-Fpc"],
expect.objectContaining({ encoding: "utf8", timeout: 2000 }),
);
expect(killSpy).toHaveBeenCalledWith(7001, "SIGTERM");
expect(killSpy).toHaveBeenCalledWith(7001, "SIGKILL");
});
it("returns empty when no stale listeners are found", () => {
spawnSyncMock.mockReturnValue({
error: undefined,