diff --git a/src/cli/gateway-cli/run.option-collisions.test.ts b/src/cli/gateway-cli/run.option-collisions.test.ts
index 95245a91989..b26b4c86e47 100644
--- a/src/cli/gateway-cli/run.option-collisions.test.ts
+++ b/src/cli/gateway-cli/run.option-collisions.test.ts
@@ -12,6 +12,7 @@ const forceFreePortAndWait = vi.fn(async (_port: number, _opts: unknown) => ({
   waitedMs: 0,
   escalatedToSigkill: false,
 }));
+const waitForPortBindable = vi.fn(async (_port: number, _opts?: unknown) => 0);
 const ensureDevGatewayConfig = vi.fn(async (_opts?: unknown) => {});
 const runGatewayLoop = vi.fn(async ({ start }: { start: () => Promise<unknown> }) => {
   await start();
@@ -80,6 +81,7 @@ vi.mock("../command-format.js", () => ({
 
 vi.mock("../ports.js", () => ({
   forceFreePortAndWait: (port: number, opts: unknown) => forceFreePortAndWait(port, opts),
+  waitForPortBindable: (port: number, opts?: unknown) => waitForPortBindable(port, opts),
 }));
 
 vi.mock("./dev.js", () => ({
@@ -108,6 +110,7 @@ describe("gateway run option collisions", () => {
     setGatewayWsLogStyle.mockClear();
     setVerbose.mockClear();
     forceFreePortAndWait.mockClear();
+    waitForPortBindable.mockClear();
     ensureDevGatewayConfig.mockClear();
     runGatewayLoop.mockClear();
   });
@@ -140,6 +143,10 @@ describe("gateway run option collisions", () => {
     ]);
 
     expect(forceFreePortAndWait).toHaveBeenCalledWith(18789, expect.anything());
+    expect(waitForPortBindable).toHaveBeenCalledWith(
+      18789,
+      expect.objectContaining({ host: "127.0.0.1" }),
+    );
     expect(setGatewayWsLogStyle).toHaveBeenCalledWith("full");
     expect(startGatewayServer).toHaveBeenCalledWith(
       18789,
diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts
index 291328273e3..666adc289a6 100644
--- a/src/cli/gateway-cli/run.ts
+++ b/src/cli/gateway-cli/run.ts
@@ -21,7 +21,7 @@ import { createSubsystemLogger } from "../../logging/subsystem.js";
 import { defaultRuntime } from "../../runtime.js";
 import { formatCliCommand } from "../command-format.js";
 import { inheritOptionFromParent } from "../command-options.js";
-import { forceFreePortAndWait } from "../ports.js";
+import { forceFreePortAndWait, waitForPortBindable } from "../ports.js";
 import { ensureDevGatewayConfig } from "./dev.js";
 import { runGatewayLoop } from "./run-loop.js";
 import {
@@ -186,6 +186,20 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
     defaultRuntime.error("Invalid port");
     defaultRuntime.exit(1);
   }
+  const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback";
+  const bind =
+    bindRaw === "loopback" ||
+    bindRaw === "lan" ||
+    bindRaw === "auto" ||
+    bindRaw === "custom" ||
+    bindRaw === "tailnet"
+      ? bindRaw
+      : null;
+  if (!bind) {
+    defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")');
+    defaultRuntime.exit(1);
+    return;
+  }
   if (opts.force) {
     try {
       const { killed, waitedMs, escalatedToSigkill } = await forceFreePortAndWait(port, {
@@ -208,6 +222,23 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
           gatewayLog.info(`force: waited ${waitedMs}ms for port ${port} to free`);
         }
       }
+      // After killing, verify the port is actually bindable (handles TIME_WAIT).
+      const bindProbeHost =
+        bind === "loopback"
+          ? "127.0.0.1"
+          : bind === "lan"
+            ? "0.0.0.0"
+            : bind === "custom"
+              ? toOptionString(cfg.gateway?.customBindHost)
+              : undefined;
+      const bindWaitMs = await waitForPortBindable(port, {
+        timeoutMs: 3000,
+        intervalMs: 150,
+        host: bindProbeHost,
+      });
+      if (bindWaitMs > 0) {
+        gatewayLog.info(`force: waited ${bindWaitMs}ms for port ${port} to become bindable`);
+      }
     } catch (err) {
       defaultRuntime.error(`Force: ${String(err)}`);
       defaultRuntime.exit(1);
@@ -257,21 +288,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
     defaultRuntime.exit(1);
     return;
   }
-  const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback";
-  const bind =
-    bindRaw === "loopback" ||
-    bindRaw === "lan" ||
-    bindRaw === "auto" ||
-    bindRaw === "custom" ||
-    bindRaw === "tailnet"
-      ? bindRaw
-      : null;
-  if (!bind) {
-    defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")');
-    defaultRuntime.exit(1);
-    return;
-  }
-
   const miskeys = extractGatewayMiskeys(snapshot?.parsed);
   const authOverride =
     authMode || passwordRaw || tokenRaw || authModeRaw
diff --git a/src/cli/ports.test.ts b/src/cli/ports.test.ts
new file mode 100644
index 00000000000..082dfa09a12
--- /dev/null
+++ b/src/cli/ports.test.ts
@@ -0,0 +1,124 @@
+import { EventEmitter } from "node:events";
+import net from "node:net";
+import { describe, expect, it, vi } from "vitest";
+
+// Hoist the factory so vi.mock can access it.
+const mockCreateServer = vi.hoisted(() => vi.fn());
+
+vi.mock("node:net", async (importOriginal) => {
+  const actual = await importOriginal<typeof import("node:net")>();
+  return { ...actual, createServer: mockCreateServer };
+});
+
+import { probePortFree, waitForPortBindable } from "./ports.js";
+
+/** Build a minimal fake net.Server that emits a given error code on listen(). */
+function makeErrServer(code: string): net.Server {
+  const err = Object.assign(new Error(`bind error: ${code}`), {
+    code,
+  }) as NodeJS.ErrnoException;
+
+  const fake = new EventEmitter() as unknown as net.Server;
+  (fake as unknown as { close: (cb?: () => void) => net.Server }).close = (cb?: () => void) => {
+    cb?.();
+    return fake;
+  };
+  (fake as unknown as { unref: () => net.Server }).unref = () => fake;
+  (fake as unknown as { listen: (...args: unknown[]) => net.Server }).listen = (
+    ..._args: unknown[]
+  ) => {
+    setImmediate(() => fake.emit("error", err));
+    return fake;
+  };
+  return fake;
+}
+
+describe("probePortFree", () => {
+  it("resolves false (not rejects) when bind returns EADDRINUSE", async () => {
+    mockCreateServer.mockReturnValue(makeErrServer("EADDRINUSE"));
+    await expect(probePortFree(9999, "127.0.0.1")).resolves.toBe(false);
+  });
+
+  it("rejects immediately for EADDRNOTAVAIL (non-retryable: host address not on any interface)", async () => {
+    mockCreateServer.mockReturnValue(makeErrServer("EADDRNOTAVAIL"));
+    await expect(probePortFree(9999, "192.0.2.1")).rejects.toMatchObject({ code: "EADDRNOTAVAIL" });
+  });
+
+  it("rejects immediately for EACCES (non-retryable bind error)", async () => {
+    mockCreateServer.mockReturnValue(makeErrServer("EACCES"));
+    await expect(probePortFree(80, "0.0.0.0")).rejects.toMatchObject({ code: "EACCES" });
+  });
+
+  it("rejects immediately for other non-retryable errors", async () => {
+    mockCreateServer.mockReturnValue(makeErrServer("EINVAL"));
+    await expect(probePortFree(9999, "0.0.0.0")).rejects.toMatchObject({ code: "EINVAL" });
+  });
+
+  it("resolves true when the port is free", async () => {
+    // Mock a successful bind: the "listening" event fires immediately without
+    // acquiring a real socket, making this deterministic and avoiding TOCTOU races.
+    // (A real-socket approach would bind to :0, release, then reprobe — the OS can
+    // reassign the ephemeral port in between, causing a flaky EADDRINUSE failure.)
+    const fakeServer = new EventEmitter() as unknown as net.Server;
+    (fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = (
+      cb?: () => void,
+    ) => {
+      cb?.();
+      return fakeServer;
+    };
+    (fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer;
+    (fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = (
+      ..._args: unknown[]
+    ) => {
+      // Simulate a successful bind by firing the "listening" callback.
+      const callback = _args.find((a) => typeof a === "function") as (() => void) | undefined;
+      setImmediate(() => callback?.());
+      return fakeServer;
+    };
+    mockCreateServer.mockReturnValue(fakeServer);
+
+    const result = await probePortFree(9999, "127.0.0.1");
+    expect(result).toBe(true);
+  });
+});
+
+describe("waitForPortBindable", () => {
+  it("probes the provided host when waiting for bindability", async () => {
+    const listenCalls: Array<{ port: number; host: string }> = [];
+    const fakeServer = new EventEmitter() as unknown as net.Server;
+    (fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = (
+      cb?: () => void,
+    ) => {
+      cb?.();
+      return fakeServer;
+    };
+    (fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer;
+    (fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = (
+      ...args: unknown[]
+    ) => {
+      const [port, host] = args as [number, string];
+      listenCalls.push({ port, host });
+      const callback = args.find((a) => typeof a === "function") as (() => void) | undefined;
+      setImmediate(() => callback?.());
+      return fakeServer;
+    };
+    mockCreateServer.mockReturnValue(fakeServer);
+
+    await expect(
+      waitForPortBindable(9999, { timeoutMs: 100, intervalMs: 10, host: "127.0.0.1" }),
+    ).resolves.toBe(0);
+    expect(listenCalls[0]).toEqual({ port: 9999, host: "127.0.0.1" });
+  });
+
+  it("propagates EACCES rejection immediately without retrying", async () => {
+    // Every call to createServer will emit EACCES — so if waitForPortBindable retried,
+    // mockCreateServer would be called many times. We assert it's called exactly once.
+    mockCreateServer.mockClear();
+    mockCreateServer.mockReturnValue(makeErrServer("EACCES"));
+    await expect(
+      waitForPortBindable(80, { timeoutMs: 5000, intervalMs: 50 }),
+    ).rejects.toMatchObject({ code: "EACCES" });
+    // Only one probe should have been attempted — no spinning through the retry loop.
+    expect(mockCreateServer).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/src/cli/ports.ts b/src/cli/ports.ts
index e2bfa67aad9..fd829bf21db 100644
--- a/src/cli/ports.ts
+++ b/src/cli/ports.ts
@@ -1,4 +1,5 @@
 import { execFileSync } from "node:child_process";
+import { createServer } from "node:net";
 import { resolveLsofCommandSync } from "../infra/ports-lsof.js";
 import { tryListenOnPort } from "../infra/ports-probe.js";
 import { sleep } from "../utils.js";
@@ -324,3 +325,63 @@ export async function forceFreePortAndWait(
     `port ${port} still has listeners after --force: ${still.map((p) => p.pid).join(", ")}`,
   );
 }
+
+/**
+ * Attempt a real TCP bind to verify the port is available at the OS level.
+ * Catches TIME_WAIT / kernel-level holds that lsof won't show.
+ *
+ * Resolves false only for EADDRINUSE — a genuinely transient condition
+ * (port still in TIME_WAIT after a --force kill) that the caller should retry.
+ *
+ * All other errors are non-retryable and are rejected immediately:
+ * - EADDRNOTAVAIL: the host address doesn't exist on any local interface
+ *   (hard misconfiguration, not a transient kernel hold).
+ * - EACCES: bind to a privileged port as non-root.
+ * - EINVAL, etc.: other unrecoverable OS errors.
+ */
+export function probePortFree(port: number, host = "0.0.0.0"): Promise<boolean> {
+  return new Promise((resolve, reject) => {
+    const srv = createServer();
+    srv.unref();
+    srv.once("error", (err: NodeJS.ErrnoException) => {
+      srv.close();
+      if (err.code === "EADDRINUSE") {
+        // Genuinely transient — port still in use or TIME_WAIT after a --force kill.
+        resolve(false);
+      } else {
+        // Non-retryable: EADDRNOTAVAIL (bad host address), EACCES (privileged port),
+        // EINVAL, and any other OS errors. Surface immediately; no retry loop.
+        reject(err);
+      }
+    });
+    srv.listen(port, host, () => {
+      srv.close(() => resolve(true));
+    });
+  });
+}
+
+/**
+ * Poll until a real test-bind succeeds, up to `timeoutMs`.
+ * Returns the number of ms waited, or throws if the port never freed.
+ */
+export async function waitForPortBindable(
+  port: number,
+  opts: { timeoutMs?: number; intervalMs?: number; host?: string } = {},
+): Promise<number> {
+  const timeoutMs = Math.max(opts.timeoutMs ?? 3000, 0);
+  const intervalMs = Math.max(opts.intervalMs ?? 150, 1);
+  const host = opts.host;
+  let waited = 0;
+  while (waited < timeoutMs) {
+    if (await probePortFree(port, host)) {
+      return waited;
+    }
+    await sleep(intervalMs);
+    waited += intervalMs;
+  }
+  // Final attempt
+  if (await probePortFree(port, host)) {
+    return waited;
+  }
+  throw new Error(`port ${port} still not bindable after ${waited}ms (TIME_WAIT or kernel hold)`);
+}
diff --git a/src/cli/update-cli/restart-helper.test.ts b/src/cli/update-cli/restart-helper.test.ts
index a152f3fdb48..18888c27f53 100644
--- a/src/cli/update-cli/restart-helper.test.ts
+++ b/src/cli/update-cli/restart-helper.test.ts
@@ -98,6 +98,8 @@ describe("restart-helper", () => {
       expect(scriptPath.endsWith(".sh")).toBe(true);
       expect(content).toContain("#!/bin/sh");
       expect(content).toContain("launchctl kickstart -k 'gui/501/ai.openclaw.gateway'");
+      // Should fall back to bootstrap when kickstart fails (service deregistered after bootout)
+      expect(content).toContain("launchctl bootstrap 'gui/501'");
       expect(content).toContain('rm -f "$0"');
       await cleanupScript(scriptPath);
     });
@@ -223,6 +225,45 @@ describe("restart-helper", () => {
       await cleanupScript(scriptPath);
     });
 
+    it("expands HOME in plist path instead of leaving literal $HOME", async () => {
+      Object.defineProperty(process, "platform", { value: "darwin" });
+      process.getuid = () => 501;
+
+      const { scriptPath, content } = await prepareAndReadScript({
+        HOME: "/Users/testuser",
+        OPENCLAW_PROFILE: "default",
+      });
+      // The plist path must contain the resolved home dir, not literal $HOME
+      expect(content).toMatch(/[\\/]Users[\\/]testuser[\\/]Library[\\/]LaunchAgents[\\/]/);
+      expect(content).not.toContain("$HOME");
+      await cleanupScript(scriptPath);
+    });
+
+    it("prefers env parameter HOME over process.env.HOME for plist path", async () => {
+      Object.defineProperty(process, "platform", { value: "darwin" });
+      process.getuid = () => 502;
+
+      const { scriptPath, content } = await prepareAndReadScript({
+        HOME: "/Users/envhome",
+        OPENCLAW_PROFILE: "default",
+      });
+      expect(content).toMatch(/[\\/]Users[\\/]envhome[\\/]Library[\\/]LaunchAgents[\\/]/);
+      await cleanupScript(scriptPath);
+    });
+
+    it("shell-escapes the label in the plist path on macOS", async () => {
+      Object.defineProperty(process, "platform", { value: "darwin" });
+      process.getuid = () => 501;
+
+      const { scriptPath, content } = await prepareAndReadScript({
+        HOME: "/Users/testuser",
+        OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.it's-a-test",
+      });
+      // The plist path must also shell-escape the label to prevent injection
+      expect(content).toContain("ai.openclaw.it'\\''s-a-test.plist");
+      await cleanupScript(scriptPath);
+    });
+
     it("rejects unsafe batch profile names on Windows", async () => {
       Object.defineProperty(process, "platform", { value: "win32" });
       const scriptPath = await prepareRestartScript({
diff --git a/src/cli/update-cli/restart-helper.ts b/src/cli/update-cli/restart-helper.ts
index cef4e25418b..4f7d45aab0c 100644
--- a/src/cli/update-cli/restart-helper.ts
+++ b/src/cli/update-cli/restart-helper.ts
@@ -83,12 +83,22 @@ rm -f "$0"
       const escaped = shellEscape(label);
       // Fallback to 501 if getuid is not available (though it should be on macOS)
       const uid = process.getuid ? process.getuid() : 501;
+      // Resolve HOME at generation time via env/process.env to match launchd.ts,
+      // and shell-escape the label in the plist filename to prevent injection.
+      const home = env.HOME?.trim() || process.env.HOME || os.homedir();
+      const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`);
+      const escapedPlistPath = shellEscape(plistPath);
       filename = `openclaw-restart-${timestamp}.sh`;
       scriptContent = `#!/bin/sh
 # Standalone restart script — survives parent process termination.
 # Wait briefly to ensure file locks are released after update.
 sleep 1
-launchctl kickstart -k 'gui/${uid}/${escaped}'
+# Try kickstart first (works when the service is still registered).
+# If it fails (e.g. after bootout), re-register via bootstrap then kickstart.
+if ! launchctl kickstart -k 'gui/${uid}/${escaped}' 2>/dev/null; then
+  launchctl bootstrap 'gui/${uid}' '${escapedPlistPath}' 2>/dev/null
+  launchctl kickstart -k 'gui/${uid}/${escaped}' 2>/dev/null || true
+fi
 # Self-cleanup
 rm -f "$0"
 `;
diff --git a/src/gateway/config-reload-plan.ts b/src/gateway/config-reload-plan.ts
index 1af87d25020..4ca1fcea7f0 100644
--- a/src/gateway/config-reload-plan.ts
+++ b/src/gateway/config-reload-plan.ts
@@ -50,6 +50,11 @@ const BASE_RELOAD_RULES: ReloadRule[] = [
     kind: "hot",
     actions: ["restart-heartbeat"],
   },
+  {
+    prefix: "agents.defaults.models",
+    kind: "hot",
+    actions: ["restart-heartbeat"],
+  },
   {
     prefix: "agents.defaults.model",
     kind: "hot",
diff --git a/src/gateway/config-reload.test.ts b/src/gateway/config-reload.test.ts
index e45347b0040..9c4994541e9 100644
--- a/src/gateway/config-reload.test.ts
+++ b/src/gateway/config-reload.test.ts
@@ -159,6 +159,14 @@ describe("buildGatewayReloadPlan", () => {
     );
   });
 
+  it("restarts heartbeat when agents.defaults.models allowlist changes", () => {
+    const plan = buildGatewayReloadPlan(["agents.defaults.models"]);
+    expect(plan.restartGateway).toBe(false);
+    expect(plan.restartHeartbeat).toBe(true);
+    expect(plan.hotReasons).toContain("agents.defaults.models");
+    expect(plan.noopPaths).toEqual([]);
+  });
+
   it("hot-reloads health monitor when channelHealthCheckMinutes changes", () => {
     const plan = buildGatewayReloadPlan(["gateway.channelHealthCheckMinutes"]);
     expect(plan.restartGateway).toBe(false);
diff --git a/src/gateway/server/http-listen.test.ts b/src/gateway/server/http-listen.test.ts
new file mode 100644
index 00000000000..12358713faf
--- /dev/null
+++ b/src/gateway/server/http-listen.test.ts
@@ -0,0 +1,100 @@
+import { EventEmitter } from "node:events";
+import type { Server as HttpServer } from "node:http";
+import { describe, expect, it, vi } from "vitest";
+import { GatewayLockError } from "../../infra/gateway-lock.js";
+import { listenGatewayHttpServer } from "./http-listen.js";
+
+const sleepMock = vi.hoisted(() => vi.fn(async (_ms: number) => {}));
+
+vi.mock("../../utils.js", () => ({
+  sleep: (ms: number) => sleepMock(ms),
+}));
+
+type ListenOutcome = { kind: "error"; code: string } | { kind: "listening" };
+
+function createFakeHttpServer(outcomes: ListenOutcome[]) {
+  class FakeHttpServer extends EventEmitter {
+    public closeCalls = 0;
+    private attempt = 0;
+
+    listen(_port: number, _host: string) {
+      const outcome = outcomes[this.attempt] ?? { kind: "listening" };
+      this.attempt += 1;
+      setImmediate(() => {
+        if (outcome.kind === "error") {
+          const err = Object.assign(new Error(outcome.code), { code: outcome.code });
+          this.emit("error", err);
+        } else {
+          this.emit("listening");
+        }
+      });
+      return this;
+    }
+
+    close(cb?: () => void) {
+      this.closeCalls += 1;
+      setImmediate(() => cb?.());
+      return this;
+    }
+  }
+
+  return new FakeHttpServer();
+}
+
+describe("listenGatewayHttpServer", () => {
+  it("retries EADDRINUSE and closes server handle before retry", async () => {
+    sleepMock.mockClear();
+    const fake = createFakeHttpServer([
+      { kind: "error", code: "EADDRINUSE" },
+      { kind: "listening" },
+    ]);
+
+    await expect(
+      listenGatewayHttpServer({
+        httpServer: fake as unknown as HttpServer,
+        bindHost: "127.0.0.1",
+        port: 18789,
+      }),
+    ).resolves.toBeUndefined();
+
+    expect(fake.closeCalls).toBe(1);
+    expect(sleepMock).toHaveBeenCalledTimes(1);
+  });
+
+  it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => {
+    sleepMock.mockClear();
+    const fake = createFakeHttpServer([
+      { kind: "error", code: "EADDRINUSE" },
+      { kind: "error", code: "EADDRINUSE" },
+      { kind: "error", code: "EADDRINUSE" },
+      { kind: "error", code: "EADDRINUSE" },
+      { kind: "error", code: "EADDRINUSE" },
+      { kind: "error", code: "EADDRINUSE" },
+    ]);
+
+    await expect(
+      listenGatewayHttpServer({
+        httpServer: fake as unknown as HttpServer,
+        bindHost: "127.0.0.1",
+        port: 18789,
+      }),
+    ).rejects.toBeInstanceOf(GatewayLockError);
+
+    expect(fake.closeCalls).toBe(4);
+  });
+
+  it("wraps non-EADDRINUSE errors as GatewayLockError", async () => {
+    sleepMock.mockClear();
+    const fake = createFakeHttpServer([{ kind: "error", code: "EACCES" }]);
+
+    await expect(
+      listenGatewayHttpServer({
+        httpServer: fake as unknown as HttpServer,
+        bindHost: "127.0.0.1",
+        port: 18789,
+      }),
+    ).rejects.toBeInstanceOf(GatewayLockError);
+
+    expect(fake.closeCalls).toBe(0);
+  });
+});
diff --git a/src/gateway/server/http-listen.ts b/src/gateway/server/http-listen.ts
index c2ae20a879f..0aa9f7b399f 100644
--- a/src/gateway/server/http-listen.ts
+++ b/src/gateway/server/http-listen.ts
@@ -1,5 +1,19 @@
 import type { Server as HttpServer } from "node:http";
 import { GatewayLockError } from "../../infra/gateway-lock.js";
+import { sleep } from "../../utils.js";
+
+const EADDRINUSE_MAX_RETRIES = 4;
+const EADDRINUSE_RETRY_INTERVAL_MS = 500;
+
+async function closeServerQuietly(httpServer: HttpServer): Promise<void> {
+  await new Promise<void>((resolve) => {
+    try {
+      httpServer.close(() => resolve());
+    } catch {
+      resolve();
+    }
+  });
+}
 
 export async function listenGatewayHttpServer(params: {
   httpServer: HttpServer;
@@ -7,31 +21,41 @@ export async function listenGatewayHttpServer(params: {
   port: number;
 }) {
   const { httpServer, bindHost, port } = params;
-  try {
-    await new Promise<void>((resolve, reject) => {
-      const onError = (err: NodeJS.ErrnoException) => {
-        httpServer.off("listening", onListening);
-        reject(err);
-      };
-      const onListening = () => {
-        httpServer.off("error", onError);
-        resolve();
-      };
-      httpServer.once("error", onError);
-      httpServer.once("listening", onListening);
-      httpServer.listen(port, bindHost);
-    });
-  } catch (err) {
-    const code = (err as NodeJS.ErrnoException).code;
-    if (code === "EADDRINUSE") {
+
+  for (let attempt = 0; ; attempt++) {
+    try {
+      await new Promise<void>((resolve, reject) => {
+        const onError = (err: NodeJS.ErrnoException) => {
+          httpServer.off("listening", onListening);
+          reject(err);
+        };
+        const onListening = () => {
+          httpServer.off("error", onError);
+          resolve();
+        };
+        httpServer.once("error", onError);
+        httpServer.once("listening", onListening);
+        httpServer.listen(port, bindHost);
+      });
+      return; // bound successfully
+    } catch (err) {
+      const code = (err as NodeJS.ErrnoException).code;
+      if (code === "EADDRINUSE" && attempt < EADDRINUSE_MAX_RETRIES) {
+        // Port may still be in TIME_WAIT after a recent process exit; retry.
+        await closeServerQuietly(httpServer);
+        await sleep(EADDRINUSE_RETRY_INTERVAL_MS);
+        continue;
+      }
+      if (code === "EADDRINUSE") {
+        throw new GatewayLockError(
+          `another gateway instance is already listening on ws://${bindHost}:${port}`,
+          err,
+        );
+      }
       throw new GatewayLockError(
-        `another gateway instance is already listening on ws://${bindHost}:${port}`,
+        `failed to bind gateway socket on ws://${bindHost}:${port}: ${String(err)}`,
         err,
       );
     }
-    throw new GatewayLockError(
-      `failed to bind gateway socket on ws://${bindHost}:${port}: ${String(err)}`,
-      err,
-    );
   }
 }
diff --git a/src/infra/restart-stale-pids.test.ts b/src/infra/restart-stale-pids.test.ts
new file mode 100644
index 00000000000..f7bf0709d9f
--- /dev/null
+++ b/src/infra/restart-stale-pids.test.ts
@@ -0,0 +1,810 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+// This entire file tests lsof-based Unix port polling. The feature is a deliberate
+// no-op on Windows (findGatewayPidsOnPortSync returns [] immediately). Running these
+// tests on a Windows CI runner would require lsof which does not exist there, so we
+// skip the suite entirely and rely on the Linux/macOS runners for coverage.
+const isWindows = process.platform === "win32";
+
+const mockSpawnSync = vi.hoisted(() => vi.fn());
+const mockResolveGatewayPort = vi.hoisted(() => vi.fn(() => 18789));
+const mockRestartWarn = vi.hoisted(() => vi.fn());
+
+vi.mock("node:child_process", () => ({
+  spawnSync: (...args: unknown[]) => mockSpawnSync(...args),
+  execFileSync: vi.fn(),
+}));
+
+vi.mock("../config/paths.js", () => ({
+  resolveGatewayPort: () => mockResolveGatewayPort(),
+}));
+
+vi.mock("./ports-lsof.js", () => ({
+  resolveLsofCommandSync: vi.fn(() => "lsof"),
+}));
+
+vi.mock("../logging/subsystem.js", () => ({
+  createSubsystemLogger: vi.fn(() => ({
+    warn: (...args: unknown[]) => mockRestartWarn(...args),
+    info: vi.fn(),
+    error: vi.fn(),
+  })),
+}));
+
+import { resolveLsofCommandSync } from "./ports-lsof.js";
+import {
+  __testing,
+  cleanStaleGatewayProcessesSync,
+  findGatewayPidsOnPortSync,
+} from "./restart-stale-pids.js";
+
+function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string {
+  return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n";
+}
+
+describe.skipIf(isWindows)("restart-stale-pids", () => {
+  beforeEach(() => {
+    mockSpawnSync.mockReset();
+    mockResolveGatewayPort.mockReset();
+    mockRestartWarn.mockReset();
+    mockResolveGatewayPort.mockReturnValue(18789);
+    __testing.setSleepSyncOverride(() => {});
+  });
+
+  afterEach(() => {
+    __testing.setSleepSyncOverride(null);
+    __testing.setDateNowOverride(null);
+    vi.restoreAllMocks();
+  });
+
+  // -------------------------------------------------------------------------
+  // findGatewayPidsOnPortSync
+  // -------------------------------------------------------------------------
+  describe("findGatewayPidsOnPortSync", () => {
+    it("returns [] when lsof exits with non-zero status", () => {
+      mockSpawnSync.mockReturnValue({ error: null, status: 1, stdout: "", stderr: "" });
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+    });
+
+    it("logs warning when initial lsof scan exits with status > 1", () => {
+      mockSpawnSync.mockReturnValue({ error: null, status: 2, stdout: "", stderr: "lsof error" });
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+      expect(mockRestartWarn).toHaveBeenCalledWith(
+        expect.stringContaining("lsof exited with status 2"),
+      );
+    });
+
+    it("returns [] when lsof returns an error object (e.g. ENOENT)", () => {
+      mockSpawnSync.mockReturnValue({
+        error: new Error("ENOENT"),
+        status: null,
+        stdout: "",
+        stderr: "",
+      });
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+      expect(mockRestartWarn).toHaveBeenCalledWith(
+        expect.stringContaining("lsof failed during initial stale-pid scan"),
+      );
+    });
+
+    it("parses openclaw-gateway pids and excludes the current process", () => {
+      const stalePid = process.pid + 1;
+      mockSpawnSync.mockReturnValue({
+        error: null,
+        status: 0,
+        stdout: lsofOutput([
+          { pid: stalePid, cmd: "openclaw-gateway" },
+          { pid: process.pid, cmd: "openclaw-gateway" },
+        ]),
+        stderr: "",
+      });
+      const pids = findGatewayPidsOnPortSync(18789);
+      expect(pids).toContain(stalePid);
+      expect(pids).not.toContain(process.pid);
+    });
+
+    it("excludes pids whose command does not include 'openclaw'", () => {
+      const otherPid = process.pid + 2;
+      mockSpawnSync.mockReturnValue({
+        error: null,
+        status: 0,
+        stdout: lsofOutput([{ pid: otherPid, cmd: "nginx" }]),
+        stderr: "",
+      });
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+    });
+
+    it("forwards the spawnTimeoutMs argument to spawnSync", () => {
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" });
+      findGatewayPidsOnPortSync(18789, 400);
+      expect(mockSpawnSync).toHaveBeenCalledWith(
+        "lsof",
+        expect.any(Array),
+        expect.objectContaining({ timeout: 400 }),
+      );
+    });
+
+    it("deduplicates pids from dual-stack listeners (IPv4+IPv6 emit same pid twice)", () => {
+      // Dual-stack listeners cause lsof to emit the same PID twice in -Fpc output
+      // (once for the IPv4 socket, once for IPv6). Without dedup, terminateStaleProcessesSync
+      // sends SIGTERM twice and returns killed=[pid, pid], corrupting the count.
+      const stalePid = process.pid + 600;
+      const stdout = `p${stalePid}\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`;
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
+      const result = findGatewayPidsOnPortSync(18789);
+      expect(result).toEqual([stalePid]); // deduped — not [pid, pid]
+    });
+
+    it("returns [] and skips lsof on win32", () => {
+      // The entire describe block is skipped on Windows (isWindows guard at top),
+      // so this test only runs on Linux/macOS. It mocks platform to win32 for the
+      // single assertion without needing to restore — the suite-level skipIf means
+      // this will never run on an actual Windows runner where the mock could leak.
+      const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
+      Object.defineProperty(process, "platform", { value: "win32", configurable: true });
+      try {
+        expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+        expect(mockSpawnSync).not.toHaveBeenCalled();
+      } finally {
+        if (origDescriptor) {
+          Object.defineProperty(process, "platform", origDescriptor);
+        }
+      }
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // parsePidsFromLsofOutput — pure unit tests (no I/O, driven via spawnSync mock)
+  // -------------------------------------------------------------------------
+  describe("parsePidsFromLsofOutput (via findGatewayPidsOnPortSync stdout path)", () => {
+    it("returns [] for empty lsof stdout (status 0, nothing listening)", () => {
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" });
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+    });
+
+    it("parses multiple openclaw pids from a single lsof output block", () => {
+      const pid1 = process.pid + 10;
+      const pid2 = process.pid + 11;
+      mockSpawnSync.mockReturnValue({
+        error: null,
+        status: 0,
+        stdout: lsofOutput([
+          { pid: pid1, cmd: "openclaw-gateway" },
+          { pid: pid2, cmd: "openclaw-gateway" },
+        ]),
+        stderr: "",
+      });
+      const result = findGatewayPidsOnPortSync(18789);
+      expect(result).toContain(pid1);
+      expect(result).toContain(pid2);
+    });
+
+    it("returns [] when status 0 but only non-openclaw pids present", () => {
+      // Port may be bound by an unrelated process. findGatewayPidsOnPortSync
+      // only tracks openclaw processes — non-openclaw listeners are ignored.
+      const otherPid = process.pid + 50;
+      mockSpawnSync.mockReturnValue({
+        error: null,
+        status: 0,
+        stdout: lsofOutput([{ pid: otherPid, cmd: "caddy" }]),
+        stderr: "",
+      });
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // pollPortOnce (via cleanStaleGatewayProcessesSync) — Codex P1 regression
+  // -------------------------------------------------------------------------
+  describe("pollPortOnce — no second lsof spawn (Codex P1 regression)", () => {
+    it("treats lsof exit status 1 as port-free (no listeners)", () => {
+      // lsof exits with status 1 when no matching processes are found — this is
+      // the canonical "port is free" signal, not an error.
+      const stalePid = process.pid + 500;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        // Poll returns status 1 — no listeners
+        return { error: null, status: 1, stdout: "", stderr: "" };
+      });
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      // Should complete cleanly (port reported free on status 1)
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+    });
+
+    it("treats lsof exit status >1 as inconclusive, not port-free — Codex P2 regression", () => {
+      // Codex P2: non-zero lsof exits other than status 1 (e.g. permission denied,
+      // bad flag, runtime error) must not be mapped to free:true. They are
+      // inconclusive and should keep the polling loop running until budget expires.
+      const stalePid = process.pid + 501;
+      let call = 0;
+      const events: string[] = [];
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          events.push("initial-find");
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        if (call === 2) {
+          // Permission/runtime error — status 2, should NOT be treated as free
+          events.push("error-poll");
+          return { error: null, status: 2, stdout: "", stderr: "lsof: permission denied" };
+        }
+        // Eventually port is free
+        events.push("free-poll");
+        return { error: null, status: 1, stdout: "", stderr: "" };
+      });
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      cleanStaleGatewayProcessesSync();
+
+      // Must have continued polling after the status-2 error, not exited early
+      expect(events).toContain("free-poll");
+    });
+
+    it("does not make a second lsof call when the first returns status 0", () => {
+      // The bug: pollPortOnce previously called findGatewayPidsOnPortSync as a
+      // second probe after getting status===0 from the first lsof. That second
+      // call collapses any error/timeout back into [], which maps to free:true —
+      // silently misclassifying an inconclusive result as "port is free".
+      //
+      // The fix: pollPortOnce now parses res.stdout directly from the first
+      // spawnSync call. Exactly ONE lsof invocation per poll cycle.
+      const stalePid = process.pid + 400;
+      let spawnCount = 0;
+      mockSpawnSync.mockImplementation(() => {
+        spawnCount++;
+        if (spawnCount === 1) {
+          // Initial findGatewayPidsOnPortSync — returns stale pid
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        if (spawnCount === 2) {
+          // First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again)
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        // Port free on third call
+        return { error: null, status: 0, stdout: "", stderr: "" };
+      });
+
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      cleanStaleGatewayProcessesSync();
+
+      // If pollPortOnce made a second lsof call internally, spawnCount would
+      // be at least 4 (initial + 2 polls each doubled). With the fix, each poll
+      // is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3.
+      expect(spawnCount).toBe(3);
+    });
+
+    it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => {
+      // On Linux containers with restricted /proc (AppArmor, seccomp, user namespaces),
+      // lsof can exit 1 AND still emit output for processes it could read.
+      // status 1 + non-empty openclaw stdout must not be treated as port-free.
+      const stalePid = process.pid + 601;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          // Initial scan: finds stale pid
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        if (call === 2) {
+          // status 1 + openclaw pid in stdout — container-restricted lsof reports partial results
+          return {
+            error: null,
+            status: 1,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "lsof: WARNING: can't stat() fuse",
+          };
+        }
+        // Third poll: port is genuinely free
+        return { error: null, status: 1, stdout: "", stderr: "" };
+      });
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      cleanStaleGatewayProcessesSync();
+      // Poll 2 returned busy (not free), so we must have polled at least 3 times
+      expect(call).toBeGreaterThanOrEqual(3);
+    });
+
+    it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => {
+      // If resolveLsofCommandSync throws (e.g. lsof resolution fails at runtime),
+      // pollPortOnce must catch it and return the transient-inconclusive result
+      // rather than propagating the exception.
+      const stalePid = process.pid + 402;
+      const mockedResolveLsof = vi.mocked(resolveLsofCommandSync);
+
+      mockedResolveLsof.mockImplementationOnce(() => {
+        // First call: initial findGatewayPidsOnPortSync — succeed normally
+        return "lsof";
+      });
+
+      mockSpawnSync.mockImplementationOnce(() => {
+        // Initial scan: finds stale pid
+        return {
+          error: null,
+          status: 0,
+          stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+          stderr: "",
+        };
+      });
+
+      // Second call: poll — resolveLsofCommandSync throws
+      mockedResolveLsof.mockImplementationOnce(() => {
+        throw new Error("lsof binary resolution failed");
+      });
+
+      // Third call: poll — port is free
+      mockedResolveLsof.mockImplementation(() => "lsof");
+      mockSpawnSync.mockImplementation(() => ({ error: null, status: 1, stdout: "", stderr: "" }));
+
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      // Must not throw — the catch path returns transient inconclusive, loop continues
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // cleanStaleGatewayProcessesSync
+  // -------------------------------------------------------------------------
+  describe("cleanStaleGatewayProcessesSync", () => {
+    it("returns [] and does not call process.kill when port has no listeners", () => {
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" });
+      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
+      expect(cleanStaleGatewayProcessesSync()).toEqual([]);
+      expect(killSpy).not.toHaveBeenCalled();
+    });
+
+    it("sends SIGTERM to stale pids and returns them", () => {
+      const stalePid = process.pid + 100;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        // waitForPortFreeSync polls: port free immediately
+        return { error: null, status: 0, stdout: "", stderr: "" };
+      });
+
+      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
+      const result = cleanStaleGatewayProcessesSync();
+
+      expect(result).toContain(stalePid);
+      expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM");
+    });
+
+    it("escalates to SIGKILL when process survives the SIGTERM window", () => {
+      const stalePid = process.pid + 101;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call <= 5) {
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        return { error: null, status: 0, stdout: "", stderr: "" };
+      });
+
+      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
+      cleanStaleGatewayProcessesSync();
+
+      expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM");
+      expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGKILL");
+    });
+
+    it("polls until port is confirmed free before returning — regression for #33103", () => {
+      // Core regression: cleanStaleGatewayProcessesSync must not return while
+      // the port is still bound. Previously it returned after a fixed 500ms
+      // sleep regardless of port state, causing systemd's new process to hit
+      // EADDRINUSE and enter an unbounded restart loop.
+      const stalePid = process.pid + 200;
+      const events: string[] = [];
+      let call = 0;
+
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          events.push("initial-find");
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        if (call <= 4) {
+          events.push(`busy-poll-${call}`);
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        events.push("port-free");
+        return { error: null, status: 0, stdout: "", stderr: "" };
+      });
+
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      cleanStaleGatewayProcessesSync();
+
+      expect(events).toContain("port-free");
+      expect(events.filter((e) => e.startsWith("busy-poll")).length).toBeGreaterThan(0);
+    });
+
+    it("bails immediately when lsof is permanently unavailable (ENOENT) — Greptile edge case", () => {
+      // Regression for the edge case identified in PR review: lsof returning an
+      // error must not be treated as "port free". ENOENT means lsof is not
+      // installed — a permanent condition. The polling loop should bail
+      // immediately on ENOENT rather than spinning the full 2-second budget.
+      const stalePid = process.pid + 300;
+      const events: string[] = [];
+      let call = 0;
+
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          events.push("initial-find");
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        // Permanent ENOENT — lsof is not installed
+        events.push(`enoent-poll-${call}`);
+        const err = new Error("lsof not found") as NodeJS.ErrnoException;
+        err.code = "ENOENT";
+        return { error: err, status: null, stdout: "", stderr: "" };
+      });
+
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+
+      // Must bail after first ENOENT poll — no point retrying a missing binary
+      const enoentPolls = events.filter((e) => e.startsWith("enoent-poll"));
+      expect(enoentPolls.length).toBe(1);
+    });
+
+    it("bails immediately when lsof is permanently unavailable (EPERM) — SELinux/AppArmor", () => {
+      // EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks
+      // execution. Like ENOENT/EACCES, this is permanent — retrying is pointless.
+      const stalePid = process.pid + 305;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        const err = new Error("lsof eperm") as NodeJS.ErrnoException;
+        err.code = "EPERM";
+        return { error: err, status: null, stdout: "", stderr: "" };
+      });
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+      // Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES
+      expect(call).toBe(2); // 1 initial find + 1 EPERM poll
+    });
+
+    it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => {
+      // EACCES and EPERM are also permanent conditions — lsof exists but the
+      // process has no permission to run it. No point retrying.
+      const stalePid = process.pid + 302;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        const err = new Error("lsof permission denied") as NodeJS.ErrnoException;
+        err.code = "EACCES";
+        return { error: err, status: null, stdout: "", stderr: "" };
+      });
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+      // Should have bailed after exactly 1 poll call (the EACCES one)
+      expect(call).toBe(2); // 1 initial find + 1 EACCES poll
+    });
+
+    it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => {
+      // Sub-agent audit HIGH finding: the original test relied on real wall-clock
+      // time (Date.now() + 2000ms deadline), burning 2 full seconds of CI time
+      // every run. Fix: expose dateNowOverride in __testing so the deadline can
+      // be synthesised instantly, keeping the test under 10ms.
+      const stalePid = process.pid + 303;
+      let fakeNow = 0;
+      __testing.setDateNowOverride(() => fakeNow);
+
+      mockSpawnSync.mockImplementation(() => {
+        // Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline.
+        fakeNow += 2001;
+        return {
+          error: null,
+          status: 0,
+          stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+          stderr: "",
+        };
+      });
+
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      // Must return without throwing (proceeds with warning after budget expires)
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+    });
+
+    it("still polls for port-free when all stale pids were already dead at SIGTERM time", () => {
+      // Sub-agent audit MEDIUM finding: if all pids from the initial scan are
+      // already dead before SIGTERM runs (race), terminateStaleProcessesSync
+      // returns killed=[] — but cleanStaleGatewayProcessesSync MUST still call
+      // waitForPortFreeSync. The process may have exited on its own while
+      // leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would
+      // silently recreate the EADDRINUSE race we are fixing.
+      const stalePid = process.pid + 304;
+      let call = 0;
+      const events: string[] = [];
+
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          // Initial scan: finds stale pid
+          events.push("initial-find");
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        // Port is already free on first poll — pid was dead before SIGTERM
+        events.push("poll-free");
+        return { error: null, status: 1, stdout: "", stderr: "" };
+      });
+
+      // All SIGTERMs throw ESRCH — pid already gone
+      vi.spyOn(process, "kill").mockImplementation(() => {
+        throw Object.assign(new Error("ESRCH"), { code: "ESRCH" });
+      });
+
+      cleanStaleGatewayProcessesSync();
+
+      // waitForPortFreeSync must still have fired even though killed=[]
+      expect(events).toContain("poll-free");
+    });
+
+    it("continues polling on transient lsof errors (not ENOENT) — Codex P1 fix", () => {
+      // A transient lsof error (spawnSync timeout, status 2, etc.) must NOT abort
+      // the polling loop. The loop should keep retrying until the budget expires
+      // or a definitive result is returned. Bailing on the first transient error
+      // would recreate the EADDRINUSE race this PR is designed to prevent.
+      const stalePid = process.pid + 301;
+      const events: string[] = [];
+      let call = 0;
+
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          events.push("initial-find");
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        if (call === 2) {
+          // Transient: spawnSync timeout (no ENOENT code)
+          events.push("transient-error");
+          return { error: new Error("timeout"), status: null, stdout: "", stderr: "" };
+        }
+        // Port free on the next poll
+        events.push("port-free");
+        return { error: null, status: 1, stdout: "", stderr: "" };
+      });
+
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      cleanStaleGatewayProcessesSync();
+
+      // Must have kept polling after the transient error and reached port-free
+      expect(events).toContain("transient-error");
+      expect(events).toContain("port-free");
+    });
+
+    it("returns gracefully when resolveGatewayPort throws", () => {
+      mockResolveGatewayPort.mockImplementationOnce(() => {
+        throw new Error("config read error");
+      });
+      expect(cleanStaleGatewayProcessesSync()).toEqual([]);
+    });
+
+    it("returns gracefully when lsof is unavailable from the start", () => {
+      mockSpawnSync.mockReturnValue({
+        error: new Error("ENOENT"),
+        status: null,
+        stdout: "",
+        stderr: "",
+      });
+      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
+      expect(cleanStaleGatewayProcessesSync()).toEqual([]);
+      expect(killSpy).not.toHaveBeenCalled();
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // parsePidsFromLsofOutput — branch-coverage for mid-loop && short-circuits
+  // -------------------------------------------------------------------------
+  describe("parsePidsFromLsofOutput — branch coverage (lines 67-69)", () => {
+    it("skips a mid-loop entry when the command does not include 'openclaw'", () => {
+      // Exercises the false branch of currentCmd.toLowerCase().includes("openclaw")
+      // inside the mid-loop flush: a non-openclaw cmd between two entries must not
+      // be pushed, but the following openclaw entry still must be.
+      const stalePid = process.pid + 700;
+      // Mixed output: non-openclaw entry first, then openclaw entry
+      const stdout = `p${process.pid + 699}\ncnginx\np${stalePid}\ncopenclaw-gateway\n`;
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
+      const result = findGatewayPidsOnPortSync(18789);
+      expect(result).toContain(stalePid);
+      expect(result).not.toContain(process.pid + 699);
+    });
+
+    it("skips a mid-loop entry when currentCmd is missing (two consecutive p-lines)", () => {
+      // Exercises currentCmd falsy branch mid-loop: two 'p' lines in a row
+      // (no 'c' line between them) — the first PID must be skipped, the second handled.
+      const stalePid = process.pid + 701;
+      // Two consecutive p-lines: first has no c-line before the next p-line
+      const stdout = `p${process.pid + 702}\np${stalePid}\ncopenclaw-gateway\n`;
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
+      const result = findGatewayPidsOnPortSync(18789);
+      expect(result).toContain(stalePid);
+    });
+
+    it("ignores a p-line with an invalid (non-positive) PID — ternary false branch", () => {
+      // Exercises the `Number.isFinite(parsed) && parsed > 0 ? parsed : undefined`
+      // false branch: a malformed 'p' line (e.g. 'p0' or 'pNaN') must not corrupt
+      // currentPid and must not end up in the returned pids array.
+      const stalePid = process.pid + 703;
+      // p0 is invalid (not > 0); the following valid openclaw entry must still be found.
+      const stdout = `p0\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`;
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
+      const result = findGatewayPidsOnPortSync(18789);
+      expect(result).toContain(stalePid);
+      expect(result).not.toContain(0);
+    });
+
+    it("silently skips lines that start with neither 'p' nor 'c' — else-if false branch", () => {
+      // lsof -Fpc only emits 'p' and 'c' lines, but defensive handling of
+      // unexpected output (e.g. 'f' for file descriptor in other lsof formats)
+      // must not throw or corrupt the pid list. Unknown lines are just skipped.
+      const stalePid = process.pid + 704;
+      // Intersperse an 'f' line (file descriptor marker) — not a 'p' or 'c' line
+      const stdout = `p${stalePid}\nf8\ncopenclaw-gateway\n`;
+      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
+      const result = findGatewayPidsOnPortSync(18789);
+      // The 'f' line must not corrupt parsing; stalePid must still be found
+      // (the 'c' line after 'f' correctly sets currentCmd)
+      expect(result).toContain(stalePid);
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // pollPortOnce branch — status 1 + non-empty stdout with zero openclaw pids
+  // -------------------------------------------------------------------------
+  describe("pollPortOnce — status 1 + non-empty non-openclaw stdout (line 145)", () => {
+    it("treats status 1 + non-openclaw stdout as port-free (not an openclaw process)", () => {
+      // status 1 + non-empty stdout where no openclaw pids are present:
+      // the port may be held by an unrelated process. From our perspective
+      // (we only kill openclaw pids) it is effectively free.
+      const stalePid = process.pid + 800;
+      let call = 0;
+      mockSpawnSync.mockImplementation(() => {
+        call++;
+        if (call === 1) {
+          return {
+            error: null,
+            status: 0,
+            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
+            stderr: "",
+          };
+        }
+        // status 1 + non-openclaw output — should be treated as free:true for our purposes
+        return {
+          error: null,
+          status: 1,
+          stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]),
+          stderr: "",
+        };
+      });
+      vi.spyOn(process, "kill").mockReturnValue(true);
+      // Should complete cleanly — no openclaw pids in status-1 output → free
+      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
+      // Completed in exactly 2 calls (initial find + 1 free poll)
+      expect(call).toBe(2);
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // sleepSync — direct unit tests via __testing.callSleepSyncRaw
+  // -------------------------------------------------------------------------
+  describe("sleepSync — Atomics.wait paths", () => {
+    it("returns immediately when called with 0ms (timeoutMs <= 0 early return)", () => {
+      // sleepSync(0) must short-circuit before touching Atomics.wait.
+      // Verify it does not throw and returns synchronously.
+      __testing.setSleepSyncOverride(null); // bypass override so real path runs
+      expect(() => __testing.callSleepSyncRaw(0)).not.toThrow();
+    });
+
+    it("returns immediately when called with a negative value (Math.max(0,...) clamp)", () => {
+      __testing.setSleepSyncOverride(null);
+      expect(() => __testing.callSleepSyncRaw(-1)).not.toThrow();
+    });
+
+    it("executes the Atomics.wait path successfully when called with a positive timeout", () => {
+      // Verify the real Atomics.wait code path runs without error.
+      // Use 1ms to keep the test fast; Atomics.wait resolves immediately
+      // because the timeout expires in 1ms.
+      __testing.setSleepSyncOverride(null);
+      expect(() => __testing.callSleepSyncRaw(1)).not.toThrow();
+    });
+
+    it("falls back to busy-wait when Atomics.wait throws (Worker / sandboxed env)", () => {
+      // Atomics.wait throws in Worker threads and some sandboxed runtimes.
+      // The catch branch must handle this without propagating the exception.
+      const origWait = Atomics.wait;
+      Atomics.wait = () => {
+        throw new Error("not on main thread");
+      };
+      __testing.setSleepSyncOverride(null);
+      try {
+        // 1ms is enough to exercise the busy-wait loop without slowing CI.
+        expect(() => __testing.callSleepSyncRaw(1)).not.toThrow();
+      } finally {
+        Atomics.wait = origWait;
+        __testing.setSleepSyncOverride(() => {});
+      }
+    });
+  });
+});
diff --git a/src/infra/restart-stale-pids.ts b/src/infra/restart-stale-pids.ts
index bbab76f8374..c6c9535c737 100644
--- a/src/infra/restart-stale-pids.ts
+++ b/src/infra/restart-stale-pids.ts
@@ -4,11 +4,31 @@ import { createSubsystemLogger } from "../logging/subsystem.js";
 import { resolveLsofCommandSync } from "./ports-lsof.js";
 
 const SPAWN_TIMEOUT_MS = 2000;
-const STALE_SIGTERM_WAIT_MS = 300;
-const STALE_SIGKILL_WAIT_MS = 200;
+const STALE_SIGTERM_WAIT_MS = 600;
+const STALE_SIGKILL_WAIT_MS = 400;
+/**
+ * After SIGKILL, the kernel may not release the TCP port immediately.
+ * Poll until the port is confirmed free (or until the budget expires) before
+ * returning control to the caller (typically `triggerOpenClawRestart` →
+ * `systemctl restart`). Without this wait the new process races the dying
+ * process for the port and systemd enters an EADDRINUSE restart loop.
+ *
+ * POLL_SPAWN_TIMEOUT_MS is intentionally much shorter than SPAWN_TIMEOUT_MS
+ * so that a single slow or hung lsof invocation does not consume the entire
+ * polling budget. At 400 ms per call, up to five independent lsof attempts
+ * fit within PORT_FREE_TIMEOUT_MS = 2000 ms, each with a definitive outcome.
+ */
+const PORT_FREE_POLL_INTERVAL_MS = 50;
+const PORT_FREE_TIMEOUT_MS = 2000;
+const POLL_SPAWN_TIMEOUT_MS = 400;
 
 const restartLog = createSubsystemLogger("restart");
 let sleepSyncOverride: ((ms: number) => void) | null = null;
+let dateNowOverride: (() => number) | null = null;
+
+function getTimeMs(): number {
+  return dateNowOverride ? dateNowOverride() : Date.now();
+}
 
 function sleepSync(ms: number): void {
   const timeoutMs = Math.max(0, Math.floor(ms));
@@ -31,25 +51,14 @@ function sleepSync(ms: number): void {
 }
 
 /**
- * Find PIDs of gateway processes listening on the given port using synchronous lsof.
- * Returns only PIDs that belong to openclaw gateway processes (not the current process).
+ * Parse openclaw gateway PIDs from lsof -Fpc stdout.
+ * Pure function — no I/O. Excludes the current process.
  */
-export function findGatewayPidsOnPortSync(port: number): number[] {
-  if (process.platform === "win32") {
-    return [];
-  }
-  const lsof = resolveLsofCommandSync();
-  const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], {
-    encoding: "utf8",
-    timeout: SPAWN_TIMEOUT_MS,
-  });
-  if (res.error || res.status !== 0) {
-    return [];
-  }
+function parsePidsFromLsofOutput(stdout: string): number[] {
   const pids: number[] = [];
   let currentPid: number | undefined;
   let currentCmd: string | undefined;
-  for (const line of res.stdout.split(/\r?\n/).filter(Boolean)) {
+  for (const line of stdout.split(/\r?\n/).filter(Boolean)) {
     if (line.startsWith("p")) {
       if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
         pids.push(currentPid);
@@ -64,17 +73,117 @@ export function findGatewayPidsOnPortSync(port: number): number[] {
   if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
     pids.push(currentPid);
   }
-  return pids.filter((pid) => pid !== process.pid);
+  // Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the
+  // same PID twice. Return each PID at most once to avoid double-killing.
+  return [...new Set(pids)].filter((pid) => pid !== process.pid);
+}
+
+/**
+ * Find PIDs of gateway processes listening on the given port using synchronous lsof.
+ * Returns only PIDs that belong to openclaw gateway processes (not the current process).
+ */
+export function findGatewayPidsOnPortSync(
+  port: number,
+  spawnTimeoutMs = SPAWN_TIMEOUT_MS,
+): number[] {
+  if (process.platform === "win32") {
+    return [];
+  }
+  const lsof = resolveLsofCommandSync();
+  const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], {
+    encoding: "utf8",
+    timeout: spawnTimeoutMs,
+  });
+  if (res.error) {
+    const code = (res.error as NodeJS.ErrnoException).code;
+    const detail =
+      code && code.trim().length > 0
+        ? code
+        : res.error instanceof Error
+          ? res.error.message
+          : "unknown error";
+    restartLog.warn(`lsof failed during initial stale-pid scan for port ${port}: ${detail}`);
+    return [];
+  }
+  if (res.status === 1) {
+    return [];
+  }
+  if (res.status !== 0) {
+    restartLog.warn(
+      `lsof exited with status ${res.status} during initial stale-pid scan for port ${port}; skipping stale pid check`,
+    );
+    return [];
+  }
+  return parsePidsFromLsofOutput(res.stdout);
+}
+
+/**
+ * Attempt a single lsof poll for the given port.
+ *
+ * Returns a discriminated union with four possible states:
+ *
+ *   { free: true }                      — port confirmed free
+ *   { free: false }                     — port confirmed busy
+ *   { free: null; permanent: false }    — transient error, keep retrying
+ *   { free: null; permanent: true }     — lsof unavailable (ENOENT / EACCES),
+ *                                         no point retrying
+ *
+ * Separating transient from permanent errors is critical so that:
+ *  1. A slow/timed-out lsof call (transient) does not abort the polling loop —
+ *     the caller retries until the wall-clock budget expires.
+ *  2. Non-zero lsof exits from runtime/permission failures (status > 1) are
+ *     not misclassified as "port free" — they are inconclusive and retried.
+ *  3. A missing lsof binary (permanent) short-circuits cleanly rather than
+ *     spinning the full budget pointlessly.
+ */
+type PollResult = { free: true } | { free: false } | { free: null; permanent: boolean };
+
+function pollPortOnce(port: number): PollResult {
+  try {
+    const lsof = resolveLsofCommandSync();
+    const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], {
+      encoding: "utf8",
+      timeout: POLL_SPAWN_TIMEOUT_MS,
+    });
+    if (res.error) {
+      // Spawn-level failure. ENOENT / EACCES means lsof is permanently
+      // unavailable on this system; other errors (e.g. timeout) are transient.
+      const code = (res.error as NodeJS.ErrnoException).code;
+      const permanent = code === "ENOENT" || code === "EACCES" || code === "EPERM";
+      return { free: null, permanent };
+    }
+    if (res.status === 1) {
+      // lsof canonical "no matching processes" exit — port is genuinely free.
+      // Guard: on Linux containers with restricted /proc (AppArmor, seccomp,
+      // user namespaces), lsof can exit 1 AND still emit some output for the
+      // processes it could read. Parse stdout when non-empty to avoid false-free.
+      if (res.stdout) {
+        const pids = parsePidsFromLsofOutput(res.stdout);
+        return pids.length === 0 ? { free: true } : { free: false };
+      }
+      return { free: true };
+    }
+    if (res.status !== 0) {
+      // status > 1: runtime/permission/flag error. Cannot confirm port state —
+      // treat as a transient failure and keep polling rather than falsely
+      // reporting the port as free (which would recreate the EADDRINUSE race).
+      return { free: null, permanent: false };
+    }
+    // status === 0: lsof found listeners. Parse pids from the stdout we
+    // already hold — no second lsof spawn, no new failure surface.
+    const pids = parsePidsFromLsofOutput(res.stdout);
+    return pids.length === 0 ? { free: true } : { free: false };
+  } catch {
+    return { free: null, permanent: false };
+  }
 }
 
 /**
  * Synchronously terminate stale gateway processes.
+ * Callers must pass a non-empty pids array.
  * Sends SIGTERM, waits briefly, then SIGKILL for survivors.
  */
 function terminateStaleProcessesSync(pids: number[]): number[] {
-  if (pids.length === 0) {
-    return [];
-  }
   const killed: number[] = [];
   for (const pid of pids) {
     try {
@@ -100,8 +209,48 @@ function terminateStaleProcessesSync(pids: number[]): number[] {
   return killed;
 }
 
+/**
+ * Poll the given port until it is confirmed free, lsof is confirmed unavailable,
+ * or the wall-clock budget expires.
+ *
+ * Each poll invocation uses POLL_SPAWN_TIMEOUT_MS (400 ms), which is
+ * significantly shorter than PORT_FREE_TIMEOUT_MS (2000 ms). This ensures
+ * that a single slow or hung lsof call cannot consume the entire polling
+ * budget and cause the function to exit prematurely with an inconclusive
+ * result. Up to five independent lsof attempts fit within the budget.
+ *
+ * Exit conditions:
+ *   - `pollPortOnce` returns `{ free: true }`                    → port confirmed free
+ *   - `pollPortOnce` returns `{ free: null, permanent: true }`   → lsof unavailable, bail
+ *   - `pollPortOnce` returns `{ free: false }`                   → port busy, sleep + retry
+ *   - `pollPortOnce` returns `{ free: null, permanent: false }`  → transient error, sleep + retry
+ *   - Wall-clock deadline exceeded                               → log warning, proceed anyway
+ */
+function waitForPortFreeSync(port: number): void {
+  const deadline = getTimeMs() + PORT_FREE_TIMEOUT_MS;
+  while (getTimeMs() < deadline) {
+    const result = pollPortOnce(port);
+    if (result.free === true) {
+      return;
+    }
+    if (result.free === null && result.permanent) {
+      // lsof is permanently unavailable (ENOENT / EACCES) — bail immediately,
+      // no point spinning the remaining budget.
+      return;
+    }
+    // result.free === false: port still bound.
+    // result.free === null && !permanent: transient lsof error — keep polling.
+    sleepSync(PORT_FREE_POLL_INTERVAL_MS);
+  }
+  restartLog.warn(`port ${port} still in use after ${PORT_FREE_TIMEOUT_MS}ms; proceeding anyway`);
+}
+
 /**
  * Inspect the gateway port and kill any stale gateway processes holding it.
+ * Blocks until the port is confirmed free (or the poll budget expires) so
+ * the supervisor (systemd / launchctl) does not race a zombie process for
+ * the port and enter an EADDRINUSE restart loop.
+ *
  * Called before service restart commands to prevent port conflicts.
  */
 export function cleanStaleGatewayProcessesSync(): number[] {
@@ -114,7 +263,14 @@ export function cleanStaleGatewayProcessesSync(): number[] {
     restartLog.warn(
       `killing ${stalePids.length} stale gateway process(es) before restart: ${stalePids.join(", ")}`,
     );
-    return terminateStaleProcessesSync(stalePids);
+    const killed = terminateStaleProcessesSync(stalePids);
+    // Wait for the port to be released before returning — called unconditionally
+    // even when `killed` is empty (all pids were already dead before SIGTERM).
+    // A process can exit before our signal arrives yet still leave its socket
+    // in TIME_WAIT / FIN_WAIT; polling is the only reliable way to confirm the
+    // kernel has fully released the port before systemd fires the new process.
+    waitForPortFreeSync(port);
+    return killed;
   } catch {
     return [];
   }
@@ -124,4 +280,9 @@ export const __testing = {
   setSleepSyncOverride(fn: ((ms: number) => void) | null) {
     sleepSyncOverride = fn;
   },
+  setDateNowOverride(fn: (() => number) | null) {
+    dateNowOverride = fn;
+  },
+  /** Invoke sleepSync directly (bypasses the override) for unit-testing the real Atomics path. */
+  callSleepSyncRaw: sleepSync,
 };
diff --git a/src/infra/restart.ts b/src/infra/restart.ts
index c84dfc6f7ac..3f65cfc1614 100644
--- a/src/infra/restart.ts
+++ b/src/infra/restart.ts
@@ -1,4 +1,6 @@
 import { spawnSync } from "node:child_process";
+import os from "node:os";
+import path from "node:path";
 import {
   resolveGatewayLaunchAgentLabel,
   resolveGatewaySystemdServiceName,
@@ -335,7 +337,8 @@ export function triggerOpenClawRestart(): RestartAttempt {
     process.env.OPENCLAW_LAUNCHD_LABEL ||
     resolveGatewayLaunchAgentLabel(process.env.OPENCLAW_PROFILE);
   const uid = typeof process.getuid === "function" ? process.getuid() : undefined;
-  const target = uid !== undefined ? `gui/${uid}/${label}` : label;
+  const domain = uid !== undefined ? `gui/${uid}` : "gui/501";
+  const target = `${domain}/${label}`;
   const args = ["kickstart", "-k", target];
   tried.push(`launchctl ${args.join(" ")}`);
   const res = spawnSync("launchctl", args, {
@@ -345,10 +348,39 @@ export function triggerOpenClawRestart(): RestartAttempt {
   if (!res.error && res.status === 0) {
     return { ok: true, method: "launchctl", tried };
   }
+
+  // kickstart fails when the service was previously booted out (deregistered from launchd).
+  // Fall back to bootstrap (re-register from plist) + kickstart.
+  // Use env HOME to match how launchd.ts resolves the plist install path.
+  const home = process.env.HOME?.trim() || os.homedir();
+  const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`);
+  const bootstrapArgs = ["bootstrap", domain, plistPath];
+  tried.push(`launchctl ${bootstrapArgs.join(" ")}`);
+  const boot = spawnSync("launchctl", bootstrapArgs, {
+    encoding: "utf8",
+    timeout: SPAWN_TIMEOUT_MS,
+  });
+  if (boot.error || (boot.status !== 0 && boot.status !== null)) {
+    return {
+      ok: false,
+      method: "launchctl",
+      detail: formatSpawnDetail(boot),
+      tried,
+    };
+  }
+  const retryArgs = ["kickstart", "-k", target];
+  tried.push(`launchctl ${retryArgs.join(" ")}`);
+  const retry = spawnSync("launchctl", retryArgs, {
+    encoding: "utf8",
+    timeout: SPAWN_TIMEOUT_MS,
+  });
+  if (!retry.error && retry.status === 0) {
+    return { ok: true, method: "launchctl", tried };
+  }
   return {
     ok: false,
     method: "launchctl",
-    detail: formatSpawnDetail(res),
+    detail: formatSpawnDetail(retry),
     tried,
   };
 }