fix(agents): clean false-live session locks (#76854)

Summary:
- Clean false-live session locks whose live PID resolves to a non-OpenClaw owner.
- Tighten owner argv detection so generic JS entrypoints require an OpenClaw command token.
- Add regression coverage for generic non-OpenClaw entrypoints and preserve real gateway argv.

Verification:
- pnpm test src/agents/session-write-lock.test.ts src/commands/doctor-session-locks.test.ts
- pnpm test src/infra/gateway-process-argv.test.ts
- pnpm exec oxfmt --check --threads=1 src/agents/session-write-lock.ts src/agents/session-write-lock.test.ts src/commands/doctor-session-locks.ts src/commands/doctor-session-locks.test.ts docs/gateway/doctor.md CHANGELOG.md
- node scripts/run-oxlint.mjs src/agents/session-write-lock.ts src/agents/session-write-lock.test.ts src/commands/doctor-session-locks.ts src/commands/doctor-session-locks.test.ts
- CI workflow success on d42f9c46ef

Co-authored-by: mkdev11 <MkDev11@users.noreply.github.com>
This commit is contained in:
MkDev11
2026-05-09 23:13:55 -07:00
committed by GitHub
parent c919702b8f
commit e37a3050d9
6 changed files with 304 additions and 14 deletions

View File

@@ -1040,6 +1040,7 @@ Docs: https://docs.openclaw.ai
- Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning.
- Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings.
- Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda.
- Doctor/session locks: remove fresh session write-lock files when their live PID can be read and proven to belong to a non-OpenClaw process, while preserving active or unknown owners. Fixes #76823. Thanks @renatomaluhy.
- Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc.
- Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt.
- Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy.

View File

@@ -319,7 +319,7 @@ That stages grounded durable candidates into the short-term dreaming store while
</Accordion>
<Accordion title="3c. Session lock cleanup">
Doctor scans every agent session directory for stale write-lock files — files left behind when a session exited abnormally. For each lock file found it reports: the path, PID, whether the PID is still alive, lock age, and whether it is considered stale (dead PID or older than 30 minutes). In `--fix` / `--repair` mode it removes stale lock files automatically; otherwise it prints a note and instructs you to rerun with `--fix`.
Doctor scans every agent session directory for stale write-lock files — files left behind when a session exited abnormally. For each lock file found it reports: the path, PID, whether the PID is still alive, lock age, and whether it is considered stale (dead PID, older than 30 minutes, or a live PID that can be proven to belong to a non-OpenClaw process). In `--fix` / `--repair` mode it removes stale lock files automatically; otherwise it prints a note and instructs you to rerun with `--fix`.
</Accordion>
<Accordion title="3d. Session transcript branch repair">
Doctor scans agent session JSONL files for the duplicated branch shape created by the 2026.4.24 prompt transcript rewrite bug: an abandoned user turn with OpenClaw internal runtime context plus an active sibling containing the same visible user prompt. In `--fix` / `--repair` mode, doctor backs up each affected file next to the original and rewrites the transcript to the active branch so gateway history and memory readers no longer see duplicate turns.

View File

@@ -407,6 +407,7 @@ describe("acquireSessionWriteLock", () => {
staleMs: 30_000,
nowMs,
removeStale: true,
readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "agent"],
});
expect(result.locks).toHaveLength(3);
@@ -424,6 +425,153 @@ describe("acquireSessionWriteLock", () => {
}
});
it("cleans fresh live .jsonl lock files owned by a non-OpenClaw process", async () => {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
const sessionsDir = path.join(root, "sessions");
await fs.mkdir(sessionsDir, { recursive: true });
const nowMs = Date.now();
const falseLiveLock = path.join(sessionsDir, "false-live.jsonl.lock");
try {
await fs.writeFile(
falseLiveLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs).toISOString(),
}),
"utf8",
);
const result = await cleanStaleLockFiles({
sessionsDir,
staleMs: 30_000,
nowMs,
removeStale: true,
readOwnerProcessArgs: () => ["python", "worker.py"],
});
expect(result.locks).toHaveLength(1);
expect(result.cleaned.map((entry) => path.basename(entry.lockPath))).toEqual([
"false-live.jsonl.lock",
]);
expect(result.cleaned[0]?.staleReasons).toContain("non-openclaw-owner");
await expect(fs.access(falseLiveLock)).rejects.toThrow();
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
it("cleans fresh live .jsonl lock files owned by generic non-OpenClaw entrypoints", async () => {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
const sessionsDir = path.join(root, "sessions");
await fs.mkdir(sessionsDir, { recursive: true });
const nowMs = Date.now();
const falseLiveLock = path.join(sessionsDir, "false-live-generic-entry.jsonl.lock");
try {
await fs.writeFile(
falseLiveLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs).toISOString(),
}),
"utf8",
);
const result = await cleanStaleLockFiles({
sessionsDir,
staleMs: 30_000,
nowMs,
removeStale: true,
readOwnerProcessArgs: () => ["node", "/srv/app/dist/index.js"],
});
expect(result.cleaned.map((entry) => path.basename(entry.lockPath))).toEqual([
"false-live-generic-entry.jsonl.lock",
]);
expect(result.cleaned[0]?.staleReasons).toContain("non-openclaw-owner");
await expect(fs.access(falseLiveLock)).rejects.toThrow();
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
it("keeps fresh live .jsonl lock files with OpenClaw or unknown owners", async () => {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
const sessionsDir = path.join(root, "sessions");
await fs.mkdir(sessionsDir, { recursive: true });
const nowMs = Date.now();
const openclawLock = path.join(sessionsDir, "openclaw-live.jsonl.lock");
const gatewayLock = path.join(sessionsDir, "gateway-live.jsonl.lock");
const unknownLock = path.join(sessionsDir, "unknown-live.jsonl.lock");
try {
await fs.writeFile(
openclawLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs).toISOString(),
}),
"utf8",
);
const openclawResult = await cleanStaleLockFiles({
sessionsDir,
staleMs: 30_000,
nowMs,
removeStale: true,
readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "agent"],
});
expect(openclawResult.cleaned).toEqual([]);
await expect(fs.access(openclawLock)).resolves.toBeUndefined();
await fs.rm(openclawLock, { force: true });
await fs.writeFile(
gatewayLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs).toISOString(),
}),
"utf8",
);
const gatewayResult = await cleanStaleLockFiles({
sessionsDir,
staleMs: 30_000,
nowMs,
removeStale: true,
readOwnerProcessArgs: () => ["node", "dist/index.js", "gateway", "run"],
});
expect(gatewayResult.cleaned).toEqual([]);
await expect(fs.access(gatewayLock)).resolves.toBeUndefined();
await fs.rm(gatewayLock, { force: true });
await fs.writeFile(
unknownLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs).toISOString(),
}),
"utf8",
);
const unknownResult = await cleanStaleLockFiles({
sessionsDir,
staleMs: 30_000,
nowMs,
removeStale: true,
readOwnerProcessArgs: () => null,
});
expect(unknownResult.cleaned).toEqual([]);
await expect(fs.access(unknownLock)).resolves.toBeUndefined();
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
it("cleans untracked current-process .jsonl lock files with matching starttime", async () => {
pinCurrentProcessStartTimeForTest();
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));

View File

@@ -3,6 +3,7 @@ import type fsSync from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { createFileLockManager } from "../infra/file-lock-manager.js";
import { readGatewayProcessArgsSync as readProcessArgsSync } from "../infra/gateway-processes.js";
import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js";
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
@@ -28,6 +29,8 @@ export type SessionLockInspection = {
removed: boolean;
};
export type SessionLockOwnerProcessArgsReader = (pid: number) => string[] | null;
const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
type CleanupSignal = (typeof CLEANUP_SIGNALS)[number];
const CLEANUP_STATE_KEY = Symbol.for("openclaw.sessionWriteLockCleanupState");
@@ -295,6 +298,56 @@ async function resolveNormalizedSessionFile(sessionFile: string): Promise<string
}
}
function normalizeOwnerProcessArg(arg: string): string {
return arg.trim().replaceAll("\\", "/").toLowerCase();
}
function isOpenClawSessionOwnerArgv(args: string[]): boolean {
const normalized = args.map(normalizeOwnerProcessArg).filter(Boolean);
if (normalized.length === 0) {
return false;
}
const exe = (normalized[0] ?? "").replace(/\.(bat|cmd|exe)$/i, "");
if (exe === "openclaw" || exe.endsWith("/openclaw") || exe.endsWith("/openclaw-gateway")) {
return true;
}
if (
normalized.some(
(arg) =>
arg === "openclaw" ||
arg.endsWith("/openclaw") ||
arg === "openclaw.mjs" ||
arg.endsWith("/openclaw.mjs"),
)
) {
return true;
}
const entryCandidates = [
"dist/index.js",
"dist/entry.js",
"scripts/run-node.mjs",
"src/entry.ts",
"src/index.ts",
];
const hasOpenClawCommandToken = normalized.some((arg) => arg === "gateway" || arg === "agent");
return normalized.some(
(arg) => entryCandidates.some((entry) => arg.endsWith(entry)) && hasOpenClawCommandToken,
);
}
function readOwnerProcessArgs(
reader: SessionLockOwnerProcessArgsReader,
pid: number,
): string[] | null {
try {
const args = reader(pid);
return Array.isArray(args) ? args : null;
} catch {
return null;
}
}
function inspectLockPayload(
payload: LockFilePayload | null,
staleMs: number,
@@ -342,6 +395,29 @@ function inspectLockPayload(
};
}
function shouldTreatAsNonOpenClawOwner(params: {
payload: LockFilePayload | null;
inspected: LockInspectionDetails;
heldByThisProcess: boolean;
readOwnerProcessArgs: SessionLockOwnerProcessArgsReader;
}): boolean {
if (params.inspected.stale || params.inspected.pid === null || !params.inspected.pidAlive) {
return false;
}
if (params.inspected.pid === process.pid && params.heldByThisProcess) {
return false;
}
if (!isValidLockNumber(params.payload?.pid) || params.payload.pid <= 0) {
return false;
}
const args = readOwnerProcessArgs(params.readOwnerProcessArgs, params.payload.pid);
if (!args || args.every((arg) => !arg.trim())) {
return false;
}
return !isOpenClawSessionOwnerArgv(args);
}
function lockInspectionNeedsMtimeStaleFallback(details: LockInspectionDetails): boolean {
return (
details.stale &&
@@ -383,6 +459,7 @@ async function removeReportedStaleLockIfStillStale(params: {
lockPath: string;
normalizedSessionFile: string;
staleMs: number;
readOwnerProcessArgs?: SessionLockOwnerProcessArgsReader;
}): Promise<boolean> {
const nowMs = Date.now();
const payload = await readLockPayload(params.lockPath);
@@ -392,6 +469,7 @@ async function removeReportedStaleLockIfStillStale(params: {
nowMs,
heldByThisProcess: sessionLockHeldByThisProcess(params.normalizedSessionFile),
reclaimLockWithoutStarttime: true,
readOwnerProcessArgs: params.readOwnerProcessArgs ?? readProcessArgsSync,
});
if (!(await shouldReclaimContendedLockFile(params.lockPath, inspected, params.staleMs, nowMs))) {
return false;
@@ -430,24 +508,41 @@ function inspectLockPayloadForSession(params: {
nowMs: number;
heldByThisProcess: boolean;
reclaimLockWithoutStarttime: boolean;
readOwnerProcessArgs: SessionLockOwnerProcessArgsReader;
}): LockInspectionDetails {
const inspected = inspectLockPayload(params.payload, params.staleMs, params.nowMs);
if (
!shouldTreatAsOrphanSelfLock({
shouldTreatAsOrphanSelfLock({
payload: params.payload,
heldByThisProcess: params.heldByThisProcess,
reclaimLockWithoutStarttime: params.reclaimLockWithoutStarttime,
})
) {
return inspected;
return {
...inspected,
stale: true,
staleReasons: inspected.staleReasons.includes("orphan-self-pid")
? inspected.staleReasons
: [...inspected.staleReasons, "orphan-self-pid"],
};
}
return {
...inspected,
stale: true,
staleReasons: inspected.staleReasons.includes("orphan-self-pid")
? inspected.staleReasons
: [...inspected.staleReasons, "orphan-self-pid"],
};
if (
shouldTreatAsNonOpenClawOwner({
payload: params.payload,
inspected,
heldByThisProcess: params.heldByThisProcess,
readOwnerProcessArgs: params.readOwnerProcessArgs,
})
) {
return {
...inspected,
stale: true,
staleReasons: [...inspected.staleReasons, "non-openclaw-owner"],
};
}
return inspected;
}
export async function cleanStaleLockFiles(params: {
@@ -455,6 +550,7 @@ export async function cleanStaleLockFiles(params: {
staleMs?: number;
removeStale?: boolean;
nowMs?: number;
readOwnerProcessArgs?: SessionLockOwnerProcessArgsReader;
log?: {
warn?: (message: string) => void;
info?: (message: string) => void;
@@ -464,6 +560,7 @@ export async function cleanStaleLockFiles(params: {
const staleMs = resolvePositiveMs(params.staleMs, DEFAULT_STALE_MS);
const removeStale = params.removeStale !== false;
const nowMs = params.nowMs ?? Date.now();
const ownerProcessArgsReader = params.readOwnerProcessArgs ?? readProcessArgsSync;
let entries: fsSync.Dirent[] = [];
try {
@@ -491,6 +588,7 @@ export async function cleanStaleLockFiles(params: {
nowMs,
heldByThisProcess: false,
reclaimLockWithoutStarttime: false,
readOwnerProcessArgs: ownerProcessArgsReader,
});
const lockInfo: SessionLockInspection = {
lockPath,
@@ -558,6 +656,7 @@ export async function acquireSessionWriteLock(params: {
nowMs,
heldByThisProcess,
reclaimLockWithoutStarttime: true,
readOwnerProcessArgs: readProcessArgsSync,
});
return await shouldReclaimContendedLockFile(lockPath, inspected, staleMs, nowMs);
},

View File

@@ -48,7 +48,11 @@ describe("noteSessionLockHealth", () => {
"utf8",
);
await noteSessionLockHealth({ shouldRepair: false, staleMs: 60_000 });
await noteSessionLockHealth({
shouldRepair: false,
staleMs: 60_000,
readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "doctor"],
});
expect(note).toHaveBeenCalledTimes(1);
const [message, title] = note.mock.calls[0] as [string, string];
@@ -77,7 +81,11 @@ describe("noteSessionLockHealth", () => {
"utf8",
);
await noteSessionLockHealth({ shouldRepair: true, staleMs: 30_000 });
await noteSessionLockHealth({
shouldRepair: true,
staleMs: 30_000,
readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "doctor"],
});
expect(note).toHaveBeenCalledTimes(1);
const [message] = note.mock.calls[0] as [string, string];
@@ -87,4 +95,29 @@ describe("noteSessionLockHealth", () => {
await expectPathMissing(staleLock);
await expect(fs.access(freshLock)).resolves.toBeUndefined();
});
it("removes fresh live locks when the owner is not an OpenClaw process", async () => {
const sessionsDir = state.sessionsDir();
await fs.mkdir(sessionsDir, { recursive: true });
const falseLiveLock = path.join(sessionsDir, "false-live.jsonl.lock");
await fs.writeFile(
falseLiveLock,
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }),
"utf8",
);
await noteSessionLockHealth({
shouldRepair: true,
staleMs: 60_000,
readOwnerProcessArgs: () => ["python", "worker.py"],
});
expect(note).toHaveBeenCalledTimes(1);
const [message] = note.mock.calls[0] as [string, string];
expect(message).toContain("stale=yes (non-openclaw-owner)");
expect(message).toContain("[removed]");
expect(message).toContain("Removed 1 stale session lock file");
await expect(fs.access(falseLiveLock)).rejects.toThrow();
});
});

View File

@@ -1,5 +1,9 @@
import { resolveAgentSessionDirs } from "../agents/session-dirs.js";
import { cleanStaleLockFiles, type SessionLockInspection } from "../agents/session-write-lock.js";
import {
cleanStaleLockFiles,
type SessionLockInspection,
type SessionLockOwnerProcessArgsReader,
} from "../agents/session-write-lock.js";
import { resolveStateDir } from "../config/paths.js";
import { note } from "../terminal/note.js";
import { shortenHomePath } from "../utils.js";
@@ -35,7 +39,11 @@ function formatLockLine(lock: SessionLockInspection): string {
return `- ${shortenHomePath(lock.lockPath)} ${pidStatus} ${ageStatus} ${staleStatus}${removedStatus}`;
}
export async function noteSessionLockHealth(params?: { shouldRepair?: boolean; staleMs?: number }) {
export async function noteSessionLockHealth(params?: {
shouldRepair?: boolean;
staleMs?: number;
readOwnerProcessArgs?: SessionLockOwnerProcessArgsReader;
}) {
const shouldRepair = params?.shouldRepair === true;
const staleMs = params?.staleMs ?? DEFAULT_STALE_MS;
let sessionDirs: string[] = [];
@@ -56,6 +64,7 @@ export async function noteSessionLockHealth(params?: { shouldRepair?: boolean; s
sessionsDir,
staleMs,
removeStale: shouldRepair,
readOwnerProcessArgs: params?.readOwnerProcessArgs,
});
allLocks.push(...result.locks);
}