diff --git a/CHANGELOG.md b/CHANGELOG.md index 57f3ffab55a..7073292f293 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1040,6 +1040,7 @@ Docs: https://docs.openclaw.ai - Slack: collapse routine Socket Mode pong-timeout reconnects into one OpenClaw reconnect line and suppress the duplicate Slack SDK pong warning. - Gateway/diagnostics: abort-drain embedded runs after an extended no-progress stall so a single dead session no longer leaves queued Discord/channel turns blocked behind repeated `recovery=none` liveness warnings. - Plugins/ClawHub: accept the live artifact resolver `kind`/`sha256` field names alongside the typed `artifactKind`/`artifactSha256` form so `clawhub:` installs of npm-pack and legacy ZIP packages no longer miss downloadable artifacts. Thanks @romneyda. +- Doctor/session locks: remove fresh session write-lock files when their live PID can be read and proven to belong to a non-OpenClaw process, while preserving active or unknown owners. Fixes #76823. Thanks @renatomaluhy. - Control UI/Sessions: avoid full `sessions.list` reloads for chat-turn `sessions.changed` payloads, so large session stores no longer add multi-second delays while chat responses are being delivered. (#76676) Thanks @VACInc. - Gateway/watch: run `doctor --fix --non-interactive` once and retry when the dev Gateway child exits during startup, so stale local plugin install/config state does not leave the tmux watch session disappearing without a repair attempt. - Doctor/Telegram: warn when selected Telegram quote replies can suppress `streaming.preview.toolProgress`, and document the `replyToMode` trade-off without changing runtime delivery. Fixes #73487. Thanks @GodsBoy. diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md index 001c9a39559..24c46ec5d5f 100644 --- a/docs/gateway/doctor.md +++ b/docs/gateway/doctor.md @@ -319,7 +319,7 @@ That stages grounded durable candidates into the short-term dreaming store while - Doctor scans every agent session directory for stale write-lock files — files left behind when a session exited abnormally. For each lock file found it reports: the path, PID, whether the PID is still alive, lock age, and whether it is considered stale (dead PID or older than 30 minutes). In `--fix` / `--repair` mode it removes stale lock files automatically; otherwise it prints a note and instructs you to rerun with `--fix`. + Doctor scans every agent session directory for stale write-lock files — files left behind when a session exited abnormally. For each lock file found it reports: the path, PID, whether the PID is still alive, lock age, and whether it is considered stale (dead PID, older than 30 minutes, or a live PID that can be proven to belong to a non-OpenClaw process). In `--fix` / `--repair` mode it removes stale lock files automatically; otherwise it prints a note and instructs you to rerun with `--fix`. Doctor scans agent session JSONL files for the duplicated branch shape created by the 2026.4.24 prompt transcript rewrite bug: an abandoned user turn with OpenClaw internal runtime context plus an active sibling containing the same visible user prompt. In `--fix` / `--repair` mode, doctor backs up each affected file next to the original and rewrites the transcript to the active branch so gateway history and memory readers no longer see duplicate turns. diff --git a/src/agents/session-write-lock.test.ts b/src/agents/session-write-lock.test.ts index 45e86edce5b..bc4e73d9884 100644 --- a/src/agents/session-write-lock.test.ts +++ b/src/agents/session-write-lock.test.ts @@ -407,6 +407,7 @@ describe("acquireSessionWriteLock", () => { staleMs: 30_000, nowMs, removeStale: true, + readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "agent"], }); expect(result.locks).toHaveLength(3); @@ -424,6 +425,153 @@ describe("acquireSessionWriteLock", () => { } }); + it("cleans fresh live .jsonl lock files owned by a non-OpenClaw process", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + const sessionsDir = path.join(root, "sessions"); + await fs.mkdir(sessionsDir, { recursive: true }); + + const nowMs = Date.now(); + const falseLiveLock = path.join(sessionsDir, "false-live.jsonl.lock"); + + try { + await fs.writeFile( + falseLiveLock, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(nowMs).toISOString(), + }), + "utf8", + ); + + const result = await cleanStaleLockFiles({ + sessionsDir, + staleMs: 30_000, + nowMs, + removeStale: true, + readOwnerProcessArgs: () => ["python", "worker.py"], + }); + + expect(result.locks).toHaveLength(1); + expect(result.cleaned.map((entry) => path.basename(entry.lockPath))).toEqual([ + "false-live.jsonl.lock", + ]); + expect(result.cleaned[0]?.staleReasons).toContain("non-openclaw-owner"); + await expect(fs.access(falseLiveLock)).rejects.toThrow(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("cleans fresh live .jsonl lock files owned by generic non-OpenClaw entrypoints", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + const sessionsDir = path.join(root, "sessions"); + await fs.mkdir(sessionsDir, { recursive: true }); + + const nowMs = Date.now(); + const falseLiveLock = path.join(sessionsDir, "false-live-generic-entry.jsonl.lock"); + + try { + await fs.writeFile( + falseLiveLock, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(nowMs).toISOString(), + }), + "utf8", + ); + + const result = await cleanStaleLockFiles({ + sessionsDir, + staleMs: 30_000, + nowMs, + removeStale: true, + readOwnerProcessArgs: () => ["node", "/srv/app/dist/index.js"], + }); + + expect(result.cleaned.map((entry) => path.basename(entry.lockPath))).toEqual([ + "false-live-generic-entry.jsonl.lock", + ]); + expect(result.cleaned[0]?.staleReasons).toContain("non-openclaw-owner"); + await expect(fs.access(falseLiveLock)).rejects.toThrow(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("keeps fresh live .jsonl lock files with OpenClaw or unknown owners", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + const sessionsDir = path.join(root, "sessions"); + await fs.mkdir(sessionsDir, { recursive: true }); + + const nowMs = Date.now(); + const openclawLock = path.join(sessionsDir, "openclaw-live.jsonl.lock"); + const gatewayLock = path.join(sessionsDir, "gateway-live.jsonl.lock"); + const unknownLock = path.join(sessionsDir, "unknown-live.jsonl.lock"); + + try { + await fs.writeFile( + openclawLock, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(nowMs).toISOString(), + }), + "utf8", + ); + const openclawResult = await cleanStaleLockFiles({ + sessionsDir, + staleMs: 30_000, + nowMs, + removeStale: true, + readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "agent"], + }); + + expect(openclawResult.cleaned).toEqual([]); + await expect(fs.access(openclawLock)).resolves.toBeUndefined(); + + await fs.rm(openclawLock, { force: true }); + await fs.writeFile( + gatewayLock, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(nowMs).toISOString(), + }), + "utf8", + ); + const gatewayResult = await cleanStaleLockFiles({ + sessionsDir, + staleMs: 30_000, + nowMs, + removeStale: true, + readOwnerProcessArgs: () => ["node", "dist/index.js", "gateway", "run"], + }); + + expect(gatewayResult.cleaned).toEqual([]); + await expect(fs.access(gatewayLock)).resolves.toBeUndefined(); + + await fs.rm(gatewayLock, { force: true }); + await fs.writeFile( + unknownLock, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(nowMs).toISOString(), + }), + "utf8", + ); + const unknownResult = await cleanStaleLockFiles({ + sessionsDir, + staleMs: 30_000, + nowMs, + removeStale: true, + readOwnerProcessArgs: () => null, + }); + + expect(unknownResult.cleaned).toEqual([]); + await expect(fs.access(unknownLock)).resolves.toBeUndefined(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + it("cleans untracked current-process .jsonl lock files with matching starttime", async () => { pinCurrentProcessStartTimeForTest(); const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); diff --git a/src/agents/session-write-lock.ts b/src/agents/session-write-lock.ts index ae20cde9268..a548be75929 100644 --- a/src/agents/session-write-lock.ts +++ b/src/agents/session-write-lock.ts @@ -3,6 +3,7 @@ import type fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; import { createFileLockManager } from "../infra/file-lock-manager.js"; +import { readGatewayProcessArgsSync as readProcessArgsSync } from "../infra/gateway-processes.js"; import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js"; import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js"; @@ -28,6 +29,8 @@ export type SessionLockInspection = { removed: boolean; }; +export type SessionLockOwnerProcessArgsReader = (pid: number) => string[] | null; + const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const; type CleanupSignal = (typeof CLEANUP_SIGNALS)[number]; const CLEANUP_STATE_KEY = Symbol.for("openclaw.sessionWriteLockCleanupState"); @@ -295,6 +298,56 @@ async function resolveNormalizedSessionFile(sessionFile: string): Promise + arg === "openclaw" || + arg.endsWith("/openclaw") || + arg === "openclaw.mjs" || + arg.endsWith("/openclaw.mjs"), + ) + ) { + return true; + } + + const entryCandidates = [ + "dist/index.js", + "dist/entry.js", + "scripts/run-node.mjs", + "src/entry.ts", + "src/index.ts", + ]; + const hasOpenClawCommandToken = normalized.some((arg) => arg === "gateway" || arg === "agent"); + return normalized.some( + (arg) => entryCandidates.some((entry) => arg.endsWith(entry)) && hasOpenClawCommandToken, + ); +} + +function readOwnerProcessArgs( + reader: SessionLockOwnerProcessArgsReader, + pid: number, +): string[] | null { + try { + const args = reader(pid); + return Array.isArray(args) ? args : null; + } catch { + return null; + } +} + function inspectLockPayload( payload: LockFilePayload | null, staleMs: number, @@ -342,6 +395,29 @@ function inspectLockPayload( }; } +function shouldTreatAsNonOpenClawOwner(params: { + payload: LockFilePayload | null; + inspected: LockInspectionDetails; + heldByThisProcess: boolean; + readOwnerProcessArgs: SessionLockOwnerProcessArgsReader; +}): boolean { + if (params.inspected.stale || params.inspected.pid === null || !params.inspected.pidAlive) { + return false; + } + if (params.inspected.pid === process.pid && params.heldByThisProcess) { + return false; + } + if (!isValidLockNumber(params.payload?.pid) || params.payload.pid <= 0) { + return false; + } + + const args = readOwnerProcessArgs(params.readOwnerProcessArgs, params.payload.pid); + if (!args || args.every((arg) => !arg.trim())) { + return false; + } + return !isOpenClawSessionOwnerArgv(args); +} + function lockInspectionNeedsMtimeStaleFallback(details: LockInspectionDetails): boolean { return ( details.stale && @@ -383,6 +459,7 @@ async function removeReportedStaleLockIfStillStale(params: { lockPath: string; normalizedSessionFile: string; staleMs: number; + readOwnerProcessArgs?: SessionLockOwnerProcessArgsReader; }): Promise { const nowMs = Date.now(); const payload = await readLockPayload(params.lockPath); @@ -392,6 +469,7 @@ async function removeReportedStaleLockIfStillStale(params: { nowMs, heldByThisProcess: sessionLockHeldByThisProcess(params.normalizedSessionFile), reclaimLockWithoutStarttime: true, + readOwnerProcessArgs: params.readOwnerProcessArgs ?? readProcessArgsSync, }); if (!(await shouldReclaimContendedLockFile(params.lockPath, inspected, params.staleMs, nowMs))) { return false; @@ -430,24 +508,41 @@ function inspectLockPayloadForSession(params: { nowMs: number; heldByThisProcess: boolean; reclaimLockWithoutStarttime: boolean; + readOwnerProcessArgs: SessionLockOwnerProcessArgsReader; }): LockInspectionDetails { const inspected = inspectLockPayload(params.payload, params.staleMs, params.nowMs); if ( - !shouldTreatAsOrphanSelfLock({ + shouldTreatAsOrphanSelfLock({ payload: params.payload, heldByThisProcess: params.heldByThisProcess, reclaimLockWithoutStarttime: params.reclaimLockWithoutStarttime, }) ) { - return inspected; + return { + ...inspected, + stale: true, + staleReasons: inspected.staleReasons.includes("orphan-self-pid") + ? inspected.staleReasons + : [...inspected.staleReasons, "orphan-self-pid"], + }; } - return { - ...inspected, - stale: true, - staleReasons: inspected.staleReasons.includes("orphan-self-pid") - ? inspected.staleReasons - : [...inspected.staleReasons, "orphan-self-pid"], - }; + + if ( + shouldTreatAsNonOpenClawOwner({ + payload: params.payload, + inspected, + heldByThisProcess: params.heldByThisProcess, + readOwnerProcessArgs: params.readOwnerProcessArgs, + }) + ) { + return { + ...inspected, + stale: true, + staleReasons: [...inspected.staleReasons, "non-openclaw-owner"], + }; + } + + return inspected; } export async function cleanStaleLockFiles(params: { @@ -455,6 +550,7 @@ export async function cleanStaleLockFiles(params: { staleMs?: number; removeStale?: boolean; nowMs?: number; + readOwnerProcessArgs?: SessionLockOwnerProcessArgsReader; log?: { warn?: (message: string) => void; info?: (message: string) => void; @@ -464,6 +560,7 @@ export async function cleanStaleLockFiles(params: { const staleMs = resolvePositiveMs(params.staleMs, DEFAULT_STALE_MS); const removeStale = params.removeStale !== false; const nowMs = params.nowMs ?? Date.now(); + const ownerProcessArgsReader = params.readOwnerProcessArgs ?? readProcessArgsSync; let entries: fsSync.Dirent[] = []; try { @@ -491,6 +588,7 @@ export async function cleanStaleLockFiles(params: { nowMs, heldByThisProcess: false, reclaimLockWithoutStarttime: false, + readOwnerProcessArgs: ownerProcessArgsReader, }); const lockInfo: SessionLockInspection = { lockPath, @@ -558,6 +656,7 @@ export async function acquireSessionWriteLock(params: { nowMs, heldByThisProcess, reclaimLockWithoutStarttime: true, + readOwnerProcessArgs: readProcessArgsSync, }); return await shouldReclaimContendedLockFile(lockPath, inspected, staleMs, nowMs); }, diff --git a/src/commands/doctor-session-locks.test.ts b/src/commands/doctor-session-locks.test.ts index 5254703713c..4dd56a5b8c5 100644 --- a/src/commands/doctor-session-locks.test.ts +++ b/src/commands/doctor-session-locks.test.ts @@ -48,7 +48,11 @@ describe("noteSessionLockHealth", () => { "utf8", ); - await noteSessionLockHealth({ shouldRepair: false, staleMs: 60_000 }); + await noteSessionLockHealth({ + shouldRepair: false, + staleMs: 60_000, + readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "doctor"], + }); expect(note).toHaveBeenCalledTimes(1); const [message, title] = note.mock.calls[0] as [string, string]; @@ -77,7 +81,11 @@ describe("noteSessionLockHealth", () => { "utf8", ); - await noteSessionLockHealth({ shouldRepair: true, staleMs: 30_000 }); + await noteSessionLockHealth({ + shouldRepair: true, + staleMs: 30_000, + readOwnerProcessArgs: () => ["node", "/opt/openclaw/openclaw.mjs", "doctor"], + }); expect(note).toHaveBeenCalledTimes(1); const [message] = note.mock.calls[0] as [string, string]; @@ -87,4 +95,29 @@ describe("noteSessionLockHealth", () => { await expectPathMissing(staleLock); await expect(fs.access(freshLock)).resolves.toBeUndefined(); }); + + it("removes fresh live locks when the owner is not an OpenClaw process", async () => { + const sessionsDir = state.sessionsDir(); + await fs.mkdir(sessionsDir, { recursive: true }); + + const falseLiveLock = path.join(sessionsDir, "false-live.jsonl.lock"); + await fs.writeFile( + falseLiveLock, + JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }), + "utf8", + ); + + await noteSessionLockHealth({ + shouldRepair: true, + staleMs: 60_000, + readOwnerProcessArgs: () => ["python", "worker.py"], + }); + + expect(note).toHaveBeenCalledTimes(1); + const [message] = note.mock.calls[0] as [string, string]; + expect(message).toContain("stale=yes (non-openclaw-owner)"); + expect(message).toContain("[removed]"); + expect(message).toContain("Removed 1 stale session lock file"); + await expect(fs.access(falseLiveLock)).rejects.toThrow(); + }); }); diff --git a/src/commands/doctor-session-locks.ts b/src/commands/doctor-session-locks.ts index d7331c6cf0b..c7377783c3f 100644 --- a/src/commands/doctor-session-locks.ts +++ b/src/commands/doctor-session-locks.ts @@ -1,5 +1,9 @@ import { resolveAgentSessionDirs } from "../agents/session-dirs.js"; -import { cleanStaleLockFiles, type SessionLockInspection } from "../agents/session-write-lock.js"; +import { + cleanStaleLockFiles, + type SessionLockInspection, + type SessionLockOwnerProcessArgsReader, +} from "../agents/session-write-lock.js"; import { resolveStateDir } from "../config/paths.js"; import { note } from "../terminal/note.js"; import { shortenHomePath } from "../utils.js"; @@ -35,7 +39,11 @@ function formatLockLine(lock: SessionLockInspection): string { return `- ${shortenHomePath(lock.lockPath)} ${pidStatus} ${ageStatus} ${staleStatus}${removedStatus}`; } -export async function noteSessionLockHealth(params?: { shouldRepair?: boolean; staleMs?: number }) { +export async function noteSessionLockHealth(params?: { + shouldRepair?: boolean; + staleMs?: number; + readOwnerProcessArgs?: SessionLockOwnerProcessArgsReader; +}) { const shouldRepair = params?.shouldRepair === true; const staleMs = params?.staleMs ?? DEFAULT_STALE_MS; let sessionDirs: string[] = []; @@ -56,6 +64,7 @@ export async function noteSessionLockHealth(params?: { shouldRepair?: boolean; s sessionsDir, staleMs, removeStale: shouldRepair, + readOwnerProcessArgs: params?.readOwnerProcessArgs, }); allLocks.push(...result.locks); }