diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index 09c4226c3ac..51d30f89143 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -28,7 +28,12 @@ import { DEFAULT_AGENT_ID } from "../routing/session-key.js"; import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js"; import { GatewayClient } from "./client.js"; import { renderCatNoncePngBase64 } from "./live-image-probe.js"; -import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js"; +import { + hasExpectedSingleNonce, + hasExpectedToolNonce, + shouldRetryExecReadProbe, + shouldRetryToolReadProbe, +} from "./live-tool-probe-utils.js"; import { startGatewayServer } from "./server.js"; import { extractPayloadText } from "./test-helpers.agent-results.js"; @@ -862,41 +867,77 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`${progressLabel}: tool-exec`); const nonceC = randomUUID(); const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`); - - const execReadProbe = await client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-exec-read`, - message: - "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - "Finally reply including the nonce text you read back.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ); - if (execReadProbe?.status !== "ok") { - throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`); - } - const execReadText = extractPayloadText(execReadProbe?.result); - if ( - isEmptyStreamText(execReadText) && - (model.provider === "minimax" || model.provider === "openai-codex") + const maxExecReadAttempts = 3; + let execReadText = ""; + for ( + let execReadAttempt = 0; + execReadAttempt < maxExecReadAttempts; + execReadAttempt += 1 ) { - logProgress(`${progressLabel}: skip (${model.provider} empty response)`); - break; + const strictReply = execReadAttempt > 0; + const execReadProbe = await client.request( + "agent", + { + sessionKey, + idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + `Then reply with exactly: ${nonceC}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + "Finally reply including the nonce text you read back.", + thinking: params.thinkingLevel, + deliver: false, + }, + { expectFinal: true }, + ); + if (execReadProbe?.status !== "ok") { + if (execReadAttempt + 1 < maxExecReadAttempts) { + logProgress( + `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`, + ); + continue; + } + throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`); + } + execReadText = extractPayloadText(execReadProbe?.result); + if ( + isEmptyStreamText(execReadText) && + (model.provider === "minimax" || model.provider === "openai-codex") + ) { + logProgress(`${progressLabel}: skip (${model.provider} empty response)`); + break; + } + assertNoReasoningTags({ + text: execReadText, + model: modelKey, + phase: "tool-exec", + label: params.label, + }); + if (hasExpectedSingleNonce(execReadText, nonceC)) { + break; + } + if ( + shouldRetryExecReadProbe({ + text: execReadText, + nonce: nonceC, + attempt: execReadAttempt, + maxAttempts: maxExecReadAttempts, + }) + ) { + logProgress( + `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) malformed tool output`, + ); + continue; + } + throw new Error(`exec+read probe missing nonce: ${execReadText}`); } - assertNoReasoningTags({ - text: execReadText, - model: modelKey, - phase: "tool-exec", - label: params.label, - }); - if (!execReadText.includes(nonceC)) { + if (!hasExpectedSingleNonce(execReadText, nonceC)) { throw new Error(`exec+read probe missing nonce: ${execReadText}`); } diff --git a/src/gateway/live-tool-probe-utils.test.ts b/src/gateway/live-tool-probe-utils.test.ts index ff2468ece53..044bf6b7ede 100644 --- a/src/gateway/live-tool-probe-utils.test.ts +++ b/src/gateway/live-tool-probe-utils.test.ts @@ -1,5 +1,10 @@ import { describe, expect, it } from "vitest"; -import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js"; +import { + hasExpectedSingleNonce, + hasExpectedToolNonce, + shouldRetryExecReadProbe, + shouldRetryToolReadProbe, +} from "./live-tool-probe-utils.js"; describe("live tool probe utils", () => { it("matches nonce pair when both are present", () => { @@ -7,6 +12,11 @@ describe("live tool probe utils", () => { expect(hasExpectedToolNonce("value a-1 only", "a-1", "b-2")).toBe(false); }); + it("matches single nonce when present", () => { + expect(hasExpectedSingleNonce("value nonce-1", "nonce-1")).toBe(true); + expect(hasExpectedSingleNonce("value nonce-2", "nonce-1")).toBe(false); + }); + it("retries malformed tool output when attempts remain", () => { expect( shouldRetryToolReadProbe({ @@ -97,4 +107,37 @@ describe("live tool probe utils", () => { }), ).toBe(false); }); + + it("retries malformed exec+read output when attempts remain", () => { + expect( + shouldRetryExecReadProbe({ + text: "read[object Object]", + nonce: "nonce-c", + attempt: 0, + maxAttempts: 3, + }), + ).toBe(true); + }); + + it("does not retry exec+read once max attempts are exhausted", () => { + expect( + shouldRetryExecReadProbe({ + text: "read[object Object]", + nonce: "nonce-c", + attempt: 2, + maxAttempts: 3, + }), + ).toBe(false); + }); + + it("does not retry exec+read when nonce is present", () => { + expect( + shouldRetryExecReadProbe({ + text: "nonce-c", + nonce: "nonce-c", + attempt: 0, + maxAttempts: 3, + }), + ).toBe(false); + }); }); diff --git a/src/gateway/live-tool-probe-utils.ts b/src/gateway/live-tool-probe-utils.ts index f38a08724b4..3e450ef530d 100644 --- a/src/gateway/live-tool-probe-utils.ts +++ b/src/gateway/live-tool-probe-utils.ts @@ -2,6 +2,25 @@ export function hasExpectedToolNonce(text: string, nonceA: string, nonceB: strin return text.includes(nonceA) && text.includes(nonceB); } +export function hasExpectedSingleNonce(text: string, nonce: string): boolean { + return text.includes(nonce); +} + +function hasMalformedToolOutput(text: string): boolean { + const trimmed = text.trim(); + if (!trimmed) { + return true; + } + const lower = trimmed.toLowerCase(); + if (trimmed.includes("[object Object]")) { + return true; + } + if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) { + return true; + } + return false; +} + export function shouldRetryToolReadProbe(params: { text: string; nonceA: string; @@ -16,19 +35,27 @@ export function shouldRetryToolReadProbe(params: { if (hasExpectedToolNonce(params.text, params.nonceA, params.nonceB)) { return false; } - const trimmed = params.text.trim(); - if (!trimmed) { - return true; - } - const lower = trimmed.toLowerCase(); - if (trimmed.includes("[object Object]")) { - return true; - } - if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) { + if (hasMalformedToolOutput(params.text)) { return true; } + const lower = params.text.trim().toLowerCase(); if (params.provider === "mistral" && (lower.includes("noncea=") || lower.includes("nonceb="))) { return true; } return false; } + +export function shouldRetryExecReadProbe(params: { + text: string; + nonce: string; + attempt: number; + maxAttempts: number; +}): boolean { + if (params.attempt + 1 >= params.maxAttempts) { + return false; + } + if (hasExpectedSingleNonce(params.text, params.nonce)) { + return false; + } + return hasMalformedToolOutput(params.text); +}