fix(gateway): retry exec-read live tool probe

This commit is contained in:
Peter Steinberger
2026-03-03 03:36:37 +00:00
parent 70ab91500a
commit 92c4a2a29e
3 changed files with 155 additions and 44 deletions

View File

@@ -28,7 +28,12 @@ import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
import { GatewayClient } from "./client.js";
import { renderCatNoncePngBase64 } from "./live-image-probe.js";
import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js";
import {
hasExpectedSingleNonce,
hasExpectedToolNonce,
shouldRetryExecReadProbe,
shouldRetryToolReadProbe,
} from "./live-tool-probe-utils.js";
import { startGatewayServer } from "./server.js";
import { extractPayloadText } from "./test-helpers.agent-results.js";
@@ -862,41 +867,77 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
logProgress(`${progressLabel}: tool-exec`);
const nonceC = randomUUID();
const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
const execReadProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read`,
message:
"OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (execReadProbe?.status !== "ok") {
throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
}
const execReadText = extractPayloadText(execReadProbe?.result);
if (
isEmptyStreamText(execReadText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
const maxExecReadAttempts = 3;
let execReadText = "";
for (
let execReadAttempt = 0;
execReadAttempt < maxExecReadAttempts;
execReadAttempt += 1
) {
logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
break;
const strictReply = execReadAttempt > 0;
const execReadProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
message: strictReply
? "OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
`Then reply with exactly: ${nonceC}. No extra text.`
: "OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (execReadProbe?.status !== "ok") {
if (execReadAttempt + 1 < maxExecReadAttempts) {
logProgress(
`${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`,
);
continue;
}
throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
}
execReadText = extractPayloadText(execReadProbe?.result);
if (
isEmptyStreamText(execReadText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
break;
}
assertNoReasoningTags({
text: execReadText,
model: modelKey,
phase: "tool-exec",
label: params.label,
});
if (hasExpectedSingleNonce(execReadText, nonceC)) {
break;
}
if (
shouldRetryExecReadProbe({
text: execReadText,
nonce: nonceC,
attempt: execReadAttempt,
maxAttempts: maxExecReadAttempts,
})
) {
logProgress(
`${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) malformed tool output`,
);
continue;
}
throw new Error(`exec+read probe missing nonce: ${execReadText}`);
}
assertNoReasoningTags({
text: execReadText,
model: modelKey,
phase: "tool-exec",
label: params.label,
});
if (!execReadText.includes(nonceC)) {
if (!hasExpectedSingleNonce(execReadText, nonceC)) {
throw new Error(`exec+read probe missing nonce: ${execReadText}`);
}

View File

@@ -1,5 +1,10 @@
import { describe, expect, it } from "vitest";
import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js";
import {
hasExpectedSingleNonce,
hasExpectedToolNonce,
shouldRetryExecReadProbe,
shouldRetryToolReadProbe,
} from "./live-tool-probe-utils.js";
describe("live tool probe utils", () => {
it("matches nonce pair when both are present", () => {
@@ -7,6 +12,11 @@ describe("live tool probe utils", () => {
expect(hasExpectedToolNonce("value a-1 only", "a-1", "b-2")).toBe(false);
});
it("matches single nonce when present", () => {
expect(hasExpectedSingleNonce("value nonce-1", "nonce-1")).toBe(true);
expect(hasExpectedSingleNonce("value nonce-2", "nonce-1")).toBe(false);
});
it("retries malformed tool output when attempts remain", () => {
expect(
shouldRetryToolReadProbe({
@@ -97,4 +107,37 @@ describe("live tool probe utils", () => {
}),
).toBe(false);
});
it("retries malformed exec+read output when attempts remain", () => {
expect(
shouldRetryExecReadProbe({
text: "read[object Object]",
nonce: "nonce-c",
attempt: 0,
maxAttempts: 3,
}),
).toBe(true);
});
it("does not retry exec+read once max attempts are exhausted", () => {
expect(
shouldRetryExecReadProbe({
text: "read[object Object]",
nonce: "nonce-c",
attempt: 2,
maxAttempts: 3,
}),
).toBe(false);
});
it("does not retry exec+read when nonce is present", () => {
expect(
shouldRetryExecReadProbe({
text: "nonce-c",
nonce: "nonce-c",
attempt: 0,
maxAttempts: 3,
}),
).toBe(false);
});
});

View File

@@ -2,6 +2,25 @@ export function hasExpectedToolNonce(text: string, nonceA: string, nonceB: strin
return text.includes(nonceA) && text.includes(nonceB);
}
export function hasExpectedSingleNonce(text: string, nonce: string): boolean {
return text.includes(nonce);
}
function hasMalformedToolOutput(text: string): boolean {
const trimmed = text.trim();
if (!trimmed) {
return true;
}
const lower = trimmed.toLowerCase();
if (trimmed.includes("[object Object]")) {
return true;
}
if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) {
return true;
}
return false;
}
export function shouldRetryToolReadProbe(params: {
text: string;
nonceA: string;
@@ -16,19 +35,27 @@ export function shouldRetryToolReadProbe(params: {
if (hasExpectedToolNonce(params.text, params.nonceA, params.nonceB)) {
return false;
}
const trimmed = params.text.trim();
if (!trimmed) {
return true;
}
const lower = trimmed.toLowerCase();
if (trimmed.includes("[object Object]")) {
return true;
}
if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) {
if (hasMalformedToolOutput(params.text)) {
return true;
}
const lower = params.text.trim().toLowerCase();
if (params.provider === "mistral" && (lower.includes("noncea=") || lower.includes("nonceb="))) {
return true;
}
return false;
}
export function shouldRetryExecReadProbe(params: {
text: string;
nonce: string;
attempt: number;
maxAttempts: number;
}): boolean {
if (params.attempt + 1 >= params.maxAttempts) {
return false;
}
if (hasExpectedSingleNonce(params.text, params.nonce)) {
return false;
}
return hasMalformedToolOutput(params.text);
}