fix(gateway): retry exec-read live tool probe

2026-03-07 22:44:16 +00:00 · 2026-03-03 03:36:37 +00:00
parent 70ab91500a
commit 92c4a2a29e
3 changed files with 155 additions and 44 deletions
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@@ -28,7 +28,12 @@ import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
 import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
 import { GatewayClient } from "./client.js";
 import { renderCatNoncePngBase64 } from "./live-image-probe.js";
-import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js";
+import {
+  hasExpectedSingleNonce,
+  hasExpectedToolNonce,
+  shouldRetryExecReadProbe,
+  shouldRetryToolReadProbe,
+} from "./live-tool-probe-utils.js";
 import { startGatewayServer } from "./server.js";
 import { extractPayloadText } from "./test-helpers.agent-results.js";

@@ -862,41 +867,77 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
            logProgress(`${progressLabel}: tool-exec`);
            const nonceC = randomUUID();
            const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
-
-            const execReadProbe = await client.request<AgentFinalPayload>(
-              "agent",
-              {
-                sessionKey,
-                idempotencyKey: `idem-${runIdTool}-exec-read`,
-                message:
-                  "OpenClaw live tool probe (local, safe): " +
-                  "use the tool named `exec` (or `Exec`) to run this command: " +
-                  `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
-                  `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
-                  "Finally reply including the nonce text you read back.",
-                thinking: params.thinkingLevel,
-                deliver: false,
-              },
-              { expectFinal: true },
-            );
-            if (execReadProbe?.status !== "ok") {
-              throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
-            }
-            const execReadText = extractPayloadText(execReadProbe?.result);
-            if (
-              isEmptyStreamText(execReadText) &&
-              (model.provider === "minimax" || model.provider === "openai-codex")
+            const maxExecReadAttempts = 3;
+            let execReadText = "";
+            for (
+              let execReadAttempt = 0;
+              execReadAttempt < maxExecReadAttempts;
+              execReadAttempt += 1
            ) {
-              logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
-              break;
+              const strictReply = execReadAttempt > 0;
+              const execReadProbe = await client.request<AgentFinalPayload>(
+                "agent",
+                {
+                  sessionKey,
+                  idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
+                  message: strictReply
+                    ? "OpenClaw live tool probe (local, safe): " +
+                      "use the tool named `exec` (or `Exec`) to run this command: " +
+                      `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                      `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                      `Then reply with exactly: ${nonceC}. No extra text.`
+                    : "OpenClaw live tool probe (local, safe): " +
+                      "use the tool named `exec` (or `Exec`) to run this command: " +
+                      `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                      `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                      "Finally reply including the nonce text you read back.",
+                  thinking: params.thinkingLevel,
+                  deliver: false,
+                },
+                { expectFinal: true },
+              );
+              if (execReadProbe?.status !== "ok") {
+                if (execReadAttempt + 1 < maxExecReadAttempts) {
+                  logProgress(
+                    `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`,
+                  );
+                  continue;
+                }
+                throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
+              }
+              execReadText = extractPayloadText(execReadProbe?.result);
+              if (
+                isEmptyStreamText(execReadText) &&
+                (model.provider === "minimax" || model.provider === "openai-codex")
+              ) {
+                logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
+                break;
+              }
+              assertNoReasoningTags({
+                text: execReadText,
+                model: modelKey,
+                phase: "tool-exec",
+                label: params.label,
+              });
+              if (hasExpectedSingleNonce(execReadText, nonceC)) {
+                break;
+              }
+              if (
+                shouldRetryExecReadProbe({
+                  text: execReadText,
+                  nonce: nonceC,
+                  attempt: execReadAttempt,
+                  maxAttempts: maxExecReadAttempts,
+                })
+              ) {
+                logProgress(
+                  `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) malformed tool output`,
+                );
+                continue;
+              }
+              throw new Error(`exec+read probe missing nonce: ${execReadText}`);
            }
-            assertNoReasoningTags({
-              text: execReadText,
-              model: modelKey,
-              phase: "tool-exec",
-              label: params.label,
-            });
-            if (!execReadText.includes(nonceC)) {
+            if (!hasExpectedSingleNonce(execReadText, nonceC)) {
              throw new Error(`exec+read probe missing nonce: ${execReadText}`);
            }

--- a/src/gateway/live-tool-probe-utils.test.ts
+++ b/src/gateway/live-tool-probe-utils.test.ts
@@ -1,5 +1,10 @@
 import { describe, expect, it } from "vitest";
-import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js";
+import {
+  hasExpectedSingleNonce,
+  hasExpectedToolNonce,
+  shouldRetryExecReadProbe,
+  shouldRetryToolReadProbe,
+} from "./live-tool-probe-utils.js";

 describe("live tool probe utils", () => {
  it("matches nonce pair when both are present", () => {
@@ -7,6 +12,11 @@ describe("live tool probe utils", () => {
    expect(hasExpectedToolNonce("value a-1 only", "a-1", "b-2")).toBe(false);
  });

+  it("matches single nonce when present", () => {
+    expect(hasExpectedSingleNonce("value nonce-1", "nonce-1")).toBe(true);
+    expect(hasExpectedSingleNonce("value nonce-2", "nonce-1")).toBe(false);
+  });
+
  it("retries malformed tool output when attempts remain", () => {
    expect(
      shouldRetryToolReadProbe({
@@ -97,4 +107,37 @@ describe("live tool probe utils", () => {
      }),
    ).toBe(false);
  });
+
+  it("retries malformed exec+read output when attempts remain", () => {
+    expect(
+      shouldRetryExecReadProbe({
+        text: "read[object Object]",
+        nonce: "nonce-c",
+        attempt: 0,
+        maxAttempts: 3,
+      }),
+    ).toBe(true);
+  });
+
+  it("does not retry exec+read once max attempts are exhausted", () => {
+    expect(
+      shouldRetryExecReadProbe({
+        text: "read[object Object]",
+        nonce: "nonce-c",
+        attempt: 2,
+        maxAttempts: 3,
+      }),
+    ).toBe(false);
+  });
+
+  it("does not retry exec+read when nonce is present", () => {
+    expect(
+      shouldRetryExecReadProbe({
+        text: "nonce-c",
+        nonce: "nonce-c",
+        attempt: 0,
+        maxAttempts: 3,
+      }),
+    ).toBe(false);
+  });
 });
--- a/src/gateway/live-tool-probe-utils.ts
+++ b/src/gateway/live-tool-probe-utils.ts
@@ -2,6 +2,25 @@ export function hasExpectedToolNonce(text: string, nonceA: string, nonceB: strin
  return text.includes(nonceA) && text.includes(nonceB);
 }

+export function hasExpectedSingleNonce(text: string, nonce: string): boolean {
+  return text.includes(nonce);
+}
+
+function hasMalformedToolOutput(text: string): boolean {
+  const trimmed = text.trim();
+  if (!trimmed) {
+    return true;
+  }
+  const lower = trimmed.toLowerCase();
+  if (trimmed.includes("[object Object]")) {
+    return true;
+  }
+  if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) {
+    return true;
+  }
+  return false;
+}
+
 export function shouldRetryToolReadProbe(params: {
  text: string;
  nonceA: string;
@@ -16,19 +35,27 @@ export function shouldRetryToolReadProbe(params: {
  if (hasExpectedToolNonce(params.text, params.nonceA, params.nonceB)) {
    return false;
  }
-  const trimmed = params.text.trim();
-  if (!trimmed) {
-    return true;
-  }
-  const lower = trimmed.toLowerCase();
-  if (trimmed.includes("[object Object]")) {
-    return true;
-  }
-  if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) {
+  if (hasMalformedToolOutput(params.text)) {
    return true;
  }
+  const lower = params.text.trim().toLowerCase();
  if (params.provider === "mistral" && (lower.includes("noncea=") || lower.includes("nonceb="))) {
    return true;
  }
  return false;
 }
+
+export function shouldRetryExecReadProbe(params: {
+  text: string;
+  nonce: string;
+  attempt: number;
+  maxAttempts: number;
+}): boolean {
+  if (params.attempt + 1 >= params.maxAttempts) {
+    return false;
+  }
+  if (hasExpectedSingleNonce(params.text, params.nonce)) {
+    return false;
+  }
+  return hasMalformedToolOutput(params.text);
+}