fix: stabilize docker test suite

2026-04-27 00:17:29 +00:00 · 2026-03-17 03:01:11 +00:00
parent ed248c76c7
commit 1ffe8fde84
17 changed files with 450 additions and 382 deletions
--- a/src/agents/auth-profiles.external-cli-sync.test.ts
+++ b/src/agents/auth-profiles.external-cli-sync.test.ts
@@ -51,4 +51,40 @@ describe("syncExternalCliCredentials", () => {
    });
    expect(store.profiles[CODEX_CLI_PROFILE_ID]).toBeUndefined();
  });
+
+  it("refreshes stored Codex expiry from external CLI even when the cached profile looks fresh", () => {
+    const staleExpiry = Date.now() + 30 * 60_000;
+    const freshExpiry = Date.now() + 5 * 24 * 60 * 60_000;
+    mocks.readCodexCliCredentialsCached.mockReturnValue({
+      type: "oauth",
+      provider: "openai-codex",
+      access: "new-access-token",
+      refresh: "new-refresh-token",
+      expires: freshExpiry,
+      accountId: "acct_456",
+    });
+
+    const store: AuthProfileStore = {
+      version: 1,
+      profiles: {
+        [OPENAI_CODEX_DEFAULT_PROFILE_ID]: {
+          type: "oauth",
+          provider: "openai-codex",
+          access: "old-access-token",
+          refresh: "old-refresh-token",
+          expires: staleExpiry,
+          accountId: "acct_456",
+        },
+      },
+    };
+
+    const mutated = syncExternalCliCredentials(store);
+
+    expect(mutated).toBe(true);
+    expect(store.profiles[OPENAI_CODEX_DEFAULT_PROFILE_ID]).toMatchObject({
+      access: "new-access-token",
+      refresh: "new-refresh-token",
+      expires: freshExpiry,
+    });
+  });
 });
--- a/src/agents/auth-profiles/external-cli-sync.ts
+++ b/src/agents/auth-profiles/external-cli-sync.ts
@@ -4,13 +4,12 @@ import {
  readMiniMaxCliCredentialsCached,
 } from "../cli-credentials.js";
 import {
-  EXTERNAL_CLI_NEAR_EXPIRY_MS,
  EXTERNAL_CLI_SYNC_TTL_MS,
  QWEN_CLI_PROFILE_ID,
  MINIMAX_CLI_PROFILE_ID,
  log,
 } from "./constants.js";
-import type { AuthProfileCredential, AuthProfileStore, OAuthCredential } from "./types.js";
+import type { AuthProfileStore, OAuthCredential } from "./types.js";

 const OPENAI_CODEX_DEFAULT_PROFILE_ID = "openai-codex:default";

@@ -37,62 +36,33 @@ function shallowEqualOAuthCredentials(a: OAuthCredential | undefined, b: OAuthCr
  );
 }

-function isExternalProfileFresh(cred: AuthProfileCredential | undefined, now: number): boolean {
-  if (!cred) {
-    return false;
-  }
-  if (cred.type !== "oauth" && cred.type !== "token") {
-    return false;
-  }
-  if (
-    cred.provider !== "qwen-portal" &&
-    cred.provider !== "minimax-portal" &&
-    cred.provider !== "openai-codex"
-  ) {
-    return false;
-  }
-  if (typeof cred.expires !== "number") {
-    return true;
-  }
-  return cred.expires > now + EXTERNAL_CLI_NEAR_EXPIRY_MS;
-}
-
 /** Sync external CLI credentials into the store for a given provider. */
 function syncExternalCliCredentialsForProvider(
  store: AuthProfileStore,
  profileId: string,
  provider: string,
  readCredentials: () => OAuthCredential | null,
-  now: number,
  options: ExternalCliSyncOptions,
 ): boolean {
  const existing = store.profiles[profileId];
-  const shouldSync =
-    !existing || existing.provider !== provider || !isExternalProfileFresh(existing, now);
-  const creds = shouldSync ? readCredentials() : null;
+  const creds = readCredentials();
  if (!creds) {
    return false;
  }

  const existingOAuth = existing?.type === "oauth" ? existing : undefined;
-  const shouldUpdate =
-    !existingOAuth ||
-    existingOAuth.provider !== provider ||
-    existingOAuth.expires <= now ||
-    creds.expires > existingOAuth.expires;
-
-  if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, creds)) {
-    store.profiles[profileId] = creds;
-    if (options.log !== false) {
-      log.info(`synced ${provider} credentials from external cli`, {
-        profileId,
-        expires: new Date(creds.expires).toISOString(),
-      });
-    }
-    return true;
+  if (shallowEqualOAuthCredentials(existingOAuth, creds)) {
+    return false;
  }

-  return false;
+  store.profiles[profileId] = creds;
+  if (options.log !== false) {
+    log.info(`synced ${provider} credentials from external cli`, {
+      profileId,
+      expires: new Date(creds.expires).toISOString(),
+    });
+  }
+  return true;
 }

 /**
@@ -106,46 +76,24 @@ export function syncExternalCliCredentials(
  options: ExternalCliSyncOptions = {},
 ): boolean {
  let mutated = false;
-  const now = Date.now();

-  // Sync from Qwen Code CLI
-  const existingQwen = store.profiles[QWEN_CLI_PROFILE_ID];
-  const shouldSyncQwen =
-    !existingQwen ||
-    existingQwen.provider !== "qwen-portal" ||
-    !isExternalProfileFresh(existingQwen, now);
-  const qwenCreds = shouldSyncQwen
-    ? readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS })
-    : null;
-  if (qwenCreds) {
-    const existing = store.profiles[QWEN_CLI_PROFILE_ID];
-    const existingOAuth = existing?.type === "oauth" ? existing : undefined;
-    const shouldUpdate =
-      !existingOAuth ||
-      existingOAuth.provider !== "qwen-portal" ||
-      existingOAuth.expires <= now ||
-      qwenCreds.expires > existingOAuth.expires;
-
-    if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, qwenCreds)) {
-      store.profiles[QWEN_CLI_PROFILE_ID] = qwenCreds;
-      mutated = true;
-      if (options.log !== false) {
-        log.info("synced qwen credentials from qwen cli", {
-          profileId: QWEN_CLI_PROFILE_ID,
-          expires: new Date(qwenCreds.expires).toISOString(),
-        });
-      }
-    }
+  if (
+    syncExternalCliCredentialsForProvider(
+      store,
+      QWEN_CLI_PROFILE_ID,
+      "qwen-portal",
+      () => readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
+      options,
+    )
+  ) {
+    mutated = true;
  }
-
-  // Sync from MiniMax Portal CLI
  if (
    syncExternalCliCredentialsForProvider(
      store,
      MINIMAX_CLI_PROFILE_ID,
      "minimax-portal",
      () => readMiniMaxCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
-      now,
      options,
    )
  ) {
@@ -157,7 +105,6 @@ export function syncExternalCliCredentials(
      OPENAI_CODEX_DEFAULT_PROFILE_ID,
      "openai-codex",
      () => readCodexCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
-      now,
      options,
    )
  ) {
--- a/src/agents/cli-credentials.test.ts
+++ b/src/agents/cli-credentials.test.ts
@@ -46,6 +46,12 @@ async function readCachedClaudeCliCredentials(allowKeychainPrompt: boolean) {
  });
 }

+function createJwtWithExp(expSeconds: number): string {
+  const encode = (value: Record<string, unknown>) =>
+    Buffer.from(JSON.stringify(value)).toString("base64url");
+  return `${encode({ alg: "RS256", typ: "JWT" })}.${encode({ exp: expSeconds })}.signature`;
+}
+
 describe("cli credentials", () => {
  beforeAll(async () => {
    ({
@@ -229,6 +235,7 @@ describe("cli credentials", () => {
  it("reads Codex credentials from keychain when available", async () => {
    const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-"));
    process.env.CODEX_HOME = tempHome;
+    const expSeconds = Math.floor(Date.parse("2026-03-23T00:48:49Z") / 1000);

    const accountHash = "cli|";

@@ -238,7 +245,7 @@ describe("cli credentials", () => {
      expect(cmd).toContain(accountHash);
      return JSON.stringify({
        tokens: {
-          access_token: "keychain-access",
+          access_token: createJwtWithExp(expSeconds),
          refresh_token: "keychain-refresh",
        },
        last_refresh: "2026-01-01T00:00:00Z",
@@ -248,15 +255,17 @@ describe("cli credentials", () => {
    const creds = readCodexCliCredentials({ platform: "darwin", execSync: execSyncMock });

    expect(creds).toMatchObject({
-      access: "keychain-access",
+      access: createJwtWithExp(expSeconds),
      refresh: "keychain-refresh",
      provider: "openai-codex",
+      expires: expSeconds * 1000,
    });
  });

  it("falls back to Codex auth.json when keychain is unavailable", async () => {
    const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-"));
    process.env.CODEX_HOME = tempHome;
+    const expSeconds = Math.floor(Date.parse("2026-03-24T12:34:56Z") / 1000);
    execSyncMock.mockImplementation(() => {
      throw new Error("not found");
    });
@@ -267,7 +276,7 @@ describe("cli credentials", () => {
      authPath,
      JSON.stringify({
        tokens: {
-          access_token: "file-access",
+          access_token: createJwtWithExp(expSeconds),
          refresh_token: "file-refresh",
        },
      }),
@@ -277,9 +286,10 @@ describe("cli credentials", () => {
    const creds = readCodexCliCredentials({ execSync: execSyncMock });

    expect(creds).toMatchObject({
-      access: "file-access",
+      access: createJwtWithExp(expSeconds),
      refresh: "file-refresh",
      provider: "openai-codex",
+      expires: expSeconds * 1000,
    });
  });
 });
--- a/src/agents/cli-credentials.ts
+++ b/src/agents/cli-credentials.ts
@@ -153,6 +153,22 @@ function computeCodexKeychainAccount(codexHome: string) {
  return `cli|${hash.slice(0, 16)}`;
 }

+function decodeJwtExpiryMs(token: string): number | null {
+  const parts = token.split(".");
+  if (parts.length < 2) {
+    return null;
+  }
+  try {
+    const payloadRaw = Buffer.from(parts[1], "base64url").toString("utf8");
+    const payload = JSON.parse(payloadRaw) as { exp?: unknown };
+    return typeof payload.exp === "number" && Number.isFinite(payload.exp) && payload.exp > 0
+      ? payload.exp * 1000
+      : null;
+  } catch {
+    return null;
+  }
+}
+
 function readCodexKeychainCredentials(options?: {
  platform?: NodeJS.Platform;
  execSync?: ExecSyncFn;
@@ -193,9 +209,10 @@ function readCodexKeychainCredentials(options?: {
      typeof lastRefreshRaw === "string" || typeof lastRefreshRaw === "number"
        ? new Date(lastRefreshRaw).getTime()
        : Date.now();
-    const expires = Number.isFinite(lastRefresh)
+    const fallbackExpiry = Number.isFinite(lastRefresh)
      ? lastRefresh + 60 * 60 * 1000
      : Date.now() + 60 * 60 * 1000;
+    const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry;
    const accountId = typeof tokens?.account_id === "string" ? tokens.account_id : undefined;

    log.info("read codex credentials from keychain", {
@@ -483,13 +500,14 @@ export function readCodexCliCredentials(options?: {
    return null;
  }

-  let expires: number;
+  let fallbackExpiry: number;
  try {
    const stat = fs.statSync(authPath);
-    expires = stat.mtimeMs + 60 * 60 * 1000;
+    fallbackExpiry = stat.mtimeMs + 60 * 60 * 1000;
  } catch {
-    expires = Date.now() + 60 * 60 * 1000;
+    fallbackExpiry = Date.now() + 60 * 60 * 1000;
  }
+  const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry;

  return {
    type: "oauth",
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@@ -117,6 +117,10 @@ function isChatGPTUsageLimitErrorMessage(raw: string): boolean {
  return msg.includes("hit your chatgpt usage limit") && msg.includes("try again in");
 }

+function isRefreshTokenReused(raw: string): boolean {
+  return /refresh_token_reused/i.test(raw);
+}
+
 function isInstructionsRequiredError(raw: string): boolean {
  return /instructions are required/i.test(raw);
 }
@@ -643,6 +647,15 @@ describeLive("live models (profile keys)", () => {
              logProgress(`${progressLabel}: skip (rate limit)`);
              break;
            }
+            if (
+              allowNotFoundSkip &&
+              model.provider === "openai-codex" &&
+              isRefreshTokenReused(message)
+            ) {
+              skipped.push({ model: id, reason: message });
+              logProgress(`${progressLabel}: skip (codex refresh token reused)`);
+              break;
+            }
            if (
              allowNotFoundSkip &&
              model.provider === "openai-codex" &&
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@@ -24,7 +24,7 @@ import { shouldSuppressBuiltInModel } from "../agents/model-suppression.js";
 import { ensureOpenClawModelsJson } from "../agents/models-config.js";
 import { isRateLimitErrorMessage } from "../agents/pi-embedded-helpers/errors.js";
 import { discoverAuthStorage, discoverModels } from "../agents/pi-model-discovery.js";
-import { loadConfig } from "../config/config.js";
+import { clearRuntimeConfigSnapshot, loadConfig } from "../config/config.js";
 import type { ModelsConfig, OpenClawConfig, ModelProviderConfig } from "../config/types.js";
 import { isTruthyEnvValue } from "../infra/env.js";
 import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
@@ -38,7 +38,7 @@ import {
  shouldRetryToolReadProbe,
 } from "./live-tool-probe-utils.js";
 import { startGatewayServer } from "./server.js";
-import { extractPayloadText } from "./test-helpers.agent-results.js";
+import { loadSessionEntry, readSessionMessages } from "./session-utils.js";

 const LIVE = isTruthyEnvValue(process.env.LIVE) || isTruthyEnvValue(process.env.OPENCLAW_LIVE_TEST);
 const GATEWAY_LIVE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY);
@@ -171,6 +171,32 @@ function logProgress(message: string): void {
  console.log(`[live] ${message}`);
 }

+function enterProductionEnvForLiveRun() {
+  const previous = {
+    vitest: process.env.VITEST,
+    nodeEnv: process.env.NODE_ENV,
+  };
+  delete process.env.VITEST;
+  process.env.NODE_ENV = "production";
+  return previous;
+}
+
+function restoreProductionEnvForLiveRun(previous: {
+  vitest: string | undefined;
+  nodeEnv: string | undefined;
+}) {
+  if (previous.vitest === undefined) {
+    delete process.env.VITEST;
+  } else {
+    process.env.VITEST = previous.vitest;
+  }
+  if (previous.nodeEnv === undefined) {
+    delete process.env.NODE_ENV;
+  } else {
+    process.env.NODE_ENV = previous.nodeEnv;
+  }
+}
+
 function formatFailurePreview(
  failures: Array<{ model: string; error: string }>,
  maxItems: number,
@@ -319,25 +345,14 @@ async function runAnthropicRefusalProbe(params: {
 }): Promise<void> {
  logProgress(`${params.label}: refusal-probe`);
  const magic = buildAnthropicRefusalToken();
-  const runId = randomUUID();
-  const probe = await withGatewayLiveProbeTimeout(
-    params.client.request<AgentFinalPayload>(
-      "agent",
-      {
-        sessionKey: params.sessionKey,
-        idempotencyKey: `idem-${runId}-refusal`,
-        message: `Reply with the single word ok. Test token: ${magic}`,
-        thinking: params.thinkingLevel,
-        deliver: false,
-      },
-      { expectFinal: true },
-    ),
-    `${params.label}: refusal-probe`,
-  );
-  if (probe?.status !== "ok") {
-    throw new Error(`refusal probe failed: status=${String(probe?.status)}`);
-  }
-  const probeText = extractPayloadText(probe?.result);
+  const probeText = await requestGatewayAgentText({
+    client: params.client,
+    sessionKey: params.sessionKey,
+    idempotencyKey: `idem-${randomUUID()}-refusal`,
+    message: `Reply with the single word ok. Test token: ${magic}`,
+    thinkingLevel: params.thinkingLevel,
+    context: `${params.label}: refusal-probe`,
+  });
  assertNoReasoningTags({
    text: probeText,
    model: params.modelKey,
@@ -348,25 +363,14 @@ async function runAnthropicRefusalProbe(params: {
    throw new Error(`refusal probe missing ok: ${probeText}`);
  }

-  const followupId = randomUUID();
-  const followup = await withGatewayLiveProbeTimeout(
-    params.client.request<AgentFinalPayload>(
-      "agent",
-      {
-        sessionKey: params.sessionKey,
-        idempotencyKey: `idem-${followupId}-refusal-followup`,
-        message: "Now reply with exactly: still ok.",
-        thinking: params.thinkingLevel,
-        deliver: false,
-      },
-      { expectFinal: true },
-    ),
-    `${params.label}: refusal-followup`,
-  );
-  if (followup?.status !== "ok") {
-    throw new Error(`refusal followup failed: status=${String(followup?.status)}`);
-  }
-  const followupText = extractPayloadText(followup?.result);
+  const followupText = await requestGatewayAgentText({
+    client: params.client,
+    sessionKey: params.sessionKey,
+    idempotencyKey: `idem-${randomUUID()}-refusal-followup`,
+    message: "Now reply with exactly: still ok.",
+    thinkingLevel: params.thinkingLevel,
+    context: `${params.label}: refusal-followup`,
+  });
  assertNoReasoningTags({
    text: followupText,
    model: params.modelKey,
@@ -475,11 +479,6 @@ async function getFreeGatewayPort(): Promise<number> {
  throw new Error("failed to acquire a free gateway port block");
 }

-type AgentFinalPayload = {
-  status?: unknown;
-  result?: unknown;
-};
-
 async function connectClient(params: { url: string; token: string }) {
  return await new Promise<GatewayClient>((resolve, reject) => {
    let settled = false;
@@ -513,6 +512,115 @@ async function connectClient(params: { url: string; token: string }) {
  });
 }

+function extractTranscriptMessageText(message: unknown): string {
+  if (!message || typeof message !== "object") {
+    return "";
+  }
+  const record = message as {
+    text?: unknown;
+    content?: unknown;
+  };
+  if (typeof record.text === "string" && record.text.trim()) {
+    return record.text.trim();
+  }
+  if (typeof record.content === "string" && record.content.trim()) {
+    return record.content.trim();
+  }
+  if (!Array.isArray(record.content)) {
+    return "";
+  }
+  return record.content
+    .map((entry) => {
+      if (!entry || typeof entry !== "object") {
+        return "";
+      }
+      const text = (entry as { text?: unknown }).text;
+      return typeof text === "string" && text.trim() ? text.trim() : "";
+    })
+    .filter(Boolean)
+    .join("\n")
+    .trim();
+}
+
+function readSessionAssistantTexts(sessionKey: string): string[] {
+  const { storePath, entry } = loadSessionEntry(sessionKey);
+  if (!entry?.sessionId) {
+    return [];
+  }
+  const messages = readSessionMessages(entry.sessionId, storePath, entry.sessionFile);
+  const assistantTexts: string[] = [];
+  for (const message of messages) {
+    if (!message || typeof message !== "object") {
+      continue;
+    }
+    const role = (message as { role?: unknown }).role;
+    if (role !== "assistant") {
+      continue;
+    }
+    assistantTexts.push(extractTranscriptMessageText(message));
+  }
+  return assistantTexts;
+}
+
+async function waitForSessionAssistantText(params: {
+  sessionKey: string;
+  baselineAssistantCount: number;
+  context: string;
+}) {
+  const startedAt = Date.now();
+  let delayMs = 50;
+  while (Date.now() - startedAt < GATEWAY_LIVE_PROBE_TIMEOUT_MS) {
+    const assistantTexts = readSessionAssistantTexts(params.sessionKey);
+    if (assistantTexts.length > params.baselineAssistantCount) {
+      const freshText = assistantTexts
+        .slice(params.baselineAssistantCount)
+        .map((text) => text.trim())
+        .findLast((text) => text.length > 0);
+      if (freshText) {
+        return freshText;
+      }
+    }
+    await new Promise((resolve) => setTimeout(resolve, delayMs));
+    delayMs = Math.min(delayMs * 2, 250);
+  }
+  throw new Error(`probe timeout after ${GATEWAY_LIVE_PROBE_TIMEOUT_MS}ms (${params.context})`);
+}
+
+async function requestGatewayAgentText(params: {
+  client: GatewayClient;
+  sessionKey: string;
+  message: string;
+  thinkingLevel: string;
+  context: string;
+  idempotencyKey: string;
+  attachments?: Array<{
+    mimeType: string;
+    fileName: string;
+    content: string;
+  }>;
+}) {
+  const baselineAssistantCount = readSessionAssistantTexts(params.sessionKey).length;
+  const accepted = await withGatewayLiveProbeTimeout(
+    params.client.request<{ runId?: unknown; status?: unknown }>("agent", {
+      sessionKey: params.sessionKey,
+      idempotencyKey: params.idempotencyKey,
+      message: params.message,
+      thinking: params.thinkingLevel,
+      deliver: false,
+      attachments: params.attachments,
+    }),
+    `${params.context}: agent-accept`,
+  );
+  if (accepted?.status !== "accepted") {
+    throw new Error(`agent status=${String(accepted?.status)}`);
+  }
+  return await waitForSessionAssistantText({
+    sessionKey: params.sessionKey,
+    baselineAssistantCount,
+    context: `${params.context}: transcript-final`,
+  });
+}
+
 type GatewayModelSuiteParams = {
  label: string;
  cfg: OpenClawConfig;
@@ -636,6 +744,8 @@ function buildMinimaxProviderOverride(params: {
 }

 async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
+  clearRuntimeConfigSnapshot();
+  const runtimeEnv = enterProductionEnvForLiveRun();
  const previous = {
    configPath: process.env.OPENCLAW_CONFIG_PATH,
    token: process.env.OPENCLAW_GATEWAY_TOKEN,
@@ -793,48 +903,26 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
          );

          logProgress(`${progressLabel}: prompt`);
-          const runId = randomUUID();
-          const payload = await withGatewayLiveProbeTimeout(
-            client.request<AgentFinalPayload>(
-              "agent",
-              {
-                sessionKey,
-                idempotencyKey: `idem-${runId}`,
-                message:
-                  "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
-                thinking: params.thinkingLevel,
-                deliver: false,
-              },
-              { expectFinal: true },
-            ),
-            `${progressLabel}: prompt`,
-          );
-
-          if (payload?.status !== "ok") {
-            throw new Error(`agent status=${String(payload?.status)}`);
-          }
-          let text = extractPayloadText(payload?.result);
+          let text = await requestGatewayAgentText({
+            client,
+            sessionKey,
+            idempotencyKey: `idem-${randomUUID()}`,
+            message:
+              "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
+            thinkingLevel: params.thinkingLevel,
+            context: `${progressLabel}: prompt`,
+          });
          if (!text) {
            logProgress(`${progressLabel}: empty response, retrying`);
-            const retry = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${randomUUID()}-retry`,
-                  message:
-                    "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: prompt-retry`,
-            );
-            if (retry?.status !== "ok") {
-              throw new Error(`agent status=${String(retry?.status)}`);
-            }
-            text = extractPayloadText(retry?.result);
+            text = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${randomUUID()}-retry`,
+              message:
+                "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: prompt-retry`,
+            });
          }
          if (!text && isGoogleishProvider(model.provider)) {
            logProgress(`${progressLabel}: skip (google empty response)`);
@@ -881,36 +969,20 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
            toolReadAttempt += 1
          ) {
            const strictReply = toolReadAttempt > 0;
-            const toolProbe = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
-                  message: strictReply
-                    ? "OpenClaw live tool probe (local, safe): " +
-                      `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
-                      `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`
-                    : "OpenClaw live tool probe (local, safe): " +
-                      `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
-                      "Then reply with the two nonce values you read (include both).",
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: tool-read`,
-            );
-            if (toolProbe?.status !== "ok") {
-              if (toolReadAttempt + 1 < maxToolReadAttempts) {
-                logProgress(
-                  `${progressLabel}: tool-read retry (${toolReadAttempt + 2}/${maxToolReadAttempts}) status=${String(toolProbe?.status)}`,
-                );
-                continue;
-              }
-              throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
-            }
-            toolText = extractPayloadText(toolProbe?.result);
+            toolText = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
+              message: strictReply
+                ? "OpenClaw live tool probe (local, safe): " +
+                  `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
+                  `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`
+                : "OpenClaw live tool probe (local, safe): " +
+                  `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
+                  "Then reply with the two nonce values you read (include both).",
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: tool-read`,
+            });
            if (
              isEmptyStreamText(toolText) &&
              (model.provider === "minimax" || model.provider === "openai-codex")
@@ -960,40 +1032,24 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
              execReadAttempt += 1
            ) {
              const strictReply = execReadAttempt > 0;
-              const execReadProbe = await withGatewayLiveProbeTimeout(
-                client.request<AgentFinalPayload>(
-                  "agent",
-                  {
-                    sessionKey,
-                    idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
-                    message: strictReply
-                      ? "OpenClaw live tool probe (local, safe): " +
-                        "use the tool named `exec` (or `Exec`) to run this command: " +
-                        `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
-                        `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
-                        `Then reply with exactly: ${nonceC}. No extra text.`
-                      : "OpenClaw live tool probe (local, safe): " +
-                        "use the tool named `exec` (or `Exec`) to run this command: " +
-                        `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
-                        `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
-                        "Finally reply including the nonce text you read back.",
-                    thinking: params.thinkingLevel,
-                    deliver: false,
-                  },
-                  { expectFinal: true },
-                ),
-                `${progressLabel}: tool-exec`,
-              );
-              if (execReadProbe?.status !== "ok") {
-                if (execReadAttempt + 1 < maxExecReadAttempts) {
-                  logProgress(
-                    `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`,
-                  );
-                  continue;
-                }
-                throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
-              }
-              execReadText = extractPayloadText(execReadProbe?.result);
+              execReadText = await requestGatewayAgentText({
+                client,
+                sessionKey,
+                idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
+                message: strictReply
+                  ? "OpenClaw live tool probe (local, safe): " +
+                    "use the tool named `exec` (or `Exec`) to run this command: " +
+                    `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                    `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                    `Then reply with exactly: ${nonceC}. No extra text.`
+                  : "OpenClaw live tool probe (local, safe): " +
+                    "use the tool named `exec` (or `Exec`) to run this command: " +
+                    `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                    `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                    "Finally reply including the nonce text you read back.",
+                thinkingLevel: params.thinkingLevel,
+                context: `${progressLabel}: tool-exec`,
+              });
              if (
                isEmptyStreamText(execReadText) &&
                (model.provider === "minimax" || model.provider === "openai-codex")
@@ -1040,62 +1096,51 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
            const imageBase64 = renderCatNoncePngBase64(imageCode);
            const runIdImage = randomUUID();

-            const imageProbe = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
+            const imageText = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runIdImage}-image`,
+              message:
+                "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
+                "(1) the animal shown or written in the image, lowercase; " +
+                "(2) the code printed in the image, uppercase. No extra text.",
+              attachments: [
                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runIdImage}-image`,
-                  message:
-                    "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
-                    "(1) the animal shown or written in the image, lowercase; " +
-                    "(2) the code printed in the image, uppercase. No extra text.",
-                  attachments: [
-                    {
-                      mimeType: "image/png",
-                      fileName: `probe-${runIdImage}.png`,
-                      content: imageBase64,
-                    },
-                  ],
-                  thinking: params.thinkingLevel,
-                  deliver: false,
+                  mimeType: "image/png",
+                  fileName: `probe-${runIdImage}.png`,
+                  content: imageBase64,
                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: image`,
-            );
+              ],
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: image`,
+            });
            // Best-effort: do not fail the whole live suite on flaky image handling.
            // (We still keep prompt + tool probes as hard checks.)
-            if (imageProbe?.status !== "ok") {
-              logProgress(`${progressLabel}: image skip (status=${String(imageProbe?.status)})`);
+            if (
+              isEmptyStreamText(imageText) &&
+              (model.provider === "minimax" || model.provider === "openai-codex")
+            ) {
+              logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
            } else {
-              const imageText = extractPayloadText(imageProbe?.result);
-              if (
-                isEmptyStreamText(imageText) &&
-                (model.provider === "minimax" || model.provider === "openai-codex")
-              ) {
-                logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
+              assertNoReasoningTags({
+                text: imageText,
+                model: modelKey,
+                phase: "image",
+                label: params.label,
+              });
+              if (!/\bcat\b/i.test(imageText)) {
+                logProgress(`${progressLabel}: image skip (missing 'cat')`);
              } else {
-                assertNoReasoningTags({
-                  text: imageText,
-                  model: modelKey,
-                  phase: "image",
-                  label: params.label,
-                });
-                if (!/\bcat\b/i.test(imageText)) {
-                  logProgress(`${progressLabel}: image skip (missing 'cat')`);
-                } else {
-                  const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
-                  const bestDistance = candidates.reduce((best, cand) => {
-                    if (Math.abs(cand.length - imageCode.length) > 2) {
-                      return best;
-                    }
-                    return Math.min(best, editDistance(cand, imageCode));
-                  }, Number.POSITIVE_INFINITY);
-                  // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
-                  if (!(bestDistance <= 3)) {
-                    logProgress(`${progressLabel}: image skip (code mismatch)`);
+                const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
+                const bestDistance = candidates.reduce((best, cand) => {
+                  if (Math.abs(cand.length - imageCode.length) > 2) {
+                    return best;
                  }
+                  return Math.min(best, editDistance(cand, imageCode));
+                }, Number.POSITIVE_INFINITY);
+                // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
+                if (!(bestDistance <= 3)) {
+                  logProgress(`${progressLabel}: image skip (code mismatch)`);
                }
              }
            }
@@ -1108,24 +1153,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
          ) {
            logProgress(`${progressLabel}: tool-only regression`);
            const runId2 = randomUUID();
-            const first = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runId2}-1`,
-                  message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: tool-only-regression-first`,
-            );
-            if (first?.status !== "ok") {
-              throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
-            }
-            const firstText = extractPayloadText(first?.result);
+            const firstText = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runId2}-1`,
+              message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: tool-only-regression-first`,
+            });
            assertNoReasoningTags({
              text: firstText,
              model: modelKey,
@@ -1133,24 +1168,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
              label: params.label,
            });

-            const second = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runId2}-2`,
-                  message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: tool-only-regression-second`,
-            );
-            if (second?.status !== "ok") {
-              throw new Error(`post-tool message failed: status=${String(second?.status)}`);
-            }
-            const reply = extractPayloadText(second?.result);
+            const reply = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runId2}-2`,
+              message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: tool-only-regression-second`,
+            });
            assertNoReasoningTags({
              text: reply,
              model: modelKey,
@@ -1290,6 +1315,8 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
      logProgress(`[${params.label}] skipped all models (missing profiles)`);
    }
  } finally {
+    clearRuntimeConfigSnapshot();
+    restoreProductionEnvForLiveRun(runtimeEnv);
    client.stop();
    await server.close({ reason: "live test complete" });
    await fs.rm(toolProbePath, { force: true });
@@ -1317,6 +1344,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
  it(
    "runs meaningful prompts across models with available keys",
    async () => {
+      clearRuntimeConfigSnapshot();
      const cfg = loadConfig();
      await ensureOpenClawModelsJson(cfg);

@@ -1422,6 +1450,8 @@ describeLive("gateway live (dev agent, profile keys)", () => {
    if (!ZAI_FALLBACK) {
      return;
    }
+    clearRuntimeConfigSnapshot();
+    const runtimeEnv = enterProductionEnvForLiveRun();
    const previous = {
      configPath: process.env.OPENCLAW_CONFIG_PATH,
      token: process.env.OPENCLAW_GATEWAY_TOKEN,
@@ -1520,27 +1550,16 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        "zai-fallback: sessions-reset",
      );

-      const runId = randomUUID();
-      const toolProbe = await withGatewayLiveProbeTimeout(
-        client.request<AgentFinalPayload>(
-          "agent",
-          {
-            sessionKey,
-            idempotencyKey: `idem-${runId}-tool`,
-            message:
-              `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
-              `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
-            thinking: THINKING_LEVEL,
-            deliver: false,
-          },
-          { expectFinal: true },
-        ),
-        "zai-fallback: tool-probe",
-      );
-      if (toolProbe?.status !== "ok") {
-        throw new Error(`anthropic tool probe failed: status=${String(toolProbe?.status)}`);
-      }
-      const toolText = extractPayloadText(toolProbe?.result);
+      const toolText = await requestGatewayAgentText({
+        client,
+        sessionKey,
+        idempotencyKey: `idem-${randomUUID()}-tool`,
+        message:
+          `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
+          `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
+        thinkingLevel: THINKING_LEVEL,
+        context: "zai-fallback: tool-probe",
+      });
      assertNoReasoningTags({
        text: toolText,
        model: "anthropic/claude-opus-4-5",
@@ -1559,27 +1578,16 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        "zai-fallback: sessions-patch-zai",
      );

-      const followupId = randomUUID();
-      const followup = await withGatewayLiveProbeTimeout(
-        client.request<AgentFinalPayload>(
-          "agent",
-          {
-            sessionKey,
-            idempotencyKey: `idem-${followupId}-followup`,
-            message:
-              `What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
-              `Reply with exactly: ${nonceA} ${nonceB}.`,
-            thinking: THINKING_LEVEL,
-            deliver: false,
-          },
-          { expectFinal: true },
-        ),
-        "zai-fallback: followup",
-      );
-      if (followup?.status !== "ok") {
-        throw new Error(`zai followup failed: status=${String(followup?.status)}`);
-      }
-      const followupText = extractPayloadText(followup?.result);
+      const followupText = await requestGatewayAgentText({
+        client,
+        sessionKey,
+        idempotencyKey: `idem-${randomUUID()}-followup`,
+        message:
+          `What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
+          `Reply with exactly: ${nonceA} ${nonceB}.`,
+        thinkingLevel: THINKING_LEVEL,
+        context: "zai-fallback: followup",
+      });
      assertNoReasoningTags({
        text: followupText,
        model: "zai/glm-4.7",
@@ -1590,6 +1598,8 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        throw new Error(`zai followup missing nonce: ${followupText}`);
      }
    } finally {
+      clearRuntimeConfigSnapshot();
+      restoreProductionEnvForLiveRun(runtimeEnv);
      client.stop();
      await server.close({ reason: "live test complete" });
      await fs.rm(toolProbePath, { force: true });