From 1ffe8fde84d1c558a23d3ae985800c7bcfaf06a6 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 17 Mar 2026 03:01:11 +0000
Subject: [PATCH] fix: stabilize docker test suite

---
 CHANGELOG.md                                  |   1 +
 docs/help/testing.md                          |   8 +-
 package.json                                  |   2 +-
 pnpm-lock.yaml                                |   2 +-
 scripts/docker/cleanup-smoke/Dockerfile       |   2 +
 scripts/e2e/Dockerfile                        |   7 +-
 scripts/e2e/doctor-install-switch-docker.sh   |   2 +-
 scripts/e2e/onboard-docker.sh                 |  33 +-
 scripts/e2e/plugins-docker.sh                 |   2 +-
 scripts/test-live-gateway-models-docker.sh    |   9 +-
 scripts/test-live-models-docker.sh            |  12 +-
 .../auth-profiles.external-cli-sync.test.ts   |  36 ++
 src/agents/auth-profiles/external-cli-sync.ts |  97 +--
 src/agents/cli-credentials.test.ts            |  18 +-
 src/agents/cli-credentials.ts                 |  26 +-
 src/agents/models.profiles.live.test.ts       |  13 +
 .../gateway-models.profiles.live.test.ts      | 562 +++++++++---------
 17 files changed, 450 insertions(+), 382 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d948e2b59ee..24335d41a91 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -78,6 +78,7 @@ Docs: https://docs.openclaw.ai
 - Z.AI/onboarding: add `glm-5-turbo` to the default Z.AI provider catalog so onboarding-generated configs expose the new model alongside the existing GLM defaults. (#46670) Thanks @tomsun28.
 - Zalo Personal/group gating: stop reapplying `dmPolicy.allowFrom` as a sender gate for already-allowlisted groups when `groupAllowFrom` is unset, so any member of an allowed group can trigger replies while DMs stay restricted. (#46663) Fixes #40146. Thanks @Takhoffman.
 - Zalo/plugin runtime: export `resolveClientIp` from `openclaw/plugin-sdk/zalo` so installed builds no longer crash on startup when the webhook monitor loads from the packaged extension instead of the monorepo source tree. (#46549) Thanks @No898.
+- Docker/live tests: mount external CLI auth homes into writable container copies, derive Codex OAuth expiry from JWT `exp`, refresh synced CLI creds instead of trusting stale cached expiry, and make gateway live probes wait on transcript output so `pnpm test:docker:all` stays green in Linux.
 - Plugins/install precedence: keep bundled plugins ahead of auto-discovered globals by default, but let an explicitly installed plugin record win its own duplicate-id tie so installed channel plugins load from `~/.openclaw/extensions` after `openclaw plugins install`. (#46722) Thanks @Takhoffman.
 - Control UI/logging: make browser-safe logger imports avoid eager temp-dir resolution so the bundled Control UI no longer crashes to a blank screen when logging reaches `tmp-openclaw-dir`. (#48469) Fixes #48062. Thanks @7inspire.
 - Plugins/scoped ids: preserve scoped plugin ids during install and config keying, and keep bundled plugins ahead of discovered duplicate ids by default so `@scope/name` plugins no longer collide with unscoped installs. (#47413) Thanks @vincentkoc.
diff --git a/docs/help/testing.md b/docs/help/testing.md
index 09388dd769e..ab63db23670 100644
--- a/docs/help/testing.md
+++ b/docs/help/testing.md
@@ -362,7 +362,7 @@ If you want to rely on env keys (e.g. exported in your `~/.profile`), run local
 
 ## Docker runners (optional “works in Linux” checks)
 
-These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted). They also bind-mount CLI auth homes like `~/.codex`, `~/.claude`, `~/.qwen`, and `~/.minimax` when present so external-CLI OAuth stays available in-container:
+These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted). They also bind-mount CLI auth homes like `~/.codex`, `~/.claude`, `~/.qwen`, and `~/.minimax` when present, then copy them into the container home before the run so external-CLI OAuth can refresh tokens without mutating the host auth store:
 
 - Direct models: `pnpm test:docker:live-models` (script: `scripts/test-live-models-docker.sh`)
 - Gateway + dev agent: `pnpm test:docker:live-gateway` (script: `scripts/test-live-gateway-models-docker.sh`)
@@ -373,6 +373,9 @@ These run `pnpm test:live` inside the repo Docker image, mounting your local con
 The live-model Docker runners also bind-mount the current checkout read-only and
 stage it into a temporary workdir inside the container. This keeps the runtime
 image slim while still running Vitest against your exact local source/config.
+`test:docker:live-models` still runs `pnpm test:live`, so pass through
+`OPENCLAW_LIVE_GATEWAY_*` as well when you need to narrow or exclude gateway
+live coverage from that Docker lane.
 
 Manual ACP plain-language thread smoke (not CI):
 
@@ -384,8 +387,9 @@ Useful env vars:
 - `OPENCLAW_CONFIG_DIR=...` (default: `~/.openclaw`) mounted to `/home/node/.openclaw`
 - `OPENCLAW_WORKSPACE_DIR=...` (default: `~/.openclaw/workspace`) mounted to `/home/node/.openclaw/workspace`
 - `OPENCLAW_PROFILE_FILE=...` (default: `~/.profile`) mounted to `/home/node/.profile` and sourced before running tests
-- External CLI auth dirs under `$HOME` (`.codex`, `.claude`, `.qwen`, `.minimax`) are mounted read-only to the matching `/home/node/...` paths when present
+- External CLI auth dirs under `$HOME` (`.codex`, `.claude`, `.qwen`, `.minimax`) are mounted read-only under `/host-auth/...`, then copied into `/home/node/...` before tests start
 - `OPENCLAW_LIVE_GATEWAY_MODELS=...` / `OPENCLAW_LIVE_MODELS=...` to narrow the run
+- `OPENCLAW_LIVE_GATEWAY_PROVIDERS=...` / `OPENCLAW_LIVE_PROVIDERS=...` to filter providers in-container
 - `OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS=1` to ensure creds come from the profile store (not env)
 
 ## Docs sanity
diff --git a/package.json b/package.json
index f0904418919..eaae91d6a40 100644
--- a/package.json
+++ b/package.json
@@ -401,7 +401,7 @@
     "dotenv": "^17.3.1",
     "express": "^5.2.1",
     "file-type": "^21.3.2",
-    "gaxios": "^7.1.3",
+    "gaxios": "7.1.3",
     "grammy": "^1.41.1",
     "hono": "4.12.7",
     "https-proxy-agent": "^8.0.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 90ebda912b0..e05340832b6 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -126,7 +126,7 @@ importers:
         specifier: 21.3.2
         version: 21.3.2
       gaxios:
-        specifier: ^7.1.3
+        specifier: 7.1.3
         version: 7.1.3
       grammy:
         specifier: ^1.41.1
diff --git a/scripts/docker/cleanup-smoke/Dockerfile b/scripts/docker/cleanup-smoke/Dockerfile
index 07a2334aa41..f214ffbabf4 100644
--- a/scripts/docker/cleanup-smoke/Dockerfile
+++ b/scripts/docker/cleanup-smoke/Dockerfile
@@ -2,6 +2,8 @@
 
 FROM node:24-bookworm-slim@sha256:b4687aef2571c632a1953695ce4d61d6462a7eda471fe6e272eebf0418f276ba
 
+ENV COREPACK_ENABLE_DOWNLOAD_PROMPT=0
+
 RUN --mount=type=cache,id=openclaw-cleanup-smoke-apt-cache,target=/var/cache/apt,sharing=locked \
   --mount=type=cache,id=openclaw-cleanup-smoke-apt-lists,target=/var/lib/apt,sharing=locked \
   apt-get update \
diff --git a/scripts/e2e/Dockerfile b/scripts/e2e/Dockerfile
index 4669e762c4a..2c23c9ef1b8 100644
--- a/scripts/e2e/Dockerfile
+++ b/scripts/e2e/Dockerfile
@@ -20,7 +20,7 @@ WORKDIR /app
 
 COPY --chown=appuser:appuser package.json pnpm-lock.yaml pnpm-workspace.yaml ./
 COPY --chown=appuser:appuser ui/package.json ./ui/package.json
-COPY --chown=appuser:appuser extensions/memory-core/package.json ./extensions/memory-core/package.json
+COPY --chown=appuser:appuser extensions ./extensions
 COPY --chown=appuser:appuser patches ./patches
 
 RUN --mount=type=cache,id=openclaw-pnpm-store,target=/home/appuser/.local/share/pnpm/store,sharing=locked \
@@ -39,6 +39,9 @@ COPY --chown=appuser:appuser apps/shared/OpenClawKit/Sources/OpenClawKit/Resourc
 COPY --chown=appuser:appuser apps/shared/OpenClawKit/Tools/CanvasA2UI ./apps/shared/OpenClawKit/Tools/CanvasA2UI
 
 RUN pnpm build
-RUN pnpm ui:build
+# Onboard Docker E2E does not exercise the Control UI itself; it only needs the
+# asset-existence check to pass so configure/onboard can continue.
+RUN mkdir -p dist/control-ui \
+  && printf '%s\n' '<!doctype html><title>OpenClaw Control UI</title>' > dist/control-ui/index.html
 
 CMD ["bash"]
diff --git a/scripts/e2e/doctor-install-switch-docker.sh b/scripts/e2e/doctor-install-switch-docker.sh
index ca91619ef5a..4ca742a362b 100755
--- a/scripts/e2e/doctor-install-switch-docker.sh
+++ b/scripts/e2e/doctor-install-switch-docker.sh
@@ -75,7 +75,7 @@ LOGINCTL
 
   # Install the npm-global variant from the local /app source.
   # `npm pack` can emit script output; keep only the tarball name.
-  pkg_tgz="$(npm pack --silent /app | tail -n 1 | tr -d '\r')"
+  pkg_tgz="$(npm pack --ignore-scripts --silent /app | tail -n 1 | tr -d '\r')"
   if [ ! -f "/app/$pkg_tgz" ]; then
     echo "npm pack failed (expected /app/$pkg_tgz)"
     exit 1
diff --git a/scripts/e2e/onboard-docker.sh b/scripts/e2e/onboard-docker.sh
index 49b08dcc2ca..70cbd6f0c51 100755
--- a/scripts/e2e/onboard-docker.sh
+++ b/scripts/e2e/onboard-docker.sh
@@ -74,8 +74,14 @@ TRASH
           try { text = fs.readFileSync(file, \"utf8\"); } catch { process.exit(1); }
           // Clack/script output can include lots of control sequences; keep a larger tail and strip ANSI more robustly.
           if (text.length > 120000) text = text.slice(-120000);
-          const stripAnsi = (value) =>
+          const normalizeScriptOutput = (value) =>
             value
+              // util-linux script can emit each byte on its own CRLF-delimited line.
+              // Collapse those first so ANSI/control stripping works on real sequences.
+              .replace(/\\r?\\n/g, \"\")
+              .replace(/\\r/g, \"\");
+          const stripAnsi = (value) =>
+            normalizeScriptOutput(value)
               // OSC: ESC ] ... BEL or ESC \\
               .replace(/\\x1b\\][^\\x07]*(?:\\x07|\\x1b\\\\)/g, \"\")
               // CSI: ESC [ ... cmd
@@ -269,23 +275,24 @@ TRASH
   }
 
   send_channels_flow() {
-    # Configure channels via configure wizard.
-    # Prompts are interactive; notes are not. Use conservative delays to stay in sync.
-    # Where will the Gateway run? -> Local (default)
-    send $'"'"'\r'"'"' 1.2
-    # Channels mode -> Configure/link (default)
-    send $'"'"'\r'"'"' 1.5
+    # Configure channels via configure wizard. Sync on prompt text so
+    # keystrokes do not drift into the wrong screen when render timing changes.
+    wait_for_log "Where will the Gateway run?" 120
+    send $'"'"'\r'"'"' 0.6
+    wait_for_log "Channels" 120
+    send $'"'"'\r'"'"' 0.6
     # Select a channel -> Finished (last option; clack wraps on Up)
-    send $'"'"'\e[A\r'"'"' 2.0
+    wait_for_log "Select a channel" 120
+    send $'"'"'\e[A\r'"'"' 0.8
     # Keep stdin open until wizard exits.
-    send "" 2.5
+    send "" 2.0
   }
 
   send_skills_flow() {
-    # configure --section skills still runs the configure wizard; the first prompt is gateway location.
-    # Avoid log-based synchronization here; clack output can fragment ANSI sequences and break matching.
-    send $'"'"'\r'"'"' 3.0
-    wait_for_log "Configure skills now?" 120 true || true
+    # configure --section skills still runs the configure wizard.
+    wait_for_log "Where will the Gateway run?" 120
+    send $'"'"'\r'"'"' 0.6
+    wait_for_log "Configure skills now?" 120
     send $'"'"'n\r'"'"' 0.8
     send "" 2.0
   }
diff --git a/scripts/e2e/plugins-docker.sh b/scripts/e2e/plugins-docker.sh
index 587840ec93a..632d6924099 100755
--- a/scripts/e2e/plugins-docker.sh
+++ b/scripts/e2e/plugins-docker.sh
@@ -8,7 +8,7 @@ echo "Building Docker image..."
 docker build -t "$IMAGE_NAME" -f "$ROOT_DIR/scripts/e2e/Dockerfile" "$ROOT_DIR"
 
 echo "Running plugins Docker E2E..."
-docker run --rm -i "$IMAGE_NAME" bash -s <<'EOF'
+docker run --rm -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 -i "$IMAGE_NAME" bash -s <<'EOF'
 set -euo pipefail
 
 if [ -f dist/index.mjs ]; then
diff --git a/scripts/test-live-gateway-models-docker.sh b/scripts/test-live-gateway-models-docker.sh
index f40e064910b..a3e1036171f 100755
--- a/scripts/test-live-gateway-models-docker.sh
+++ b/scripts/test-live-gateway-models-docker.sh
@@ -17,13 +17,20 @@ EXTERNAL_AUTH_MOUNTS=()
 for auth_dir in .claude .codex .minimax .qwen; do
   host_path="$HOME/$auth_dir"
   if [[ -d "$host_path" ]]; then
-    EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/home/node/"$auth_dir":ro)
+    EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/host-auth/"$auth_dir":ro)
   fi
 done
 
 read -r -d '' LIVE_TEST_CMD <<'EOF' || true
 set -euo pipefail
 [ -f "$HOME/.profile" ] && source "$HOME/.profile" || true
+for auth_dir in .claude .codex .minimax .qwen; do
+  if [ -d "/host-auth/$auth_dir" ]; then
+    mkdir -p "$HOME/$auth_dir"
+    cp -R "/host-auth/$auth_dir/." "$HOME/$auth_dir"
+    chmod -R u+rwX "$HOME/$auth_dir" || true
+  fi
+done
 tmp_dir="$(mktemp -d)"
 cleanup() {
   rm -rf "$tmp_dir"
diff --git a/scripts/test-live-models-docker.sh b/scripts/test-live-models-docker.sh
index 52257cd3230..c1cec5b2740 100755
--- a/scripts/test-live-models-docker.sh
+++ b/scripts/test-live-models-docker.sh
@@ -17,13 +17,20 @@ EXTERNAL_AUTH_MOUNTS=()
 for auth_dir in .claude .codex .minimax .qwen; do
   host_path="$HOME/$auth_dir"
   if [[ -d "$host_path" ]]; then
-    EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/home/node/"$auth_dir":ro)
+    EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/host-auth/"$auth_dir":ro)
   fi
 done
 
 read -r -d '' LIVE_TEST_CMD <<'EOF' || true
 set -euo pipefail
 [ -f "$HOME/.profile" ] && source "$HOME/.profile" || true
+for auth_dir in .claude .codex .minimax .qwen; do
+  if [ -d "/host-auth/$auth_dir" ]; then
+    mkdir -p "$HOME/$auth_dir"
+    cp -R "/host-auth/$auth_dir/." "$HOME/$auth_dir"
+    chmod -R u+rwX "$HOME/$auth_dir" || true
+  fi
+done
 tmp_dir="$(mktemp -d)"
 cleanup() {
   rm -rf "$tmp_dir"
@@ -57,6 +64,9 @@ docker run --rm -t \
   -e OPENCLAW_LIVE_MAX_MODELS="${OPENCLAW_LIVE_MAX_MODELS:-${CLAWDBOT_LIVE_MAX_MODELS:-48}}" \
   -e OPENCLAW_LIVE_MODEL_TIMEOUT_MS="${OPENCLAW_LIVE_MODEL_TIMEOUT_MS:-${CLAWDBOT_LIVE_MODEL_TIMEOUT_MS:-}}" \
   -e OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS="${OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS:-${CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS:-}}" \
+  -e OPENCLAW_LIVE_GATEWAY_MODELS="${OPENCLAW_LIVE_GATEWAY_MODELS:-${CLAWDBOT_LIVE_GATEWAY_MODELS:-}}" \
+  -e OPENCLAW_LIVE_GATEWAY_PROVIDERS="${OPENCLAW_LIVE_GATEWAY_PROVIDERS:-${CLAWDBOT_LIVE_GATEWAY_PROVIDERS:-}}" \
+  -e OPENCLAW_LIVE_GATEWAY_MAX_MODELS="${OPENCLAW_LIVE_GATEWAY_MAX_MODELS:-${CLAWDBOT_LIVE_GATEWAY_MAX_MODELS:-}}" \
   -v "$ROOT_DIR":/src:ro \
   -v "$CONFIG_DIR":/home/node/.openclaw \
   -v "$WORKSPACE_DIR":/home/node/.openclaw/workspace \
diff --git a/src/agents/auth-profiles.external-cli-sync.test.ts b/src/agents/auth-profiles.external-cli-sync.test.ts
index 303b85b72d2..eae0fab70af 100644
--- a/src/agents/auth-profiles.external-cli-sync.test.ts
+++ b/src/agents/auth-profiles.external-cli-sync.test.ts
@@ -51,4 +51,40 @@ describe("syncExternalCliCredentials", () => {
     });
     expect(store.profiles[CODEX_CLI_PROFILE_ID]).toBeUndefined();
   });
+
+  it("refreshes stored Codex expiry from external CLI even when the cached profile looks fresh", () => {
+    const staleExpiry = Date.now() + 30 * 60_000;
+    const freshExpiry = Date.now() + 5 * 24 * 60 * 60_000;
+    mocks.readCodexCliCredentialsCached.mockReturnValue({
+      type: "oauth",
+      provider: "openai-codex",
+      access: "new-access-token",
+      refresh: "new-refresh-token",
+      expires: freshExpiry,
+      accountId: "acct_456",
+    });
+
+    const store: AuthProfileStore = {
+      version: 1,
+      profiles: {
+        [OPENAI_CODEX_DEFAULT_PROFILE_ID]: {
+          type: "oauth",
+          provider: "openai-codex",
+          access: "old-access-token",
+          refresh: "old-refresh-token",
+          expires: staleExpiry,
+          accountId: "acct_456",
+        },
+      },
+    };
+
+    const mutated = syncExternalCliCredentials(store);
+
+    expect(mutated).toBe(true);
+    expect(store.profiles[OPENAI_CODEX_DEFAULT_PROFILE_ID]).toMatchObject({
+      access: "new-access-token",
+      refresh: "new-refresh-token",
+      expires: freshExpiry,
+    });
+  });
 });
diff --git a/src/agents/auth-profiles/external-cli-sync.ts b/src/agents/auth-profiles/external-cli-sync.ts
index 7e490c97c94..ff43b586b48 100644
--- a/src/agents/auth-profiles/external-cli-sync.ts
+++ b/src/agents/auth-profiles/external-cli-sync.ts
@@ -4,13 +4,12 @@ import {
   readMiniMaxCliCredentialsCached,
 } from "../cli-credentials.js";
 import {
-  EXTERNAL_CLI_NEAR_EXPIRY_MS,
   EXTERNAL_CLI_SYNC_TTL_MS,
   QWEN_CLI_PROFILE_ID,
   MINIMAX_CLI_PROFILE_ID,
   log,
 } from "./constants.js";
-import type { AuthProfileCredential, AuthProfileStore, OAuthCredential } from "./types.js";
+import type { AuthProfileStore, OAuthCredential } from "./types.js";
 
 const OPENAI_CODEX_DEFAULT_PROFILE_ID = "openai-codex:default";
 
@@ -37,62 +36,33 @@ function shallowEqualOAuthCredentials(a: OAuthCredential | undefined, b: OAuthCr
   );
 }
 
-function isExternalProfileFresh(cred: AuthProfileCredential | undefined, now: number): boolean {
-  if (!cred) {
-    return false;
-  }
-  if (cred.type !== "oauth" && cred.type !== "token") {
-    return false;
-  }
-  if (
-    cred.provider !== "qwen-portal" &&
-    cred.provider !== "minimax-portal" &&
-    cred.provider !== "openai-codex"
-  ) {
-    return false;
-  }
-  if (typeof cred.expires !== "number") {
-    return true;
-  }
-  return cred.expires > now + EXTERNAL_CLI_NEAR_EXPIRY_MS;
-}
-
 /** Sync external CLI credentials into the store for a given provider. */
 function syncExternalCliCredentialsForProvider(
   store: AuthProfileStore,
   profileId: string,
   provider: string,
   readCredentials: () => OAuthCredential | null,
-  now: number,
   options: ExternalCliSyncOptions,
 ): boolean {
   const existing = store.profiles[profileId];
-  const shouldSync =
-    !existing || existing.provider !== provider || !isExternalProfileFresh(existing, now);
-  const creds = shouldSync ? readCredentials() : null;
+  const creds = readCredentials();
   if (!creds) {
     return false;
   }
 
   const existingOAuth = existing?.type === "oauth" ? existing : undefined;
-  const shouldUpdate =
-    !existingOAuth ||
-    existingOAuth.provider !== provider ||
-    existingOAuth.expires <= now ||
-    creds.expires > existingOAuth.expires;
-
-  if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, creds)) {
-    store.profiles[profileId] = creds;
-    if (options.log !== false) {
-      log.info(`synced ${provider} credentials from external cli`, {
-        profileId,
-        expires: new Date(creds.expires).toISOString(),
-      });
-    }
-    return true;
+  if (shallowEqualOAuthCredentials(existingOAuth, creds)) {
+    return false;
   }
 
-  return false;
+  store.profiles[profileId] = creds;
+  if (options.log !== false) {
+    log.info(`synced ${provider} credentials from external cli`, {
+      profileId,
+      expires: new Date(creds.expires).toISOString(),
+    });
+  }
+  return true;
 }
 
 /**
@@ -106,46 +76,24 @@ export function syncExternalCliCredentials(
   options: ExternalCliSyncOptions = {},
 ): boolean {
   let mutated = false;
-  const now = Date.now();
 
-  // Sync from Qwen Code CLI
-  const existingQwen = store.profiles[QWEN_CLI_PROFILE_ID];
-  const shouldSyncQwen =
-    !existingQwen ||
-    existingQwen.provider !== "qwen-portal" ||
-    !isExternalProfileFresh(existingQwen, now);
-  const qwenCreds = shouldSyncQwen
-    ? readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS })
-    : null;
-  if (qwenCreds) {
-    const existing = store.profiles[QWEN_CLI_PROFILE_ID];
-    const existingOAuth = existing?.type === "oauth" ? existing : undefined;
-    const shouldUpdate =
-      !existingOAuth ||
-      existingOAuth.provider !== "qwen-portal" ||
-      existingOAuth.expires <= now ||
-      qwenCreds.expires > existingOAuth.expires;
-
-    if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, qwenCreds)) {
-      store.profiles[QWEN_CLI_PROFILE_ID] = qwenCreds;
-      mutated = true;
-      if (options.log !== false) {
-        log.info("synced qwen credentials from qwen cli", {
-          profileId: QWEN_CLI_PROFILE_ID,
-          expires: new Date(qwenCreds.expires).toISOString(),
-        });
-      }
-    }
+  if (
+    syncExternalCliCredentialsForProvider(
+      store,
+      QWEN_CLI_PROFILE_ID,
+      "qwen-portal",
+      () => readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
+      options,
+    )
+  ) {
+    mutated = true;
   }
-
-  // Sync from MiniMax Portal CLI
   if (
     syncExternalCliCredentialsForProvider(
       store,
       MINIMAX_CLI_PROFILE_ID,
       "minimax-portal",
       () => readMiniMaxCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
-      now,
       options,
     )
   ) {
@@ -157,7 +105,6 @@ export function syncExternalCliCredentials(
       OPENAI_CODEX_DEFAULT_PROFILE_ID,
       "openai-codex",
       () => readCodexCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
-      now,
       options,
     )
   ) {
diff --git a/src/agents/cli-credentials.test.ts b/src/agents/cli-credentials.test.ts
index fcfaf21450d..53be1581b13 100644
--- a/src/agents/cli-credentials.test.ts
+++ b/src/agents/cli-credentials.test.ts
@@ -46,6 +46,12 @@ async function readCachedClaudeCliCredentials(allowKeychainPrompt: boolean) {
   });
 }
 
+function createJwtWithExp(expSeconds: number): string {
+  const encode = (value: Record<string, unknown>) =>
+    Buffer.from(JSON.stringify(value)).toString("base64url");
+  return `${encode({ alg: "RS256", typ: "JWT" })}.${encode({ exp: expSeconds })}.signature`;
+}
+
 describe("cli credentials", () => {
   beforeAll(async () => {
     ({
@@ -229,6 +235,7 @@ describe("cli credentials", () => {
   it("reads Codex credentials from keychain when available", async () => {
     const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-"));
     process.env.CODEX_HOME = tempHome;
+    const expSeconds = Math.floor(Date.parse("2026-03-23T00:48:49Z") / 1000);
 
     const accountHash = "cli|";
 
@@ -238,7 +245,7 @@ describe("cli credentials", () => {
       expect(cmd).toContain(accountHash);
       return JSON.stringify({
         tokens: {
-          access_token: "keychain-access",
+          access_token: createJwtWithExp(expSeconds),
           refresh_token: "keychain-refresh",
         },
         last_refresh: "2026-01-01T00:00:00Z",
@@ -248,15 +255,17 @@ describe("cli credentials", () => {
     const creds = readCodexCliCredentials({ platform: "darwin", execSync: execSyncMock });
 
     expect(creds).toMatchObject({
-      access: "keychain-access",
+      access: createJwtWithExp(expSeconds),
       refresh: "keychain-refresh",
       provider: "openai-codex",
+      expires: expSeconds * 1000,
     });
   });
 
   it("falls back to Codex auth.json when keychain is unavailable", async () => {
     const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-"));
     process.env.CODEX_HOME = tempHome;
+    const expSeconds = Math.floor(Date.parse("2026-03-24T12:34:56Z") / 1000);
     execSyncMock.mockImplementation(() => {
       throw new Error("not found");
     });
@@ -267,7 +276,7 @@ describe("cli credentials", () => {
       authPath,
       JSON.stringify({
         tokens: {
-          access_token: "file-access",
+          access_token: createJwtWithExp(expSeconds),
           refresh_token: "file-refresh",
         },
       }),
@@ -277,9 +286,10 @@ describe("cli credentials", () => {
     const creds = readCodexCliCredentials({ execSync: execSyncMock });
 
     expect(creds).toMatchObject({
-      access: "file-access",
+      access: createJwtWithExp(expSeconds),
       refresh: "file-refresh",
       provider: "openai-codex",
+      expires: expSeconds * 1000,
     });
   });
 });
diff --git a/src/agents/cli-credentials.ts b/src/agents/cli-credentials.ts
index 0d6d7c28c84..8ded765346a 100644
--- a/src/agents/cli-credentials.ts
+++ b/src/agents/cli-credentials.ts
@@ -153,6 +153,22 @@ function computeCodexKeychainAccount(codexHome: string) {
   return `cli|${hash.slice(0, 16)}`;
 }
 
+function decodeJwtExpiryMs(token: string): number | null {
+  const parts = token.split(".");
+  if (parts.length < 2) {
+    return null;
+  }
+  try {
+    const payloadRaw = Buffer.from(parts[1], "base64url").toString("utf8");
+    const payload = JSON.parse(payloadRaw) as { exp?: unknown };
+    return typeof payload.exp === "number" && Number.isFinite(payload.exp) && payload.exp > 0
+      ? payload.exp * 1000
+      : null;
+  } catch {
+    return null;
+  }
+}
+
 function readCodexKeychainCredentials(options?: {
   platform?: NodeJS.Platform;
   execSync?: ExecSyncFn;
@@ -193,9 +209,10 @@ function readCodexKeychainCredentials(options?: {
       typeof lastRefreshRaw === "string" || typeof lastRefreshRaw === "number"
         ? new Date(lastRefreshRaw).getTime()
         : Date.now();
-    const expires = Number.isFinite(lastRefresh)
+    const fallbackExpiry = Number.isFinite(lastRefresh)
       ? lastRefresh + 60 * 60 * 1000
       : Date.now() + 60 * 60 * 1000;
+    const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry;
     const accountId = typeof tokens?.account_id === "string" ? tokens.account_id : undefined;
 
     log.info("read codex credentials from keychain", {
@@ -483,13 +500,14 @@ export function readCodexCliCredentials(options?: {
     return null;
   }
 
-  let expires: number;
+  let fallbackExpiry: number;
   try {
     const stat = fs.statSync(authPath);
-    expires = stat.mtimeMs + 60 * 60 * 1000;
+    fallbackExpiry = stat.mtimeMs + 60 * 60 * 1000;
   } catch {
-    expires = Date.now() + 60 * 60 * 1000;
+    fallbackExpiry = Date.now() + 60 * 60 * 1000;
   }
+  const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry;
 
   return {
     type: "oauth",
diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts
index 515d2b48ce6..87cbbb6a203 100644
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@@ -117,6 +117,10 @@ function isChatGPTUsageLimitErrorMessage(raw: string): boolean {
   return msg.includes("hit your chatgpt usage limit") && msg.includes("try again in");
 }
 
+function isRefreshTokenReused(raw: string): boolean {
+  return /refresh_token_reused/i.test(raw);
+}
+
 function isInstructionsRequiredError(raw: string): boolean {
   return /instructions are required/i.test(raw);
 }
@@ -643,6 +647,15 @@ describeLive("live models (profile keys)", () => {
               logProgress(`${progressLabel}: skip (rate limit)`);
               break;
             }
+            if (
+              allowNotFoundSkip &&
+              model.provider === "openai-codex" &&
+              isRefreshTokenReused(message)
+            ) {
+              skipped.push({ model: id, reason: message });
+              logProgress(`${progressLabel}: skip (codex refresh token reused)`);
+              break;
+            }
             if (
               allowNotFoundSkip &&
               model.provider === "openai-codex" &&
diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts
index 6a74c98da3b..973cf952d16 100644
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@@ -24,7 +24,7 @@ import { shouldSuppressBuiltInModel } from "../agents/model-suppression.js";
 import { ensureOpenClawModelsJson } from "../agents/models-config.js";
 import { isRateLimitErrorMessage } from "../agents/pi-embedded-helpers/errors.js";
 import { discoverAuthStorage, discoverModels } from "../agents/pi-model-discovery.js";
-import { loadConfig } from "../config/config.js";
+import { clearRuntimeConfigSnapshot, loadConfig } from "../config/config.js";
 import type { ModelsConfig, OpenClawConfig, ModelProviderConfig } from "../config/types.js";
 import { isTruthyEnvValue } from "../infra/env.js";
 import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
@@ -38,7 +38,7 @@ import {
   shouldRetryToolReadProbe,
 } from "./live-tool-probe-utils.js";
 import { startGatewayServer } from "./server.js";
-import { extractPayloadText } from "./test-helpers.agent-results.js";
+import { loadSessionEntry, readSessionMessages } from "./session-utils.js";
 
 const LIVE = isTruthyEnvValue(process.env.LIVE) || isTruthyEnvValue(process.env.OPENCLAW_LIVE_TEST);
 const GATEWAY_LIVE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY);
@@ -171,6 +171,32 @@ function logProgress(message: string): void {
   console.log(`[live] ${message}`);
 }
 
+function enterProductionEnvForLiveRun() {
+  const previous = {
+    vitest: process.env.VITEST,
+    nodeEnv: process.env.NODE_ENV,
+  };
+  delete process.env.VITEST;
+  process.env.NODE_ENV = "production";
+  return previous;
+}
+
+function restoreProductionEnvForLiveRun(previous: {
+  vitest: string | undefined;
+  nodeEnv: string | undefined;
+}) {
+  if (previous.vitest === undefined) {
+    delete process.env.VITEST;
+  } else {
+    process.env.VITEST = previous.vitest;
+  }
+  if (previous.nodeEnv === undefined) {
+    delete process.env.NODE_ENV;
+  } else {
+    process.env.NODE_ENV = previous.nodeEnv;
+  }
+}
+
 function formatFailurePreview(
   failures: Array<{ model: string; error: string }>,
   maxItems: number,
@@ -319,25 +345,14 @@ async function runAnthropicRefusalProbe(params: {
 }): Promise<void> {
   logProgress(`${params.label}: refusal-probe`);
   const magic = buildAnthropicRefusalToken();
-  const runId = randomUUID();
-  const probe = await withGatewayLiveProbeTimeout(
-    params.client.request<AgentFinalPayload>(
-      "agent",
-      {
-        sessionKey: params.sessionKey,
-        idempotencyKey: `idem-${runId}-refusal`,
-        message: `Reply with the single word ok. Test token: ${magic}`,
-        thinking: params.thinkingLevel,
-        deliver: false,
-      },
-      { expectFinal: true },
-    ),
-    `${params.label}: refusal-probe`,
-  );
-  if (probe?.status !== "ok") {
-    throw new Error(`refusal probe failed: status=${String(probe?.status)}`);
-  }
-  const probeText = extractPayloadText(probe?.result);
+  const probeText = await requestGatewayAgentText({
+    client: params.client,
+    sessionKey: params.sessionKey,
+    idempotencyKey: `idem-${randomUUID()}-refusal`,
+    message: `Reply with the single word ok. Test token: ${magic}`,
+    thinkingLevel: params.thinkingLevel,
+    context: `${params.label}: refusal-probe`,
+  });
   assertNoReasoningTags({
     text: probeText,
     model: params.modelKey,
@@ -348,25 +363,14 @@ async function runAnthropicRefusalProbe(params: {
     throw new Error(`refusal probe missing ok: ${probeText}`);
   }
 
-  const followupId = randomUUID();
-  const followup = await withGatewayLiveProbeTimeout(
-    params.client.request<AgentFinalPayload>(
-      "agent",
-      {
-        sessionKey: params.sessionKey,
-        idempotencyKey: `idem-${followupId}-refusal-followup`,
-        message: "Now reply with exactly: still ok.",
-        thinking: params.thinkingLevel,
-        deliver: false,
-      },
-      { expectFinal: true },
-    ),
-    `${params.label}: refusal-followup`,
-  );
-  if (followup?.status !== "ok") {
-    throw new Error(`refusal followup failed: status=${String(followup?.status)}`);
-  }
-  const followupText = extractPayloadText(followup?.result);
+  const followupText = await requestGatewayAgentText({
+    client: params.client,
+    sessionKey: params.sessionKey,
+    idempotencyKey: `idem-${randomUUID()}-refusal-followup`,
+    message: "Now reply with exactly: still ok.",
+    thinkingLevel: params.thinkingLevel,
+    context: `${params.label}: refusal-followup`,
+  });
   assertNoReasoningTags({
     text: followupText,
     model: params.modelKey,
@@ -475,11 +479,6 @@ async function getFreeGatewayPort(): Promise<number> {
   throw new Error("failed to acquire a free gateway port block");
 }
 
-type AgentFinalPayload = {
-  status?: unknown;
-  result?: unknown;
-};
-
 async function connectClient(params: { url: string; token: string }) {
   return await new Promise<GatewayClient>((resolve, reject) => {
     let settled = false;
@@ -513,6 +512,115 @@ async function connectClient(params: { url: string; token: string }) {
   });
 }
 
+function extractTranscriptMessageText(message: unknown): string {
+  if (!message || typeof message !== "object") {
+    return "";
+  }
+  const record = message as {
+    text?: unknown;
+    content?: unknown;
+  };
+  if (typeof record.text === "string" && record.text.trim()) {
+    return record.text.trim();
+  }
+  if (typeof record.content === "string" && record.content.trim()) {
+    return record.content.trim();
+  }
+  if (!Array.isArray(record.content)) {
+    return "";
+  }
+  return record.content
+    .map((entry) => {
+      if (!entry || typeof entry !== "object") {
+        return "";
+      }
+      const text = (entry as { text?: unknown }).text;
+      return typeof text === "string" && text.trim() ? text.trim() : "";
+    })
+    .filter(Boolean)
+    .join("\n")
+    .trim();
+}
+
+function readSessionAssistantTexts(sessionKey: string): string[] {
+  const { storePath, entry } = loadSessionEntry(sessionKey);
+  if (!entry?.sessionId) {
+    return [];
+  }
+  const messages = readSessionMessages(entry.sessionId, storePath, entry.sessionFile);
+  const assistantTexts: string[] = [];
+  for (const message of messages) {
+    if (!message || typeof message !== "object") {
+      continue;
+    }
+    const role = (message as { role?: unknown }).role;
+    if (role !== "assistant") {
+      continue;
+    }
+    assistantTexts.push(extractTranscriptMessageText(message));
+  }
+  return assistantTexts;
+}
+
+async function waitForSessionAssistantText(params: {
+  sessionKey: string;
+  baselineAssistantCount: number;
+  context: string;
+}) {
+  const startedAt = Date.now();
+  let delayMs = 50;
+  while (Date.now() - startedAt < GATEWAY_LIVE_PROBE_TIMEOUT_MS) {
+    const assistantTexts = readSessionAssistantTexts(params.sessionKey);
+    if (assistantTexts.length > params.baselineAssistantCount) {
+      const freshText = assistantTexts
+        .slice(params.baselineAssistantCount)
+        .map((text) => text.trim())
+        .findLast((text) => text.length > 0);
+      if (freshText) {
+        return freshText;
+      }
+    }
+    await new Promise((resolve) => setTimeout(resolve, delayMs));
+    delayMs = Math.min(delayMs * 2, 250);
+  }
+  throw new Error(`probe timeout after ${GATEWAY_LIVE_PROBE_TIMEOUT_MS}ms (${params.context})`);
+}
+
+async function requestGatewayAgentText(params: {
+  client: GatewayClient;
+  sessionKey: string;
+  message: string;
+  thinkingLevel: string;
+  context: string;
+  idempotencyKey: string;
+  attachments?: Array<{
+    mimeType: string;
+    fileName: string;
+    content: string;
+  }>;
+}) {
+  const baselineAssistantCount = readSessionAssistantTexts(params.sessionKey).length;
+  const accepted = await withGatewayLiveProbeTimeout(
+    params.client.request<{ runId?: unknown; status?: unknown }>("agent", {
+      sessionKey: params.sessionKey,
+      idempotencyKey: params.idempotencyKey,
+      message: params.message,
+      thinking: params.thinkingLevel,
+      deliver: false,
+      attachments: params.attachments,
+    }),
+    `${params.context}: agent-accept`,
+  );
+  if (accepted?.status !== "accepted") {
+    throw new Error(`agent status=${String(accepted?.status)}`);
+  }
+  return await waitForSessionAssistantText({
+    sessionKey: params.sessionKey,
+    baselineAssistantCount,
+    context: `${params.context}: transcript-final`,
+  });
+}
+
 type GatewayModelSuiteParams = {
   label: string;
   cfg: OpenClawConfig;
@@ -636,6 +744,8 @@ function buildMinimaxProviderOverride(params: {
 }
 
 async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
+  clearRuntimeConfigSnapshot();
+  const runtimeEnv = enterProductionEnvForLiveRun();
   const previous = {
     configPath: process.env.OPENCLAW_CONFIG_PATH,
     token: process.env.OPENCLAW_GATEWAY_TOKEN,
@@ -793,48 +903,26 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
           );
 
           logProgress(`${progressLabel}: prompt`);
-          const runId = randomUUID();
-          const payload = await withGatewayLiveProbeTimeout(
-            client.request<AgentFinalPayload>(
-              "agent",
-              {
-                sessionKey,
-                idempotencyKey: `idem-${runId}`,
-                message:
-                  "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
-                thinking: params.thinkingLevel,
-                deliver: false,
-              },
-              { expectFinal: true },
-            ),
-            `${progressLabel}: prompt`,
-          );
-
-          if (payload?.status !== "ok") {
-            throw new Error(`agent status=${String(payload?.status)}`);
-          }
-          let text = extractPayloadText(payload?.result);
+          let text = await requestGatewayAgentText({
+            client,
+            sessionKey,
+            idempotencyKey: `idem-${randomUUID()}`,
+            message:
+              "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
+            thinkingLevel: params.thinkingLevel,
+            context: `${progressLabel}: prompt`,
+          });
           if (!text) {
             logProgress(`${progressLabel}: empty response, retrying`);
-            const retry = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${randomUUID()}-retry`,
-                  message:
-                    "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: prompt-retry`,
-            );
-            if (retry?.status !== "ok") {
-              throw new Error(`agent status=${String(retry?.status)}`);
-            }
-            text = extractPayloadText(retry?.result);
+            text = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${randomUUID()}-retry`,
+              message:
+                "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: prompt-retry`,
+            });
           }
           if (!text && isGoogleishProvider(model.provider)) {
             logProgress(`${progressLabel}: skip (google empty response)`);
@@ -881,36 +969,20 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
             toolReadAttempt += 1
           ) {
             const strictReply = toolReadAttempt > 0;
-            const toolProbe = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
-                  message: strictReply
-                    ? "OpenClaw live tool probe (local, safe): " +
-                      `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
-                      `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`
-                    : "OpenClaw live tool probe (local, safe): " +
-                      `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
-                      "Then reply with the two nonce values you read (include both).",
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: tool-read`,
-            );
-            if (toolProbe?.status !== "ok") {
-              if (toolReadAttempt + 1 < maxToolReadAttempts) {
-                logProgress(
-                  `${progressLabel}: tool-read retry (${toolReadAttempt + 2}/${maxToolReadAttempts}) status=${String(toolProbe?.status)}`,
-                );
-                continue;
-              }
-              throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
-            }
-            toolText = extractPayloadText(toolProbe?.result);
+            toolText = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
+              message: strictReply
+                ? "OpenClaw live tool probe (local, safe): " +
+                  `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
+                  `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`
+                : "OpenClaw live tool probe (local, safe): " +
+                  `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
+                  "Then reply with the two nonce values you read (include both).",
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: tool-read`,
+            });
             if (
               isEmptyStreamText(toolText) &&
               (model.provider === "minimax" || model.provider === "openai-codex")
@@ -960,40 +1032,24 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
               execReadAttempt += 1
             ) {
               const strictReply = execReadAttempt > 0;
-              const execReadProbe = await withGatewayLiveProbeTimeout(
-                client.request<AgentFinalPayload>(
-                  "agent",
-                  {
-                    sessionKey,
-                    idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
-                    message: strictReply
-                      ? "OpenClaw live tool probe (local, safe): " +
-                        "use the tool named `exec` (or `Exec`) to run this command: " +
-                        `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
-                        `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
-                        `Then reply with exactly: ${nonceC}. No extra text.`
-                      : "OpenClaw live tool probe (local, safe): " +
-                        "use the tool named `exec` (or `Exec`) to run this command: " +
-                        `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
-                        `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
-                        "Finally reply including the nonce text you read back.",
-                    thinking: params.thinkingLevel,
-                    deliver: false,
-                  },
-                  { expectFinal: true },
-                ),
-                `${progressLabel}: tool-exec`,
-              );
-              if (execReadProbe?.status !== "ok") {
-                if (execReadAttempt + 1 < maxExecReadAttempts) {
-                  logProgress(
-                    `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`,
-                  );
-                  continue;
-                }
-                throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
-              }
-              execReadText = extractPayloadText(execReadProbe?.result);
+              execReadText = await requestGatewayAgentText({
+                client,
+                sessionKey,
+                idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
+                message: strictReply
+                  ? "OpenClaw live tool probe (local, safe): " +
+                    "use the tool named `exec` (or `Exec`) to run this command: " +
+                    `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                    `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                    `Then reply with exactly: ${nonceC}. No extra text.`
+                  : "OpenClaw live tool probe (local, safe): " +
+                    "use the tool named `exec` (or `Exec`) to run this command: " +
+                    `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
+                    `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
+                    "Finally reply including the nonce text you read back.",
+                thinkingLevel: params.thinkingLevel,
+                context: `${progressLabel}: tool-exec`,
+              });
               if (
                 isEmptyStreamText(execReadText) &&
                 (model.provider === "minimax" || model.provider === "openai-codex")
@@ -1040,62 +1096,51 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
             const imageBase64 = renderCatNoncePngBase64(imageCode);
             const runIdImage = randomUUID();
 
-            const imageProbe = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
+            const imageText = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runIdImage}-image`,
+              message:
+                "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
+                "(1) the animal shown or written in the image, lowercase; " +
+                "(2) the code printed in the image, uppercase. No extra text.",
+              attachments: [
                 {
-                  sessionKey,
-                  idempotencyKey: `idem-${runIdImage}-image`,
-                  message:
-                    "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
-                    "(1) the animal shown or written in the image, lowercase; " +
-                    "(2) the code printed in the image, uppercase. No extra text.",
-                  attachments: [
-                    {
-                      mimeType: "image/png",
-                      fileName: `probe-${runIdImage}.png`,
-                      content: imageBase64,
-                    },
-                  ],
-                  thinking: params.thinkingLevel,
-                  deliver: false,
+                  mimeType: "image/png",
+                  fileName: `probe-${runIdImage}.png`,
+                  content: imageBase64,
                 },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: image`,
-            );
+              ],
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: image`,
+            });
             // Best-effort: do not fail the whole live suite on flaky image handling.
             // (We still keep prompt + tool probes as hard checks.)
-            if (imageProbe?.status !== "ok") {
-              logProgress(`${progressLabel}: image skip (status=${String(imageProbe?.status)})`);
+            if (
+              isEmptyStreamText(imageText) &&
+              (model.provider === "minimax" || model.provider === "openai-codex")
+            ) {
+              logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
             } else {
-              const imageText = extractPayloadText(imageProbe?.result);
-              if (
-                isEmptyStreamText(imageText) &&
-                (model.provider === "minimax" || model.provider === "openai-codex")
-              ) {
-                logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
+              assertNoReasoningTags({
+                text: imageText,
+                model: modelKey,
+                phase: "image",
+                label: params.label,
+              });
+              if (!/\bcat\b/i.test(imageText)) {
+                logProgress(`${progressLabel}: image skip (missing 'cat')`);
               } else {
-                assertNoReasoningTags({
-                  text: imageText,
-                  model: modelKey,
-                  phase: "image",
-                  label: params.label,
-                });
-                if (!/\bcat\b/i.test(imageText)) {
-                  logProgress(`${progressLabel}: image skip (missing 'cat')`);
-                } else {
-                  const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
-                  const bestDistance = candidates.reduce((best, cand) => {
-                    if (Math.abs(cand.length - imageCode.length) > 2) {
-                      return best;
-                    }
-                    return Math.min(best, editDistance(cand, imageCode));
-                  }, Number.POSITIVE_INFINITY);
-                  // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
-                  if (!(bestDistance <= 3)) {
-                    logProgress(`${progressLabel}: image skip (code mismatch)`);
+                const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
+                const bestDistance = candidates.reduce((best, cand) => {
+                  if (Math.abs(cand.length - imageCode.length) > 2) {
+                    return best;
                   }
+                  return Math.min(best, editDistance(cand, imageCode));
+                }, Number.POSITIVE_INFINITY);
+                // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
+                if (!(bestDistance <= 3)) {
+                  logProgress(`${progressLabel}: image skip (code mismatch)`);
                 }
               }
             }
@@ -1108,24 +1153,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
           ) {
             logProgress(`${progressLabel}: tool-only regression`);
             const runId2 = randomUUID();
-            const first = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runId2}-1`,
-                  message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: tool-only-regression-first`,
-            );
-            if (first?.status !== "ok") {
-              throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
-            }
-            const firstText = extractPayloadText(first?.result);
+            const firstText = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runId2}-1`,
+              message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: tool-only-regression-first`,
+            });
             assertNoReasoningTags({
               text: firstText,
               model: modelKey,
@@ -1133,24 +1168,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
               label: params.label,
             });
 
-            const second = await withGatewayLiveProbeTimeout(
-              client.request<AgentFinalPayload>(
-                "agent",
-                {
-                  sessionKey,
-                  idempotencyKey: `idem-${runId2}-2`,
-                  message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
-                  thinking: params.thinkingLevel,
-                  deliver: false,
-                },
-                { expectFinal: true },
-              ),
-              `${progressLabel}: tool-only-regression-second`,
-            );
-            if (second?.status !== "ok") {
-              throw new Error(`post-tool message failed: status=${String(second?.status)}`);
-            }
-            const reply = extractPayloadText(second?.result);
+            const reply = await requestGatewayAgentText({
+              client,
+              sessionKey,
+              idempotencyKey: `idem-${runId2}-2`,
+              message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
+              thinkingLevel: params.thinkingLevel,
+              context: `${progressLabel}: tool-only-regression-second`,
+            });
             assertNoReasoningTags({
               text: reply,
               model: modelKey,
@@ -1290,6 +1315,8 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
       logProgress(`[${params.label}] skipped all models (missing profiles)`);
     }
   } finally {
+    clearRuntimeConfigSnapshot();
+    restoreProductionEnvForLiveRun(runtimeEnv);
     client.stop();
     await server.close({ reason: "live test complete" });
     await fs.rm(toolProbePath, { force: true });
@@ -1317,6 +1344,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
   it(
     "runs meaningful prompts across models with available keys",
     async () => {
+      clearRuntimeConfigSnapshot();
       const cfg = loadConfig();
       await ensureOpenClawModelsJson(cfg);
 
@@ -1422,6 +1450,8 @@ describeLive("gateway live (dev agent, profile keys)", () => {
     if (!ZAI_FALLBACK) {
       return;
     }
+    clearRuntimeConfigSnapshot();
+    const runtimeEnv = enterProductionEnvForLiveRun();
     const previous = {
       configPath: process.env.OPENCLAW_CONFIG_PATH,
       token: process.env.OPENCLAW_GATEWAY_TOKEN,
@@ -1520,27 +1550,16 @@ describeLive("gateway live (dev agent, profile keys)", () => {
         "zai-fallback: sessions-reset",
       );
 
-      const runId = randomUUID();
-      const toolProbe = await withGatewayLiveProbeTimeout(
-        client.request<AgentFinalPayload>(
-          "agent",
-          {
-            sessionKey,
-            idempotencyKey: `idem-${runId}-tool`,
-            message:
-              `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
-              `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
-            thinking: THINKING_LEVEL,
-            deliver: false,
-          },
-          { expectFinal: true },
-        ),
-        "zai-fallback: tool-probe",
-      );
-      if (toolProbe?.status !== "ok") {
-        throw new Error(`anthropic tool probe failed: status=${String(toolProbe?.status)}`);
-      }
-      const toolText = extractPayloadText(toolProbe?.result);
+      const toolText = await requestGatewayAgentText({
+        client,
+        sessionKey,
+        idempotencyKey: `idem-${randomUUID()}-tool`,
+        message:
+          `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
+          `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
+        thinkingLevel: THINKING_LEVEL,
+        context: "zai-fallback: tool-probe",
+      });
       assertNoReasoningTags({
         text: toolText,
         model: "anthropic/claude-opus-4-5",
@@ -1559,27 +1578,16 @@ describeLive("gateway live (dev agent, profile keys)", () => {
         "zai-fallback: sessions-patch-zai",
       );
 
-      const followupId = randomUUID();
-      const followup = await withGatewayLiveProbeTimeout(
-        client.request<AgentFinalPayload>(
-          "agent",
-          {
-            sessionKey,
-            idempotencyKey: `idem-${followupId}-followup`,
-            message:
-              `What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
-              `Reply with exactly: ${nonceA} ${nonceB}.`,
-            thinking: THINKING_LEVEL,
-            deliver: false,
-          },
-          { expectFinal: true },
-        ),
-        "zai-fallback: followup",
-      );
-      if (followup?.status !== "ok") {
-        throw new Error(`zai followup failed: status=${String(followup?.status)}`);
-      }
-      const followupText = extractPayloadText(followup?.result);
+      const followupText = await requestGatewayAgentText({
+        client,
+        sessionKey,
+        idempotencyKey: `idem-${randomUUID()}-followup`,
+        message:
+          `What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
+          `Reply with exactly: ${nonceA} ${nonceB}.`,
+        thinkingLevel: THINKING_LEVEL,
+        context: "zai-fallback: followup",
+      });
       assertNoReasoningTags({
         text: followupText,
         model: "zai/glm-4.7",
@@ -1590,6 +1598,8 @@ describeLive("gateway live (dev agent, profile keys)", () => {
         throw new Error(`zai followup missing nonce: ${followupText}`);
       }
     } finally {
+      clearRuntimeConfigSnapshot();
+      restoreProductionEnvForLiveRun(runtimeEnv);
       client.stop();
       await server.close({ reason: "live test complete" });
       await fs.rm(toolProbePath, { force: true });