From 1ffe8fde84d1c558a23d3ae985800c7bcfaf06a6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 17 Mar 2026 03:01:11 +0000 Subject: [PATCH] fix: stabilize docker test suite --- CHANGELOG.md | 1 + docs/help/testing.md | 8 +- package.json | 2 +- pnpm-lock.yaml | 2 +- scripts/docker/cleanup-smoke/Dockerfile | 2 + scripts/e2e/Dockerfile | 7 +- scripts/e2e/doctor-install-switch-docker.sh | 2 +- scripts/e2e/onboard-docker.sh | 33 +- scripts/e2e/plugins-docker.sh | 2 +- scripts/test-live-gateway-models-docker.sh | 9 +- scripts/test-live-models-docker.sh | 12 +- .../auth-profiles.external-cli-sync.test.ts | 36 ++ src/agents/auth-profiles/external-cli-sync.ts | 97 +-- src/agents/cli-credentials.test.ts | 18 +- src/agents/cli-credentials.ts | 26 +- src/agents/models.profiles.live.test.ts | 13 + .../gateway-models.profiles.live.test.ts | 562 +++++++++--------- 17 files changed, 450 insertions(+), 382 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d948e2b59ee..24335d41a91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -78,6 +78,7 @@ Docs: https://docs.openclaw.ai - Z.AI/onboarding: add `glm-5-turbo` to the default Z.AI provider catalog so onboarding-generated configs expose the new model alongside the existing GLM defaults. (#46670) Thanks @tomsun28. - Zalo Personal/group gating: stop reapplying `dmPolicy.allowFrom` as a sender gate for already-allowlisted groups when `groupAllowFrom` is unset, so any member of an allowed group can trigger replies while DMs stay restricted. (#46663) Fixes #40146. Thanks @Takhoffman. - Zalo/plugin runtime: export `resolveClientIp` from `openclaw/plugin-sdk/zalo` so installed builds no longer crash on startup when the webhook monitor loads from the packaged extension instead of the monorepo source tree. (#46549) Thanks @No898. +- Docker/live tests: mount external CLI auth homes into writable container copies, derive Codex OAuth expiry from JWT `exp`, refresh synced CLI creds instead of trusting stale cached expiry, and make gateway live probes wait on transcript output so `pnpm test:docker:all` stays green in Linux. - Plugins/install precedence: keep bundled plugins ahead of auto-discovered globals by default, but let an explicitly installed plugin record win its own duplicate-id tie so installed channel plugins load from `~/.openclaw/extensions` after `openclaw plugins install`. (#46722) Thanks @Takhoffman. - Control UI/logging: make browser-safe logger imports avoid eager temp-dir resolution so the bundled Control UI no longer crashes to a blank screen when logging reaches `tmp-openclaw-dir`. (#48469) Fixes #48062. Thanks @7inspire. - Plugins/scoped ids: preserve scoped plugin ids during install and config keying, and keep bundled plugins ahead of discovered duplicate ids by default so `@scope/name` plugins no longer collide with unscoped installs. (#47413) Thanks @vincentkoc. diff --git a/docs/help/testing.md b/docs/help/testing.md index 09388dd769e..ab63db23670 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -362,7 +362,7 @@ If you want to rely on env keys (e.g. exported in your `~/.profile`), run local ## Docker runners (optional “works in Linux” checks) -These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted). They also bind-mount CLI auth homes like `~/.codex`, `~/.claude`, `~/.qwen`, and `~/.minimax` when present so external-CLI OAuth stays available in-container: +These run `pnpm test:live` inside the repo Docker image, mounting your local config dir and workspace (and sourcing `~/.profile` if mounted). They also bind-mount CLI auth homes like `~/.codex`, `~/.claude`, `~/.qwen`, and `~/.minimax` when present, then copy them into the container home before the run so external-CLI OAuth can refresh tokens without mutating the host auth store: - Direct models: `pnpm test:docker:live-models` (script: `scripts/test-live-models-docker.sh`) - Gateway + dev agent: `pnpm test:docker:live-gateway` (script: `scripts/test-live-gateway-models-docker.sh`) @@ -373,6 +373,9 @@ These run `pnpm test:live` inside the repo Docker image, mounting your local con The live-model Docker runners also bind-mount the current checkout read-only and stage it into a temporary workdir inside the container. This keeps the runtime image slim while still running Vitest against your exact local source/config. +`test:docker:live-models` still runs `pnpm test:live`, so pass through +`OPENCLAW_LIVE_GATEWAY_*` as well when you need to narrow or exclude gateway +live coverage from that Docker lane. Manual ACP plain-language thread smoke (not CI): @@ -384,8 +387,9 @@ Useful env vars: - `OPENCLAW_CONFIG_DIR=...` (default: `~/.openclaw`) mounted to `/home/node/.openclaw` - `OPENCLAW_WORKSPACE_DIR=...` (default: `~/.openclaw/workspace`) mounted to `/home/node/.openclaw/workspace` - `OPENCLAW_PROFILE_FILE=...` (default: `~/.profile`) mounted to `/home/node/.profile` and sourced before running tests -- External CLI auth dirs under `$HOME` (`.codex`, `.claude`, `.qwen`, `.minimax`) are mounted read-only to the matching `/home/node/...` paths when present +- External CLI auth dirs under `$HOME` (`.codex`, `.claude`, `.qwen`, `.minimax`) are mounted read-only under `/host-auth/...`, then copied into `/home/node/...` before tests start - `OPENCLAW_LIVE_GATEWAY_MODELS=...` / `OPENCLAW_LIVE_MODELS=...` to narrow the run +- `OPENCLAW_LIVE_GATEWAY_PROVIDERS=...` / `OPENCLAW_LIVE_PROVIDERS=...` to filter providers in-container - `OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS=1` to ensure creds come from the profile store (not env) ## Docs sanity diff --git a/package.json b/package.json index f0904418919..eaae91d6a40 100644 --- a/package.json +++ b/package.json @@ -401,7 +401,7 @@ "dotenv": "^17.3.1", "express": "^5.2.1", "file-type": "^21.3.2", - "gaxios": "^7.1.3", + "gaxios": "7.1.3", "grammy": "^1.41.1", "hono": "4.12.7", "https-proxy-agent": "^8.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 90ebda912b0..e05340832b6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -126,7 +126,7 @@ importers: specifier: 21.3.2 version: 21.3.2 gaxios: - specifier: ^7.1.3 + specifier: 7.1.3 version: 7.1.3 grammy: specifier: ^1.41.1 diff --git a/scripts/docker/cleanup-smoke/Dockerfile b/scripts/docker/cleanup-smoke/Dockerfile index 07a2334aa41..f214ffbabf4 100644 --- a/scripts/docker/cleanup-smoke/Dockerfile +++ b/scripts/docker/cleanup-smoke/Dockerfile @@ -2,6 +2,8 @@ FROM node:24-bookworm-slim@sha256:b4687aef2571c632a1953695ce4d61d6462a7eda471fe6e272eebf0418f276ba +ENV COREPACK_ENABLE_DOWNLOAD_PROMPT=0 + RUN --mount=type=cache,id=openclaw-cleanup-smoke-apt-cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,id=openclaw-cleanup-smoke-apt-lists,target=/var/lib/apt,sharing=locked \ apt-get update \ diff --git a/scripts/e2e/Dockerfile b/scripts/e2e/Dockerfile index 4669e762c4a..2c23c9ef1b8 100644 --- a/scripts/e2e/Dockerfile +++ b/scripts/e2e/Dockerfile @@ -20,7 +20,7 @@ WORKDIR /app COPY --chown=appuser:appuser package.json pnpm-lock.yaml pnpm-workspace.yaml ./ COPY --chown=appuser:appuser ui/package.json ./ui/package.json -COPY --chown=appuser:appuser extensions/memory-core/package.json ./extensions/memory-core/package.json +COPY --chown=appuser:appuser extensions ./extensions COPY --chown=appuser:appuser patches ./patches RUN --mount=type=cache,id=openclaw-pnpm-store,target=/home/appuser/.local/share/pnpm/store,sharing=locked \ @@ -39,6 +39,9 @@ COPY --chown=appuser:appuser apps/shared/OpenClawKit/Sources/OpenClawKit/Resourc COPY --chown=appuser:appuser apps/shared/OpenClawKit/Tools/CanvasA2UI ./apps/shared/OpenClawKit/Tools/CanvasA2UI RUN pnpm build -RUN pnpm ui:build +# Onboard Docker E2E does not exercise the Control UI itself; it only needs the +# asset-existence check to pass so configure/onboard can continue. +RUN mkdir -p dist/control-ui \ + && printf '%s\n' 'OpenClaw Control UI' > dist/control-ui/index.html CMD ["bash"] diff --git a/scripts/e2e/doctor-install-switch-docker.sh b/scripts/e2e/doctor-install-switch-docker.sh index ca91619ef5a..4ca742a362b 100755 --- a/scripts/e2e/doctor-install-switch-docker.sh +++ b/scripts/e2e/doctor-install-switch-docker.sh @@ -75,7 +75,7 @@ LOGINCTL # Install the npm-global variant from the local /app source. # `npm pack` can emit script output; keep only the tarball name. - pkg_tgz="$(npm pack --silent /app | tail -n 1 | tr -d '\r')" + pkg_tgz="$(npm pack --ignore-scripts --silent /app | tail -n 1 | tr -d '\r')" if [ ! -f "/app/$pkg_tgz" ]; then echo "npm pack failed (expected /app/$pkg_tgz)" exit 1 diff --git a/scripts/e2e/onboard-docker.sh b/scripts/e2e/onboard-docker.sh index 49b08dcc2ca..70cbd6f0c51 100755 --- a/scripts/e2e/onboard-docker.sh +++ b/scripts/e2e/onboard-docker.sh @@ -74,8 +74,14 @@ TRASH try { text = fs.readFileSync(file, \"utf8\"); } catch { process.exit(1); } // Clack/script output can include lots of control sequences; keep a larger tail and strip ANSI more robustly. if (text.length > 120000) text = text.slice(-120000); - const stripAnsi = (value) => + const normalizeScriptOutput = (value) => value + // util-linux script can emit each byte on its own CRLF-delimited line. + // Collapse those first so ANSI/control stripping works on real sequences. + .replace(/\\r?\\n/g, \"\") + .replace(/\\r/g, \"\"); + const stripAnsi = (value) => + normalizeScriptOutput(value) // OSC: ESC ] ... BEL or ESC \\ .replace(/\\x1b\\][^\\x07]*(?:\\x07|\\x1b\\\\)/g, \"\") // CSI: ESC [ ... cmd @@ -269,23 +275,24 @@ TRASH } send_channels_flow() { - # Configure channels via configure wizard. - # Prompts are interactive; notes are not. Use conservative delays to stay in sync. - # Where will the Gateway run? -> Local (default) - send $'"'"'\r'"'"' 1.2 - # Channels mode -> Configure/link (default) - send $'"'"'\r'"'"' 1.5 + # Configure channels via configure wizard. Sync on prompt text so + # keystrokes do not drift into the wrong screen when render timing changes. + wait_for_log "Where will the Gateway run?" 120 + send $'"'"'\r'"'"' 0.6 + wait_for_log "Channels" 120 + send $'"'"'\r'"'"' 0.6 # Select a channel -> Finished (last option; clack wraps on Up) - send $'"'"'\e[A\r'"'"' 2.0 + wait_for_log "Select a channel" 120 + send $'"'"'\e[A\r'"'"' 0.8 # Keep stdin open until wizard exits. - send "" 2.5 + send "" 2.0 } send_skills_flow() { - # configure --section skills still runs the configure wizard; the first prompt is gateway location. - # Avoid log-based synchronization here; clack output can fragment ANSI sequences and break matching. - send $'"'"'\r'"'"' 3.0 - wait_for_log "Configure skills now?" 120 true || true + # configure --section skills still runs the configure wizard. + wait_for_log "Where will the Gateway run?" 120 + send $'"'"'\r'"'"' 0.6 + wait_for_log "Configure skills now?" 120 send $'"'"'n\r'"'"' 0.8 send "" 2.0 } diff --git a/scripts/e2e/plugins-docker.sh b/scripts/e2e/plugins-docker.sh index 587840ec93a..632d6924099 100755 --- a/scripts/e2e/plugins-docker.sh +++ b/scripts/e2e/plugins-docker.sh @@ -8,7 +8,7 @@ echo "Building Docker image..." docker build -t "$IMAGE_NAME" -f "$ROOT_DIR/scripts/e2e/Dockerfile" "$ROOT_DIR" echo "Running plugins Docker E2E..." -docker run --rm -i "$IMAGE_NAME" bash -s <<'EOF' +docker run --rm -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 -i "$IMAGE_NAME" bash -s <<'EOF' set -euo pipefail if [ -f dist/index.mjs ]; then diff --git a/scripts/test-live-gateway-models-docker.sh b/scripts/test-live-gateway-models-docker.sh index f40e064910b..a3e1036171f 100755 --- a/scripts/test-live-gateway-models-docker.sh +++ b/scripts/test-live-gateway-models-docker.sh @@ -17,13 +17,20 @@ EXTERNAL_AUTH_MOUNTS=() for auth_dir in .claude .codex .minimax .qwen; do host_path="$HOME/$auth_dir" if [[ -d "$host_path" ]]; then - EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/home/node/"$auth_dir":ro) + EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/host-auth/"$auth_dir":ro) fi done read -r -d '' LIVE_TEST_CMD <<'EOF' || true set -euo pipefail [ -f "$HOME/.profile" ] && source "$HOME/.profile" || true +for auth_dir in .claude .codex .minimax .qwen; do + if [ -d "/host-auth/$auth_dir" ]; then + mkdir -p "$HOME/$auth_dir" + cp -R "/host-auth/$auth_dir/." "$HOME/$auth_dir" + chmod -R u+rwX "$HOME/$auth_dir" || true + fi +done tmp_dir="$(mktemp -d)" cleanup() { rm -rf "$tmp_dir" diff --git a/scripts/test-live-models-docker.sh b/scripts/test-live-models-docker.sh index 52257cd3230..c1cec5b2740 100755 --- a/scripts/test-live-models-docker.sh +++ b/scripts/test-live-models-docker.sh @@ -17,13 +17,20 @@ EXTERNAL_AUTH_MOUNTS=() for auth_dir in .claude .codex .minimax .qwen; do host_path="$HOME/$auth_dir" if [[ -d "$host_path" ]]; then - EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/home/node/"$auth_dir":ro) + EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/host-auth/"$auth_dir":ro) fi done read -r -d '' LIVE_TEST_CMD <<'EOF' || true set -euo pipefail [ -f "$HOME/.profile" ] && source "$HOME/.profile" || true +for auth_dir in .claude .codex .minimax .qwen; do + if [ -d "/host-auth/$auth_dir" ]; then + mkdir -p "$HOME/$auth_dir" + cp -R "/host-auth/$auth_dir/." "$HOME/$auth_dir" + chmod -R u+rwX "$HOME/$auth_dir" || true + fi +done tmp_dir="$(mktemp -d)" cleanup() { rm -rf "$tmp_dir" @@ -57,6 +64,9 @@ docker run --rm -t \ -e OPENCLAW_LIVE_MAX_MODELS="${OPENCLAW_LIVE_MAX_MODELS:-${CLAWDBOT_LIVE_MAX_MODELS:-48}}" \ -e OPENCLAW_LIVE_MODEL_TIMEOUT_MS="${OPENCLAW_LIVE_MODEL_TIMEOUT_MS:-${CLAWDBOT_LIVE_MODEL_TIMEOUT_MS:-}}" \ -e OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS="${OPENCLAW_LIVE_REQUIRE_PROFILE_KEYS:-${CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS:-}}" \ + -e OPENCLAW_LIVE_GATEWAY_MODELS="${OPENCLAW_LIVE_GATEWAY_MODELS:-${CLAWDBOT_LIVE_GATEWAY_MODELS:-}}" \ + -e OPENCLAW_LIVE_GATEWAY_PROVIDERS="${OPENCLAW_LIVE_GATEWAY_PROVIDERS:-${CLAWDBOT_LIVE_GATEWAY_PROVIDERS:-}}" \ + -e OPENCLAW_LIVE_GATEWAY_MAX_MODELS="${OPENCLAW_LIVE_GATEWAY_MAX_MODELS:-${CLAWDBOT_LIVE_GATEWAY_MAX_MODELS:-}}" \ -v "$ROOT_DIR":/src:ro \ -v "$CONFIG_DIR":/home/node/.openclaw \ -v "$WORKSPACE_DIR":/home/node/.openclaw/workspace \ diff --git a/src/agents/auth-profiles.external-cli-sync.test.ts b/src/agents/auth-profiles.external-cli-sync.test.ts index 303b85b72d2..eae0fab70af 100644 --- a/src/agents/auth-profiles.external-cli-sync.test.ts +++ b/src/agents/auth-profiles.external-cli-sync.test.ts @@ -51,4 +51,40 @@ describe("syncExternalCliCredentials", () => { }); expect(store.profiles[CODEX_CLI_PROFILE_ID]).toBeUndefined(); }); + + it("refreshes stored Codex expiry from external CLI even when the cached profile looks fresh", () => { + const staleExpiry = Date.now() + 30 * 60_000; + const freshExpiry = Date.now() + 5 * 24 * 60 * 60_000; + mocks.readCodexCliCredentialsCached.mockReturnValue({ + type: "oauth", + provider: "openai-codex", + access: "new-access-token", + refresh: "new-refresh-token", + expires: freshExpiry, + accountId: "acct_456", + }); + + const store: AuthProfileStore = { + version: 1, + profiles: { + [OPENAI_CODEX_DEFAULT_PROFILE_ID]: { + type: "oauth", + provider: "openai-codex", + access: "old-access-token", + refresh: "old-refresh-token", + expires: staleExpiry, + accountId: "acct_456", + }, + }, + }; + + const mutated = syncExternalCliCredentials(store); + + expect(mutated).toBe(true); + expect(store.profiles[OPENAI_CODEX_DEFAULT_PROFILE_ID]).toMatchObject({ + access: "new-access-token", + refresh: "new-refresh-token", + expires: freshExpiry, + }); + }); }); diff --git a/src/agents/auth-profiles/external-cli-sync.ts b/src/agents/auth-profiles/external-cli-sync.ts index 7e490c97c94..ff43b586b48 100644 --- a/src/agents/auth-profiles/external-cli-sync.ts +++ b/src/agents/auth-profiles/external-cli-sync.ts @@ -4,13 +4,12 @@ import { readMiniMaxCliCredentialsCached, } from "../cli-credentials.js"; import { - EXTERNAL_CLI_NEAR_EXPIRY_MS, EXTERNAL_CLI_SYNC_TTL_MS, QWEN_CLI_PROFILE_ID, MINIMAX_CLI_PROFILE_ID, log, } from "./constants.js"; -import type { AuthProfileCredential, AuthProfileStore, OAuthCredential } from "./types.js"; +import type { AuthProfileStore, OAuthCredential } from "./types.js"; const OPENAI_CODEX_DEFAULT_PROFILE_ID = "openai-codex:default"; @@ -37,62 +36,33 @@ function shallowEqualOAuthCredentials(a: OAuthCredential | undefined, b: OAuthCr ); } -function isExternalProfileFresh(cred: AuthProfileCredential | undefined, now: number): boolean { - if (!cred) { - return false; - } - if (cred.type !== "oauth" && cred.type !== "token") { - return false; - } - if ( - cred.provider !== "qwen-portal" && - cred.provider !== "minimax-portal" && - cred.provider !== "openai-codex" - ) { - return false; - } - if (typeof cred.expires !== "number") { - return true; - } - return cred.expires > now + EXTERNAL_CLI_NEAR_EXPIRY_MS; -} - /** Sync external CLI credentials into the store for a given provider. */ function syncExternalCliCredentialsForProvider( store: AuthProfileStore, profileId: string, provider: string, readCredentials: () => OAuthCredential | null, - now: number, options: ExternalCliSyncOptions, ): boolean { const existing = store.profiles[profileId]; - const shouldSync = - !existing || existing.provider !== provider || !isExternalProfileFresh(existing, now); - const creds = shouldSync ? readCredentials() : null; + const creds = readCredentials(); if (!creds) { return false; } const existingOAuth = existing?.type === "oauth" ? existing : undefined; - const shouldUpdate = - !existingOAuth || - existingOAuth.provider !== provider || - existingOAuth.expires <= now || - creds.expires > existingOAuth.expires; - - if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, creds)) { - store.profiles[profileId] = creds; - if (options.log !== false) { - log.info(`synced ${provider} credentials from external cli`, { - profileId, - expires: new Date(creds.expires).toISOString(), - }); - } - return true; + if (shallowEqualOAuthCredentials(existingOAuth, creds)) { + return false; } - return false; + store.profiles[profileId] = creds; + if (options.log !== false) { + log.info(`synced ${provider} credentials from external cli`, { + profileId, + expires: new Date(creds.expires).toISOString(), + }); + } + return true; } /** @@ -106,46 +76,24 @@ export function syncExternalCliCredentials( options: ExternalCliSyncOptions = {}, ): boolean { let mutated = false; - const now = Date.now(); - // Sync from Qwen Code CLI - const existingQwen = store.profiles[QWEN_CLI_PROFILE_ID]; - const shouldSyncQwen = - !existingQwen || - existingQwen.provider !== "qwen-portal" || - !isExternalProfileFresh(existingQwen, now); - const qwenCreds = shouldSyncQwen - ? readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }) - : null; - if (qwenCreds) { - const existing = store.profiles[QWEN_CLI_PROFILE_ID]; - const existingOAuth = existing?.type === "oauth" ? existing : undefined; - const shouldUpdate = - !existingOAuth || - existingOAuth.provider !== "qwen-portal" || - existingOAuth.expires <= now || - qwenCreds.expires > existingOAuth.expires; - - if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, qwenCreds)) { - store.profiles[QWEN_CLI_PROFILE_ID] = qwenCreds; - mutated = true; - if (options.log !== false) { - log.info("synced qwen credentials from qwen cli", { - profileId: QWEN_CLI_PROFILE_ID, - expires: new Date(qwenCreds.expires).toISOString(), - }); - } - } + if ( + syncExternalCliCredentialsForProvider( + store, + QWEN_CLI_PROFILE_ID, + "qwen-portal", + () => readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }), + options, + ) + ) { + mutated = true; } - - // Sync from MiniMax Portal CLI if ( syncExternalCliCredentialsForProvider( store, MINIMAX_CLI_PROFILE_ID, "minimax-portal", () => readMiniMaxCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }), - now, options, ) ) { @@ -157,7 +105,6 @@ export function syncExternalCliCredentials( OPENAI_CODEX_DEFAULT_PROFILE_ID, "openai-codex", () => readCodexCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }), - now, options, ) ) { diff --git a/src/agents/cli-credentials.test.ts b/src/agents/cli-credentials.test.ts index fcfaf21450d..53be1581b13 100644 --- a/src/agents/cli-credentials.test.ts +++ b/src/agents/cli-credentials.test.ts @@ -46,6 +46,12 @@ async function readCachedClaudeCliCredentials(allowKeychainPrompt: boolean) { }); } +function createJwtWithExp(expSeconds: number): string { + const encode = (value: Record) => + Buffer.from(JSON.stringify(value)).toString("base64url"); + return `${encode({ alg: "RS256", typ: "JWT" })}.${encode({ exp: expSeconds })}.signature`; +} + describe("cli credentials", () => { beforeAll(async () => { ({ @@ -229,6 +235,7 @@ describe("cli credentials", () => { it("reads Codex credentials from keychain when available", async () => { const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-")); process.env.CODEX_HOME = tempHome; + const expSeconds = Math.floor(Date.parse("2026-03-23T00:48:49Z") / 1000); const accountHash = "cli|"; @@ -238,7 +245,7 @@ describe("cli credentials", () => { expect(cmd).toContain(accountHash); return JSON.stringify({ tokens: { - access_token: "keychain-access", + access_token: createJwtWithExp(expSeconds), refresh_token: "keychain-refresh", }, last_refresh: "2026-01-01T00:00:00Z", @@ -248,15 +255,17 @@ describe("cli credentials", () => { const creds = readCodexCliCredentials({ platform: "darwin", execSync: execSyncMock }); expect(creds).toMatchObject({ - access: "keychain-access", + access: createJwtWithExp(expSeconds), refresh: "keychain-refresh", provider: "openai-codex", + expires: expSeconds * 1000, }); }); it("falls back to Codex auth.json when keychain is unavailable", async () => { const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-")); process.env.CODEX_HOME = tempHome; + const expSeconds = Math.floor(Date.parse("2026-03-24T12:34:56Z") / 1000); execSyncMock.mockImplementation(() => { throw new Error("not found"); }); @@ -267,7 +276,7 @@ describe("cli credentials", () => { authPath, JSON.stringify({ tokens: { - access_token: "file-access", + access_token: createJwtWithExp(expSeconds), refresh_token: "file-refresh", }, }), @@ -277,9 +286,10 @@ describe("cli credentials", () => { const creds = readCodexCliCredentials({ execSync: execSyncMock }); expect(creds).toMatchObject({ - access: "file-access", + access: createJwtWithExp(expSeconds), refresh: "file-refresh", provider: "openai-codex", + expires: expSeconds * 1000, }); }); }); diff --git a/src/agents/cli-credentials.ts b/src/agents/cli-credentials.ts index 0d6d7c28c84..8ded765346a 100644 --- a/src/agents/cli-credentials.ts +++ b/src/agents/cli-credentials.ts @@ -153,6 +153,22 @@ function computeCodexKeychainAccount(codexHome: string) { return `cli|${hash.slice(0, 16)}`; } +function decodeJwtExpiryMs(token: string): number | null { + const parts = token.split("."); + if (parts.length < 2) { + return null; + } + try { + const payloadRaw = Buffer.from(parts[1], "base64url").toString("utf8"); + const payload = JSON.parse(payloadRaw) as { exp?: unknown }; + return typeof payload.exp === "number" && Number.isFinite(payload.exp) && payload.exp > 0 + ? payload.exp * 1000 + : null; + } catch { + return null; + } +} + function readCodexKeychainCredentials(options?: { platform?: NodeJS.Platform; execSync?: ExecSyncFn; @@ -193,9 +209,10 @@ function readCodexKeychainCredentials(options?: { typeof lastRefreshRaw === "string" || typeof lastRefreshRaw === "number" ? new Date(lastRefreshRaw).getTime() : Date.now(); - const expires = Number.isFinite(lastRefresh) + const fallbackExpiry = Number.isFinite(lastRefresh) ? lastRefresh + 60 * 60 * 1000 : Date.now() + 60 * 60 * 1000; + const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry; const accountId = typeof tokens?.account_id === "string" ? tokens.account_id : undefined; log.info("read codex credentials from keychain", { @@ -483,13 +500,14 @@ export function readCodexCliCredentials(options?: { return null; } - let expires: number; + let fallbackExpiry: number; try { const stat = fs.statSync(authPath); - expires = stat.mtimeMs + 60 * 60 * 1000; + fallbackExpiry = stat.mtimeMs + 60 * 60 * 1000; } catch { - expires = Date.now() + 60 * 60 * 1000; + fallbackExpiry = Date.now() + 60 * 60 * 1000; } + const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry; return { type: "oauth", diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index 515d2b48ce6..87cbbb6a203 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -117,6 +117,10 @@ function isChatGPTUsageLimitErrorMessage(raw: string): boolean { return msg.includes("hit your chatgpt usage limit") && msg.includes("try again in"); } +function isRefreshTokenReused(raw: string): boolean { + return /refresh_token_reused/i.test(raw); +} + function isInstructionsRequiredError(raw: string): boolean { return /instructions are required/i.test(raw); } @@ -643,6 +647,15 @@ describeLive("live models (profile keys)", () => { logProgress(`${progressLabel}: skip (rate limit)`); break; } + if ( + allowNotFoundSkip && + model.provider === "openai-codex" && + isRefreshTokenReused(message) + ) { + skipped.push({ model: id, reason: message }); + logProgress(`${progressLabel}: skip (codex refresh token reused)`); + break; + } if ( allowNotFoundSkip && model.provider === "openai-codex" && diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index 6a74c98da3b..973cf952d16 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -24,7 +24,7 @@ import { shouldSuppressBuiltInModel } from "../agents/model-suppression.js"; import { ensureOpenClawModelsJson } from "../agents/models-config.js"; import { isRateLimitErrorMessage } from "../agents/pi-embedded-helpers/errors.js"; import { discoverAuthStorage, discoverModels } from "../agents/pi-model-discovery.js"; -import { loadConfig } from "../config/config.js"; +import { clearRuntimeConfigSnapshot, loadConfig } from "../config/config.js"; import type { ModelsConfig, OpenClawConfig, ModelProviderConfig } from "../config/types.js"; import { isTruthyEnvValue } from "../infra/env.js"; import { DEFAULT_AGENT_ID } from "../routing/session-key.js"; @@ -38,7 +38,7 @@ import { shouldRetryToolReadProbe, } from "./live-tool-probe-utils.js"; import { startGatewayServer } from "./server.js"; -import { extractPayloadText } from "./test-helpers.agent-results.js"; +import { loadSessionEntry, readSessionMessages } from "./session-utils.js"; const LIVE = isTruthyEnvValue(process.env.LIVE) || isTruthyEnvValue(process.env.OPENCLAW_LIVE_TEST); const GATEWAY_LIVE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY); @@ -171,6 +171,32 @@ function logProgress(message: string): void { console.log(`[live] ${message}`); } +function enterProductionEnvForLiveRun() { + const previous = { + vitest: process.env.VITEST, + nodeEnv: process.env.NODE_ENV, + }; + delete process.env.VITEST; + process.env.NODE_ENV = "production"; + return previous; +} + +function restoreProductionEnvForLiveRun(previous: { + vitest: string | undefined; + nodeEnv: string | undefined; +}) { + if (previous.vitest === undefined) { + delete process.env.VITEST; + } else { + process.env.VITEST = previous.vitest; + } + if (previous.nodeEnv === undefined) { + delete process.env.NODE_ENV; + } else { + process.env.NODE_ENV = previous.nodeEnv; + } +} + function formatFailurePreview( failures: Array<{ model: string; error: string }>, maxItems: number, @@ -319,25 +345,14 @@ async function runAnthropicRefusalProbe(params: { }): Promise { logProgress(`${params.label}: refusal-probe`); const magic = buildAnthropicRefusalToken(); - const runId = randomUUID(); - const probe = await withGatewayLiveProbeTimeout( - params.client.request( - "agent", - { - sessionKey: params.sessionKey, - idempotencyKey: `idem-${runId}-refusal`, - message: `Reply with the single word ok. Test token: ${magic}`, - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${params.label}: refusal-probe`, - ); - if (probe?.status !== "ok") { - throw new Error(`refusal probe failed: status=${String(probe?.status)}`); - } - const probeText = extractPayloadText(probe?.result); + const probeText = await requestGatewayAgentText({ + client: params.client, + sessionKey: params.sessionKey, + idempotencyKey: `idem-${randomUUID()}-refusal`, + message: `Reply with the single word ok. Test token: ${magic}`, + thinkingLevel: params.thinkingLevel, + context: `${params.label}: refusal-probe`, + }); assertNoReasoningTags({ text: probeText, model: params.modelKey, @@ -348,25 +363,14 @@ async function runAnthropicRefusalProbe(params: { throw new Error(`refusal probe missing ok: ${probeText}`); } - const followupId = randomUUID(); - const followup = await withGatewayLiveProbeTimeout( - params.client.request( - "agent", - { - sessionKey: params.sessionKey, - idempotencyKey: `idem-${followupId}-refusal-followup`, - message: "Now reply with exactly: still ok.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${params.label}: refusal-followup`, - ); - if (followup?.status !== "ok") { - throw new Error(`refusal followup failed: status=${String(followup?.status)}`); - } - const followupText = extractPayloadText(followup?.result); + const followupText = await requestGatewayAgentText({ + client: params.client, + sessionKey: params.sessionKey, + idempotencyKey: `idem-${randomUUID()}-refusal-followup`, + message: "Now reply with exactly: still ok.", + thinkingLevel: params.thinkingLevel, + context: `${params.label}: refusal-followup`, + }); assertNoReasoningTags({ text: followupText, model: params.modelKey, @@ -475,11 +479,6 @@ async function getFreeGatewayPort(): Promise { throw new Error("failed to acquire a free gateway port block"); } -type AgentFinalPayload = { - status?: unknown; - result?: unknown; -}; - async function connectClient(params: { url: string; token: string }) { return await new Promise((resolve, reject) => { let settled = false; @@ -513,6 +512,115 @@ async function connectClient(params: { url: string; token: string }) { }); } +function extractTranscriptMessageText(message: unknown): string { + if (!message || typeof message !== "object") { + return ""; + } + const record = message as { + text?: unknown; + content?: unknown; + }; + if (typeof record.text === "string" && record.text.trim()) { + return record.text.trim(); + } + if (typeof record.content === "string" && record.content.trim()) { + return record.content.trim(); + } + if (!Array.isArray(record.content)) { + return ""; + } + return record.content + .map((entry) => { + if (!entry || typeof entry !== "object") { + return ""; + } + const text = (entry as { text?: unknown }).text; + return typeof text === "string" && text.trim() ? text.trim() : ""; + }) + .filter(Boolean) + .join("\n") + .trim(); +} + +function readSessionAssistantTexts(sessionKey: string): string[] { + const { storePath, entry } = loadSessionEntry(sessionKey); + if (!entry?.sessionId) { + return []; + } + const messages = readSessionMessages(entry.sessionId, storePath, entry.sessionFile); + const assistantTexts: string[] = []; + for (const message of messages) { + if (!message || typeof message !== "object") { + continue; + } + const role = (message as { role?: unknown }).role; + if (role !== "assistant") { + continue; + } + assistantTexts.push(extractTranscriptMessageText(message)); + } + return assistantTexts; +} + +async function waitForSessionAssistantText(params: { + sessionKey: string; + baselineAssistantCount: number; + context: string; +}) { + const startedAt = Date.now(); + let delayMs = 50; + while (Date.now() - startedAt < GATEWAY_LIVE_PROBE_TIMEOUT_MS) { + const assistantTexts = readSessionAssistantTexts(params.sessionKey); + if (assistantTexts.length > params.baselineAssistantCount) { + const freshText = assistantTexts + .slice(params.baselineAssistantCount) + .map((text) => text.trim()) + .findLast((text) => text.length > 0); + if (freshText) { + return freshText; + } + } + await new Promise((resolve) => setTimeout(resolve, delayMs)); + delayMs = Math.min(delayMs * 2, 250); + } + throw new Error(`probe timeout after ${GATEWAY_LIVE_PROBE_TIMEOUT_MS}ms (${params.context})`); +} + +async function requestGatewayAgentText(params: { + client: GatewayClient; + sessionKey: string; + message: string; + thinkingLevel: string; + context: string; + idempotencyKey: string; + attachments?: Array<{ + mimeType: string; + fileName: string; + content: string; + }>; +}) { + const baselineAssistantCount = readSessionAssistantTexts(params.sessionKey).length; + const accepted = await withGatewayLiveProbeTimeout( + params.client.request<{ runId?: unknown; status?: unknown }>("agent", { + sessionKey: params.sessionKey, + idempotencyKey: params.idempotencyKey, + message: params.message, + thinking: params.thinkingLevel, + deliver: false, + attachments: params.attachments, + }), + `${params.context}: agent-accept`, + ); + if (accepted?.status !== "accepted") { + throw new Error(`agent status=${String(accepted?.status)}`); + } + return await waitForSessionAssistantText({ + sessionKey: params.sessionKey, + baselineAssistantCount, + context: `${params.context}: transcript-final`, + }); +} + type GatewayModelSuiteParams = { label: string; cfg: OpenClawConfig; @@ -636,6 +744,8 @@ function buildMinimaxProviderOverride(params: { } async function runGatewayModelSuite(params: GatewayModelSuiteParams) { + clearRuntimeConfigSnapshot(); + const runtimeEnv = enterProductionEnvForLiveRun(); const previous = { configPath: process.env.OPENCLAW_CONFIG_PATH, token: process.env.OPENCLAW_GATEWAY_TOKEN, @@ -793,48 +903,26 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { ); logProgress(`${progressLabel}: prompt`); - const runId = randomUUID(); - const payload = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId}`, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${progressLabel}: prompt`, - ); - - if (payload?.status !== "ok") { - throw new Error(`agent status=${String(payload?.status)}`); - } - let text = extractPayloadText(payload?.result); + let text = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${randomUUID()}`, + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: prompt`, + }); if (!text) { logProgress(`${progressLabel}: empty response, retrying`); - const retry = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${randomUUID()}-retry`, - message: - "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${progressLabel}: prompt-retry`, - ); - if (retry?.status !== "ok") { - throw new Error(`agent status=${String(retry?.status)}`); - } - text = extractPayloadText(retry?.result); + text = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${randomUUID()}-retry`, + message: + "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: prompt-retry`, + }); } if (!text && isGoogleishProvider(model.provider)) { logProgress(`${progressLabel}: skip (google empty response)`); @@ -881,36 +969,20 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { toolReadAttempt += 1 ) { const strictReply = toolReadAttempt > 0; - const toolProbe = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, - message: strictReply - ? "OpenClaw live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.` - : "OpenClaw live tool probe (local, safe): " + - `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + - "Then reply with the two nonce values you read (include both).", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${progressLabel}: tool-read`, - ); - if (toolProbe?.status !== "ok") { - if (toolReadAttempt + 1 < maxToolReadAttempts) { - logProgress( - `${progressLabel}: tool-read retry (${toolReadAttempt + 2}/${maxToolReadAttempts}) status=${String(toolProbe?.status)}`, - ); - continue; - } - throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`); - } - toolText = extractPayloadText(toolProbe?.result); + toolText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` + + "Then reply with the two nonce values you read (include both).", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-read`, + }); if ( isEmptyStreamText(toolText) && (model.provider === "minimax" || model.provider === "openai-codex") @@ -960,40 +1032,24 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { execReadAttempt += 1 ) { const strictReply = execReadAttempt > 0; - const execReadProbe = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, - message: strictReply - ? "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - `Then reply with exactly: ${nonceC}. No extra text.` - : "OpenClaw live tool probe (local, safe): " + - "use the tool named `exec` (or `Exec`) to run this command: " + - `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + - `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + - "Finally reply including the nonce text you read back.", - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${progressLabel}: tool-exec`, - ); - if (execReadProbe?.status !== "ok") { - if (execReadAttempt + 1 < maxExecReadAttempts) { - logProgress( - `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`, - ); - continue; - } - throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`); - } - execReadText = extractPayloadText(execReadProbe?.result); + execReadText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`, + message: strictReply + ? "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + `Then reply with exactly: ${nonceC}. No extra text.` + : "OpenClaw live tool probe (local, safe): " + + "use the tool named `exec` (or `Exec`) to run this command: " + + `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` + + `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` + + "Finally reply including the nonce text you read back.", + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-exec`, + }); if ( isEmptyStreamText(execReadText) && (model.provider === "minimax" || model.provider === "openai-codex") @@ -1040,62 +1096,51 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { const imageBase64 = renderCatNoncePngBase64(imageCode); const runIdImage = randomUUID(); - const imageProbe = await withGatewayLiveProbeTimeout( - client.request( - "agent", + const imageText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runIdImage}-image`, + message: + "Look at the attached image. Reply with exactly two tokens separated by a single space: " + + "(1) the animal shown or written in the image, lowercase; " + + "(2) the code printed in the image, uppercase. No extra text.", + attachments: [ { - sessionKey, - idempotencyKey: `idem-${runIdImage}-image`, - message: - "Look at the attached image. Reply with exactly two tokens separated by a single space: " + - "(1) the animal shown or written in the image, lowercase; " + - "(2) the code printed in the image, uppercase. No extra text.", - attachments: [ - { - mimeType: "image/png", - fileName: `probe-${runIdImage}.png`, - content: imageBase64, - }, - ], - thinking: params.thinkingLevel, - deliver: false, + mimeType: "image/png", + fileName: `probe-${runIdImage}.png`, + content: imageBase64, }, - { expectFinal: true }, - ), - `${progressLabel}: image`, - ); + ], + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: image`, + }); // Best-effort: do not fail the whole live suite on flaky image handling. // (We still keep prompt + tool probes as hard checks.) - if (imageProbe?.status !== "ok") { - logProgress(`${progressLabel}: image skip (status=${String(imageProbe?.status)})`); + if ( + isEmptyStreamText(imageText) && + (model.provider === "minimax" || model.provider === "openai-codex") + ) { + logProgress(`${progressLabel}: image skip (${model.provider} empty response)`); } else { - const imageText = extractPayloadText(imageProbe?.result); - if ( - isEmptyStreamText(imageText) && - (model.provider === "minimax" || model.provider === "openai-codex") - ) { - logProgress(`${progressLabel}: image skip (${model.provider} empty response)`); + assertNoReasoningTags({ + text: imageText, + model: modelKey, + phase: "image", + label: params.label, + }); + if (!/\bcat\b/i.test(imageText)) { + logProgress(`${progressLabel}: image skip (missing 'cat')`); } else { - assertNoReasoningTags({ - text: imageText, - model: modelKey, - phase: "image", - label: params.label, - }); - if (!/\bcat\b/i.test(imageText)) { - logProgress(`${progressLabel}: image skip (missing 'cat')`); - } else { - const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; - const bestDistance = candidates.reduce((best, cand) => { - if (Math.abs(cand.length - imageCode.length) > 2) { - return best; - } - return Math.min(best, editDistance(cand, imageCode)); - }, Number.POSITIVE_INFINITY); - // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above. - if (!(bestDistance <= 3)) { - logProgress(`${progressLabel}: image skip (code mismatch)`); + const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? []; + const bestDistance = candidates.reduce((best, cand) => { + if (Math.abs(cand.length - imageCode.length) > 2) { + return best; } + return Math.min(best, editDistance(cand, imageCode)); + }, Number.POSITIVE_INFINITY); + // OCR / image-read flake: allow a small edit distance, but still require the "cat" token above. + if (!(bestDistance <= 3)) { + logProgress(`${progressLabel}: image skip (code mismatch)`); } } } @@ -1108,24 +1153,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { ) { logProgress(`${progressLabel}: tool-only regression`); const runId2 = randomUUID(); - const first = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId2}-1`, - message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${progressLabel}: tool-only-regression-first`, - ); - if (first?.status !== "ok") { - throw new Error(`tool-only turn failed: status=${String(first?.status)}`); - } - const firstText = extractPayloadText(first?.result); + const firstText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runId2}-1`, + message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`, + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-only-regression-first`, + }); assertNoReasoningTags({ text: firstText, model: modelKey, @@ -1133,24 +1168,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { label: params.label, }); - const second = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId2}-2`, - message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, - thinking: params.thinkingLevel, - deliver: false, - }, - { expectFinal: true }, - ), - `${progressLabel}: tool-only-regression-second`, - ); - if (second?.status !== "ok") { - throw new Error(`post-tool message failed: status=${String(second?.status)}`); - } - const reply = extractPayloadText(second?.result); + const reply = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${runId2}-2`, + message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`, + thinkingLevel: params.thinkingLevel, + context: `${progressLabel}: tool-only-regression-second`, + }); assertNoReasoningTags({ text: reply, model: modelKey, @@ -1290,6 +1315,8 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) { logProgress(`[${params.label}] skipped all models (missing profiles)`); } } finally { + clearRuntimeConfigSnapshot(); + restoreProductionEnvForLiveRun(runtimeEnv); client.stop(); await server.close({ reason: "live test complete" }); await fs.rm(toolProbePath, { force: true }); @@ -1317,6 +1344,7 @@ describeLive("gateway live (dev agent, profile keys)", () => { it( "runs meaningful prompts across models with available keys", async () => { + clearRuntimeConfigSnapshot(); const cfg = loadConfig(); await ensureOpenClawModelsJson(cfg); @@ -1422,6 +1450,8 @@ describeLive("gateway live (dev agent, profile keys)", () => { if (!ZAI_FALLBACK) { return; } + clearRuntimeConfigSnapshot(); + const runtimeEnv = enterProductionEnvForLiveRun(); const previous = { configPath: process.env.OPENCLAW_CONFIG_PATH, token: process.env.OPENCLAW_GATEWAY_TOKEN, @@ -1520,27 +1550,16 @@ describeLive("gateway live (dev agent, profile keys)", () => { "zai-fallback: sessions-reset", ); - const runId = randomUUID(); - const toolProbe = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${runId}-tool`, - message: - `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` + - `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`, - thinking: THINKING_LEVEL, - deliver: false, - }, - { expectFinal: true }, - ), - "zai-fallback: tool-probe", - ); - if (toolProbe?.status !== "ok") { - throw new Error(`anthropic tool probe failed: status=${String(toolProbe?.status)}`); - } - const toolText = extractPayloadText(toolProbe?.result); + const toolText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${randomUUID()}-tool`, + message: + `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` + + `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`, + thinkingLevel: THINKING_LEVEL, + context: "zai-fallback: tool-probe", + }); assertNoReasoningTags({ text: toolText, model: "anthropic/claude-opus-4-5", @@ -1559,27 +1578,16 @@ describeLive("gateway live (dev agent, profile keys)", () => { "zai-fallback: sessions-patch-zai", ); - const followupId = randomUUID(); - const followup = await withGatewayLiveProbeTimeout( - client.request( - "agent", - { - sessionKey, - idempotencyKey: `idem-${followupId}-followup`, - message: - `What are the values of nonceA and nonceB in "${toolProbePath}"? ` + - `Reply with exactly: ${nonceA} ${nonceB}.`, - thinking: THINKING_LEVEL, - deliver: false, - }, - { expectFinal: true }, - ), - "zai-fallback: followup", - ); - if (followup?.status !== "ok") { - throw new Error(`zai followup failed: status=${String(followup?.status)}`); - } - const followupText = extractPayloadText(followup?.result); + const followupText = await requestGatewayAgentText({ + client, + sessionKey, + idempotencyKey: `idem-${randomUUID()}-followup`, + message: + `What are the values of nonceA and nonceB in "${toolProbePath}"? ` + + `Reply with exactly: ${nonceA} ${nonceB}.`, + thinkingLevel: THINKING_LEVEL, + context: "zai-fallback: followup", + }); assertNoReasoningTags({ text: followupText, model: "zai/glm-4.7", @@ -1590,6 +1598,8 @@ describeLive("gateway live (dev agent, profile keys)", () => { throw new Error(`zai followup missing nonce: ${followupText}`); } } finally { + clearRuntimeConfigSnapshot(); + restoreProductionEnvForLiveRun(runtimeEnv); client.stop(); await server.close({ reason: "live test complete" }); await fs.rm(toolProbePath, { force: true });