fix: stabilize docker test suite

This commit is contained in:
Peter Steinberger
2026-03-17 03:01:11 +00:00
parent ed248c76c7
commit 1ffe8fde84
17 changed files with 450 additions and 382 deletions

View File

@@ -51,4 +51,40 @@ describe("syncExternalCliCredentials", () => {
});
expect(store.profiles[CODEX_CLI_PROFILE_ID]).toBeUndefined();
});
it("refreshes stored Codex expiry from external CLI even when the cached profile looks fresh", () => {
const staleExpiry = Date.now() + 30 * 60_000;
const freshExpiry = Date.now() + 5 * 24 * 60 * 60_000;
mocks.readCodexCliCredentialsCached.mockReturnValue({
type: "oauth",
provider: "openai-codex",
access: "new-access-token",
refresh: "new-refresh-token",
expires: freshExpiry,
accountId: "acct_456",
});
const store: AuthProfileStore = {
version: 1,
profiles: {
[OPENAI_CODEX_DEFAULT_PROFILE_ID]: {
type: "oauth",
provider: "openai-codex",
access: "old-access-token",
refresh: "old-refresh-token",
expires: staleExpiry,
accountId: "acct_456",
},
},
};
const mutated = syncExternalCliCredentials(store);
expect(mutated).toBe(true);
expect(store.profiles[OPENAI_CODEX_DEFAULT_PROFILE_ID]).toMatchObject({
access: "new-access-token",
refresh: "new-refresh-token",
expires: freshExpiry,
});
});
});

View File

@@ -4,13 +4,12 @@ import {
readMiniMaxCliCredentialsCached,
} from "../cli-credentials.js";
import {
EXTERNAL_CLI_NEAR_EXPIRY_MS,
EXTERNAL_CLI_SYNC_TTL_MS,
QWEN_CLI_PROFILE_ID,
MINIMAX_CLI_PROFILE_ID,
log,
} from "./constants.js";
import type { AuthProfileCredential, AuthProfileStore, OAuthCredential } from "./types.js";
import type { AuthProfileStore, OAuthCredential } from "./types.js";
const OPENAI_CODEX_DEFAULT_PROFILE_ID = "openai-codex:default";
@@ -37,62 +36,33 @@ function shallowEqualOAuthCredentials(a: OAuthCredential | undefined, b: OAuthCr
);
}
function isExternalProfileFresh(cred: AuthProfileCredential | undefined, now: number): boolean {
if (!cred) {
return false;
}
if (cred.type !== "oauth" && cred.type !== "token") {
return false;
}
if (
cred.provider !== "qwen-portal" &&
cred.provider !== "minimax-portal" &&
cred.provider !== "openai-codex"
) {
return false;
}
if (typeof cred.expires !== "number") {
return true;
}
return cred.expires > now + EXTERNAL_CLI_NEAR_EXPIRY_MS;
}
/** Sync external CLI credentials into the store for a given provider. */
function syncExternalCliCredentialsForProvider(
store: AuthProfileStore,
profileId: string,
provider: string,
readCredentials: () => OAuthCredential | null,
now: number,
options: ExternalCliSyncOptions,
): boolean {
const existing = store.profiles[profileId];
const shouldSync =
!existing || existing.provider !== provider || !isExternalProfileFresh(existing, now);
const creds = shouldSync ? readCredentials() : null;
const creds = readCredentials();
if (!creds) {
return false;
}
const existingOAuth = existing?.type === "oauth" ? existing : undefined;
const shouldUpdate =
!existingOAuth ||
existingOAuth.provider !== provider ||
existingOAuth.expires <= now ||
creds.expires > existingOAuth.expires;
if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, creds)) {
store.profiles[profileId] = creds;
if (options.log !== false) {
log.info(`synced ${provider} credentials from external cli`, {
profileId,
expires: new Date(creds.expires).toISOString(),
});
}
return true;
if (shallowEqualOAuthCredentials(existingOAuth, creds)) {
return false;
}
return false;
store.profiles[profileId] = creds;
if (options.log !== false) {
log.info(`synced ${provider} credentials from external cli`, {
profileId,
expires: new Date(creds.expires).toISOString(),
});
}
return true;
}
/**
@@ -106,46 +76,24 @@ export function syncExternalCliCredentials(
options: ExternalCliSyncOptions = {},
): boolean {
let mutated = false;
const now = Date.now();
// Sync from Qwen Code CLI
const existingQwen = store.profiles[QWEN_CLI_PROFILE_ID];
const shouldSyncQwen =
!existingQwen ||
existingQwen.provider !== "qwen-portal" ||
!isExternalProfileFresh(existingQwen, now);
const qwenCreds = shouldSyncQwen
? readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS })
: null;
if (qwenCreds) {
const existing = store.profiles[QWEN_CLI_PROFILE_ID];
const existingOAuth = existing?.type === "oauth" ? existing : undefined;
const shouldUpdate =
!existingOAuth ||
existingOAuth.provider !== "qwen-portal" ||
existingOAuth.expires <= now ||
qwenCreds.expires > existingOAuth.expires;
if (shouldUpdate && !shallowEqualOAuthCredentials(existingOAuth, qwenCreds)) {
store.profiles[QWEN_CLI_PROFILE_ID] = qwenCreds;
mutated = true;
if (options.log !== false) {
log.info("synced qwen credentials from qwen cli", {
profileId: QWEN_CLI_PROFILE_ID,
expires: new Date(qwenCreds.expires).toISOString(),
});
}
}
if (
syncExternalCliCredentialsForProvider(
store,
QWEN_CLI_PROFILE_ID,
"qwen-portal",
() => readQwenCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
options,
)
) {
mutated = true;
}
// Sync from MiniMax Portal CLI
if (
syncExternalCliCredentialsForProvider(
store,
MINIMAX_CLI_PROFILE_ID,
"minimax-portal",
() => readMiniMaxCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
now,
options,
)
) {
@@ -157,7 +105,6 @@ export function syncExternalCliCredentials(
OPENAI_CODEX_DEFAULT_PROFILE_ID,
"openai-codex",
() => readCodexCliCredentialsCached({ ttlMs: EXTERNAL_CLI_SYNC_TTL_MS }),
now,
options,
)
) {

View File

@@ -46,6 +46,12 @@ async function readCachedClaudeCliCredentials(allowKeychainPrompt: boolean) {
});
}
function createJwtWithExp(expSeconds: number): string {
const encode = (value: Record<string, unknown>) =>
Buffer.from(JSON.stringify(value)).toString("base64url");
return `${encode({ alg: "RS256", typ: "JWT" })}.${encode({ exp: expSeconds })}.signature`;
}
describe("cli credentials", () => {
beforeAll(async () => {
({
@@ -229,6 +235,7 @@ describe("cli credentials", () => {
it("reads Codex credentials from keychain when available", async () => {
const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-"));
process.env.CODEX_HOME = tempHome;
const expSeconds = Math.floor(Date.parse("2026-03-23T00:48:49Z") / 1000);
const accountHash = "cli|";
@@ -238,7 +245,7 @@ describe("cli credentials", () => {
expect(cmd).toContain(accountHash);
return JSON.stringify({
tokens: {
access_token: "keychain-access",
access_token: createJwtWithExp(expSeconds),
refresh_token: "keychain-refresh",
},
last_refresh: "2026-01-01T00:00:00Z",
@@ -248,15 +255,17 @@ describe("cli credentials", () => {
const creds = readCodexCliCredentials({ platform: "darwin", execSync: execSyncMock });
expect(creds).toMatchObject({
access: "keychain-access",
access: createJwtWithExp(expSeconds),
refresh: "keychain-refresh",
provider: "openai-codex",
expires: expSeconds * 1000,
});
});
it("falls back to Codex auth.json when keychain is unavailable", async () => {
const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-codex-"));
process.env.CODEX_HOME = tempHome;
const expSeconds = Math.floor(Date.parse("2026-03-24T12:34:56Z") / 1000);
execSyncMock.mockImplementation(() => {
throw new Error("not found");
});
@@ -267,7 +276,7 @@ describe("cli credentials", () => {
authPath,
JSON.stringify({
tokens: {
access_token: "file-access",
access_token: createJwtWithExp(expSeconds),
refresh_token: "file-refresh",
},
}),
@@ -277,9 +286,10 @@ describe("cli credentials", () => {
const creds = readCodexCliCredentials({ execSync: execSyncMock });
expect(creds).toMatchObject({
access: "file-access",
access: createJwtWithExp(expSeconds),
refresh: "file-refresh",
provider: "openai-codex",
expires: expSeconds * 1000,
});
});
});

View File

@@ -153,6 +153,22 @@ function computeCodexKeychainAccount(codexHome: string) {
return `cli|${hash.slice(0, 16)}`;
}
function decodeJwtExpiryMs(token: string): number | null {
const parts = token.split(".");
if (parts.length < 2) {
return null;
}
try {
const payloadRaw = Buffer.from(parts[1], "base64url").toString("utf8");
const payload = JSON.parse(payloadRaw) as { exp?: unknown };
return typeof payload.exp === "number" && Number.isFinite(payload.exp) && payload.exp > 0
? payload.exp * 1000
: null;
} catch {
return null;
}
}
function readCodexKeychainCredentials(options?: {
platform?: NodeJS.Platform;
execSync?: ExecSyncFn;
@@ -193,9 +209,10 @@ function readCodexKeychainCredentials(options?: {
typeof lastRefreshRaw === "string" || typeof lastRefreshRaw === "number"
? new Date(lastRefreshRaw).getTime()
: Date.now();
const expires = Number.isFinite(lastRefresh)
const fallbackExpiry = Number.isFinite(lastRefresh)
? lastRefresh + 60 * 60 * 1000
: Date.now() + 60 * 60 * 1000;
const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry;
const accountId = typeof tokens?.account_id === "string" ? tokens.account_id : undefined;
log.info("read codex credentials from keychain", {
@@ -483,13 +500,14 @@ export function readCodexCliCredentials(options?: {
return null;
}
let expires: number;
let fallbackExpiry: number;
try {
const stat = fs.statSync(authPath);
expires = stat.mtimeMs + 60 * 60 * 1000;
fallbackExpiry = stat.mtimeMs + 60 * 60 * 1000;
} catch {
expires = Date.now() + 60 * 60 * 1000;
fallbackExpiry = Date.now() + 60 * 60 * 1000;
}
const expires = decodeJwtExpiryMs(accessToken) ?? fallbackExpiry;
return {
type: "oauth",

View File

@@ -117,6 +117,10 @@ function isChatGPTUsageLimitErrorMessage(raw: string): boolean {
return msg.includes("hit your chatgpt usage limit") && msg.includes("try again in");
}
function isRefreshTokenReused(raw: string): boolean {
return /refresh_token_reused/i.test(raw);
}
function isInstructionsRequiredError(raw: string): boolean {
return /instructions are required/i.test(raw);
}
@@ -643,6 +647,15 @@ describeLive("live models (profile keys)", () => {
logProgress(`${progressLabel}: skip (rate limit)`);
break;
}
if (
allowNotFoundSkip &&
model.provider === "openai-codex" &&
isRefreshTokenReused(message)
) {
skipped.push({ model: id, reason: message });
logProgress(`${progressLabel}: skip (codex refresh token reused)`);
break;
}
if (
allowNotFoundSkip &&
model.provider === "openai-codex" &&

View File

@@ -24,7 +24,7 @@ import { shouldSuppressBuiltInModel } from "../agents/model-suppression.js";
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
import { isRateLimitErrorMessage } from "../agents/pi-embedded-helpers/errors.js";
import { discoverAuthStorage, discoverModels } from "../agents/pi-model-discovery.js";
import { loadConfig } from "../config/config.js";
import { clearRuntimeConfigSnapshot, loadConfig } from "../config/config.js";
import type { ModelsConfig, OpenClawConfig, ModelProviderConfig } from "../config/types.js";
import { isTruthyEnvValue } from "../infra/env.js";
import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
@@ -38,7 +38,7 @@ import {
shouldRetryToolReadProbe,
} from "./live-tool-probe-utils.js";
import { startGatewayServer } from "./server.js";
import { extractPayloadText } from "./test-helpers.agent-results.js";
import { loadSessionEntry, readSessionMessages } from "./session-utils.js";
const LIVE = isTruthyEnvValue(process.env.LIVE) || isTruthyEnvValue(process.env.OPENCLAW_LIVE_TEST);
const GATEWAY_LIVE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_GATEWAY);
@@ -171,6 +171,32 @@ function logProgress(message: string): void {
console.log(`[live] ${message}`);
}
function enterProductionEnvForLiveRun() {
const previous = {
vitest: process.env.VITEST,
nodeEnv: process.env.NODE_ENV,
};
delete process.env.VITEST;
process.env.NODE_ENV = "production";
return previous;
}
function restoreProductionEnvForLiveRun(previous: {
vitest: string | undefined;
nodeEnv: string | undefined;
}) {
if (previous.vitest === undefined) {
delete process.env.VITEST;
} else {
process.env.VITEST = previous.vitest;
}
if (previous.nodeEnv === undefined) {
delete process.env.NODE_ENV;
} else {
process.env.NODE_ENV = previous.nodeEnv;
}
}
function formatFailurePreview(
failures: Array<{ model: string; error: string }>,
maxItems: number,
@@ -319,25 +345,14 @@ async function runAnthropicRefusalProbe(params: {
}): Promise<void> {
logProgress(`${params.label}: refusal-probe`);
const magic = buildAnthropicRefusalToken();
const runId = randomUUID();
const probe = await withGatewayLiveProbeTimeout(
params.client.request<AgentFinalPayload>(
"agent",
{
sessionKey: params.sessionKey,
idempotencyKey: `idem-${runId}-refusal`,
message: `Reply with the single word ok. Test token: ${magic}`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${params.label}: refusal-probe`,
);
if (probe?.status !== "ok") {
throw new Error(`refusal probe failed: status=${String(probe?.status)}`);
}
const probeText = extractPayloadText(probe?.result);
const probeText = await requestGatewayAgentText({
client: params.client,
sessionKey: params.sessionKey,
idempotencyKey: `idem-${randomUUID()}-refusal`,
message: `Reply with the single word ok. Test token: ${magic}`,
thinkingLevel: params.thinkingLevel,
context: `${params.label}: refusal-probe`,
});
assertNoReasoningTags({
text: probeText,
model: params.modelKey,
@@ -348,25 +363,14 @@ async function runAnthropicRefusalProbe(params: {
throw new Error(`refusal probe missing ok: ${probeText}`);
}
const followupId = randomUUID();
const followup = await withGatewayLiveProbeTimeout(
params.client.request<AgentFinalPayload>(
"agent",
{
sessionKey: params.sessionKey,
idempotencyKey: `idem-${followupId}-refusal-followup`,
message: "Now reply with exactly: still ok.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${params.label}: refusal-followup`,
);
if (followup?.status !== "ok") {
throw new Error(`refusal followup failed: status=${String(followup?.status)}`);
}
const followupText = extractPayloadText(followup?.result);
const followupText = await requestGatewayAgentText({
client: params.client,
sessionKey: params.sessionKey,
idempotencyKey: `idem-${randomUUID()}-refusal-followup`,
message: "Now reply with exactly: still ok.",
thinkingLevel: params.thinkingLevel,
context: `${params.label}: refusal-followup`,
});
assertNoReasoningTags({
text: followupText,
model: params.modelKey,
@@ -475,11 +479,6 @@ async function getFreeGatewayPort(): Promise<number> {
throw new Error("failed to acquire a free gateway port block");
}
type AgentFinalPayload = {
status?: unknown;
result?: unknown;
};
async function connectClient(params: { url: string; token: string }) {
return await new Promise<GatewayClient>((resolve, reject) => {
let settled = false;
@@ -513,6 +512,115 @@ async function connectClient(params: { url: string; token: string }) {
});
}
function extractTranscriptMessageText(message: unknown): string {
if (!message || typeof message !== "object") {
return "";
}
const record = message as {
text?: unknown;
content?: unknown;
};
if (typeof record.text === "string" && record.text.trim()) {
return record.text.trim();
}
if (typeof record.content === "string" && record.content.trim()) {
return record.content.trim();
}
if (!Array.isArray(record.content)) {
return "";
}
return record.content
.map((entry) => {
if (!entry || typeof entry !== "object") {
return "";
}
const text = (entry as { text?: unknown }).text;
return typeof text === "string" && text.trim() ? text.trim() : "";
})
.filter(Boolean)
.join("\n")
.trim();
}
function readSessionAssistantTexts(sessionKey: string): string[] {
const { storePath, entry } = loadSessionEntry(sessionKey);
if (!entry?.sessionId) {
return [];
}
const messages = readSessionMessages(entry.sessionId, storePath, entry.sessionFile);
const assistantTexts: string[] = [];
for (const message of messages) {
if (!message || typeof message !== "object") {
continue;
}
const role = (message as { role?: unknown }).role;
if (role !== "assistant") {
continue;
}
assistantTexts.push(extractTranscriptMessageText(message));
}
return assistantTexts;
}
async function waitForSessionAssistantText(params: {
sessionKey: string;
baselineAssistantCount: number;
context: string;
}) {
const startedAt = Date.now();
let delayMs = 50;
while (Date.now() - startedAt < GATEWAY_LIVE_PROBE_TIMEOUT_MS) {
const assistantTexts = readSessionAssistantTexts(params.sessionKey);
if (assistantTexts.length > params.baselineAssistantCount) {
const freshText = assistantTexts
.slice(params.baselineAssistantCount)
.map((text) => text.trim())
.findLast((text) => text.length > 0);
if (freshText) {
return freshText;
}
}
await new Promise((resolve) => setTimeout(resolve, delayMs));
delayMs = Math.min(delayMs * 2, 250);
}
throw new Error(`probe timeout after ${GATEWAY_LIVE_PROBE_TIMEOUT_MS}ms (${params.context})`);
}
async function requestGatewayAgentText(params: {
client: GatewayClient;
sessionKey: string;
message: string;
thinkingLevel: string;
context: string;
idempotencyKey: string;
attachments?: Array<{
mimeType: string;
fileName: string;
content: string;
}>;
}) {
const baselineAssistantCount = readSessionAssistantTexts(params.sessionKey).length;
const accepted = await withGatewayLiveProbeTimeout(
params.client.request<{ runId?: unknown; status?: unknown }>("agent", {
sessionKey: params.sessionKey,
idempotencyKey: params.idempotencyKey,
message: params.message,
thinking: params.thinkingLevel,
deliver: false,
attachments: params.attachments,
}),
`${params.context}: agent-accept`,
);
if (accepted?.status !== "accepted") {
throw new Error(`agent status=${String(accepted?.status)}`);
}
return await waitForSessionAssistantText({
sessionKey: params.sessionKey,
baselineAssistantCount,
context: `${params.context}: transcript-final`,
});
}
type GatewayModelSuiteParams = {
label: string;
cfg: OpenClawConfig;
@@ -636,6 +744,8 @@ function buildMinimaxProviderOverride(params: {
}
async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
clearRuntimeConfigSnapshot();
const runtimeEnv = enterProductionEnvForLiveRun();
const previous = {
configPath: process.env.OPENCLAW_CONFIG_PATH,
token: process.env.OPENCLAW_GATEWAY_TOKEN,
@@ -793,48 +903,26 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
);
logProgress(`${progressLabel}: prompt`);
const runId = randomUUID();
const payload = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId}`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${progressLabel}: prompt`,
);
if (payload?.status !== "ok") {
throw new Error(`agent status=${String(payload?.status)}`);
}
let text = extractPayloadText(payload?.result);
let text = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${randomUUID()}`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: prompt`,
});
if (!text) {
logProgress(`${progressLabel}: empty response, retrying`);
const retry = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${randomUUID()}-retry`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${progressLabel}: prompt-retry`,
);
if (retry?.status !== "ok") {
throw new Error(`agent status=${String(retry?.status)}`);
}
text = extractPayloadText(retry?.result);
text = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${randomUUID()}-retry`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: prompt-retry`,
});
}
if (!text && isGoogleishProvider(model.provider)) {
logProgress(`${progressLabel}: skip (google empty response)`);
@@ -881,36 +969,20 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
toolReadAttempt += 1
) {
const strictReply = toolReadAttempt > 0;
const toolProbe = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
message: strictReply
? "OpenClaw live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`
: "OpenClaw live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
"Then reply with the two nonce values you read (include both).",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${progressLabel}: tool-read`,
);
if (toolProbe?.status !== "ok") {
if (toolReadAttempt + 1 < maxToolReadAttempts) {
logProgress(
`${progressLabel}: tool-read retry (${toolReadAttempt + 2}/${maxToolReadAttempts}) status=${String(toolProbe?.status)}`,
);
continue;
}
throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
}
toolText = extractPayloadText(toolProbe?.result);
toolText = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
message: strictReply
? "OpenClaw live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`
: "OpenClaw live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
"Then reply with the two nonce values you read (include both).",
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: tool-read`,
});
if (
isEmptyStreamText(toolText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
@@ -960,40 +1032,24 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
execReadAttempt += 1
) {
const strictReply = execReadAttempt > 0;
const execReadProbe = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
message: strictReply
? "OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
`Then reply with exactly: ${nonceC}. No extra text.`
: "OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${progressLabel}: tool-exec`,
);
if (execReadProbe?.status !== "ok") {
if (execReadAttempt + 1 < maxExecReadAttempts) {
logProgress(
`${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`,
);
continue;
}
throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
}
execReadText = extractPayloadText(execReadProbe?.result);
execReadText = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
message: strictReply
? "OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
`Then reply with exactly: ${nonceC}. No extra text.`
: "OpenClaw live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: tool-exec`,
});
if (
isEmptyStreamText(execReadText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
@@ -1040,62 +1096,51 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
const imageBase64 = renderCatNoncePngBase64(imageCode);
const runIdImage = randomUUID();
const imageProbe = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
const imageText = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${runIdImage}-image`,
message:
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
"(1) the animal shown or written in the image, lowercase; " +
"(2) the code printed in the image, uppercase. No extra text.",
attachments: [
{
sessionKey,
idempotencyKey: `idem-${runIdImage}-image`,
message:
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
"(1) the animal shown or written in the image, lowercase; " +
"(2) the code printed in the image, uppercase. No extra text.",
attachments: [
{
mimeType: "image/png",
fileName: `probe-${runIdImage}.png`,
content: imageBase64,
},
],
thinking: params.thinkingLevel,
deliver: false,
mimeType: "image/png",
fileName: `probe-${runIdImage}.png`,
content: imageBase64,
},
{ expectFinal: true },
),
`${progressLabel}: image`,
);
],
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: image`,
});
// Best-effort: do not fail the whole live suite on flaky image handling.
// (We still keep prompt + tool probes as hard checks.)
if (imageProbe?.status !== "ok") {
logProgress(`${progressLabel}: image skip (status=${String(imageProbe?.status)})`);
if (
isEmptyStreamText(imageText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
} else {
const imageText = extractPayloadText(imageProbe?.result);
if (
isEmptyStreamText(imageText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
assertNoReasoningTags({
text: imageText,
model: modelKey,
phase: "image",
label: params.label,
});
if (!/\bcat\b/i.test(imageText)) {
logProgress(`${progressLabel}: image skip (missing 'cat')`);
} else {
assertNoReasoningTags({
text: imageText,
model: modelKey,
phase: "image",
label: params.label,
});
if (!/\bcat\b/i.test(imageText)) {
logProgress(`${progressLabel}: image skip (missing 'cat')`);
} else {
const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
const bestDistance = candidates.reduce((best, cand) => {
if (Math.abs(cand.length - imageCode.length) > 2) {
return best;
}
return Math.min(best, editDistance(cand, imageCode));
}, Number.POSITIVE_INFINITY);
// OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
if (!(bestDistance <= 3)) {
logProgress(`${progressLabel}: image skip (code mismatch)`);
const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
const bestDistance = candidates.reduce((best, cand) => {
if (Math.abs(cand.length - imageCode.length) > 2) {
return best;
}
return Math.min(best, editDistance(cand, imageCode));
}, Number.POSITIVE_INFINITY);
// OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
if (!(bestDistance <= 3)) {
logProgress(`${progressLabel}: image skip (code mismatch)`);
}
}
}
@@ -1108,24 +1153,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
) {
logProgress(`${progressLabel}: tool-only regression`);
const runId2 = randomUUID();
const first = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-1`,
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${progressLabel}: tool-only-regression-first`,
);
if (first?.status !== "ok") {
throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
}
const firstText = extractPayloadText(first?.result);
const firstText = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${runId2}-1`,
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: tool-only-regression-first`,
});
assertNoReasoningTags({
text: firstText,
model: modelKey,
@@ -1133,24 +1168,14 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
label: params.label,
});
const second = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-2`,
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
),
`${progressLabel}: tool-only-regression-second`,
);
if (second?.status !== "ok") {
throw new Error(`post-tool message failed: status=${String(second?.status)}`);
}
const reply = extractPayloadText(second?.result);
const reply = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${runId2}-2`,
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
thinkingLevel: params.thinkingLevel,
context: `${progressLabel}: tool-only-regression-second`,
});
assertNoReasoningTags({
text: reply,
model: modelKey,
@@ -1290,6 +1315,8 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
logProgress(`[${params.label}] skipped all models (missing profiles)`);
}
} finally {
clearRuntimeConfigSnapshot();
restoreProductionEnvForLiveRun(runtimeEnv);
client.stop();
await server.close({ reason: "live test complete" });
await fs.rm(toolProbePath, { force: true });
@@ -1317,6 +1344,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
it(
"runs meaningful prompts across models with available keys",
async () => {
clearRuntimeConfigSnapshot();
const cfg = loadConfig();
await ensureOpenClawModelsJson(cfg);
@@ -1422,6 +1450,8 @@ describeLive("gateway live (dev agent, profile keys)", () => {
if (!ZAI_FALLBACK) {
return;
}
clearRuntimeConfigSnapshot();
const runtimeEnv = enterProductionEnvForLiveRun();
const previous = {
configPath: process.env.OPENCLAW_CONFIG_PATH,
token: process.env.OPENCLAW_GATEWAY_TOKEN,
@@ -1520,27 +1550,16 @@ describeLive("gateway live (dev agent, profile keys)", () => {
"zai-fallback: sessions-reset",
);
const runId = randomUUID();
const toolProbe = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId}-tool`,
message:
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
thinking: THINKING_LEVEL,
deliver: false,
},
{ expectFinal: true },
),
"zai-fallback: tool-probe",
);
if (toolProbe?.status !== "ok") {
throw new Error(`anthropic tool probe failed: status=${String(toolProbe?.status)}`);
}
const toolText = extractPayloadText(toolProbe?.result);
const toolText = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${randomUUID()}-tool`,
message:
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
thinkingLevel: THINKING_LEVEL,
context: "zai-fallback: tool-probe",
});
assertNoReasoningTags({
text: toolText,
model: "anthropic/claude-opus-4-5",
@@ -1559,27 +1578,16 @@ describeLive("gateway live (dev agent, profile keys)", () => {
"zai-fallback: sessions-patch-zai",
);
const followupId = randomUUID();
const followup = await withGatewayLiveProbeTimeout(
client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${followupId}-followup`,
message:
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
`Reply with exactly: ${nonceA} ${nonceB}.`,
thinking: THINKING_LEVEL,
deliver: false,
},
{ expectFinal: true },
),
"zai-fallback: followup",
);
if (followup?.status !== "ok") {
throw new Error(`zai followup failed: status=${String(followup?.status)}`);
}
const followupText = extractPayloadText(followup?.result);
const followupText = await requestGatewayAgentText({
client,
sessionKey,
idempotencyKey: `idem-${randomUUID()}-followup`,
message:
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
`Reply with exactly: ${nonceA} ${nonceB}.`,
thinkingLevel: THINKING_LEVEL,
context: "zai-fallback: followup",
});
assertNoReasoningTags({
text: followupText,
model: "zai/glm-4.7",
@@ -1590,6 +1598,8 @@ describeLive("gateway live (dev agent, profile keys)", () => {
throw new Error(`zai followup missing nonce: ${followupText}`);
}
} finally {
clearRuntimeConfigSnapshot();
restoreProductionEnvForLiveRun(runtimeEnv);
client.stop();
await server.close({ reason: "live test complete" });
await fs.rm(toolProbePath, { force: true });