test: isolate MCP live cache probe

This commit is contained in:
Peter Steinberger
2026-04-04 14:39:41 +09:00
parent 0ebc7b6077
commit 30ba837a7b
3 changed files with 467 additions and 10 deletions

View File

@@ -24,12 +24,54 @@ const env = {
OPENCLAW_LIVE_TEST_QUIET: quietOverride ?? process.env.OPENCLAW_LIVE_TEST_QUIET ?? "1",
};
function parsePositiveInt(value, fallback) {
if (!value) {
return fallback;
}
const parsed = Number.parseInt(value, 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
}
const heartbeatMs = parsePositiveInt(process.env.OPENCLAW_LIVE_WRAPPER_HEARTBEAT_MS, 20_000);
const startedAt = Date.now();
let lastOutputAt = startedAt;
const child = spawnPnpmRunner({
stdio: ["inherit", "pipe", "pipe"],
pnpmArgs: ["exec", "vitest", "run", "--config", "vitest.live.config.ts", ...forwardedArgs],
env,
});
const noteOutput = () => {
lastOutputAt = Date.now();
};
child.stdout?.on("data", (chunk) => {
noteOutput();
process.stdout.write(chunk);
});
child.stderr?.on("data", (chunk) => {
noteOutput();
process.stderr.write(chunk);
});
const heartbeat = setInterval(() => {
const now = Date.now();
if (now - lastOutputAt < heartbeatMs) {
return;
}
const elapsedSec = Math.max(1, Math.round((now - startedAt) / 1_000));
const quietSec = Math.max(1, Math.round((now - lastOutputAt) / 1_000));
process.stderr.write(
`[test:live] still running (${elapsedSec}s elapsed, ${quietSec}s since last output)\n`,
);
lastOutputAt = now;
}, heartbeatMs);
heartbeat.unref?.();
child.on("exit", (code, signal) => {
clearInterval(heartbeat);
if (signal) {
process.kill(process.pid, signal);
return;
@@ -38,6 +80,7 @@ child.on("exit", (code, signal) => {
});
child.on("error", (error) => {
clearInterval(heartbeat);
console.error(error);
process.exit(1);
});

View File

@@ -1,3 +1,4 @@
import fs from "node:fs/promises";
import type { AssistantMessage, Message, Tool } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import { beforeAll, describe, expect, it } from "vitest";
@@ -19,6 +20,10 @@ const OPENAI_SESSION_ID = "live-cache-openai-stable-session";
const ANTHROPIC_SESSION_ID = "live-cache-anthropic-stable-session";
const OPENAI_PREFIX = buildStableCachePrefix("openai");
const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
const LIVE_TEST_PNG_URL = new URL(
"../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
import.meta.url,
);
type CacheRun = {
hitRate: number;
@@ -32,17 +37,50 @@ const NOOP_TOOL: Tool = {
description: "Return ok.",
parameters: Type.Object({}, { additionalProperties: false }),
};
let liveTestPngBase64 = "";
type UserContent = Extract<Message, { role: "user" }>["content"];
function makeAssistantHistoryTurn(text: string): Message {
return {
role: "assistant",
content: [{ type: "text", text }],
timestamp: Date.now(),
};
}
function makeUserHistoryTurn(content: UserContent): Message {
return {
role: "user",
content,
timestamp: Date.now(),
};
}
function makeImageUserTurn(text: string): Message {
if (!liveTestPngBase64) {
throw new Error("live test PNG not loaded");
}
return makeUserHistoryTurn([
{ type: "text", text },
{ type: "image", mimeType: "image/png", data: liveTestPngBase64 },
]);
}
function extractFirstToolCall(message: AssistantMessage) {
return message.content.find((block) => block.type === "toolCall");
}
function buildToolResultMessage(toolCallId: string): Extract<Message, { role: "toolResult" }> {
function buildToolResultMessage(
toolCallId: string,
toolName = "noop",
text = "ok",
): Extract<Message, { role: "toolResult" }> {
return {
role: "toolResult",
toolCallId,
toolName: "noop",
content: [{ type: "text", text: "ok" }],
toolName,
content: [{ type: "text", text }],
isError: false,
timestamp: Date.now(),
};
@@ -55,9 +93,9 @@ async function runToolOnlyTurn(params: {
providerTag: "anthropic" | "openai";
sessionId: string;
systemPrompt: string;
tool: Tool;
}) {
let prompt =
"Call the tool `noop` with {}. IMPORTANT: respond ONLY with the tool call and no other text.";
let prompt = `Call the tool \`${params.tool.name}\` with {}. IMPORTANT: respond ONLY with the tool call and no other text.`;
let response = await completeSimpleWithLiveTimeout(
params.model,
{
@@ -69,7 +107,7 @@ async function runToolOnlyTurn(params: {
timestamp: Date.now(),
},
],
tools: [NOOP_TOOL],
tools: [params.tool],
},
{
apiKey: params.apiKey,
@@ -77,6 +115,7 @@ async function runToolOnlyTurn(params: {
sessionId: params.sessionId,
maxTokens: 128,
temperature: 0,
...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
},
`${params.providerTag} tool-only turn`,
params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS,
@@ -85,7 +124,7 @@ async function runToolOnlyTurn(params: {
let toolCall = extractFirstToolCall(response);
let text = extractAssistantText(response);
for (let attempt = 0; attempt < 2 && (!toolCall || text.length > 0); attempt += 1) {
prompt = "Return only a tool call for `noop` with {}. No text.";
prompt = `Return only a tool call for \`${params.tool.name}\` with {}. No text.`;
response = await completeSimpleWithLiveTimeout(
params.model,
{
@@ -97,7 +136,7 @@ async function runToolOnlyTurn(params: {
timestamp: Date.now(),
},
],
tools: [NOOP_TOOL],
tools: [params.tool],
},
{
apiKey: params.apiKey,
@@ -105,6 +144,7 @@ async function runToolOnlyTurn(params: {
sessionId: params.sessionId,
maxTokens: 128,
temperature: 0,
...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
},
`${params.providerTag} tool-only retry ${attempt + 1}`,
params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS,
@@ -139,6 +179,7 @@ async function runOpenAiToolCacheProbe(params: {
providerTag: "openai",
sessionId: params.sessionId,
systemPrompt: OPENAI_PREFIX,
tool: NOOP_TOOL,
});
const response = await completeSimpleWithLiveTimeout(
params.model,
@@ -151,7 +192,10 @@ async function runOpenAiToolCacheProbe(params: {
timestamp: Date.now(),
},
toolTurn.response,
buildToolResultMessage(toolTurn.toolCall.id),
buildToolResultMessage(toolTurn.toolCall.id, NOOP_TOOL.name, "ok"),
makeAssistantHistoryTurn("TOOL HISTORY ACKNOWLEDGED"),
makeUserHistoryTurn("Keep the tool output stable in history."),
makeAssistantHistoryTurn("TOOL HISTORY PRESERVED"),
{
role: "user",
content: `Reply with exactly CACHE-OK ${params.suffix}.`,
@@ -166,6 +210,7 @@ async function runOpenAiToolCacheProbe(params: {
sessionId: params.sessionId,
maxTokens: 64,
temperature: 0,
reasoning: "none" as unknown as never,
},
`openai cache probe ${params.suffix}`,
OPENAI_TIMEOUT_MS,
@@ -218,6 +263,47 @@ async function runOpenAiCacheProbe(params: {
};
}
async function runOpenAiImageCacheProbe(params: {
apiKey: string;
model: Awaited<ReturnType<typeof resolveLiveDirectModel>>["model"];
sessionId: string;
suffix: string;
}): Promise<CacheRun> {
const response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: OPENAI_PREFIX,
messages: [
makeImageUserTurn(
"An image is attached. Ignore image semantics but keep the bytes in history.",
),
makeAssistantHistoryTurn("IMAGE HISTORY ACKNOWLEDGED"),
makeUserHistoryTurn("Keep the earlier image turn stable in context."),
makeAssistantHistoryTurn("IMAGE HISTORY PRESERVED"),
makeUserHistoryTurn(`Reply with exactly CACHE-OK ${params.suffix}.`),
],
},
{
apiKey: params.apiKey,
cacheRetention: "short",
sessionId: params.sessionId,
maxTokens: 64,
temperature: 0,
reasoning: "none" as unknown as never,
},
`openai image cache probe ${params.suffix}`,
OPENAI_TIMEOUT_MS,
);
const text = extractAssistantText(response);
expect(text.toLowerCase()).toContain(params.suffix.toLowerCase());
return {
suffix: params.suffix,
text,
usage: response.usage,
hitRate: computeCacheHitRate(response.usage),
};
}
async function runAnthropicCacheProbe(params: {
apiKey: string;
model: Awaited<ReturnType<typeof resolveLiveDirectModel>>["model"];
@@ -271,6 +357,7 @@ async function runAnthropicToolCacheProbe(params: {
providerTag: "anthropic",
sessionId: params.sessionId,
systemPrompt: ANTHROPIC_PREFIX,
tool: NOOP_TOOL,
});
const response = await completeSimpleWithLiveTimeout(
params.model,
@@ -283,7 +370,10 @@ async function runAnthropicToolCacheProbe(params: {
timestamp: Date.now(),
},
toolTurn.response,
buildToolResultMessage(toolTurn.toolCall.id),
buildToolResultMessage(toolTurn.toolCall.id, NOOP_TOOL.name, "ok"),
makeAssistantHistoryTurn("TOOL HISTORY ACKNOWLEDGED"),
makeUserHistoryTurn("Keep the tool output stable in history."),
makeAssistantHistoryTurn("TOOL HISTORY PRESERVED"),
{
role: "user",
content: `Reply with exactly CACHE-OK ${params.suffix}.`,
@@ -312,7 +402,52 @@ async function runAnthropicToolCacheProbe(params: {
};
}
async function runAnthropicImageCacheProbe(params: {
apiKey: string;
model: Awaited<ReturnType<typeof resolveLiveDirectModel>>["model"];
sessionId: string;
suffix: string;
cacheRetention: "none" | "short" | "long";
}): Promise<CacheRun> {
const response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: ANTHROPIC_PREFIX,
messages: [
makeImageUserTurn(
"An image is attached. Ignore image semantics but keep the bytes in history.",
),
makeAssistantHistoryTurn("IMAGE HISTORY ACKNOWLEDGED"),
makeUserHistoryTurn("Keep the earlier image turn stable in context."),
makeAssistantHistoryTurn("IMAGE HISTORY PRESERVED"),
makeUserHistoryTurn(`Reply with exactly CACHE-OK ${params.suffix}.`),
],
},
{
apiKey: params.apiKey,
cacheRetention: params.cacheRetention,
sessionId: params.sessionId,
maxTokens: 64,
temperature: 0,
},
`anthropic image cache probe ${params.suffix} (${params.cacheRetention})`,
ANTHROPIC_TIMEOUT_MS,
);
const text = extractAssistantText(response);
expect(text.toLowerCase()).toContain(params.suffix.toLowerCase());
return {
suffix: params.suffix,
text,
usage: response.usage,
hitRate: computeCacheHitRate(response.usage),
};
}
describeCacheLive("pi embedded runner prompt caching (live)", () => {
beforeAll(async () => {
liveTestPngBase64 = (await fs.readFile(LIVE_TEST_PNG_URL)).toString("base64");
}, 120_000);
describe("openai", () => {
let fixture: Awaited<ReturnType<typeof resolveLiveDirectModel>>;
@@ -396,6 +531,39 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
},
8 * 60_000,
);
it(
"keeps high cache-read rates across image-heavy followup turns",
async () => {
const warmup = await runOpenAiImageCacheProbe({
...fixture,
sessionId: `${OPENAI_SESSION_ID}-image`,
suffix: "image-warmup",
});
logLiveCache(
`openai image warmup cacheRead=${warmup.usage.cacheRead} input=${warmup.usage.input} rate=${warmup.hitRate.toFixed(3)}`,
);
const hitA = await runOpenAiImageCacheProbe({
...fixture,
sessionId: `${OPENAI_SESSION_ID}-image`,
suffix: "image-hit-a",
});
const hitB = await runOpenAiImageCacheProbe({
...fixture,
sessionId: `${OPENAI_SESSION_ID}-image`,
suffix: "image-hit-b",
});
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
logLiveCache(
`openai image best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
},
6 * 60_000,
);
});
describe("anthropic", () => {
@@ -490,6 +658,42 @@ describeCacheLive("pi embedded runner prompt caching (live)", () => {
8 * 60_000,
);
it(
"keeps high cache-read rates across image-heavy followup turns",
async () => {
const warmup = await runAnthropicImageCacheProbe({
...fixture,
sessionId: `${ANTHROPIC_SESSION_ID}-image`,
suffix: "image-warmup",
cacheRetention: "short",
});
logLiveCache(
`anthropic image warmup cacheWrite=${warmup.usage.cacheWrite} cacheRead=${warmup.usage.cacheRead} input=${warmup.usage.input} rate=${warmup.hitRate.toFixed(3)}`,
);
const hitA = await runAnthropicImageCacheProbe({
...fixture,
sessionId: `${ANTHROPIC_SESSION_ID}-image`,
suffix: "image-hit-a",
cacheRetention: "short",
});
const hitB = await runAnthropicImageCacheProbe({
...fixture,
sessionId: `${ANTHROPIC_SESSION_ID}-image`,
suffix: "image-hit-b",
cacheRetention: "short",
});
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
logLiveCache(
`anthropic image best-hit suffix=${bestHit.suffix} cacheWrite=${bestHit.usage.cacheWrite} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
},
6 * 60_000,
);
it(
"does not report meaningful cache activity when retention is disabled",
async () => {

View File

@@ -0,0 +1,210 @@
import type { AssistantMessage, Tool } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
import {
buildStableCachePrefix,
completeSimpleWithLiveTimeout,
computeCacheHitRate,
extractAssistantText,
LIVE_CACHE_TEST_ENABLED,
logLiveCache,
resolveLiveDirectModel,
} from "./live-cache-test-support.js";
const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
const OPENAI_TIMEOUT_MS = 120_000;
const OPENAI_SESSION_ID = "live-cache-openai-mcp-style-session";
const OPENAI_PREFIX = buildStableCachePrefix("openai-mcp-style");
const MCP_TOOL: Tool = {
name: "bundleProbe__bundle_probe",
description: "Return bundle MCP probe text.",
parameters: Type.Object({}, { additionalProperties: false }),
};
type CacheRun = {
hitRate: number;
suffix: string;
text: string;
usage: AssistantMessage["usage"];
};
function extractFirstToolCall(message: AssistantMessage) {
return message.content.find((block) => block.type === "toolCall");
}
function buildToolResultMessage(toolCallId: string) {
return {
role: "toolResult" as const,
toolCallId,
toolName: MCP_TOOL.name,
content: [{ type: "text" as const, text: "FROM-BUNDLE" }],
isError: false,
timestamp: Date.now(),
};
}
async function runToolOnlyTurn(params: {
apiKey: string;
model: Awaited<ReturnType<typeof resolveLiveDirectModel>>["model"];
sessionId: string;
}) {
let prompt = `Call the tool \`${MCP_TOOL.name}\` with {}. IMPORTANT: respond ONLY with the tool call and no other text.`;
let response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: OPENAI_PREFIX,
messages: [{ role: "user", content: prompt, timestamp: Date.now() }],
tools: [MCP_TOOL],
},
{
apiKey: params.apiKey,
cacheRetention: "short",
sessionId: params.sessionId,
maxTokens: 128,
temperature: 0,
reasoning: "none" as unknown as never,
},
"openai mcp-style tool-only turn",
OPENAI_TIMEOUT_MS,
);
let toolCall = extractFirstToolCall(response);
let text = extractAssistantText(response);
for (let attempt = 0; attempt < 2 && (!toolCall || text.length > 0); attempt += 1) {
prompt = `Return only a tool call for \`${MCP_TOOL.name}\` with {}. No text.`;
response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: OPENAI_PREFIX,
messages: [{ role: "user", content: prompt, timestamp: Date.now() }],
tools: [MCP_TOOL],
},
{
apiKey: params.apiKey,
cacheRetention: "short",
sessionId: params.sessionId,
maxTokens: 128,
temperature: 0,
reasoning: "none" as unknown as never,
},
`openai mcp-style tool-only retry ${attempt + 1}`,
OPENAI_TIMEOUT_MS,
);
toolCall = extractFirstToolCall(response);
text = extractAssistantText(response);
}
expect(toolCall).toBeTruthy();
expect(text.length).toBe(0);
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("expected tool call");
}
return {
prompt,
response,
toolCall,
};
}
async function runOpenAiMcpStyleCacheProbe(params: {
apiKey: string;
model: Awaited<ReturnType<typeof resolveLiveDirectModel>>["model"];
sessionId: string;
suffix: string;
}): Promise<CacheRun> {
const toolTurn = await runToolOnlyTurn(params);
const response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: OPENAI_PREFIX,
messages: [
{ role: "user", content: toolTurn.prompt, timestamp: Date.now() },
toolTurn.response,
buildToolResultMessage(toolTurn.toolCall.id),
{
role: "assistant",
content: [{ type: "text", text: "MCP TOOL HISTORY ACKNOWLEDGED" }],
timestamp: Date.now(),
},
{
role: "user",
content: "Keep the MCP tool output stable in history.",
timestamp: Date.now(),
},
{
role: "assistant",
content: [{ type: "text", text: "MCP TOOL HISTORY PRESERVED" }],
timestamp: Date.now(),
},
{
role: "user",
content: `Reply with exactly CACHE-OK ${params.suffix}.`,
timestamp: Date.now(),
},
],
tools: [MCP_TOOL],
},
{
apiKey: params.apiKey,
cacheRetention: "short",
sessionId: params.sessionId,
maxTokens: 64,
temperature: 0,
reasoning: "none" as unknown as never,
},
`openai mcp-style cache probe ${params.suffix}`,
OPENAI_TIMEOUT_MS,
);
const text = extractAssistantText(response);
expect(text.toLowerCase()).toContain(params.suffix.toLowerCase());
return {
suffix: params.suffix,
text,
usage: response.usage,
hitRate: computeCacheHitRate(response.usage),
};
}
describeCacheLive("MCP-style prompt caching (live)", () => {
it(
"keeps high cache-read rates across MCP-style followup turns",
async () => {
const fixture = await resolveLiveDirectModel({
provider: "openai",
api: "openai-responses",
envVar: "OPENCLAW_LIVE_OPENAI_CACHE_MODEL",
preferredModelIds: ["gpt-5.4-mini", "gpt-5.4", "gpt-5.2"],
});
logLiveCache(`openai mcp-style model=${fixture.model.provider}/${fixture.model.id}`);
const warmup = await runOpenAiMcpStyleCacheProbe({
...fixture,
sessionId: OPENAI_SESSION_ID,
suffix: "mcp-warmup",
});
logLiveCache(
`openai mcp-style warmup cacheRead=${warmup.usage.cacheRead} input=${warmup.usage.input} rate=${warmup.hitRate.toFixed(3)}`,
);
const hitA = await runOpenAiMcpStyleCacheProbe({
...fixture,
sessionId: OPENAI_SESSION_ID,
suffix: "mcp-hit-a",
});
const hitB = await runOpenAiMcpStyleCacheProbe({
...fixture,
sessionId: OPENAI_SESSION_ID,
suffix: "mcp-hit-b",
});
const bestHit = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
logLiveCache(
`openai mcp-style best-hit suffix=${bestHit.suffix} cacheRead=${bestHit.usage.cacheRead} input=${bestHit.usage.input} rate=${bestHit.hitRate.toFixed(3)}`,
);
expect(bestHit.usage.cacheRead ?? 0).toBeGreaterThan(1_024);
expect(bestHit.hitRate).toBeGreaterThanOrEqual(0.6);
},
10 * 60_000,
);
});