From aa5a0a36f8a7540042e8e351388de47cb199326b Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Fri, 1 May 2026 16:32:52 +0530 Subject: [PATCH] test(rtt): expose warm sample metrics --- scripts/lib/rtt-harness.ts | 47 ++++++++++++++++++++-- scripts/rtt.ts | 18 ++++++++- test/fixtures/telegram-qa-summary-rtt.json | 33 ++++++++++++++- test/scripts/rtt-harness.test.ts | 29 ++++++++++++- 4 files changed, 119 insertions(+), 8 deletions(-) diff --git a/scripts/lib/rtt-harness.ts b/scripts/lib/rtt-harness.ts index 237bc8bf57f..ddb81715919 100644 --- a/scripts/lib/rtt-harness.ts +++ b/scripts/lib/rtt-harness.ts @@ -10,6 +10,8 @@ export type RttProviderMode = "mock-openai" | "live-frontier"; export type RttCliOptions = { providerMode: RttProviderMode; runs: number; + samples: number; + sampleTimeoutMs: number; harnessRoot: string; output: string; scenarios: string[]; @@ -35,6 +37,12 @@ export type RttResult = { rtt: { canaryMs?: number; mentionReplyMs?: number; + warmSamples?: number[]; + avgMs?: number; + p50Ms?: number; + p95Ms?: number; + maxMs?: number; + failedSamples?: number; }; artifacts: { rawSummaryPath: string; @@ -49,6 +57,20 @@ export type TelegramQaSummary = { id?: string; rttMs?: number; status?: string; + samples?: Array<{ + index?: number; + status?: string; + rttMs?: number; + }>; + stats?: { + total?: number; + passed?: number; + failed?: number; + avgMs?: number; + p50Ms?: number; + p95Ms?: number; + maxMs?: number; + }; }>; }; @@ -82,11 +104,26 @@ export function buildRunId(params: { now: Date; spec: string; index?: number }) export function extractRtt(summary: TelegramQaSummary) { const scenarios = summary.scenarios ?? []; - return { + const mention = scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply"); + const warmSamples = mention?.samples + ?.filter((sample) => sample.status === "pass" && sample.rttMs !== undefined) + .sort((left, right) => (left.index ?? 0) - (right.index ?? 0)) + .flatMap((sample) => (sample.rttMs === undefined ? [] : [sample.rttMs])); + const rtt: RttResult["rtt"] = { canaryMs: scenarios.find((scenario) => scenario.id === "telegram-canary")?.rttMs, - mentionReplyMs: scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply") - ?.rttMs, + mentionReplyMs: mention?.stats?.p50Ms ?? mention?.rttMs, }; + if (warmSamples?.length) { + rtt.warmSamples = warmSamples; + } + if (mention?.stats) { + rtt.avgMs = mention.stats.avgMs; + rtt.p50Ms = mention.stats.p50Ms; + rtt.p95Ms = mention.stats.p95Ms; + rtt.maxMs = mention.stats.maxMs; + rtt.failedSamples = mention.stats.failed; + } + return rtt; } export function createHarnessEnv(params: { @@ -96,6 +133,8 @@ export function createHarnessEnv(params: { spec: string; version: string; rawOutputDir: string; + samples: number; + sampleTimeoutMs: number; timeoutMs: number; }) { return { @@ -106,6 +145,8 @@ export function createHarnessEnv(params: { OPENCLAW_NPM_TELEGRAM_SCENARIOS: params.scenarios.join(","), OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR: params.rawOutputDir, OPENCLAW_NPM_TELEGRAM_FAST: params.baseEnv.OPENCLAW_NPM_TELEGRAM_FAST ?? "1", + OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES: String(params.samples), + OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS: String(params.sampleTimeoutMs), OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS: String(params.timeoutMs), OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS: String(params.timeoutMs), }; diff --git a/scripts/rtt.ts b/scripts/rtt.ts index 5ae096ab3b8..c35e5a9e929 100644 --- a/scripts/rtt.ts +++ b/scripts/rtt.ts @@ -20,10 +20,12 @@ import { const DEFAULT_SCENARIOS = ["telegram-mentioned-message-reply"]; const DEFAULT_PROVIDER_MODE = "mock-openai" satisfies RttProviderMode; const DEFAULT_TIMEOUT_MS = 180_000; +const DEFAULT_SAMPLES = 20; +const DEFAULT_SAMPLE_TIMEOUT_MS = 30_000; function usage() { return [ - "Usage: pnpm rtt [--provider mock-openai|live-frontier] [--runs N] [--timeout-ms N] [--harness-root PATH] [--output PATH]", + "Usage: pnpm rtt [--provider mock-openai|live-frontier] [--runs N] [--samples N] [--sample-timeout-ms N] [--timeout-ms N] [--harness-root PATH] [--output PATH]", "", "Examples:", " pnpm rtt openclaw@beta", @@ -61,6 +63,8 @@ function parseArgs(argv: string[]) { let spec: string | undefined; let providerMode = DEFAULT_PROVIDER_MODE; let runs = 1; + let samples = DEFAULT_SAMPLES; + let sampleTimeoutMs = DEFAULT_SAMPLE_TIMEOUT_MS; let harnessRoot = "~/Developer/clawdbot"; let output = "runs"; let timeoutMs = DEFAULT_TIMEOUT_MS; @@ -79,6 +83,14 @@ function parseArgs(argv: string[]) { runs = parsePositiveInt("--runs", argv[++index] ?? ""); continue; } + if (arg === "--samples") { + samples = parsePositiveInt("--samples", argv[++index] ?? ""); + continue; + } + if (arg === "--sample-timeout-ms") { + sampleTimeoutMs = parsePositiveInt("--sample-timeout-ms", argv[++index] ?? ""); + continue; + } if (arg === "--harness-root") { harnessRoot = argv[++index] ?? ""; if (!harnessRoot.trim()) { @@ -115,6 +127,8 @@ function parseArgs(argv: string[]) { options: { providerMode, runs, + samples, + sampleTimeoutMs, harnessRoot: path.resolve(resolveHome(harnessRoot)), output: path.resolve(resolveHome(output)), scenarios: DEFAULT_SCENARIOS, @@ -140,6 +154,8 @@ async function runOne(params: { baseEnv: process.env, providerMode: params.options.providerMode, rawOutputDir, + samples: params.options.samples, + sampleTimeoutMs: params.options.sampleTimeoutMs, scenarios: params.options.scenarios, spec: params.spec, timeoutMs: params.options.timeoutMs, diff --git a/test/fixtures/telegram-qa-summary-rtt.json b/test/fixtures/telegram-qa-summary-rtt.json index 36796aabb08..11f67ba30f5 100644 --- a/test/fixtures/telegram-qa-summary-rtt.json +++ b/test/fixtures/telegram-qa-summary-rtt.json @@ -24,8 +24,37 @@ "id": "telegram-mentioned-message-reply", "title": "Telegram mentioned message gets a reply", "status": "pass", - "details": "reply matched in 5678ms", - "rttMs": 5678 + "details": "3/3 warm samples passed", + "rttMs": 5000, + "samples": [ + { + "index": 1, + "status": "pass", + "details": "observed SUT message 101", + "rttMs": 4000 + }, + { + "index": 2, + "status": "pass", + "details": "observed SUT message 102", + "rttMs": 5000 + }, + { + "index": 3, + "status": "pass", + "details": "observed SUT message 103", + "rttMs": 7000 + } + ], + "stats": { + "total": 3, + "passed": 3, + "failed": 0, + "avgMs": 5333, + "p50Ms": 5000, + "p95Ms": 7000, + "maxMs": 7000 + } } ] } diff --git a/test/scripts/rtt-harness.test.ts b/test/scripts/rtt-harness.test.ts index f74805672c1..1f269213f55 100644 --- a/test/scripts/rtt-harness.test.ts +++ b/test/scripts/rtt-harness.test.ts @@ -52,6 +52,8 @@ describe("RTT harness", () => { }, providerMode: "mock-openai", rawOutputDir: ".artifacts/rtt/run/raw", + samples: 20, + sampleTimeoutMs: 30_000, scenarios: ["telegram-mentioned-message-reply"], spec: "openclaw@beta", timeoutMs: 180_000, @@ -65,6 +67,8 @@ describe("RTT harness", () => { expect(env.OPENCLAW_NPM_TELEGRAM_SCENARIOS).toBe("telegram-mentioned-message-reply"); expect(env.OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR).toBe(".artifacts/rtt/run/raw"); expect(env.OPENCLAW_NPM_TELEGRAM_FAST).toBe("0"); + expect(env.OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES).toBe("20"); + expect(env.OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS).toBe("30000"); expect(env.OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS).toBe("180000"); expect(env.OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS).toBe("180000"); }); @@ -73,7 +77,13 @@ describe("RTT harness", () => { const summary = await readTelegramSummary(FIXTURE_PATH); expect(extractRtt(summary)).toEqual({ canaryMs: 1234, - mentionReplyMs: 5678, + mentionReplyMs: 5000, + warmSamples: [4000, 5000, 7000], + avgMs: 5333, + p50Ms: 5000, + p95Ms: 7000, + maxMs: 7000, + failedSamples: 0, }); }); @@ -103,8 +113,17 @@ describe("RTT harness", () => { providerMode: "mock-openai", scenarios: ["telegram-mentioned-message-reply"], }, - rtt: { canaryMs: 1234, mentionReplyMs: 5678 }, + rtt: { + canaryMs: 1234, + mentionReplyMs: 5000, + avgMs: 5333, + p50Ms: 5000, + p95Ms: 7000, + maxMs: 7000, + failedSamples: 0, + }, }); + expect(result.rtt.warmSamples).toEqual([4000, 5000, 7000]); }); it("marks failed scenario summaries as failed results", () => { @@ -150,6 +169,10 @@ describe("RTT harness", () => { "live-frontier", "--runs", "3", + "--samples", + "5", + "--sample-timeout-ms", + "30000", "--timeout-ms", "240000", "--harness-root", @@ -162,6 +185,8 @@ describe("RTT harness", () => { expect(parsed.options).toMatchObject({ providerMode: "live-frontier", runs: 3, + samples: 5, + sampleTimeoutMs: 30_000, harnessRoot: "/tmp/openclaw", output: "/tmp/runs", scenarios: ["telegram-mentioned-message-reply"],