From aa5a0a36f8a7540042e8e351388de47cb199326b Mon Sep 17 00:00:00 2001
From: Ayaan Zaidi <hi@obviy.us>
Date: Fri, 1 May 2026 16:32:52 +0530
Subject: [PATCH] test(rtt): expose warm sample metrics

---
 scripts/lib/rtt-harness.ts                 | 47 ++++++++++++++++++++--
 scripts/rtt.ts                             | 18 ++++++++-
 test/fixtures/telegram-qa-summary-rtt.json | 33 ++++++++++++++-
 test/scripts/rtt-harness.test.ts           | 29 ++++++++++++-
 4 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/scripts/lib/rtt-harness.ts b/scripts/lib/rtt-harness.ts
index 237bc8bf57f..ddb81715919 100644
--- a/scripts/lib/rtt-harness.ts
+++ b/scripts/lib/rtt-harness.ts
@@ -10,6 +10,8 @@ export type RttProviderMode = "mock-openai" | "live-frontier";
 export type RttCliOptions = {
   providerMode: RttProviderMode;
   runs: number;
+  samples: number;
+  sampleTimeoutMs: number;
   harnessRoot: string;
   output: string;
   scenarios: string[];
@@ -35,6 +37,12 @@ export type RttResult = {
   rtt: {
     canaryMs?: number;
     mentionReplyMs?: number;
+    warmSamples?: number[];
+    avgMs?: number;
+    p50Ms?: number;
+    p95Ms?: number;
+    maxMs?: number;
+    failedSamples?: number;
   };
   artifacts: {
     rawSummaryPath: string;
@@ -49,6 +57,20 @@ export type TelegramQaSummary = {
     id?: string;
     rttMs?: number;
     status?: string;
+    samples?: Array<{
+      index?: number;
+      status?: string;
+      rttMs?: number;
+    }>;
+    stats?: {
+      total?: number;
+      passed?: number;
+      failed?: number;
+      avgMs?: number;
+      p50Ms?: number;
+      p95Ms?: number;
+      maxMs?: number;
+    };
   }>;
 };
 
@@ -82,11 +104,26 @@ export function buildRunId(params: { now: Date; spec: string; index?: number })
 
 export function extractRtt(summary: TelegramQaSummary) {
   const scenarios = summary.scenarios ?? [];
-  return {
+  const mention = scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply");
+  const warmSamples = mention?.samples
+    ?.filter((sample) => sample.status === "pass" && sample.rttMs !== undefined)
+    .sort((left, right) => (left.index ?? 0) - (right.index ?? 0))
+    .flatMap((sample) => (sample.rttMs === undefined ? [] : [sample.rttMs]));
+  const rtt: RttResult["rtt"] = {
     canaryMs: scenarios.find((scenario) => scenario.id === "telegram-canary")?.rttMs,
-    mentionReplyMs: scenarios.find((scenario) => scenario.id === "telegram-mentioned-message-reply")
-      ?.rttMs,
+    mentionReplyMs: mention?.stats?.p50Ms ?? mention?.rttMs,
   };
+  if (warmSamples?.length) {
+    rtt.warmSamples = warmSamples;
+  }
+  if (mention?.stats) {
+    rtt.avgMs = mention.stats.avgMs;
+    rtt.p50Ms = mention.stats.p50Ms;
+    rtt.p95Ms = mention.stats.p95Ms;
+    rtt.maxMs = mention.stats.maxMs;
+    rtt.failedSamples = mention.stats.failed;
+  }
+  return rtt;
 }
 
 export function createHarnessEnv(params: {
@@ -96,6 +133,8 @@ export function createHarnessEnv(params: {
   spec: string;
   version: string;
   rawOutputDir: string;
+  samples: number;
+  sampleTimeoutMs: number;
   timeoutMs: number;
 }) {
   return {
@@ -106,6 +145,8 @@ export function createHarnessEnv(params: {
     OPENCLAW_NPM_TELEGRAM_SCENARIOS: params.scenarios.join(","),
     OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR: params.rawOutputDir,
     OPENCLAW_NPM_TELEGRAM_FAST: params.baseEnv.OPENCLAW_NPM_TELEGRAM_FAST ?? "1",
+    OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES: String(params.samples),
+    OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS: String(params.sampleTimeoutMs),
     OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS: String(params.timeoutMs),
     OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS: String(params.timeoutMs),
   };
diff --git a/scripts/rtt.ts b/scripts/rtt.ts
index 5ae096ab3b8..c35e5a9e929 100644
--- a/scripts/rtt.ts
+++ b/scripts/rtt.ts
@@ -20,10 +20,12 @@ import {
 const DEFAULT_SCENARIOS = ["telegram-mentioned-message-reply"];
 const DEFAULT_PROVIDER_MODE = "mock-openai" satisfies RttProviderMode;
 const DEFAULT_TIMEOUT_MS = 180_000;
+const DEFAULT_SAMPLES = 20;
+const DEFAULT_SAMPLE_TIMEOUT_MS = 30_000;
 
 function usage() {
   return [
-    "Usage: pnpm rtt <openclaw@spec> [--provider mock-openai|live-frontier] [--runs N] [--timeout-ms N] [--harness-root PATH] [--output PATH]",
+    "Usage: pnpm rtt <openclaw@spec> [--provider mock-openai|live-frontier] [--runs N] [--samples N] [--sample-timeout-ms N] [--timeout-ms N] [--harness-root PATH] [--output PATH]",
     "",
     "Examples:",
     "  pnpm rtt openclaw@beta",
@@ -61,6 +63,8 @@ function parseArgs(argv: string[]) {
   let spec: string | undefined;
   let providerMode = DEFAULT_PROVIDER_MODE;
   let runs = 1;
+  let samples = DEFAULT_SAMPLES;
+  let sampleTimeoutMs = DEFAULT_SAMPLE_TIMEOUT_MS;
   let harnessRoot = "~/Developer/clawdbot";
   let output = "runs";
   let timeoutMs = DEFAULT_TIMEOUT_MS;
@@ -79,6 +83,14 @@ function parseArgs(argv: string[]) {
       runs = parsePositiveInt("--runs", argv[++index] ?? "");
       continue;
     }
+    if (arg === "--samples") {
+      samples = parsePositiveInt("--samples", argv[++index] ?? "");
+      continue;
+    }
+    if (arg === "--sample-timeout-ms") {
+      sampleTimeoutMs = parsePositiveInt("--sample-timeout-ms", argv[++index] ?? "");
+      continue;
+    }
     if (arg === "--harness-root") {
       harnessRoot = argv[++index] ?? "";
       if (!harnessRoot.trim()) {
@@ -115,6 +127,8 @@ function parseArgs(argv: string[]) {
     options: {
       providerMode,
       runs,
+      samples,
+      sampleTimeoutMs,
       harnessRoot: path.resolve(resolveHome(harnessRoot)),
       output: path.resolve(resolveHome(output)),
       scenarios: DEFAULT_SCENARIOS,
@@ -140,6 +154,8 @@ async function runOne(params: {
     baseEnv: process.env,
     providerMode: params.options.providerMode,
     rawOutputDir,
+    samples: params.options.samples,
+    sampleTimeoutMs: params.options.sampleTimeoutMs,
     scenarios: params.options.scenarios,
     spec: params.spec,
     timeoutMs: params.options.timeoutMs,
diff --git a/test/fixtures/telegram-qa-summary-rtt.json b/test/fixtures/telegram-qa-summary-rtt.json
index 36796aabb08..11f67ba30f5 100644
--- a/test/fixtures/telegram-qa-summary-rtt.json
+++ b/test/fixtures/telegram-qa-summary-rtt.json
@@ -24,8 +24,37 @@
       "id": "telegram-mentioned-message-reply",
       "title": "Telegram mentioned message gets a reply",
       "status": "pass",
-      "details": "reply matched in 5678ms",
-      "rttMs": 5678
+      "details": "3/3 warm samples passed",
+      "rttMs": 5000,
+      "samples": [
+        {
+          "index": 1,
+          "status": "pass",
+          "details": "observed SUT message 101",
+          "rttMs": 4000
+        },
+        {
+          "index": 2,
+          "status": "pass",
+          "details": "observed SUT message 102",
+          "rttMs": 5000
+        },
+        {
+          "index": 3,
+          "status": "pass",
+          "details": "observed SUT message 103",
+          "rttMs": 7000
+        }
+      ],
+      "stats": {
+        "total": 3,
+        "passed": 3,
+        "failed": 0,
+        "avgMs": 5333,
+        "p50Ms": 5000,
+        "p95Ms": 7000,
+        "maxMs": 7000
+      }
     }
   ]
 }
diff --git a/test/scripts/rtt-harness.test.ts b/test/scripts/rtt-harness.test.ts
index f74805672c1..1f269213f55 100644
--- a/test/scripts/rtt-harness.test.ts
+++ b/test/scripts/rtt-harness.test.ts
@@ -52,6 +52,8 @@ describe("RTT harness", () => {
       },
       providerMode: "mock-openai",
       rawOutputDir: ".artifacts/rtt/run/raw",
+      samples: 20,
+      sampleTimeoutMs: 30_000,
       scenarios: ["telegram-mentioned-message-reply"],
       spec: "openclaw@beta",
       timeoutMs: 180_000,
@@ -65,6 +67,8 @@ describe("RTT harness", () => {
     expect(env.OPENCLAW_NPM_TELEGRAM_SCENARIOS).toBe("telegram-mentioned-message-reply");
     expect(env.OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR).toBe(".artifacts/rtt/run/raw");
     expect(env.OPENCLAW_NPM_TELEGRAM_FAST).toBe("0");
+    expect(env.OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES).toBe("20");
+    expect(env.OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS).toBe("30000");
     expect(env.OPENCLAW_QA_TELEGRAM_CANARY_TIMEOUT_MS).toBe("180000");
     expect(env.OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS).toBe("180000");
   });
@@ -73,7 +77,13 @@ describe("RTT harness", () => {
     const summary = await readTelegramSummary(FIXTURE_PATH);
     expect(extractRtt(summary)).toEqual({
       canaryMs: 1234,
-      mentionReplyMs: 5678,
+      mentionReplyMs: 5000,
+      warmSamples: [4000, 5000, 7000],
+      avgMs: 5333,
+      p50Ms: 5000,
+      p95Ms: 7000,
+      maxMs: 7000,
+      failedSamples: 0,
     });
   });
 
@@ -103,8 +113,17 @@ describe("RTT harness", () => {
         providerMode: "mock-openai",
         scenarios: ["telegram-mentioned-message-reply"],
       },
-      rtt: { canaryMs: 1234, mentionReplyMs: 5678 },
+      rtt: {
+        canaryMs: 1234,
+        mentionReplyMs: 5000,
+        avgMs: 5333,
+        p50Ms: 5000,
+        p95Ms: 7000,
+        maxMs: 7000,
+        failedSamples: 0,
+      },
     });
+    expect(result.rtt.warmSamples).toEqual([4000, 5000, 7000]);
   });
 
   it("marks failed scenario summaries as failed results", () => {
@@ -150,6 +169,10 @@ describe("RTT harness", () => {
       "live-frontier",
       "--runs",
       "3",
+      "--samples",
+      "5",
+      "--sample-timeout-ms",
+      "30000",
       "--timeout-ms",
       "240000",
       "--harness-root",
@@ -162,6 +185,8 @@ describe("RTT harness", () => {
     expect(parsed.options).toMatchObject({
       providerMode: "live-frontier",
       runs: 3,
+      samples: 5,
+      sampleTimeoutMs: 30_000,
       harnessRoot: "/tmp/openclaw",
       output: "/tmp/runs",
       scenarios: ["telegram-mentioned-message-reply"],