chore: add positive proof labels (#78117)

2026-05-06 15:18:58 +00:00 · 2026-05-05 16:10:17 -07:00
parent a4c860a70c
commit 33c42c8d3b
5 changed files with 195 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai

 ### Changes

+- PR triage: mark external pull requests with `proof: supplied` when Barnacle finds structured real behavior proof, keep stale negative proof labels in sync across CRLF-edited PR bodies, and let ClawSweeper own the stronger `proof: sufficient` judgement.
 - Google Meet/Voice Call: make Twilio dial-in joins speak through the realtime Gemini voice bridge with paced audio streaming, backpressure-aware buffering, barge-in queue clearing, same-session agent consult routing, duplicate-consult coalescing, and no TwiML fallback during realtime speech, giving Meet participants a much snappier OpenClaw voice agent. (#77064) Thanks @scoootscooob.
 - Voice Call/realtime: add opt-in OpenClaw agent voice context capsules and consult-cadence guidance so Gemini/OpenAI realtime calls can sound like the configured agent without consulting the full agent on every ordinary turn. Thanks @scoootscooob.
 - Docker/Gateway: harden the gateway container by dropping `NET_RAW` and `NET_ADMIN` capabilities and enabling `no-new-privileges` in the bundled `docker-compose.yml`. Thanks @VintageAyu.
--- a/scripts/github/barnacle-auto-response.mjs
+++ b/scripts/github/barnacle-auto-response.mjs
@@ -4,6 +4,8 @@ import {
  MOCK_ONLY_PROOF_LABEL,
  NEEDS_REAL_BEHAVIOR_PROOF_LABEL,
  PROOF_OVERRIDE_LABEL,
+  PROOF_SUFFICIENT_LABEL,
+  PROOF_SUPPLIED_LABEL,
  evaluateRealBehaviorProof,
  labelsForRealBehaviorProof,
 } from "./real-behavior-proof-policy.mjs";
@@ -150,6 +152,14 @@ export const managedLabelSpecs = {
    color: "C5DEF5",
    description: "Candidate: PR proof only shows tests, mocks, snapshots, lint, typecheck, or CI.",
  },
+  [PROOF_SUPPLIED_LABEL]: {
+    color: "C2E0C6",
+    description: "External PR includes structured after-fix real behavior proof.",
+  },
+  [PROOF_SUFFICIENT_LABEL]: {
+    color: "0E8A16",
+    description: "ClawSweeper judged the real behavior proof convincing.",
+  },
  [PROOF_OVERRIDE_LABEL]: {
    color: "C2E0C6",
    description: "Maintainer override for the external PR real behavior proof gate.",
@@ -218,7 +228,11 @@ const maintainerAuthorLabel = "maintainer";
 const privilegedAuthorAssociations = new Set(["OWNER", "MEMBER", "COLLABORATOR"]);
 const privilegedRepositoryRoles = new Set(["admin", "maintain", "write"]);
 const candidateLabelValues = Object.values(candidateLabels);
-const proofCandidateLabelValues = [NEEDS_REAL_BEHAVIOR_PROOF_LABEL, MOCK_ONLY_PROOF_LABEL];
+const structuralProofLabelValues = [
+  NEEDS_REAL_BEHAVIOR_PROOF_LABEL,
+  MOCK_ONLY_PROOF_LABEL,
+  PROOF_SUPPLIED_LABEL,
+];
 const noisyPrMessage =
  "Closing this PR because it looks dirty (too many unrelated or unexpected changes). This usually happens when a branch picks up unrelated commits or a merge went sideways. Please recreate the PR from a clean branch.";

@@ -759,8 +773,21 @@ async function addMissingLabels(github, context, core, issueNumber, labels, labe
  core.info(`Added candidate labels to #${issueNumber}: ${missingLabels.join(", ")}`);
 }

+function shouldRemoveProofSufficientLabel(context, proofEvaluation) {
+  if (proofEvaluation.status !== "passed") {
+    return true;
+  }
+  return ["edited", "synchronize"].includes(context.payload.action);
+}
+
 async function applyPullRequestCandidateLabels(github, context, core, pullRequest, labelSet) {
  const files = await listPullRequestFiles(github, context, pullRequest);
+  const proofEvaluation = evaluateRealBehaviorProof({
+    pullRequest: {
+      ...pullRequest,
+      labels: [...labelSet].map((name) => ({ name })),
+    },
+  });
  const classifiedLabels = classifyPullRequestCandidateLabels(
    {
      ...pullRequest,
@@ -768,9 +795,15 @@ async function applyPullRequestCandidateLabels(github, context, core, pullReques
    },
    files,
  );
-  const staleProofLabels = proofCandidateLabelValues.filter(
+  const staleProofLabels = structuralProofLabelValues.filter(
    (label) => labelSet.has(label) && !classifiedLabels.includes(label),
  );
+  if (
+    labelSet.has(PROOF_SUFFICIENT_LABEL) &&
+    shouldRemoveProofSufficientLabel(context, proofEvaluation)
+  ) {
+    staleProofLabels.push(PROOF_SUFFICIENT_LABEL);
+  }
  await removeLabels(github, context, pullRequest.number, staleProofLabels, labelSet);
  await addMissingLabels(github, context, core, pullRequest.number, classifiedLabels, labelSet);
 }
--- a/scripts/github/real-behavior-proof-policy.mjs
+++ b/scripts/github/real-behavior-proof-policy.mjs
@@ -1,4 +1,6 @@
 export const PROOF_OVERRIDE_LABEL = "proof: override";
+export const PROOF_SUPPLIED_LABEL = "proof: supplied";
+export const PROOF_SUFFICIENT_LABEL = "proof: sufficient";
 export const NEEDS_REAL_BEHAVIOR_PROOF_LABEL = "triage: needs-real-behavior-proof";
 export const MOCK_ONLY_PROOF_LABEL = "triage: mock-only-proof";

@@ -75,6 +77,10 @@ function escapeRegex(text) {
  return text.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
 }

+function normalizeLineEndings(text = "") {
+  return text.replace(/\r\n?/g, "\n");
+}
+
 function labelNames(labels) {
  return new Set(
    (labels ?? [])
@@ -106,13 +112,14 @@ export function hasProofOverride(labels) {
 }

 export function extractRealBehaviorProofSection(body = "") {
+  const normalizedBody = normalizeLineEndings(body);
  const headingRegex = /^#{2,6}\s+real behavior proof\b[^\n]*$/gim;
-  const match = headingRegex.exec(body);
+  const match = headingRegex.exec(normalizedBody);
  if (!match) {
    return "";
  }
  const sectionStart = match.index + match[0].length;
-  const rest = body.slice(sectionStart);
+  const rest = normalizedBody.slice(sectionStart);
  const nextHeading = rest.match(/\n#{1,6}\s+\S/);
  return (nextHeading ? rest.slice(0, nextHeading.index) : rest).trim();
 }
@@ -129,7 +136,7 @@ function isAnyProofFieldLine(line) {
 }

 function extractFieldValue(section, field) {
-  const lines = section.split("\n");
+  const lines = normalizeLineEndings(section).split("\n");
  for (let index = 0; index < lines.length; index += 1) {
    const matchingName = field.names.find((name) => fieldLineRegex(name).test(lines[index]));
    if (!matchingName) {
@@ -151,7 +158,7 @@ function extractFieldValue(section, field) {
 }

 function stripProofFieldLabels(section) {
-  return section
+  return normalizeLineEndings(section)
    .split("\n")
    .map((line) => {
      if (!isAnyProofFieldLine(line)) {
@@ -274,6 +281,9 @@ export function evaluateRealBehaviorProof({ pullRequest, labels } = {}) {
 }

 export function labelsForRealBehaviorProof(evaluation) {
+  if (evaluation.status === "passed") {
+    return [PROOF_SUPPLIED_LABEL];
+  }
  if (evaluation.status === "mock_only") {
    return [MOCK_ONLY_PROOF_LABEL];
  }
--- a/test/scripts/barnacle-auto-response.test.ts
+++ b/test/scripts/barnacle-auto-response.test.ts
@@ -5,6 +5,10 @@ import {
  managedLabelSpecs,
  runBarnacleAutoResponse,
 } from "../../scripts/github/barnacle-auto-response.mjs";
+import {
+  PROOF_SUFFICIENT_LABEL,
+  PROOF_SUPPLIED_LABEL,
+} from "../../scripts/github/real-behavior-proof-policy.mjs";

 const blankTemplateBody = [
  "## Summary",
@@ -227,6 +231,8 @@ describe("barnacle-auto-response", () => {
    expect(managedLabelSpecs["r: false-positive"].description).toContain("false positive");
    expect(managedLabelSpecs["r: third-party-extension"].description).toContain("ClawHub");
    expect(managedLabelSpecs["r: too-many-prs"].description).toContain("twenty active PRs");
+    expect(managedLabelSpecs[PROOF_SUPPLIED_LABEL].color).toBe("C2E0C6");
+    expect(managedLabelSpecs[PROOF_SUFFICIENT_LABEL].color).toBe("0E8A16");

    for (const label of Object.values(candidateLabels)) {
      expect(managedLabelSpecs[label]).toBeDefined();
@@ -283,7 +289,7 @@ describe("barnacle-auto-response", () => {
    expect(labels).not.toContain(candidateLabels.needsRealBehaviorProof);
  });

-  it("does not label external PRs that include real behavior proof", () => {
+  it("labels external PRs that include real behavior proof as supplied", () => {
    const labels = classifyPullRequestCandidateLabels(
      pr(
        "Fix gateway startup",
@@ -292,6 +298,23 @@ describe("barnacle-auto-response", () => {
      [file("src/gateway/server.ts")],
    );

+    expect(labels).toContain(PROOF_SUPPLIED_LABEL);
+    expect(labels).not.toContain(candidateLabels.needsRealBehaviorProof);
+    expect(labels).not.toContain(candidateLabels.mockOnlyProof);
+  });
+
+  it("labels CRLF-formatted external PRs with screenshot proof as supplied", () => {
+    const labels = classifyPullRequestCandidateLabels(
+      pr(
+        "Fix gateway startup",
+        realBehaviorProofBody(
+          "![after](https://github.com/user-attachments/assets/gateway-ready)",
+        ).replace(/\n/g, "\r\n"),
+      ),
+      [file("src/gateway/server.ts")],
+    );
+
+    expect(labels).toContain(PROOF_SUPPLIED_LABEL);
    expect(labels).not.toContain(candidateLabels.needsRealBehaviorProof);
    expect(labels).not.toContain(candidateLabels.mockOnlyProof);
  });
@@ -662,18 +685,115 @@ describe("barnacle-auto-response", () => {

    await runBarnacleAutoResponse({
      github,
-      context: barnacleContext({}, [candidateLabels.needsRealBehaviorProof, "proof: override"]),
+      context: barnacleContext({}, [
+        candidateLabels.needsRealBehaviorProof,
+        candidateLabels.mockOnlyProof,
+        PROOF_SUPPLIED_LABEL,
+        PROOF_SUFFICIENT_LABEL,
+        "proof: override",
+      ]),
      core: {
        info: () => undefined,
      },
    });

-    expect(calls.removeLabel).toContainEqual(
-      expect.objectContaining({ name: candidateLabels.needsRealBehaviorProof }),
+    expect(calls.removeLabel.map((call) => call.name)).toEqual(
+      expect.arrayContaining([
+        candidateLabels.needsRealBehaviorProof,
+        candidateLabels.mockOnlyProof,
+        PROOF_SUPPLIED_LABEL,
+        PROOF_SUFFICIENT_LABEL,
+      ]),
    );
    expect(calls.update).toEqual([]);
  });

+  it("removes stale negative proof labels and adds supplied when proof is present", async () => {
+    const { calls, github } = barnacleGithub([file("src/gateway/server.ts")]);
+
+    await runBarnacleAutoResponse({
+      github,
+      context: barnacleContext(
+        {
+          body: realBehaviorProofBody(
+            "![after](https://github.com/user-attachments/assets/gateway-ready)",
+          ),
+        },
+        [candidateLabels.needsRealBehaviorProof, candidateLabels.mockOnlyProof],
+      ),
+      core: {
+        info: () => undefined,
+      },
+    });
+
+    expect(calls.removeLabel.map((call) => call.name)).toEqual(
+      expect.arrayContaining([
+        candidateLabels.needsRealBehaviorProof,
+        candidateLabels.mockOnlyProof,
+      ]),
+    );
+    expect(calls.addLabels).toContainEqual(
+      expect.objectContaining({
+        labels: expect.arrayContaining([PROOF_SUPPLIED_LABEL]),
+      }),
+    );
+  });
+
+  it.each(["edited", "synchronize"])(
+    "removes stale sufficient proof label after PR %s events",
+    async (action) => {
+      const { calls, github } = barnacleGithub([file("src/gateway/server.ts")]);
+
+      await runBarnacleAutoResponse({
+        github,
+        context: barnacleContext(
+          {
+            body: realBehaviorProofBody(
+              "![after](https://github.com/user-attachments/assets/gateway-ready)",
+            ),
+          },
+          [PROOF_SUPPLIED_LABEL, PROOF_SUFFICIENT_LABEL],
+          { action },
+        ),
+        core: {
+          info: () => undefined,
+        },
+      });
+
+      expect(calls.removeLabel).toContainEqual(
+        expect.objectContaining({ name: PROOF_SUFFICIENT_LABEL }),
+      );
+    },
+  );
+
+  it("preserves ClawSweeper's sufficient proof label on ordinary label events", async () => {
+    const { calls, github } = barnacleGithub([file("src/gateway/server.ts")]);
+
+    await runBarnacleAutoResponse({
+      github,
+      context: barnacleContext(
+        {
+          body: realBehaviorProofBody(
+            "![after](https://github.com/user-attachments/assets/gateway-ready)",
+          ),
+        },
+        [PROOF_SUPPLIED_LABEL, PROOF_SUFFICIENT_LABEL],
+        {
+          action: "labeled",
+          label: { name: PROOF_SUFFICIENT_LABEL },
+          sender: { login: "openclaw-clawsweeper[bot]", type: "Bot" },
+        },
+      ),
+      core: {
+        info: () => undefined,
+      },
+    });
+
+    expect(calls.removeLabel).not.toContainEqual(
+      expect.objectContaining({ name: PROOF_SUFFICIENT_LABEL }),
+    );
+  });
+
  it("actions manually applied candidate labels", async () => {
    const { calls, github } = barnacleGithub([file("extensions/example/openclaw.plugin.json")]);

--- a/test/scripts/real-behavior-proof-policy.test.ts
+++ b/test/scripts/real-behavior-proof-policy.test.ts
@@ -3,6 +3,7 @@ import {
  MOCK_ONLY_PROOF_LABEL,
  NEEDS_REAL_BEHAVIOR_PROOF_LABEL,
  PROOF_OVERRIDE_LABEL,
+  PROOF_SUPPLIED_LABEL,
  evaluateRealBehaviorProof,
  labelsForRealBehaviorProof,
 } from "../../scripts/github/real-behavior-proof-policy.mjs";
@@ -56,7 +57,26 @@ describe("real-behavior-proof-policy", () => {
    });

    expect(evaluation.status).toBe("passed");
-    expect(labelsForRealBehaviorProof(evaluation)).toEqual([]);
+    expect(labelsForRealBehaviorProof(evaluation)).toEqual([PROOF_SUPPLIED_LABEL]);
+  });
+
+  it("passes CRLF-formatted external PRs with screenshot proof", () => {
+    const evaluation = evaluateRealBehaviorProof({
+      pullRequest: externalPr(
+        proofBody("![after](https://github.com/user-attachments/assets/gateway-ready)").replace(
+          /\n/g,
+          "\r\n",
+        ),
+      ),
+    });
+
+    expect(evaluation.status).toBe("passed");
+    expect(evaluation.fields).toMatchObject({
+      behavior: "Gateway startup no longer drops the configured Discord channel.",
+      evidence: "![after](https://github.com/user-attachments/assets/gateway-ready)",
+      observedResult: "The gateway stayed connected and the Discord channel showed ready.",
+    });
+    expect(labelsForRealBehaviorProof(evaluation)).toEqual([PROOF_SUPPLIED_LABEL]);
  });

  it("fails external PRs without a real behavior proof section", () => {