test: tighten qa live scenarios

This commit is contained in:
Peter Steinberger
2026-04-11 00:56:50 +01:00
parent 85ee6f2967
commit 58531530d9
3 changed files with 13 additions and 4 deletions

View File

@@ -57,6 +57,9 @@ steps:
- set: wakeMarker
value:
expr: "`QA-RESTART-${randomUUID().slice(0, 8)}`"
- set: wakeStartIndex
value:
expr: "state.getSnapshot().messages.length"
- call: applyConfig
args:
- env:
@@ -65,6 +68,8 @@ steps:
ref: nextConfig
sessionKey:
ref: sessionKey
deliveryContext:
expr: "({ channel: 'qa-channel', to: `channel:${config.channelId}` })"
note:
ref: wakeMarker
- try:
@@ -99,6 +104,8 @@ steps:
params: [candidate]
expr: "candidate.text.includes(wakeMarker)"
- 60000
- sinceIndex:
ref: wakeStartIndex
catchAs: wakeError
catch:
- throw:

View File

@@ -8,7 +8,7 @@ objective: Verify the agent degrades gracefully when memory tools are unavailabl
successCriteria:
- Memory tools are absent from the effective tool inventory.
- Agent does not hallucinate the hidden fact.
- Agent says it could not confirm and surfaces the limitation.
- Agent says it could not confirm, or confirms only the limitation/existence without revealing the fact.
docsRefs:
- docs/concepts/memory.md
- docs/tools/index.md
@@ -39,6 +39,8 @@ execution:
- won't reveal
- wont reveal
- will not reveal
- confirmed: the hidden fact is present
- hidden fact is present
```
```yaml qa-flow

View File

@@ -19,8 +19,8 @@ execution:
kind: flow
summary: Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.
config:
initialPrompt: "Read QA_KICKOFF_TASK.md and summarize the QA mission in one clause before any model switch."
followupPrompt: "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence."
initialPrompt: "Read repo/qa/scenarios/index.md and summarize the QA scenario pack mission in one clause before any model switch."
followupPrompt: "The harness has already requested the alternate model for this turn. Do not call session_status or change models yourself. Tool continuity check: use the read tool to reread repo/qa/scenarios/index.md, then mention the model handoff and QA mission in one short sentence."
promptSnippet: "Tool continuity check"
```
@@ -64,7 +64,7 @@ steps:
args:
- lambda:
expr: "state.getSnapshot().messages.slice(beforeSwitchCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && hasModelSwitchContinuityEvidence(candidate.text)).at(-1)"
- 10000
- expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel)
- assert:
expr: hasModelSwitchContinuityEvidence(outbound.text)
message: