fix(openai): accept realtime event aliases

This commit is contained in:
Peter Steinberger
2026-05-10 05:20:40 +01:00
parent ff045ea9ca
commit 40aa57ba8f
4 changed files with 90 additions and 10 deletions

View File

@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- OpenAI/realtime voice: accept Codex-compatible legacy audio and transcript event aliases so provider protocol drift does not drop assistant audio or captions.
- Discord/voice: keep default agent-proxy realtime sessions from auto-speaking filler before the forced OpenClaw consult answer, finish Discord playback on realtime response completion, and queue later exact-speech answers until playback idles to avoid mid-sentence replacement.
- Gateway: return deterministic `400 invalid_request_error` responses for malformed encoded session-kill HTTP paths instead of letting route-shaped requests fall through to later Gateway handlers. (#72439) Thanks @rubencu.
- OpenAI/realtime voice: honor disabled input-audio interruption locally so server VAD speech-start events do not clear Discord playback after operators set `interruptResponseOnInputAudio: false`.

View File

@@ -1203,6 +1203,7 @@ Notes:
- `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`). In that default `always` mode, the realtime layer does not auto-speak filler before the consult answer; it captures and transcribes speech, then speaks the routed OpenClaw answer. If multiple forced consult answers finish while Discord is still playing the first answer, later exact-speech answers are queued until playback idles instead of replacing speech mid-sentence.
- In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription.
- In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`.
- The OpenAI realtime provider accepts current Realtime 2 event names and legacy Codex-compatible aliases for output audio and transcript events, so compatible provider snapshots can drift without dropping assistant audio.
- `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting.
- `voice.realtime.minBargeInAudioEndMs` controls the minimum assistant playback duration before an OpenAI realtime barge-in truncates audio. Default: `250`. Set `0` for immediate interruption in low-echo rooms, or raise it for echo-heavy speaker setups.
- For an OpenAI voice on Discord playback, set `voice.tts.provider: "openai"` and choose a Text-to-speech voice under `voice.tts.openai.voice` or `voice.tts.providers.openai.voice`. `cedar` is a good masculine-sounding choice on the current OpenAI TTS model.

View File

@@ -1059,6 +1059,73 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
);
});
it("forwards Codex-compatible legacy realtime audio and transcript events", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const onAudio = vi.fn();
const onTranscript = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
onAudio,
onClearAudio: vi.fn(),
onTranscript,
});
const connecting = bridge.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected bridge to create a websocket");
}
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
await connecting;
const audio = Buffer.from("legacy assistant audio");
socket.emit(
"message",
Buffer.from(
JSON.stringify({
type: "conversation.output_audio.delta",
data: audio.toString("base64"),
sample_rate: 24000,
channels: 1,
}),
),
);
socket.emit(
"message",
Buffer.from(
JSON.stringify({
type: "conversation.input_transcript.delta",
delta: "partial user",
}),
),
);
socket.emit(
"message",
Buffer.from(
JSON.stringify({
type: "conversation.output_transcript.delta",
delta: "partial assistant",
}),
),
);
socket.emit(
"message",
Buffer.from(
JSON.stringify({
type: "response.output_text.done",
text: "final assistant text",
}),
),
);
expect(onAudio).toHaveBeenCalledWith(audio);
expect(onTranscript).toHaveBeenCalledWith("user", "partial user", false);
expect(onTranscript).toHaveBeenCalledWith("assistant", "partial assistant", false);
expect(onTranscript).toHaveBeenCalledWith("assistant", "final assistant text", true);
});
it("emits tool calls from realtime conversation item done events", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const onToolCall = vi.fn();

View File

@@ -117,6 +117,8 @@ function normalizeOpenAIRealtimeVoice(value: unknown): OpenAIRealtimeVoice | und
type RealtimeEvent = {
type: string;
delta?: string;
data?: string;
text?: string;
transcript?: string;
item_id?: string;
call_id?: string;
@@ -854,12 +856,14 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
this.responseCreateInFlight = false;
return;
case "conversation.output_audio.delta":
case "response.audio.delta":
case "response.output_audio.delta": {
if (!event.delta) {
const audioDelta = event.delta ?? event.data;
if (!audioDelta) {
return;
}
const audio = base64ToBuffer(event.delta);
const audio = base64ToBuffer(audioDelta);
this.config.onAudio(audio);
if (event.item_id && event.item_id !== this.lastAssistantItemId) {
this.lastAssistantItemId = event.item_id;
@@ -878,6 +882,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
return;
case "conversation.output_transcript.delta":
case "response.output_text.delta":
case "response.audio_transcript.delta":
case "response.output_audio_transcript.delta":
if (event.delta) {
@@ -885,10 +891,21 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
return;
case "response.output_text.done":
case "response.audio_transcript.done":
case "response.output_audio_transcript.done":
if (event.transcript) {
this.config.onTranscript?.("assistant", event.transcript, true);
{
const transcript = event.transcript ?? event.text;
if (transcript) {
this.config.onTranscript?.("assistant", transcript, true);
}
}
return;
case "conversation.input_transcript.delta":
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.config.onTranscript?.("user", event.delta, false);
}
return;
@@ -898,12 +915,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
return;
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.config.onTranscript?.("user", event.delta, false);
}
return;
case "response.cancelled":
case "response.done":
this.responseActive = false;