diff --git a/CHANGELOG.md b/CHANGELOG.md index 76f50be0e21..d26f7638bb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- OpenAI/realtime voice: accept Codex-compatible legacy audio and transcript event aliases so provider protocol drift does not drop assistant audio or captions. - Discord/voice: keep default agent-proxy realtime sessions from auto-speaking filler before the forced OpenClaw consult answer, finish Discord playback on realtime response completion, and queue later exact-speech answers until playback idles to avoid mid-sentence replacement. - Gateway: return deterministic `400 invalid_request_error` responses for malformed encoded session-kill HTTP paths instead of letting route-shaped requests fall through to later Gateway handlers. (#72439) Thanks @rubencu. - OpenAI/realtime voice: honor disabled input-audio interruption locally so server VAD speech-start events do not clear Discord playback after operators set `interruptResponseOnInputAudio: false`. diff --git a/docs/channels/discord.md b/docs/channels/discord.md index c41fe988502..e1ef0edf4f8 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -1203,6 +1203,7 @@ Notes: - `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`). In that default `always` mode, the realtime layer does not auto-speak filler before the consult answer; it captures and transcribes speech, then speaks the routed OpenClaw answer. If multiple forced consult answers finish while Discord is still playing the first answer, later exact-speech answers are queued until playback idles instead of replacing speech mid-sentence. - In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription. - In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`. +- The OpenAI realtime provider accepts current Realtime 2 event names and legacy Codex-compatible aliases for output audio and transcript events, so compatible provider snapshots can drift without dropping assistant audio. - `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting. - `voice.realtime.minBargeInAudioEndMs` controls the minimum assistant playback duration before an OpenAI realtime barge-in truncates audio. Default: `250`. Set `0` for immediate interruption in low-echo rooms, or raise it for echo-heavy speaker setups. - For an OpenAI voice on Discord playback, set `voice.tts.provider: "openai"` and choose a Text-to-speech voice under `voice.tts.openai.voice` or `voice.tts.providers.openai.voice`. `cedar` is a good masculine-sounding choice on the current OpenAI TTS model. diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index ea7bfb457b2..f67fa7f0bb8 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -1059,6 +1059,73 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { ); }); + it("forwards Codex-compatible legacy realtime audio and transcript events", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onAudio = vi.fn(); + const onTranscript = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio, + onClearAudio: vi.fn(), + onTranscript, + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + + const audio = Buffer.from("legacy assistant audio"); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "conversation.output_audio.delta", + data: audio.toString("base64"), + sample_rate: 24000, + channels: 1, + }), + ), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "conversation.input_transcript.delta", + delta: "partial user", + }), + ), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "conversation.output_transcript.delta", + delta: "partial assistant", + }), + ), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "response.output_text.done", + text: "final assistant text", + }), + ), + ); + + expect(onAudio).toHaveBeenCalledWith(audio); + expect(onTranscript).toHaveBeenCalledWith("user", "partial user", false); + expect(onTranscript).toHaveBeenCalledWith("assistant", "partial assistant", false); + expect(onTranscript).toHaveBeenCalledWith("assistant", "final assistant text", true); + }); + it("emits tool calls from realtime conversation item done events", async () => { const provider = buildOpenAIRealtimeVoiceProvider(); const onToolCall = vi.fn(); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index 471b87e8dd4..4d6df6cff2c 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -117,6 +117,8 @@ function normalizeOpenAIRealtimeVoice(value: unknown): OpenAIRealtimeVoice | und type RealtimeEvent = { type: string; delta?: string; + data?: string; + text?: string; transcript?: string; item_id?: string; call_id?: string; @@ -854,12 +856,14 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.responseCreateInFlight = false; return; + case "conversation.output_audio.delta": case "response.audio.delta": case "response.output_audio.delta": { - if (!event.delta) { + const audioDelta = event.delta ?? event.data; + if (!audioDelta) { return; } - const audio = base64ToBuffer(event.delta); + const audio = base64ToBuffer(audioDelta); this.config.onAudio(audio); if (event.item_id && event.item_id !== this.lastAssistantItemId) { this.lastAssistantItemId = event.item_id; @@ -878,6 +882,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } return; + case "conversation.output_transcript.delta": + case "response.output_text.delta": case "response.audio_transcript.delta": case "response.output_audio_transcript.delta": if (event.delta) { @@ -885,10 +891,21 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } return; + case "response.output_text.done": case "response.audio_transcript.done": case "response.output_audio_transcript.done": - if (event.transcript) { - this.config.onTranscript?.("assistant", event.transcript, true); + { + const transcript = event.transcript ?? event.text; + if (transcript) { + this.config.onTranscript?.("assistant", transcript, true); + } + } + return; + + case "conversation.input_transcript.delta": + case "conversation.item.input_audio_transcription.delta": + if (event.delta) { + this.config.onTranscript?.("user", event.delta, false); } return; @@ -898,12 +915,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } return; - case "conversation.item.input_audio_transcription.delta": - if (event.delta) { - this.config.onTranscript?.("user", event.delta, false); - } - return; - case "response.cancelled": case "response.done": this.responseActive = false;