mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-11 04:48:05 +00:00
fix(openai): accept realtime event aliases
This commit is contained in:
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- OpenAI/realtime voice: accept Codex-compatible legacy audio and transcript event aliases so provider protocol drift does not drop assistant audio or captions.
|
||||
- Discord/voice: keep default agent-proxy realtime sessions from auto-speaking filler before the forced OpenClaw consult answer, finish Discord playback on realtime response completion, and queue later exact-speech answers until playback idles to avoid mid-sentence replacement.
|
||||
- Gateway: return deterministic `400 invalid_request_error` responses for malformed encoded session-kill HTTP paths instead of letting route-shaped requests fall through to later Gateway handlers. (#72439) Thanks @rubencu.
|
||||
- OpenAI/realtime voice: honor disabled input-audio interruption locally so server VAD speech-start events do not clear Discord playback after operators set `interruptResponseOnInputAudio: false`.
|
||||
|
||||
@@ -1203,6 +1203,7 @@ Notes:
|
||||
- `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`). In that default `always` mode, the realtime layer does not auto-speak filler before the consult answer; it captures and transcribes speech, then speaks the routed OpenClaw answer. If multiple forced consult answers finish while Discord is still playing the first answer, later exact-speech answers are queued until playback idles instead of replacing speech mid-sentence.
|
||||
- In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription.
|
||||
- In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`.
|
||||
- The OpenAI realtime provider accepts current Realtime 2 event names and legacy Codex-compatible aliases for output audio and transcript events, so compatible provider snapshots can drift without dropping assistant audio.
|
||||
- `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting.
|
||||
- `voice.realtime.minBargeInAudioEndMs` controls the minimum assistant playback duration before an OpenAI realtime barge-in truncates audio. Default: `250`. Set `0` for immediate interruption in low-echo rooms, or raise it for echo-heavy speaker setups.
|
||||
- For an OpenAI voice on Discord playback, set `voice.tts.provider: "openai"` and choose a Text-to-speech voice under `voice.tts.openai.voice` or `voice.tts.providers.openai.voice`. `cedar` is a good masculine-sounding choice on the current OpenAI TTS model.
|
||||
|
||||
@@ -1059,6 +1059,73 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("forwards Codex-compatible legacy realtime audio and transcript events", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onAudio = vi.fn();
|
||||
const onTranscript = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
|
||||
onAudio,
|
||||
onClearAudio: vi.fn(),
|
||||
onTranscript,
|
||||
});
|
||||
const connecting = bridge.connect();
|
||||
const socket = FakeWebSocket.instances[0];
|
||||
if (!socket) {
|
||||
throw new Error("expected bridge to create a websocket");
|
||||
}
|
||||
|
||||
socket.readyState = FakeWebSocket.OPEN;
|
||||
socket.emit("open");
|
||||
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
|
||||
await connecting;
|
||||
|
||||
const audio = Buffer.from("legacy assistant audio");
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "conversation.output_audio.delta",
|
||||
data: audio.toString("base64"),
|
||||
sample_rate: 24000,
|
||||
channels: 1,
|
||||
}),
|
||||
),
|
||||
);
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "conversation.input_transcript.delta",
|
||||
delta: "partial user",
|
||||
}),
|
||||
),
|
||||
);
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "conversation.output_transcript.delta",
|
||||
delta: "partial assistant",
|
||||
}),
|
||||
),
|
||||
);
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "response.output_text.done",
|
||||
text: "final assistant text",
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
expect(onAudio).toHaveBeenCalledWith(audio);
|
||||
expect(onTranscript).toHaveBeenCalledWith("user", "partial user", false);
|
||||
expect(onTranscript).toHaveBeenCalledWith("assistant", "partial assistant", false);
|
||||
expect(onTranscript).toHaveBeenCalledWith("assistant", "final assistant text", true);
|
||||
});
|
||||
|
||||
it("emits tool calls from realtime conversation item done events", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onToolCall = vi.fn();
|
||||
|
||||
@@ -117,6 +117,8 @@ function normalizeOpenAIRealtimeVoice(value: unknown): OpenAIRealtimeVoice | und
|
||||
type RealtimeEvent = {
|
||||
type: string;
|
||||
delta?: string;
|
||||
data?: string;
|
||||
text?: string;
|
||||
transcript?: string;
|
||||
item_id?: string;
|
||||
call_id?: string;
|
||||
@@ -854,12 +856,14 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.responseCreateInFlight = false;
|
||||
return;
|
||||
|
||||
case "conversation.output_audio.delta":
|
||||
case "response.audio.delta":
|
||||
case "response.output_audio.delta": {
|
||||
if (!event.delta) {
|
||||
const audioDelta = event.delta ?? event.data;
|
||||
if (!audioDelta) {
|
||||
return;
|
||||
}
|
||||
const audio = base64ToBuffer(event.delta);
|
||||
const audio = base64ToBuffer(audioDelta);
|
||||
this.config.onAudio(audio);
|
||||
if (event.item_id && event.item_id !== this.lastAssistantItemId) {
|
||||
this.lastAssistantItemId = event.item_id;
|
||||
@@ -878,6 +882,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
return;
|
||||
|
||||
case "conversation.output_transcript.delta":
|
||||
case "response.output_text.delta":
|
||||
case "response.audio_transcript.delta":
|
||||
case "response.output_audio_transcript.delta":
|
||||
if (event.delta) {
|
||||
@@ -885,10 +891,21 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
return;
|
||||
|
||||
case "response.output_text.done":
|
||||
case "response.audio_transcript.done":
|
||||
case "response.output_audio_transcript.done":
|
||||
if (event.transcript) {
|
||||
this.config.onTranscript?.("assistant", event.transcript, true);
|
||||
{
|
||||
const transcript = event.transcript ?? event.text;
|
||||
if (transcript) {
|
||||
this.config.onTranscript?.("assistant", transcript, true);
|
||||
}
|
||||
}
|
||||
return;
|
||||
|
||||
case "conversation.input_transcript.delta":
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.config.onTranscript?.("user", event.delta, false);
|
||||
}
|
||||
return;
|
||||
|
||||
@@ -898,12 +915,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
return;
|
||||
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.config.onTranscript?.("user", event.delta, false);
|
||||
}
|
||||
return;
|
||||
|
||||
case "response.cancelled":
|
||||
case "response.done":
|
||||
this.responseActive = false;
|
||||
|
||||
Reference in New Issue
Block a user