feat(voice-call): pre-cache inbound greeting for instant playback

Pre-generates TTS audio for the configured inboundGreeting at startup
and serves it instantly when an inbound call connects, eliminating the
500ms+ TTS synthesis delay on the first ring.

Changes:
- twilio.ts: Add cachedGreetingAudio storage with getter/setter
- runtime.ts: Pre-synthesize greeting TTS after provider initialization
- webhook.ts: Play cached audio directly via media stream on inbound
  connect, falling back to the original TTS path for outbound calls
  or when no cached audio is available

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
JayMishra-github
2026-02-16 10:18:44 -08:00
committed by Peter Steinberger
parent 27a4868c2d
commit 2c6db57554
3 changed files with 75 additions and 7 deletions

View File

@@ -62,6 +62,17 @@ export class TwilioProvider implements VoiceCallProvider {
/** Map of call SID to stream SID for media streams */
private callStreamMap = new Map<string, string>();
/** Pre-generated greeting audio for instant inbound playback */
private cachedGreetingAudio: Buffer | null = null;
setCachedGreetingAudio(audio: Buffer): void {
this.cachedGreetingAudio = audio;
console.log(`[voice-call] Cached greeting audio: ${audio.length} bytes`);
}
getCachedGreetingAudio(): Buffer | null {
return this.cachedGreetingAudio;
}
/** Per-call tokens for media stream authentication */
private streamAuthTokens = new Map<string, string>();

View File

@@ -187,6 +187,35 @@ export async function createVoiceCallRuntime(params: {
twilioProvider.setMediaStreamHandler(mediaHandler);
log.info("[voice-call] Media stream handler wired to provider");
}
// Pre-cache inbound greeting TTS for instant playback on connect
if (config.inboundGreeting && ttsRuntime?.textToSpeechTelephony) {
try {
const greetingTts = createTelephonyTtsProvider({
coreConfig,
ttsOverride: config.tts,
runtime: ttsRuntime,
});
greetingTts
.synthesizeForTelephony(config.inboundGreeting)
.then((audio) => {
twilioProvider.setCachedGreetingAudio(audio);
})
.catch((err) => {
log.warn(
`[voice-call] Failed to pre-cache greeting: ${
err instanceof Error ? err.message : String(err)
}`,
);
});
} catch (err) {
log.warn(
`[voice-call] Failed to init greeting TTS: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
}
}
manager.initialize(provider, webhookUrl);

View File

@@ -141,13 +141,41 @@ export class VoiceCallWebhookServer {
(this.provider as TwilioProvider).registerCallStream(callId, streamSid);
}
// Speak initial message if one was provided when call was initiated
// Use setTimeout to allow stream setup to complete
setTimeout(() => {
this.manager.speakInitialMessage(callId).catch((err) => {
console.warn(`[voice-call] Failed to speak initial message:`, err);
});
}, 500);
// Try instant cached greeting for inbound calls (pre-generated at startup)
const cachedAudio =
this.provider.name === "twilio"
? (this.provider as TwilioProvider).getCachedGreetingAudio()
: null;
const call = this.manager.getCallByProviderCallId(callId);
if (cachedAudio && call?.metadata?.initialMessage && call.direction === "inbound") {
console.log(`[voice-call] Playing cached greeting (${cachedAudio.length} bytes)`);
delete call.metadata.initialMessage; // prevent re-speaking via fallback
const handler = this.mediaStreamHandler!;
const CHUNK_SIZE = 160;
const CHUNK_DELAY_MS = 20;
void (async () => {
const { chunkAudio } = await import("./telephony-audio.js");
await handler.queueTts(streamSid, async (signal) => {
for (const chunk of chunkAudio(cachedAudio, CHUNK_SIZE)) {
if (signal.aborted) break;
handler.sendAudio(streamSid, chunk);
await new Promise((r) => setTimeout(r, CHUNK_DELAY_MS));
}
if (!signal.aborted) {
handler.sendMark(streamSid, `greeting-${Date.now()}`);
}
});
})().catch((err) =>
console.warn("[voice-call] Cached greeting playback failed:", err),
);
} else {
// Fallback: original path with reduced delay
setTimeout(() => {
this.manager.speakInitialMessage(callId).catch((err) => {
console.warn(`[voice-call] Failed to speak initial message:`, err);
});
}, 100);
}
},
onDisconnect: (callId) => {
console.log(`[voice-call] Media stream disconnected: ${callId}`);