Merge branch 'main' into fix/voice-call-env-var-validation

This commit is contained in:
Shakker
2026-01-26 13:10:58 +00:00
committed by GitHub
416 changed files with 26012 additions and 8724 deletions

View File

@@ -82,31 +82,82 @@ export const SttConfigSchema = z
.default({ provider: "openai", model: "whisper-1" });
export type SttConfig = z.infer<typeof SttConfigSchema>;
export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
export const TtsConfigSchema = z
.object({
/** TTS provider (currently only OpenAI supported) */
provider: z.literal("openai").default("openai"),
/**
* TTS model to use:
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
* - tts-1: lower latency
* - tts-1-hd: higher quality
*/
model: z.string().min(1).default("gpt-4o-mini-tts"),
/**
* Voice ID. For best quality, use marin or cedar.
* All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
*/
voice: z.string().min(1).default("coral"),
/**
* Instructions for speech style (only works with gpt-4o-mini-tts).
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
*/
instructions: z.string().optional(),
auto: TtsAutoSchema.optional(),
enabled: z.boolean().optional(),
mode: TtsModeSchema.optional(),
provider: TtsProviderSchema.optional(),
summaryModel: z.string().optional(),
modelOverrides: z
.object({
enabled: z.boolean().optional(),
allowText: z.boolean().optional(),
allowProvider: z.boolean().optional(),
allowVoice: z.boolean().optional(),
allowModelId: z.boolean().optional(),
allowVoiceSettings: z.boolean().optional(),
allowNormalization: z.boolean().optional(),
allowSeed: z.boolean().optional(),
})
.strict()
.optional(),
elevenlabs: z
.object({
apiKey: z.string().optional(),
baseUrl: z.string().optional(),
voiceId: z.string().optional(),
modelId: z.string().optional(),
seed: z.number().int().min(0).max(4294967295).optional(),
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
languageCode: z.string().optional(),
voiceSettings: z
.object({
stability: z.number().min(0).max(1).optional(),
similarityBoost: z.number().min(0).max(1).optional(),
style: z.number().min(0).max(1).optional(),
useSpeakerBoost: z.boolean().optional(),
speed: z.number().min(0.5).max(2).optional(),
})
.strict()
.optional(),
})
.strict()
.optional(),
openai: z
.object({
apiKey: z.string().optional(),
model: z.string().optional(),
voice: z.string().optional(),
})
.strict()
.optional(),
edge: z
.object({
enabled: z.boolean().optional(),
voice: z.string().optional(),
lang: z.string().optional(),
outputFormat: z.string().optional(),
pitch: z.string().optional(),
rate: z.string().optional(),
volume: z.string().optional(),
saveSubtitles: z.boolean().optional(),
proxy: z.string().optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
export type TtsConfig = z.infer<typeof TtsConfigSchema>;
.optional();
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
// -----------------------------------------------------------------------------
// Webhook Server Configuration
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
/** STT configuration */
stt: SttConfigSchema,
/** TTS configuration */
/** TTS override (deep-merges with core messages.tts) */
tts: TtsConfigSchema,
/** Store path for call logs */

View File

@@ -2,10 +2,16 @@ import fs from "node:fs";
import path from "node:path";
import { fileURLToPath, pathToFileURL } from "node:url";
import type { VoiceCallTtsConfig } from "./config.js";
export type CoreConfig = {
session?: {
store?: string;
};
messages?: {
tts?: VoiceCallTtsConfig;
};
[key: string]: unknown;
};
type CoreAgentDeps = {

View File

@@ -143,7 +143,7 @@ export class CallManager {
// For notify mode with a message, use inline TwiML with <Say>
let inlineTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
console.log(
`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
@@ -210,11 +210,13 @@ export class CallManager {
this.addTranscriptEntry(call, "bot", text);
// Play TTS
const voice =
this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
await this.provider.playTts({
callId,
providerCallId: call.providerCallId,
text,
voice: this.config.tts.voice,
voice,
});
return { success: true };

View File

@@ -19,4 +19,3 @@ export type CallManagerContext = {
transcriptWaiters: Map<CallId, TranscriptWaiter>;
maxDurationTimers: Map<CallId, NodeJS.Timeout>;
};

View File

@@ -175,4 +175,3 @@ export function processEvent(ctx: CallManagerContext, event: NormalizedEvent): v
persistCallRecord(ctx.storePath, call);
}

View File

@@ -31,4 +31,3 @@ export function findCall(params: {
providerCallId: params.callIdOrProviderCallId,
});
}

View File

@@ -68,7 +68,7 @@ export async function initiateCall(
// For notify mode with a message, use inline TwiML with <Say>.
let inlineTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
}
@@ -120,11 +120,13 @@ export async function speak(
addTranscriptEntry(call, "bot", text);
const voice =
ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
await ctx.provider.playTts({
callId,
providerCallId: call.providerCallId,
text,
voice: ctx.config.tts.voice,
voice,
});
return { success: true };
@@ -244,4 +246,3 @@ export async function endCall(
return { success: false, error: err instanceof Error ? err.message : String(err) };
}
}

View File

@@ -48,4 +48,3 @@ export function addTranscriptEntry(
};
call.transcript.push(entry);
}

View File

@@ -86,4 +86,3 @@ export async function getCallHistoryFromStore(
return calls;
}

View File

@@ -84,4 +84,3 @@ export function waitForFinalTranscript(
ctx.transcriptWaiters.set(callId, { resolve, reject, timeout });
});
}

View File

@@ -7,4 +7,3 @@ export function generateNotifyTwiml(message: string, voice: string): string {
<Hangup/>
</Response>`;
}

View File

@@ -0,0 +1,97 @@
import { describe, expect, it } from "vitest";
import type {
OpenAIRealtimeSTTProvider,
RealtimeSTTSession,
} from "./providers/stt-openai-realtime.js";
import { MediaStreamHandler } from "./media-stream.js";
const createStubSession = (): RealtimeSTTSession => ({
connect: async () => {},
sendAudio: () => {},
waitForTranscript: async () => "",
onPartial: () => {},
onTranscript: () => {},
onSpeechStart: () => {},
close: () => {},
isConnected: () => true,
});
const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
({
createSession: () => createStubSession(),
}) as unknown as OpenAIRealtimeSTTProvider;
const flush = async (): Promise<void> => {
await new Promise((resolve) => setTimeout(resolve, 0));
};
const waitForAbort = (signal: AbortSignal): Promise<void> =>
new Promise((resolve) => {
if (signal.aborted) {
resolve();
return;
}
signal.addEventListener("abort", () => resolve(), { once: true });
});
describe("MediaStreamHandler TTS queue", () => {
it("serializes TTS playback and resolves in order", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
});
const started: number[] = [];
const finished: number[] = [];
let resolveFirst!: () => void;
const firstGate = new Promise<void>((resolve) => {
resolveFirst = resolve;
});
const first = handler.queueTts("stream-1", async () => {
started.push(1);
await firstGate;
finished.push(1);
});
const second = handler.queueTts("stream-1", async () => {
started.push(2);
finished.push(2);
});
await flush();
expect(started).toEqual([1]);
resolveFirst();
await first;
await second;
expect(started).toEqual([1, 2]);
expect(finished).toEqual([1, 2]);
});
it("cancels active playback and clears queued items", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
});
let queuedRan = false;
const started: string[] = [];
const active = handler.queueTts("stream-1", async (signal) => {
started.push("active");
await waitForAbort(signal);
});
void handler.queueTts("stream-1", async () => {
queuedRan = true;
});
await flush();
expect(started).toEqual(["active"]);
handler.clearTtsQueue("stream-1");
await active;
await flush();
expect(queuedRan).toBe(false);
});
});

View File

@@ -29,6 +29,8 @@ export interface MediaStreamConfig {
onPartialTranscript?: (callId: string, partial: string) => void;
/** Callback when stream connects */
onConnect?: (callId: string, streamSid: string) => void;
/** Callback when speech starts (barge-in) */
onSpeechStart?: (callId: string) => void;
/** Callback when stream disconnects */
onDisconnect?: (callId: string) => void;
}
@@ -43,6 +45,13 @@ interface StreamSession {
sttSession: RealtimeSTTSession;
}
type TtsQueueEntry = {
playFn: (signal: AbortSignal) => Promise<void>;
controller: AbortController;
resolve: () => void;
reject: (error: unknown) => void;
};
/**
* Manages WebSocket connections for Twilio media streams.
*/
@@ -50,6 +59,12 @@ export class MediaStreamHandler {
private wss: WebSocketServer | null = null;
private sessions = new Map<string, StreamSession>();
private config: MediaStreamConfig;
/** TTS playback queues per stream (serialize audio to prevent overlap) */
private ttsQueues = new Map<string, TtsQueueEntry[]>();
/** Whether TTS is currently playing per stream */
private ttsPlaying = new Map<string, boolean>();
/** Active TTS playback controllers per stream */
private ttsActiveControllers = new Map<string, AbortController>();
constructor(config: MediaStreamConfig) {
this.config = config;
@@ -148,6 +163,10 @@ export class MediaStreamHandler {
this.config.onTranscript?.(callSid, transcript);
});
sttSession.onSpeechStart(() => {
this.config.onSpeechStart?.(callSid);
});
const session: StreamSession = {
callId: callSid,
streamSid,
@@ -177,6 +196,7 @@ export class MediaStreamHandler {
private handleStop(session: StreamSession): void {
console.log(`[MediaStream] Stream stopped: ${session.streamSid}`);
this.clearTtsState(session.streamSid);
session.sttSession.close();
this.sessions.delete(session.streamSid);
this.config.onDisconnect?.(session.callId);
@@ -228,6 +248,46 @@ export class MediaStreamHandler {
this.sendToStream(streamSid, { event: "clear", streamSid });
}
/**
* Queue a TTS operation for sequential playback.
* Only one TTS operation plays at a time per stream to prevent overlap.
*/
async queueTts(
streamSid: string,
playFn: (signal: AbortSignal) => Promise<void>,
): Promise<void> {
const queue = this.getTtsQueue(streamSid);
let resolveEntry: () => void;
let rejectEntry: (error: unknown) => void;
const promise = new Promise<void>((resolve, reject) => {
resolveEntry = resolve;
rejectEntry = reject;
});
queue.push({
playFn,
controller: new AbortController(),
resolve: resolveEntry!,
reject: rejectEntry!,
});
if (!this.ttsPlaying.get(streamSid)) {
void this.processQueue(streamSid);
}
return promise;
}
/**
* Clear TTS queue and interrupt current playback (barge-in).
*/
clearTtsQueue(streamSid: string): void {
const queue = this.getTtsQueue(streamSid);
queue.length = 0;
this.ttsActiveControllers.get(streamSid)?.abort();
this.clearAudio(streamSid);
}
/**
* Get active session by call ID.
*/
@@ -242,11 +302,65 @@ export class MediaStreamHandler {
*/
closeAll(): void {
for (const session of this.sessions.values()) {
this.clearTtsState(session.streamSid);
session.sttSession.close();
session.ws.close();
}
this.sessions.clear();
}
private getTtsQueue(streamSid: string): TtsQueueEntry[] {
const existing = this.ttsQueues.get(streamSid);
if (existing) return existing;
const queue: TtsQueueEntry[] = [];
this.ttsQueues.set(streamSid, queue);
return queue;
}
/**
* Process the TTS queue for a stream.
* Uses iterative approach to avoid stack accumulation from recursion.
*/
private async processQueue(streamSid: string): Promise<void> {
this.ttsPlaying.set(streamSid, true);
while (true) {
const queue = this.ttsQueues.get(streamSid);
if (!queue || queue.length === 0) {
this.ttsPlaying.set(streamSid, false);
this.ttsActiveControllers.delete(streamSid);
return;
}
const entry = queue.shift()!;
this.ttsActiveControllers.set(streamSid, entry.controller);
try {
await entry.playFn(entry.controller.signal);
entry.resolve();
} catch (error) {
if (entry.controller.signal.aborted) {
entry.resolve();
} else {
console.error("[MediaStream] TTS playback error:", error);
entry.reject(error);
}
} finally {
if (this.ttsActiveControllers.get(streamSid) === entry.controller) {
this.ttsActiveControllers.delete(streamSid);
}
}
}
}
private clearTtsState(streamSid: string): void {
const queue = this.ttsQueues.get(streamSid);
if (queue) queue.length = 0;
this.ttsActiveControllers.get(streamSid)?.abort();
this.ttsActiveControllers.delete(streamSid);
this.ttsPlaying.delete(streamSid);
this.ttsQueues.delete(streamSid);
}
}
/**

View File

@@ -26,4 +26,3 @@ describe("PlivoProvider", () => {
expect(result.providerResponseBody).toContain('length="300"');
});
});

View File

@@ -38,6 +38,8 @@ export interface RealtimeSTTSession {
onPartial(callback: (partial: string) => void): void;
/** Set callback for final transcripts */
onTranscript(callback: (transcript: string) => void): void;
/** Set callback when speech starts (VAD) */
onSpeechStart(callback: () => void): void;
/** Close the session */
close(): void;
/** Check if session is connected */
@@ -91,6 +93,7 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
private pendingTranscript = "";
private onTranscriptCallback: ((transcript: string) => void) | null = null;
private onPartialCallback: ((partial: string) => void) | null = null;
private onSpeechStartCallback: (() => void) | null = null;
constructor(
private readonly apiKey: string,
@@ -243,6 +246,7 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
case "input_audio_buffer.speech_started":
console.log("[RealtimeSTT] Speech started");
this.pendingTranscript = "";
this.onSpeechStartCallback?.();
break;
case "error":
@@ -273,6 +277,10 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
this.onTranscriptCallback = callback;
}
onSpeechStart(callback: () => void): void {
this.onSpeechStartCallback = callback;
}
async waitForTranscript(timeoutMs = 30000): Promise<string> {
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {

View File

@@ -15,9 +15,9 @@ import type {
WebhookVerificationResult,
} from "../types.js";
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
import { chunkAudio } from "../telephony-audio.js";
import type { TelephonyTtsProvider } from "../telephony-tts.js";
import type { VoiceCallProvider } from "./base.js";
import type { OpenAITTSProvider } from "./tts-openai.js";
import { chunkAudio } from "./tts-openai.js";
import { twilioApiRequest } from "./twilio/api.js";
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
@@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider {
/** Current public webhook URL (set when tunnel starts or from config) */
private currentPublicUrl: string | null = null;
/** Optional OpenAI TTS provider for streaming TTS */
private ttsProvider: OpenAITTSProvider | null = null;
/** Optional telephony TTS provider for streaming TTS */
private ttsProvider: TelephonyTtsProvider | null = null;
/** Optional media stream handler for sending audio */
private mediaStreamHandler: MediaStreamHandler | null = null;
@@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider {
return this.currentPublicUrl;
}
setTTSProvider(provider: OpenAITTSProvider): void {
setTTSProvider(provider: TelephonyTtsProvider): void {
this.ttsProvider = provider;
}
@@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider {
this.callStreamMap.delete(callSid);
}
/**
* Clear TTS queue for a call (barge-in).
* Used when user starts speaking to interrupt current TTS playback.
*/
clearTtsQueue(callSid: string): void {
const streamSid = this.callStreamMap.get(callSid);
if (streamSid && this.mediaStreamHandler) {
this.mediaStreamHandler.clearTtsQueue(streamSid);
}
}
/**
* Make an authenticated request to the Twilio API.
*/
@@ -454,13 +465,13 @@ export class TwilioProvider implements VoiceCallProvider {
* Play TTS audio via Twilio.
*
* Two modes:
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
* generates audio via OpenAI and streams it through WebSocket (preferred).
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
* generates audio via core TTS and streams it through WebSocket (preferred).
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
* Note: This may not work on all Twilio accounts.
*/
async playTts(input: PlayTtsInput): Promise<void> {
// Try OpenAI TTS via media stream first (if configured)
// Try telephony TTS via media stream first (if configured)
const streamSid = this.callStreamMap.get(input.providerCallId);
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
try {
@@ -468,7 +479,7 @@ export class TwilioProvider implements VoiceCallProvider {
return;
} catch (err) {
console.warn(
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
err instanceof Error ? err.message : err,
);
// Fall through to TwiML <Say> fallback
@@ -484,7 +495,7 @@ export class TwilioProvider implements VoiceCallProvider {
}
console.warn(
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
);
const pollyVoice = mapVoiceToPolly(input.voice);
@@ -502,9 +513,9 @@ export class TwilioProvider implements VoiceCallProvider {
}
/**
* Play TTS via OpenAI and Twilio Media Streams.
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
* Uses a jitter buffer to smooth out timing variations.
* Play TTS via core TTS and Twilio Media Streams.
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
* Uses a queue to serialize playback and prevent overlapping audio.
*/
private async playTtsViaStream(
text: string,
@@ -514,22 +525,29 @@ export class TwilioProvider implements VoiceCallProvider {
throw new Error("TTS provider and media stream handler required");
}
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
const CHUNK_SIZE = 160;
const CHUNK_DELAY_MS = 20;
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
this.mediaStreamHandler.sendAudio(streamSid, chunk);
const handler = this.mediaStreamHandler;
const ttsProvider = this.ttsProvider;
await handler.queueTts(streamSid, async (signal) => {
// Generate audio with core TTS (returns mu-law at 8kHz)
const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
if (signal.aborted) break;
handler.sendAudio(streamSid, chunk);
// Pace the audio to match real-time playback
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
}
// Pace the audio to match real-time playback
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
if (signal.aborted) break;
}
// Send a mark to track when audio finishes
this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
if (!signal.aborted) {
// Send a mark to track when audio finishes
handler.sendMark(streamSid, `tts-${Date.now()}`);
}
});
}
/**

View File

@@ -27,4 +27,3 @@ export function verifyTwilioProviderWebhook(params: {
reason: result.reason,
};
}

View File

@@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js";
import { MockProvider } from "./providers/mock.js";
import { PlivoProvider } from "./providers/plivo.js";
import { TelnyxProvider } from "./providers/telnyx.js";
import { OpenAITTSProvider } from "./providers/tts-openai.js";
import { TwilioProvider } from "./providers/twilio.js";
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
import { startTunnel, type TunnelResult } from "./tunnel.js";
import {
cleanupTailscaleExposure,
@@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
export async function createVoiceCallRuntime(params: {
config: VoiceCallConfig;
coreConfig: CoreConfig;
ttsRuntime?: TelephonyTtsRuntime;
logger?: Logger;
}): Promise<VoiceCallRuntime> {
const { config, coreConfig, logger } = params;
const { config, coreConfig, ttsRuntime, logger } = params;
const log = logger ?? {
info: console.log,
warn: console.warn,
@@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: {
if (provider.name === "twilio" && config.streaming?.enabled) {
const twilioProvider = provider as TwilioProvider;
const openaiApiKey =
config.streaming.openaiApiKey || process.env.OPENAI_API_KEY;
if (openaiApiKey) {
if (ttsRuntime?.textToSpeechTelephony) {
try {
const ttsProvider = new OpenAITTSProvider({
apiKey: openaiApiKey,
voice: config.tts.voice,
model: config.tts.model,
instructions: config.tts.instructions,
const ttsProvider = createTelephonyTtsProvider({
coreConfig,
ttsOverride: config.tts,
runtime: ttsRuntime,
});
twilioProvider.setTTSProvider(ttsProvider);
log.info("[voice-call] OpenAI TTS provider configured");
log.info("[voice-call] Telephony TTS provider configured");
} catch (err) {
log.warn(
`[voice-call] Failed to initialize OpenAI TTS: ${
`[voice-call] Failed to initialize telephony TTS: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
} else {
log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled");
log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
}
const mediaHandler = webhookServer.getMediaStreamHandler();

View File

@@ -0,0 +1,88 @@
const TELEPHONY_SAMPLE_RATE = 8000;
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
/**
* Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
*/
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
const inputSamples = Math.floor(input.length / 2);
if (inputSamples === 0) return Buffer.alloc(0);
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
const outputSamples = Math.floor(inputSamples / ratio);
const output = Buffer.alloc(outputSamples * 2);
for (let i = 0; i < outputSamples; i++) {
const srcPos = i * ratio;
const srcIndex = Math.floor(srcPos);
const frac = srcPos - srcIndex;
const s0 = input.readInt16LE(srcIndex * 2);
const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
const s1 = input.readInt16LE(s1Index * 2);
const sample = Math.round(s0 + frac * (s1 - s0));
output.writeInt16LE(clamp16(sample), i * 2);
}
return output;
}
/**
* Convert 16-bit PCM to 8-bit mu-law (G.711).
*/
export function pcmToMulaw(pcm: Buffer): Buffer {
const samples = Math.floor(pcm.length / 2);
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
export function convertPcmToMulaw8k(
pcm: Buffer,
inputSampleRate: number,
): Buffer {
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
return pcmToMulaw(pcm8k);
}
/**
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
*/
export function chunkAudio(
audio: Buffer,
chunkSize = 160,
): Generator<Buffer, void, unknown> {
return (function* () {
for (let i = 0; i < audio.length; i += chunkSize) {
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
}
})();
}
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) sample = -sample;
if (sample > CLIP) sample = CLIP;
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
expMask >>= 1;
}
const mantissa = (sample >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}

View File

@@ -0,0 +1,95 @@
import type { CoreConfig } from "./core-bridge.js";
import type { VoiceCallTtsConfig } from "./config.js";
import { convertPcmToMulaw8k } from "./telephony-audio.js";
export type TelephonyTtsRuntime = {
textToSpeechTelephony: (params: {
text: string;
cfg: CoreConfig;
prefsPath?: string;
}) => Promise<{
success: boolean;
audioBuffer?: Buffer;
sampleRate?: number;
provider?: string;
error?: string;
}>;
};
export type TelephonyTtsProvider = {
synthesizeForTelephony: (text: string) => Promise<Buffer>;
};
export function createTelephonyTtsProvider(params: {
coreConfig: CoreConfig;
ttsOverride?: VoiceCallTtsConfig;
runtime: TelephonyTtsRuntime;
}): TelephonyTtsProvider {
const { coreConfig, ttsOverride, runtime } = params;
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
return {
synthesizeForTelephony: async (text: string) => {
const result = await runtime.textToSpeechTelephony({
text,
cfg: mergedConfig,
});
if (!result.success || !result.audioBuffer || !result.sampleRate) {
throw new Error(result.error ?? "TTS conversion failed");
}
return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
},
};
}
function applyTtsOverride(
coreConfig: CoreConfig,
override?: VoiceCallTtsConfig,
): CoreConfig {
if (!override) return coreConfig;
const base = coreConfig.messages?.tts;
const merged = mergeTtsConfig(base, override);
if (!merged) return coreConfig;
return {
...coreConfig,
messages: {
...(coreConfig.messages ?? {}),
tts: merged,
},
};
}
function mergeTtsConfig(
base?: VoiceCallTtsConfig,
override?: VoiceCallTtsConfig,
): VoiceCallTtsConfig | undefined {
if (!base && !override) return undefined;
if (!override) return base;
if (!base) return override;
return deepMerge(base, override);
}
function deepMerge<T>(base: T, override: T): T {
if (!isPlainObject(base) || !isPlainObject(override)) {
return override;
}
const result: Record<string, unknown> = { ...base };
for (const [key, value] of Object.entries(override)) {
if (value === undefined) continue;
const existing = (base as Record<string, unknown>)[key];
if (isPlainObject(existing) && isPlainObject(value)) {
result[key] = deepMerge(existing, value);
} else {
result[key] = value;
}
}
return result as T;
}
function isPlainObject(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}

View File

@@ -78,6 +78,11 @@ export class VoiceCallWebhookServer {
`[voice-call] Transcript for ${providerCallId}: ${transcript}`,
);
// Clear TTS queue on barge-in (user started speaking, interrupt current playback)
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
}
// Look up our internal call ID from the provider call ID
const call = this.manager.getCallByProviderCallId(providerCallId);
if (!call) {
@@ -109,6 +114,11 @@ export class VoiceCallWebhookServer {
});
}
},
onSpeechStart: (providerCallId) => {
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
}
},
onPartialTranscript: (callId, partial) => {
console.log(`[voice-call] Partial for ${callId}: ${partial}`);
},