feat: restore voice-call plugin parity

This commit is contained in:
Peter Steinberger
2026-01-12 21:40:22 +00:00
parent 3467b0ba07
commit 42c17adb5e
27 changed files with 6036 additions and 516 deletions

View File

@@ -0,0 +1,67 @@
import type {
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
PlayTtsInput,
ProviderName,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "../types.js";
/**
* Abstract base interface for voice call providers.
*
* Each provider (Telnyx, Twilio, etc.) implements this interface to provide
* a consistent API for the call manager.
*
* Responsibilities:
* - Webhook verification and event parsing
* - Outbound call initiation and hangup
* - Media control (TTS playback, STT listening)
*/
export interface VoiceCallProvider {
/** Provider identifier */
readonly name: ProviderName;
/**
* Verify webhook signature/HMAC before processing.
* Must be called before parseWebhookEvent.
*/
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult;
/**
* Parse provider-specific webhook payload into normalized events.
* Returns events and optional response to send back to provider.
*/
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult;
/**
* Initiate an outbound call.
* @returns Provider call ID and status
*/
initiateCall(input: InitiateCallInput): Promise<InitiateCallResult>;
/**
* Hang up an active call.
*/
hangupCall(input: HangupCallInput): Promise<void>;
/**
* Play TTS audio to the caller.
* The provider should handle streaming if supported.
*/
playTts(input: PlayTtsInput): Promise<void>;
/**
* Start listening for user speech (activate STT).
*/
startListening(input: StartListeningInput): Promise<void>;
/**
* Stop listening for user speech (deactivate STT).
*/
stopListening(input: StopListeningInput): Promise<void>;
}

View File

@@ -0,0 +1,9 @@
export type { VoiceCallProvider } from "./base.js";
export { MockProvider } from "./mock.js";
export {
OpenAIRealtimeSTTProvider,
type RealtimeSTTConfig,
type RealtimeSTTSession,
} from "./stt-openai-realtime.js";
export { TelnyxProvider } from "./telnyx.js";
export { TwilioProvider } from "./twilio.js";

View File

@@ -0,0 +1,168 @@
import crypto from "node:crypto";
import type {
EndReason,
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "../types.js";
import type { VoiceCallProvider } from "./base.js";
/**
* Mock voice call provider for local testing.
*
* Events are driven via webhook POST with JSON body:
* - { events: NormalizedEvent[] } for bulk events
* - { event: NormalizedEvent } for single event
*/
export class MockProvider implements VoiceCallProvider {
readonly name = "mock" as const;
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
return { ok: true };
}
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
try {
const payload = JSON.parse(ctx.rawBody);
const events: NormalizedEvent[] = [];
if (Array.isArray(payload.events)) {
for (const evt of payload.events) {
const normalized = this.normalizeEvent(evt);
if (normalized) events.push(normalized);
}
} else if (payload.event) {
const normalized = this.normalizeEvent(payload.event);
if (normalized) events.push(normalized);
}
return { events, statusCode: 200 };
} catch {
return { events: [], statusCode: 400 };
}
}
private normalizeEvent(
evt: Partial<NormalizedEvent>,
): NormalizedEvent | null {
if (!evt.type || !evt.callId) return null;
const base = {
id: evt.id || crypto.randomUUID(),
callId: evt.callId,
providerCallId: evt.providerCallId,
timestamp: evt.timestamp || Date.now(),
};
switch (evt.type) {
case "call.initiated":
case "call.ringing":
case "call.answered":
case "call.active":
return { ...base, type: evt.type };
case "call.speaking": {
const payload = evt as Partial<NormalizedEvent & { text?: string }>;
return {
...base,
type: evt.type,
text: payload.text || "",
};
}
case "call.speech": {
const payload = evt as Partial<
NormalizedEvent & {
transcript?: string;
isFinal?: boolean;
confidence?: number;
}
>;
return {
...base,
type: evt.type,
transcript: payload.transcript || "",
isFinal: payload.isFinal ?? true,
confidence: payload.confidence,
};
}
case "call.silence": {
const payload = evt as Partial<
NormalizedEvent & { durationMs?: number }
>;
return {
...base,
type: evt.type,
durationMs: payload.durationMs || 0,
};
}
case "call.dtmf": {
const payload = evt as Partial<NormalizedEvent & { digits?: string }>;
return {
...base,
type: evt.type,
digits: payload.digits || "",
};
}
case "call.ended": {
const payload = evt as Partial<
NormalizedEvent & { reason?: EndReason }
>;
return {
...base,
type: evt.type,
reason: payload.reason || "completed",
};
}
case "call.error": {
const payload = evt as Partial<
NormalizedEvent & { error?: string; retryable?: boolean }
>;
return {
...base,
type: evt.type,
error: payload.error || "unknown error",
retryable: payload.retryable,
};
}
default:
return null;
}
}
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
return {
providerCallId: `mock-${input.callId}`,
status: "initiated",
};
}
async hangupCall(_input: HangupCallInput): Promise<void> {
// No-op for mock
}
async playTts(_input: PlayTtsInput): Promise<void> {
// No-op for mock
}
async startListening(_input: StartListeningInput): Promise<void> {
// No-op for mock
}
async stopListening(_input: StopListeningInput): Promise<void> {
// No-op for mock
}
}

View File

@@ -0,0 +1,303 @@
/**
* OpenAI Realtime STT Provider
*
* Uses the OpenAI Realtime API for streaming transcription with:
* - Direct mu-law audio support (no conversion needed)
* - Built-in server-side VAD for turn detection
* - Low-latency streaming transcription
* - Partial transcript callbacks for real-time UI updates
*/
import WebSocket from "ws";
/**
* Configuration for OpenAI Realtime STT.
*/
export interface RealtimeSTTConfig {
/** OpenAI API key */
apiKey: string;
/** Model to use (default: gpt-4o-transcribe) */
model?: string;
/** Silence duration in ms before considering speech ended (default: 800) */
silenceDurationMs?: number;
/** VAD threshold 0-1 (default: 0.5) */
vadThreshold?: number;
}
/**
* Session for streaming audio and receiving transcripts.
*/
export interface RealtimeSTTSession {
/** Connect to the transcription service */
connect(): Promise<void>;
/** Send mu-law audio data (8kHz mono) */
sendAudio(audio: Buffer): void;
/** Wait for next complete transcript (after VAD detects end of speech) */
waitForTranscript(timeoutMs?: number): Promise<string>;
/** Set callback for partial transcripts (streaming) */
onPartial(callback: (partial: string) => void): void;
/** Set callback for final transcripts */
onTranscript(callback: (transcript: string) => void): void;
/** Close the session */
close(): void;
/** Check if session is connected */
isConnected(): boolean;
}
/**
* Provider factory for OpenAI Realtime STT sessions.
*/
export class OpenAIRealtimeSTTProvider {
readonly name = "openai-realtime";
private apiKey: string;
private model: string;
private silenceDurationMs: number;
private vadThreshold: number;
constructor(config: RealtimeSTTConfig) {
if (!config.apiKey) {
throw new Error("OpenAI API key required for Realtime STT");
}
this.apiKey = config.apiKey;
this.model = config.model || "gpt-4o-transcribe";
this.silenceDurationMs = config.silenceDurationMs || 800;
this.vadThreshold = config.vadThreshold || 0.5;
}
/**
* Create a new realtime transcription session.
*/
createSession(): RealtimeSTTSession {
return new OpenAIRealtimeSTTSession(
this.apiKey,
this.model,
this.silenceDurationMs,
this.vadThreshold,
);
}
}
/**
* WebSocket-based session for real-time speech-to-text.
*/
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
private static readonly RECONNECT_DELAY_MS = 1000;
private ws: WebSocket | null = null;
private connected = false;
private closed = false;
private reconnectAttempts = 0;
private pendingTranscript = "";
private onTranscriptCallback: ((transcript: string) => void) | null = null;
private onPartialCallback: ((partial: string) => void) | null = null;
constructor(
private readonly apiKey: string,
private readonly model: string,
private readonly silenceDurationMs: number,
private readonly vadThreshold: number,
) {}
async connect(): Promise<void> {
this.closed = false;
this.reconnectAttempts = 0;
return this.doConnect();
}
private async doConnect(): Promise<void> {
return new Promise((resolve, reject) => {
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
this.ws = new WebSocket(url, {
headers: {
Authorization: `Bearer ${this.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
});
this.ws.on("open", () => {
console.log("[RealtimeSTT] WebSocket connected");
this.connected = true;
this.reconnectAttempts = 0;
// Configure the transcription session
this.sendEvent({
type: "transcription_session.update",
session: {
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: this.model,
},
turn_detection: {
type: "server_vad",
threshold: this.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: this.silenceDurationMs,
},
},
});
resolve();
});
this.ws.on("message", (data: Buffer) => {
try {
const event = JSON.parse(data.toString());
this.handleEvent(event);
} catch (e) {
console.error("[RealtimeSTT] Failed to parse event:", e);
}
});
this.ws.on("error", (error) => {
console.error("[RealtimeSTT] WebSocket error:", error);
if (!this.connected) reject(error);
});
this.ws.on("close", (code, reason) => {
console.log(
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
);
this.connected = false;
// Attempt reconnection if not intentionally closed
if (!this.closed) {
void this.attemptReconnect();
}
});
setTimeout(() => {
if (!this.connected) {
reject(new Error("Realtime STT connection timeout"));
}
}, 10000);
});
}
private async attemptReconnect(): Promise<void> {
if (this.closed) {
return;
}
if (
this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS
) {
console.error(
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
);
return;
}
this.reconnectAttempts++;
const delay =
OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS *
2 ** (this.reconnectAttempts - 1);
console.log(
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
);
await new Promise((resolve) => setTimeout(resolve, delay));
if (this.closed) {
return;
}
try {
await this.doConnect();
console.log("[RealtimeSTT] Reconnected successfully");
} catch (error) {
console.error("[RealtimeSTT] Reconnect failed:", error);
}
}
private handleEvent(event: {
type: string;
delta?: string;
transcript?: string;
error?: unknown;
}): void {
switch (event.type) {
case "transcription_session.created":
case "transcription_session.updated":
case "input_audio_buffer.speech_stopped":
case "input_audio_buffer.committed":
console.log(`[RealtimeSTT] ${event.type}`);
break;
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.pendingTranscript += event.delta;
this.onPartialCallback?.(this.pendingTranscript);
}
break;
case "conversation.item.input_audio_transcription.completed":
if (event.transcript) {
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
this.onTranscriptCallback?.(event.transcript);
}
this.pendingTranscript = "";
break;
case "input_audio_buffer.speech_started":
console.log("[RealtimeSTT] Speech started");
this.pendingTranscript = "";
break;
case "error":
console.error("[RealtimeSTT] Error:", event.error);
break;
}
}
private sendEvent(event: unknown): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify(event));
}
}
sendAudio(muLawData: Buffer): void {
if (!this.connected) return;
this.sendEvent({
type: "input_audio_buffer.append",
audio: muLawData.toString("base64"),
});
}
onPartial(callback: (partial: string) => void): void {
this.onPartialCallback = callback;
}
onTranscript(callback: (transcript: string) => void): void {
this.onTranscriptCallback = callback;
}
async waitForTranscript(timeoutMs = 30000): Promise<string> {
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
this.onTranscriptCallback = null;
reject(new Error("Transcript timeout"));
}, timeoutMs);
this.onTranscriptCallback = (transcript) => {
clearTimeout(timeout);
this.onTranscriptCallback = null;
resolve(transcript);
};
});
}
close(): void {
this.closed = true;
if (this.ws) {
this.ws.close();
this.ws = null;
}
this.connected = false;
}
isConnected(): boolean {
return this.connected;
}
}

View File

@@ -0,0 +1,364 @@
import crypto from "node:crypto";
import type { TelnyxConfig } from "../config.js";
import type {
EndReason,
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "../types.js";
import type { VoiceCallProvider } from "./base.js";
/**
* Telnyx Voice API provider implementation.
*
* Uses Telnyx Call Control API v2 for managing calls.
* @see https://developers.telnyx.com/docs/api/v2/call-control
*/
export class TelnyxProvider implements VoiceCallProvider {
readonly name = "telnyx" as const;
private readonly apiKey: string;
private readonly connectionId: string;
private readonly publicKey: string | undefined;
private readonly baseUrl = "https://api.telnyx.com/v2";
constructor(config: TelnyxConfig) {
if (!config.apiKey) {
throw new Error("Telnyx API key is required");
}
if (!config.connectionId) {
throw new Error("Telnyx connection ID is required");
}
this.apiKey = config.apiKey;
this.connectionId = config.connectionId;
this.publicKey = config.publicKey;
}
/**
* Make an authenticated request to the Telnyx API.
*/
private async apiRequest<T = unknown>(
endpoint: string,
body: Record<string, unknown>,
options?: { allowNotFound?: boolean },
): Promise<T> {
const response = await fetch(`${this.baseUrl}${endpoint}`, {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
if (!response.ok) {
if (options?.allowNotFound && response.status === 404) {
return undefined as T;
}
const errorText = await response.text();
throw new Error(`Telnyx API error: ${response.status} ${errorText}`);
}
const text = await response.text();
return text ? (JSON.parse(text) as T) : (undefined as T);
}
/**
* Verify Telnyx webhook signature using Ed25519.
*/
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
if (!this.publicKey) {
// No public key configured, skip verification (not recommended for production)
return { ok: true };
}
const signature = ctx.headers["telnyx-signature-ed25519"];
const timestamp = ctx.headers["telnyx-timestamp"];
if (!signature || !timestamp) {
return { ok: false, reason: "Missing signature or timestamp header" };
}
const signatureStr = Array.isArray(signature) ? signature[0] : signature;
const timestampStr = Array.isArray(timestamp) ? timestamp[0] : timestamp;
if (!signatureStr || !timestampStr) {
return { ok: false, reason: "Empty signature or timestamp" };
}
try {
const signedPayload = `${timestampStr}|${ctx.rawBody}`;
const signatureBuffer = Buffer.from(signatureStr, "base64");
const publicKeyBuffer = Buffer.from(this.publicKey, "base64");
const isValid = crypto.verify(
null, // Ed25519 doesn't use a digest
Buffer.from(signedPayload),
{
key: publicKeyBuffer,
format: "der",
type: "spki",
},
signatureBuffer,
);
if (!isValid) {
return { ok: false, reason: "Invalid signature" };
}
// Check timestamp is within 5 minutes
const eventTime = parseInt(timestampStr, 10) * 1000;
const now = Date.now();
if (Math.abs(now - eventTime) > 5 * 60 * 1000) {
return { ok: false, reason: "Timestamp too old" };
}
return { ok: true };
} catch (err) {
return {
ok: false,
reason: `Verification error: ${err instanceof Error ? err.message : String(err)}`,
};
}
}
/**
* Parse Telnyx webhook event into normalized format.
*/
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
try {
const payload = JSON.parse(ctx.rawBody);
const data = payload.data;
if (!data || !data.event_type) {
return { events: [], statusCode: 200 };
}
const event = this.normalizeEvent(data);
return {
events: event ? [event] : [],
statusCode: 200,
};
} catch {
return { events: [], statusCode: 400 };
}
}
/**
* Convert Telnyx event to normalized event format.
*/
private normalizeEvent(data: TelnyxEvent): NormalizedEvent | null {
// Decode client_state from Base64 (we encode it in initiateCall)
let callId = "";
if (data.payload?.client_state) {
try {
callId = Buffer.from(data.payload.client_state, "base64").toString(
"utf8",
);
} catch {
// Fallback if not valid Base64
callId = data.payload.client_state;
}
}
if (!callId) {
callId = data.payload?.call_control_id || "";
}
const baseEvent = {
id: data.id || crypto.randomUUID(),
callId,
providerCallId: data.payload?.call_control_id,
timestamp: Date.now(),
};
switch (data.event_type) {
case "call.initiated":
return { ...baseEvent, type: "call.initiated" };
case "call.ringing":
return { ...baseEvent, type: "call.ringing" };
case "call.answered":
return { ...baseEvent, type: "call.answered" };
case "call.bridged":
return { ...baseEvent, type: "call.active" };
case "call.speak.started":
return {
...baseEvent,
type: "call.speaking",
text: data.payload?.text || "",
};
case "call.transcription":
return {
...baseEvent,
type: "call.speech",
transcript: data.payload?.transcription || "",
isFinal: data.payload?.is_final ?? true,
confidence: data.payload?.confidence,
};
case "call.hangup":
return {
...baseEvent,
type: "call.ended",
reason: this.mapHangupCause(data.payload?.hangup_cause),
};
case "call.dtmf.received":
return {
...baseEvent,
type: "call.dtmf",
digits: data.payload?.digit || "",
};
default:
return null;
}
}
/**
* Map Telnyx hangup cause to normalized end reason.
* @see https://developers.telnyx.com/docs/api/v2/call-control/Call-Commands#hangup-causes
*/
private mapHangupCause(cause?: string): EndReason {
switch (cause) {
case "normal_clearing":
case "normal_unspecified":
return "completed";
case "originator_cancel":
return "hangup-bot";
case "call_rejected":
case "user_busy":
return "busy";
case "no_answer":
case "no_user_response":
return "no-answer";
case "destination_out_of_order":
case "network_out_of_order":
case "service_unavailable":
case "recovery_on_timer_expire":
return "failed";
case "machine_detected":
case "fax_detected":
return "voicemail";
case "user_hangup":
case "subscriber_absent":
return "hangup-user";
default:
// Unknown cause - log it for debugging and return completed
if (cause) {
console.warn(`[telnyx] Unknown hangup cause: ${cause}`);
}
return "completed";
}
}
/**
* Initiate an outbound call via Telnyx API.
*/
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
const result = await this.apiRequest<TelnyxCallResponse>("/calls", {
connection_id: this.connectionId,
to: input.to,
from: input.from,
webhook_url: input.webhookUrl,
webhook_url_method: "POST",
client_state: Buffer.from(input.callId).toString("base64"),
timeout_secs: 30,
});
return {
providerCallId: result.data.call_control_id,
status: "initiated",
};
}
/**
* Hang up a call via Telnyx API.
*/
async hangupCall(input: HangupCallInput): Promise<void> {
await this.apiRequest(
`/calls/${input.providerCallId}/actions/hangup`,
{ command_id: crypto.randomUUID() },
{ allowNotFound: true },
);
}
/**
* Play TTS audio via Telnyx speak action.
*/
async playTts(input: PlayTtsInput): Promise<void> {
await this.apiRequest(`/calls/${input.providerCallId}/actions/speak`, {
command_id: crypto.randomUUID(),
payload: input.text,
voice: input.voice || "female",
language: input.locale || "en-US",
});
}
/**
* Start transcription (STT) via Telnyx.
*/
async startListening(input: StartListeningInput): Promise<void> {
await this.apiRequest(
`/calls/${input.providerCallId}/actions/transcription_start`,
{
command_id: crypto.randomUUID(),
language: input.language || "en",
},
);
}
/**
* Stop transcription via Telnyx.
*/
async stopListening(input: StopListeningInput): Promise<void> {
await this.apiRequest(
`/calls/${input.providerCallId}/actions/transcription_stop`,
{ command_id: crypto.randomUUID() },
{ allowNotFound: true },
);
}
}
// -----------------------------------------------------------------------------
// Telnyx-specific types
// -----------------------------------------------------------------------------
interface TelnyxEvent {
id?: string;
event_type: string;
payload?: {
call_control_id?: string;
client_state?: string;
text?: string;
transcription?: string;
is_final?: boolean;
confidence?: number;
hangup_cause?: string;
digit?: string;
[key: string]: unknown;
};
}
interface TelnyxCallResponse {
data: {
call_control_id: string;
call_leg_id: string;
call_session_id: string;
is_alive: boolean;
record_type: string;
};
}

View File

@@ -0,0 +1,264 @@
/**
* OpenAI TTS Provider
*
* Generates speech audio using OpenAI's text-to-speech API.
* Handles audio format conversion for telephony (mu-law 8kHz).
*
* Best practices from OpenAI docs:
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
* - Use tts-1 for lower latency, tts-1-hd for higher quality
* - Use marin or cedar voices for best quality
* - Use pcm or wav format for fastest response times
*
* @see https://platform.openai.com/docs/guides/text-to-speech
*/
/**
* OpenAI TTS configuration.
*/
export interface OpenAITTSConfig {
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
apiKey?: string;
/**
* TTS model:
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
* - tts-1: lower latency
* - tts-1-hd: higher quality
*/
model?: string;
/**
* Voice to use. For best quality, use marin or cedar.
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
*/
voice?: string;
/** Speed multiplier (0.25 to 4.0) */
speed?: number;
/**
* Instructions for speech style (only works with gpt-4o-mini-tts model).
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
*/
instructions?: string;
}
/**
* Supported OpenAI TTS voices (all 13 built-in voices).
* For best quality, use marin or cedar.
* Note: tts-1 and tts-1-hd support a smaller set.
*/
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"fable",
"nova",
"onyx",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
] as const;
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
/**
* OpenAI TTS Provider for generating speech audio.
*/
export class OpenAITTSProvider {
private apiKey: string;
private model: string;
private voice: OpenAITTSVoice;
private speed: number;
private instructions?: string;
constructor(config: OpenAITTSConfig = {}) {
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
// Default to gpt-4o-mini-tts for intelligent realtime applications
this.model = config.model || "gpt-4o-mini-tts";
// Default to coral - good balance of quality and natural tone
this.voice = (config.voice as OpenAITTSVoice) || "coral";
this.speed = config.speed || 1.0;
this.instructions = config.instructions;
if (!this.apiKey) {
throw new Error(
"OpenAI API key required (set OPENAI_API_KEY or pass apiKey)",
);
}
}
/**
* Generate speech audio from text.
* Returns raw PCM audio data (24kHz, mono, 16-bit).
*/
async synthesize(text: string, instructions?: string): Promise<Buffer> {
// Build request body
const body: Record<string, unknown> = {
model: this.model,
input: text,
voice: this.voice,
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
speed: this.speed,
};
// Add instructions if using gpt-4o-mini-tts model
const effectiveInstructions = instructions || this.instructions;
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
body.instructions = effectiveInstructions;
}
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
}
const arrayBuffer = await response.arrayBuffer();
return Buffer.from(arrayBuffer);
}
/**
* Generate speech and convert to mu-law format for Twilio.
* Twilio Media Streams expect 8kHz mono mu-law audio.
*/
async synthesizeForTwilio(text: string): Promise<Buffer> {
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
const pcm24k = await this.synthesize(text);
// Resample from 24kHz to 8kHz
const pcm8k = resample24kTo8k(pcm24k);
// Encode to mu-law
return pcmToMulaw(pcm8k);
}
}
/**
* Resample 24kHz PCM to 8kHz using linear interpolation.
* Input/output: 16-bit signed little-endian mono.
*/
function resample24kTo8k(input: Buffer): Buffer {
const inputSamples = input.length / 2;
const outputSamples = Math.floor(inputSamples / 3);
const output = Buffer.alloc(outputSamples * 2);
for (let i = 0; i < outputSamples; i++) {
// Calculate position in input (3:1 ratio)
const srcPos = i * 3;
const srcIdx = srcPos * 2;
if (srcIdx + 3 < input.length) {
// Linear interpolation between samples
const s0 = input.readInt16LE(srcIdx);
const s1 = input.readInt16LE(srcIdx + 2);
const frac = srcPos % 1 || 0;
const sample = Math.round(s0 + frac * (s1 - s0));
output.writeInt16LE(clamp16(sample), i * 2);
} else {
// Last sample
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
}
}
return output;
}
/**
* Clamp value to 16-bit signed integer range.
*/
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
/**
* Convert 16-bit PCM to 8-bit mu-law.
* Standard G.711 mu-law encoding for telephony.
*/
function pcmToMulaw(pcm: Buffer): Buffer {
const samples = pcm.length / 2;
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
/**
* Convert a single 16-bit linear sample to 8-bit mu-law.
* Implements ITU-T G.711 mu-law encoding.
*/
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
// Get sign bit
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) sample = -sample;
// Clip to prevent overflow
if (sample > CLIP) sample = CLIP;
// Add bias and find segment
sample += BIAS;
let exponent = 7;
for (
let expMask = 0x4000;
(sample & expMask) === 0 && exponent > 0;
exponent--, expMask >>= 1
) {
// Find the segment (exponent)
}
// Extract mantissa bits
const mantissa = (sample >> (exponent + 3)) & 0x0f;
// Combine into mu-law byte (inverted for transmission)
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}
/**
* Convert 8-bit mu-law to 16-bit linear PCM.
* Useful for decoding incoming audio.
*/
export function mulawToLinear(mulaw: number): number {
// mu-law is transmitted inverted
mulaw = ~mulaw & 0xff;
const sign = mulaw & 0x80;
const exponent = (mulaw >> 4) & 0x07;
const mantissa = mulaw & 0x0f;
let sample = ((mantissa << 3) + 132) << exponent;
sample -= 132;
return sign ? -sample : sample;
}
/**
* Chunk audio buffer into 20ms frames for streaming.
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
*/
export function chunkAudio(
audio: Buffer,
chunkSize = 160,
): Generator<Buffer, void, unknown> {
return (function* () {
for (let i = 0; i < audio.length; i += chunkSize) {
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
}
})();
}

View File

@@ -0,0 +1,537 @@
import crypto from "node:crypto";
import type { TwilioConfig } from "../config.js";
import type { MediaStreamHandler } from "../media-stream.js";
import type {
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "../types.js";
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
import { verifyTwilioWebhook } from "../webhook-security.js";
import type { VoiceCallProvider } from "./base.js";
import type { OpenAITTSProvider } from "./tts-openai.js";
import { chunkAudio } from "./tts-openai.js";
/**
* Twilio Voice API provider implementation.
*
* Uses Twilio Programmable Voice API with Media Streams for real-time
* bidirectional audio streaming.
*
* @see https://www.twilio.com/docs/voice
* @see https://www.twilio.com/docs/voice/media-streams
*/
export interface TwilioProviderOptions {
/** Allow ngrok free tier compatibility mode (less secure) */
allowNgrokFreeTier?: boolean;
/** Override public URL for signature verification */
publicUrl?: string;
/** Path for media stream WebSocket (e.g., /voice/stream) */
streamPath?: string;
/** Skip webhook signature verification (development only) */
skipVerification?: boolean;
}
export class TwilioProvider implements VoiceCallProvider {
readonly name = "twilio" as const;
private readonly accountSid: string;
private readonly authToken: string;
private readonly baseUrl: string;
private readonly callWebhookUrls = new Map<string, string>();
private readonly options: TwilioProviderOptions;
/** Current public webhook URL (set when tunnel starts or from config) */
private currentPublicUrl: string | null = null;
/** Optional OpenAI TTS provider for streaming TTS */
private ttsProvider: OpenAITTSProvider | null = null;
/** Optional media stream handler for sending audio */
private mediaStreamHandler: MediaStreamHandler | null = null;
/** Map of call SID to stream SID for media streams */
private callStreamMap = new Map<string, string>();
constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
if (!config.accountSid) {
throw new Error("Twilio Account SID is required");
}
if (!config.authToken) {
throw new Error("Twilio Auth Token is required");
}
this.accountSid = config.accountSid;
this.authToken = config.authToken;
this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
this.options = options;
if (options.publicUrl) {
this.currentPublicUrl = options.publicUrl;
}
}
/**
* Set the current public webhook URL (called when tunnel starts).
*/
setPublicUrl(url: string): void {
this.currentPublicUrl = url;
}
/**
* Get the current public webhook URL.
*/
getPublicUrl(): string | null {
return this.currentPublicUrl;
}
/**
* Set the OpenAI TTS provider for streaming TTS.
* When set, playTts will use OpenAI audio via media streams.
*/
setTTSProvider(provider: OpenAITTSProvider): void {
this.ttsProvider = provider;
}
/**
* Set the media stream handler for sending audio.
*/
setMediaStreamHandler(handler: MediaStreamHandler): void {
this.mediaStreamHandler = handler;
}
/**
* Register a call's stream SID for audio routing.
*/
registerCallStream(callSid: string, streamSid: string): void {
this.callStreamMap.set(callSid, streamSid);
}
/**
* Unregister a call's stream SID.
*/
unregisterCallStream(callSid: string): void {
this.callStreamMap.delete(callSid);
}
/**
* Make an authenticated request to the Twilio API.
*/
private async apiRequest<T = unknown>(
endpoint: string,
params: Record<string, string>,
options?: { allowNotFound?: boolean },
): Promise<T> {
const response = await fetch(`${this.baseUrl}${endpoint}`, {
method: "POST",
headers: {
Authorization: `Basic ${Buffer.from(`${this.accountSid}:${this.authToken}`).toString("base64")}`,
"Content-Type": "application/x-www-form-urlencoded",
},
body: new URLSearchParams(params),
});
if (!response.ok) {
if (options?.allowNotFound && response.status === 404) {
return undefined as T;
}
const errorText = await response.text();
throw new Error(`Twilio API error: ${response.status} ${errorText}`);
}
const text = await response.text();
return text ? (JSON.parse(text) as T) : (undefined as T);
}
/**
* Verify Twilio webhook signature using HMAC-SHA1.
*
* Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
* the public URL from forwarding headers.
*
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
*/
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
const result = verifyTwilioWebhook(ctx, this.authToken, {
publicUrl: this.currentPublicUrl || undefined,
allowNgrokFreeTier: this.options.allowNgrokFreeTier ?? true,
skipVerification: this.options.skipVerification,
});
if (!result.ok) {
console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
if (result.verificationUrl) {
console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
}
}
return {
ok: result.ok,
reason: result.reason,
};
}
/**
* Parse Twilio webhook event into normalized format.
*/
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
try {
const params = new URLSearchParams(ctx.rawBody);
const callIdFromQuery =
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
? ctx.query.callId.trim()
: undefined;
const event = this.normalizeEvent(params, callIdFromQuery);
// For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
// so the webhook response is typically a pause to keep the call alive.
const twiml = this.generateTwimlResponse(ctx);
return {
events: event ? [event] : [],
providerResponseBody: twiml,
providerResponseHeaders: { "Content-Type": "application/xml" },
statusCode: 200,
};
} catch {
return { events: [], statusCode: 400 };
}
}
/**
* Parse Twilio direction to normalized format.
*/
private static parseDirection(
direction: string | null,
): "inbound" | "outbound" | undefined {
if (direction === "inbound") return "inbound";
if (direction === "outbound-api" || direction === "outbound-dial")
return "outbound";
return undefined;
}
/**
* Convert Twilio webhook params to normalized event format.
*/
private normalizeEvent(
params: URLSearchParams,
callIdOverride?: string,
): NormalizedEvent | null {
const callSid = params.get("CallSid") || "";
const baseEvent = {
id: crypto.randomUUID(),
callId: callIdOverride || callSid,
providerCallId: callSid,
timestamp: Date.now(),
direction: TwilioProvider.parseDirection(params.get("Direction")),
from: params.get("From") || undefined,
to: params.get("To") || undefined,
};
// Handle speech result (from <Gather>)
const speechResult = params.get("SpeechResult");
if (speechResult) {
return {
...baseEvent,
type: "call.speech",
transcript: speechResult,
isFinal: true,
confidence: parseFloat(params.get("Confidence") || "0.9"),
};
}
// Handle DTMF
const digits = params.get("Digits");
if (digits) {
return { ...baseEvent, type: "call.dtmf", digits };
}
// Handle call status changes
const callStatus = params.get("CallStatus");
switch (callStatus) {
case "initiated":
return { ...baseEvent, type: "call.initiated" };
case "ringing":
return { ...baseEvent, type: "call.ringing" };
case "in-progress":
return { ...baseEvent, type: "call.answered" };
case "completed":
case "busy":
case "no-answer":
case "failed":
return { ...baseEvent, type: "call.ended", reason: callStatus };
case "canceled":
return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
default:
return null;
}
}
private static readonly EMPTY_TWIML =
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Pause length="30"/>
</Response>`;
/**
* Generate TwiML response for webhook.
* When a call is answered, connects to media stream for bidirectional audio.
*/
private generateTwimlResponse(ctx?: WebhookContext): string {
if (!ctx) return TwilioProvider.EMPTY_TWIML;
const params = new URLSearchParams(ctx.rawBody);
const callStatus = params.get("CallStatus");
const direction = params.get("Direction");
console.log(
`[voice-call] generateTwimlResponse: status=${callStatus} direction=${direction}`,
);
// For inbound calls, answer immediately with stream
if (direction === "inbound") {
const streamUrl = this.getStreamUrl();
return streamUrl
? this.getStreamConnectXml(streamUrl)
: TwilioProvider.PAUSE_TWIML;
}
// For outbound calls, only connect to stream when call is in-progress
if (callStatus !== "in-progress") {
return TwilioProvider.EMPTY_TWIML;
}
const streamUrl = this.getStreamUrl();
return streamUrl
? this.getStreamConnectXml(streamUrl)
: TwilioProvider.PAUSE_TWIML;
}
/**
* Get the WebSocket URL for media streaming.
* Derives from the public URL origin + stream path.
*/
private getStreamUrl(): string | null {
if (!this.currentPublicUrl || !this.options.streamPath) {
return null;
}
// Extract just the origin (host) from the public URL, ignoring any path
const url = new URL(this.currentPublicUrl);
const origin = url.origin;
// Convert https:// to wss:// for WebSocket
const wsOrigin = origin
.replace(/^https:\/\//, "wss://")
.replace(/^http:\/\//, "ws://");
// Append the stream path
const path = this.options.streamPath.startsWith("/")
? this.options.streamPath
: `/${this.options.streamPath}`;
return `${wsOrigin}${path}`;
}
/**
* Generate TwiML to connect a call to a WebSocket media stream.
* This enables bidirectional audio streaming for real-time STT/TTS.
*
* @param streamUrl - WebSocket URL (wss://...) for the media stream
*/
getStreamConnectXml(streamUrl: string): string {
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
<Stream url="${escapeXml(streamUrl)}" />
</Connect>
</Response>`;
}
/**
* Initiate an outbound call via Twilio API.
* If inlineTwiml is provided, uses that directly (for notify mode).
* Otherwise, uses webhook URL for dynamic TwiML.
*/
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
const url = new URL(input.webhookUrl);
url.searchParams.set("callId", input.callId);
// Build request params
const params: Record<string, string> = {
To: input.to,
From: input.from,
StatusCallback: url.toString(),
StatusCallbackEvent: "initiated ringing answered completed",
Timeout: "30",
};
// Use inline TwiML for notify mode (simpler, no webhook needed)
if (input.inlineTwiml) {
params.Twiml = input.inlineTwiml;
} else {
params.Url = url.toString();
}
const result = await this.apiRequest<TwilioCallResponse>(
"/Calls.json",
params,
);
this.callWebhookUrls.set(result.sid, url.toString());
return {
providerCallId: result.sid,
status: result.status === "queued" ? "queued" : "initiated",
};
}
/**
* Hang up a call via Twilio API.
*/
async hangupCall(input: HangupCallInput): Promise<void> {
this.callWebhookUrls.delete(input.providerCallId);
await this.apiRequest(
`/Calls/${input.providerCallId}.json`,
{ Status: "completed" },
{ allowNotFound: true },
);
}
/**
* Play TTS audio via Twilio.
*
* Two modes:
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
* generates audio via OpenAI and streams it through WebSocket (preferred).
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
* Note: This may not work on all Twilio accounts.
*/
async playTts(input: PlayTtsInput): Promise<void> {
// Try OpenAI TTS via media stream first (if configured)
const streamSid = this.callStreamMap.get(input.providerCallId);
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
try {
await this.playTtsViaStream(input.text, streamSid);
return;
} catch (err) {
console.warn(
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
err instanceof Error ? err.message : err,
);
// Fall through to TwiML <Say> fallback
}
}
// Fall back to TwiML <Say> (may not work on all accounts)
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
if (!webhookUrl) {
throw new Error(
"Missing webhook URL for this call (provider state not initialized)",
);
}
console.warn(
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
);
const pollyVoice = mapVoiceToPolly(input.voice);
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
<Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
<Say>.</Say>
</Gather>
</Response>`;
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
Twiml: twiml,
});
}
/**
* Play TTS via OpenAI and Twilio Media Streams.
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
* Uses a jitter buffer to smooth out timing variations.
*/
private async playTtsViaStream(
text: string,
streamSid: string,
): Promise<void> {
if (!this.ttsProvider || !this.mediaStreamHandler) {
throw new Error("TTS provider and media stream handler required");
}
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
const CHUNK_SIZE = 160;
const CHUNK_DELAY_MS = 20;
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
this.mediaStreamHandler.sendAudio(streamSid, chunk);
// Pace the audio to match real-time playback
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
}
// Send a mark to track when audio finishes
this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
}
/**
* Start listening for speech via Twilio <Gather>.
*/
async startListening(input: StartListeningInput): Promise<void> {
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
if (!webhookUrl) {
throw new Error(
"Missing webhook URL for this call (provider state not initialized)",
);
}
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
</Gather>
</Response>`;
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
Twiml: twiml,
});
}
/**
* Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
*/
async stopListening(_input: StopListeningInput): Promise<void> {
// Twilio's <Gather> automatically stops on speech end
// No explicit action needed
}
}
// -----------------------------------------------------------------------------
// Twilio-specific types
// -----------------------------------------------------------------------------
interface TwilioCallResponse {
sid: string;
status: string;
direction: string;
from: string;
to: string;
uri: string;
}