569 lines
18 KiB
TypeScript
569 lines
18 KiB
TypeScript
import crypto from "node:crypto";
|
|
|
|
import type { TwilioConfig } from "../config.js";
|
|
import type { MediaStreamHandler } from "../media-stream.js";
|
|
import type {
|
|
HangupCallInput,
|
|
InitiateCallInput,
|
|
InitiateCallResult,
|
|
NormalizedEvent,
|
|
PlayTtsInput,
|
|
ProviderWebhookParseResult,
|
|
StartListeningInput,
|
|
StopListeningInput,
|
|
WebhookContext,
|
|
WebhookVerificationResult,
|
|
} from "../types.js";
|
|
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
|
|
import type { VoiceCallProvider } from "./base.js";
|
|
import type { OpenAITTSProvider } from "./tts-openai.js";
|
|
import { chunkAudio } from "./tts-openai.js";
|
|
import { twilioApiRequest } from "./twilio/api.js";
|
|
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
|
|
|
|
/**
|
|
* Twilio Voice API provider implementation.
|
|
*
|
|
* Uses Twilio Programmable Voice API with Media Streams for real-time
|
|
* bidirectional audio streaming.
|
|
*
|
|
* @see https://www.twilio.com/docs/voice
|
|
* @see https://www.twilio.com/docs/voice/media-streams
|
|
*/
|
|
export interface TwilioProviderOptions {
|
|
/** Allow ngrok free tier compatibility mode (less secure) */
|
|
allowNgrokFreeTier?: boolean;
|
|
/** Override public URL for signature verification */
|
|
publicUrl?: string;
|
|
/** Path for media stream WebSocket (e.g., /voice/stream) */
|
|
streamPath?: string;
|
|
/** Skip webhook signature verification (development only) */
|
|
skipVerification?: boolean;
|
|
}
|
|
|
|
export class TwilioProvider implements VoiceCallProvider {
|
|
readonly name = "twilio" as const;
|
|
|
|
private readonly accountSid: string;
|
|
private readonly authToken: string;
|
|
private readonly baseUrl: string;
|
|
private readonly callWebhookUrls = new Map<string, string>();
|
|
private readonly options: TwilioProviderOptions;
|
|
|
|
/** Current public webhook URL (set when tunnel starts or from config) */
|
|
private currentPublicUrl: string | null = null;
|
|
|
|
/** Optional OpenAI TTS provider for streaming TTS */
|
|
private ttsProvider: OpenAITTSProvider | null = null;
|
|
|
|
/** Optional media stream handler for sending audio */
|
|
private mediaStreamHandler: MediaStreamHandler | null = null;
|
|
|
|
/** Map of call SID to stream SID for media streams */
|
|
private callStreamMap = new Map<string, string>();
|
|
|
|
/** Storage for TwiML content (for notify mode with URL-based TwiML) */
|
|
private readonly twimlStorage = new Map<string, string>();
|
|
/** Track notify-mode calls to avoid streaming on follow-up callbacks */
|
|
private readonly notifyCalls = new Set<string>();
|
|
|
|
/**
|
|
* Delete stored TwiML for a given `callId`.
|
|
*
|
|
* We keep TwiML in-memory only long enough to satisfy the initial Twilio
|
|
* webhook request (notify mode). Subsequent webhooks should not reuse it.
|
|
*/
|
|
private deleteStoredTwiml(callId: string): void {
|
|
this.twimlStorage.delete(callId);
|
|
this.notifyCalls.delete(callId);
|
|
}
|
|
|
|
/**
|
|
* Delete stored TwiML for a call, addressed by Twilio's provider call SID.
|
|
*
|
|
* This is used when we only have `providerCallId` (e.g. hangup).
|
|
*/
|
|
private deleteStoredTwimlForProviderCall(providerCallId: string): void {
|
|
const webhookUrl = this.callWebhookUrls.get(providerCallId);
|
|
if (!webhookUrl) return;
|
|
|
|
const callIdMatch = webhookUrl.match(/callId=([^&]+)/);
|
|
if (!callIdMatch) return;
|
|
|
|
this.deleteStoredTwiml(callIdMatch[1]);
|
|
}
|
|
|
|
constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
|
|
if (!config.accountSid) {
|
|
throw new Error("Twilio Account SID is required");
|
|
}
|
|
if (!config.authToken) {
|
|
throw new Error("Twilio Auth Token is required");
|
|
}
|
|
|
|
this.accountSid = config.accountSid;
|
|
this.authToken = config.authToken;
|
|
this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
|
|
this.options = options;
|
|
|
|
if (options.publicUrl) {
|
|
this.currentPublicUrl = options.publicUrl;
|
|
}
|
|
}
|
|
|
|
setPublicUrl(url: string): void {
|
|
this.currentPublicUrl = url;
|
|
}
|
|
|
|
getPublicUrl(): string | null {
|
|
return this.currentPublicUrl;
|
|
}
|
|
|
|
setTTSProvider(provider: OpenAITTSProvider): void {
|
|
this.ttsProvider = provider;
|
|
}
|
|
|
|
setMediaStreamHandler(handler: MediaStreamHandler): void {
|
|
this.mediaStreamHandler = handler;
|
|
}
|
|
|
|
registerCallStream(callSid: string, streamSid: string): void {
|
|
this.callStreamMap.set(callSid, streamSid);
|
|
}
|
|
|
|
unregisterCallStream(callSid: string): void {
|
|
this.callStreamMap.delete(callSid);
|
|
}
|
|
|
|
/**
|
|
* Make an authenticated request to the Twilio API.
|
|
*/
|
|
private async apiRequest<T = unknown>(
|
|
endpoint: string,
|
|
params: Record<string, string | string[]>,
|
|
options?: { allowNotFound?: boolean },
|
|
): Promise<T> {
|
|
return await twilioApiRequest<T>({
|
|
baseUrl: this.baseUrl,
|
|
accountSid: this.accountSid,
|
|
authToken: this.authToken,
|
|
endpoint,
|
|
body: params,
|
|
allowNotFound: options?.allowNotFound,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Verify Twilio webhook signature using HMAC-SHA1.
|
|
*
|
|
* Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
|
|
* the public URL from forwarding headers.
|
|
*
|
|
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
|
|
*/
|
|
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
|
return verifyTwilioProviderWebhook({
|
|
ctx,
|
|
authToken: this.authToken,
|
|
currentPublicUrl: this.currentPublicUrl,
|
|
options: this.options,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Parse Twilio webhook event into normalized format.
|
|
*/
|
|
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
|
|
try {
|
|
const params = new URLSearchParams(ctx.rawBody);
|
|
const callIdFromQuery =
|
|
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
|
? ctx.query.callId.trim()
|
|
: undefined;
|
|
const event = this.normalizeEvent(params, callIdFromQuery);
|
|
|
|
// For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
|
|
// so the webhook response is typically a pause to keep the call alive.
|
|
const twiml = this.generateTwimlResponse(ctx);
|
|
|
|
return {
|
|
events: event ? [event] : [],
|
|
providerResponseBody: twiml,
|
|
providerResponseHeaders: { "Content-Type": "application/xml" },
|
|
statusCode: 200,
|
|
};
|
|
} catch {
|
|
return { events: [], statusCode: 400 };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse Twilio direction to normalized format.
|
|
*/
|
|
private static parseDirection(
|
|
direction: string | null,
|
|
): "inbound" | "outbound" | undefined {
|
|
if (direction === "inbound") return "inbound";
|
|
if (direction === "outbound-api" || direction === "outbound-dial")
|
|
return "outbound";
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Convert Twilio webhook params to normalized event format.
|
|
*/
|
|
private normalizeEvent(
|
|
params: URLSearchParams,
|
|
callIdOverride?: string,
|
|
): NormalizedEvent | null {
|
|
const callSid = params.get("CallSid") || "";
|
|
|
|
const baseEvent = {
|
|
id: crypto.randomUUID(),
|
|
callId: callIdOverride || callSid,
|
|
providerCallId: callSid,
|
|
timestamp: Date.now(),
|
|
direction: TwilioProvider.parseDirection(params.get("Direction")),
|
|
from: params.get("From") || undefined,
|
|
to: params.get("To") || undefined,
|
|
};
|
|
|
|
// Handle speech result (from <Gather>)
|
|
const speechResult = params.get("SpeechResult");
|
|
if (speechResult) {
|
|
return {
|
|
...baseEvent,
|
|
type: "call.speech",
|
|
transcript: speechResult,
|
|
isFinal: true,
|
|
confidence: parseFloat(params.get("Confidence") || "0.9"),
|
|
};
|
|
}
|
|
|
|
// Handle DTMF
|
|
const digits = params.get("Digits");
|
|
if (digits) {
|
|
return { ...baseEvent, type: "call.dtmf", digits };
|
|
}
|
|
|
|
// Handle call status changes
|
|
const callStatus = params.get("CallStatus");
|
|
switch (callStatus) {
|
|
case "initiated":
|
|
return { ...baseEvent, type: "call.initiated" };
|
|
case "ringing":
|
|
return { ...baseEvent, type: "call.ringing" };
|
|
case "in-progress":
|
|
return { ...baseEvent, type: "call.answered" };
|
|
case "completed":
|
|
case "busy":
|
|
case "no-answer":
|
|
case "failed":
|
|
if (callIdOverride) {
|
|
this.deleteStoredTwiml(callIdOverride);
|
|
}
|
|
return { ...baseEvent, type: "call.ended", reason: callStatus };
|
|
case "canceled":
|
|
if (callIdOverride) {
|
|
this.deleteStoredTwiml(callIdOverride);
|
|
}
|
|
return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
|
|
default:
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private static readonly EMPTY_TWIML =
|
|
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
|
|
|
|
private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Response>
|
|
<Pause length="30"/>
|
|
</Response>`;
|
|
|
|
/**
|
|
* Generate TwiML response for webhook.
|
|
* When a call is answered, connects to media stream for bidirectional audio.
|
|
*/
|
|
private generateTwimlResponse(ctx?: WebhookContext): string {
|
|
if (!ctx) return TwilioProvider.EMPTY_TWIML;
|
|
|
|
const params = new URLSearchParams(ctx.rawBody);
|
|
const type =
|
|
typeof ctx.query?.type === "string" ? ctx.query.type.trim() : undefined;
|
|
const isStatusCallback = type === "status";
|
|
const callStatus = params.get("CallStatus");
|
|
const direction = params.get("Direction");
|
|
const callIdFromQuery =
|
|
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
|
? ctx.query.callId.trim()
|
|
: undefined;
|
|
|
|
// Avoid logging webhook params/TwiML (may contain PII).
|
|
|
|
// Handle initial TwiML request (when Twilio first initiates the call)
|
|
// Check if we have stored TwiML for this call (notify mode)
|
|
if (callIdFromQuery && !isStatusCallback) {
|
|
const storedTwiml = this.twimlStorage.get(callIdFromQuery);
|
|
if (storedTwiml) {
|
|
// Clean up after serving (one-time use)
|
|
this.deleteStoredTwiml(callIdFromQuery);
|
|
return storedTwiml;
|
|
}
|
|
if (this.notifyCalls.has(callIdFromQuery)) {
|
|
return TwilioProvider.EMPTY_TWIML;
|
|
}
|
|
}
|
|
|
|
// Status callbacks should not receive TwiML.
|
|
if (isStatusCallback) {
|
|
return TwilioProvider.EMPTY_TWIML;
|
|
}
|
|
|
|
// Handle subsequent webhook requests (status callbacks, etc.)
|
|
// For inbound calls, answer immediately with stream
|
|
if (direction === "inbound") {
|
|
const streamUrl = this.getStreamUrl();
|
|
return streamUrl
|
|
? this.getStreamConnectXml(streamUrl)
|
|
: TwilioProvider.PAUSE_TWIML;
|
|
}
|
|
|
|
// For outbound calls, only connect to stream when call is in-progress
|
|
if (callStatus !== "in-progress") {
|
|
return TwilioProvider.EMPTY_TWIML;
|
|
}
|
|
|
|
const streamUrl = this.getStreamUrl();
|
|
return streamUrl
|
|
? this.getStreamConnectXml(streamUrl)
|
|
: TwilioProvider.PAUSE_TWIML;
|
|
}
|
|
|
|
/**
|
|
* Get the WebSocket URL for media streaming.
|
|
* Derives from the public URL origin + stream path.
|
|
*/
|
|
private getStreamUrl(): string | null {
|
|
if (!this.currentPublicUrl || !this.options.streamPath) {
|
|
return null;
|
|
}
|
|
|
|
// Extract just the origin (host) from the public URL, ignoring any path
|
|
const url = new URL(this.currentPublicUrl);
|
|
const origin = url.origin;
|
|
|
|
// Convert https:// to wss:// for WebSocket
|
|
const wsOrigin = origin
|
|
.replace(/^https:\/\//, "wss://")
|
|
.replace(/^http:\/\//, "ws://");
|
|
|
|
// Append the stream path
|
|
const path = this.options.streamPath.startsWith("/")
|
|
? this.options.streamPath
|
|
: `/${this.options.streamPath}`;
|
|
|
|
return `${wsOrigin}${path}`;
|
|
}
|
|
|
|
/**
|
|
* Generate TwiML to connect a call to a WebSocket media stream.
|
|
* This enables bidirectional audio streaming for real-time STT/TTS.
|
|
*
|
|
* @param streamUrl - WebSocket URL (wss://...) for the media stream
|
|
*/
|
|
getStreamConnectXml(streamUrl: string): string {
|
|
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Response>
|
|
<Connect>
|
|
<Stream url="${escapeXml(streamUrl)}" />
|
|
</Connect>
|
|
</Response>`;
|
|
}
|
|
|
|
/**
|
|
* Initiate an outbound call via Twilio API.
|
|
* If inlineTwiml is provided, uses that directly (for notify mode).
|
|
* Otherwise, uses webhook URL for dynamic TwiML.
|
|
*/
|
|
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
|
const url = new URL(input.webhookUrl);
|
|
url.searchParams.set("callId", input.callId);
|
|
|
|
// Create separate URL for status callbacks (required by Twilio)
|
|
const statusUrl = new URL(input.webhookUrl);
|
|
statusUrl.searchParams.set("callId", input.callId);
|
|
statusUrl.searchParams.set("type", "status"); // Differentiate from TwiML requests
|
|
|
|
// Store TwiML content if provided (for notify mode)
|
|
// We now serve it from the webhook endpoint instead of sending inline
|
|
if (input.inlineTwiml) {
|
|
this.twimlStorage.set(input.callId, input.inlineTwiml);
|
|
this.notifyCalls.add(input.callId);
|
|
}
|
|
|
|
// Build request params - always use URL-based TwiML.
|
|
// Twilio silently ignores `StatusCallback` when using the inline `Twiml` parameter.
|
|
const params: Record<string, string | string[]> = {
|
|
To: input.to,
|
|
From: input.from,
|
|
Url: url.toString(), // TwiML serving endpoint
|
|
StatusCallback: statusUrl.toString(), // Separate status callback endpoint
|
|
StatusCallbackEvent: ["initiated", "ringing", "answered", "completed"],
|
|
Timeout: "30",
|
|
};
|
|
|
|
const result = await this.apiRequest<TwilioCallResponse>(
|
|
"/Calls.json",
|
|
params,
|
|
);
|
|
|
|
this.callWebhookUrls.set(result.sid, url.toString());
|
|
|
|
return {
|
|
providerCallId: result.sid,
|
|
status: result.status === "queued" ? "queued" : "initiated",
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Hang up a call via Twilio API.
|
|
*/
|
|
async hangupCall(input: HangupCallInput): Promise<void> {
|
|
this.deleteStoredTwimlForProviderCall(input.providerCallId);
|
|
|
|
this.callWebhookUrls.delete(input.providerCallId);
|
|
|
|
await this.apiRequest(
|
|
`/Calls/${input.providerCallId}.json`,
|
|
{ Status: "completed" },
|
|
{ allowNotFound: true },
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Play TTS audio via Twilio.
|
|
*
|
|
* Two modes:
|
|
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
|
|
* generates audio via OpenAI and streams it through WebSocket (preferred).
|
|
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
|
|
* Note: This may not work on all Twilio accounts.
|
|
*/
|
|
async playTts(input: PlayTtsInput): Promise<void> {
|
|
// Try OpenAI TTS via media stream first (if configured)
|
|
const streamSid = this.callStreamMap.get(input.providerCallId);
|
|
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
|
|
try {
|
|
await this.playTtsViaStream(input.text, streamSid);
|
|
return;
|
|
} catch (err) {
|
|
console.warn(
|
|
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
|
|
err instanceof Error ? err.message : err,
|
|
);
|
|
// Fall through to TwiML <Say> fallback
|
|
}
|
|
}
|
|
|
|
// Fall back to TwiML <Say> (may not work on all accounts)
|
|
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
|
if (!webhookUrl) {
|
|
throw new Error(
|
|
"Missing webhook URL for this call (provider state not initialized)",
|
|
);
|
|
}
|
|
|
|
console.warn(
|
|
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
|
|
);
|
|
|
|
const pollyVoice = mapVoiceToPolly(input.voice);
|
|
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Response>
|
|
<Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
|
|
<Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
|
|
<Say>.</Say>
|
|
</Gather>
|
|
</Response>`;
|
|
|
|
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
|
Twiml: twiml,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Play TTS via OpenAI and Twilio Media Streams.
|
|
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
|
|
* Uses a jitter buffer to smooth out timing variations.
|
|
*/
|
|
private async playTtsViaStream(
|
|
text: string,
|
|
streamSid: string,
|
|
): Promise<void> {
|
|
if (!this.ttsProvider || !this.mediaStreamHandler) {
|
|
throw new Error("TTS provider and media stream handler required");
|
|
}
|
|
|
|
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
|
|
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
|
|
|
|
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
|
const CHUNK_SIZE = 160;
|
|
const CHUNK_DELAY_MS = 20;
|
|
|
|
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
|
|
this.mediaStreamHandler.sendAudio(streamSid, chunk);
|
|
|
|
// Pace the audio to match real-time playback
|
|
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
|
|
}
|
|
|
|
// Send a mark to track when audio finishes
|
|
this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
|
|
}
|
|
|
|
/**
|
|
* Start listening for speech via Twilio <Gather>.
|
|
*/
|
|
async startListening(input: StartListeningInput): Promise<void> {
|
|
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
|
if (!webhookUrl) {
|
|
throw new Error(
|
|
"Missing webhook URL for this call (provider state not initialized)",
|
|
);
|
|
}
|
|
|
|
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
<Response>
|
|
<Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
|
|
</Gather>
|
|
</Response>`;
|
|
|
|
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
|
Twiml: twiml,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
|
|
*/
|
|
async stopListening(_input: StopListeningInput): Promise<void> {
|
|
// Twilio's <Gather> automatically stops on speech end
|
|
// No explicit action needed
|
|
}
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// Twilio-specific types
|
|
// -----------------------------------------------------------------------------
|
|
|
|
interface TwilioCallResponse {
|
|
sid: string;
|
|
status: string;
|
|
direction: string;
|
|
from: string;
|
|
to: string;
|
|
uri: string;
|
|
}
|