feat: restore voice-call plugin parity
This commit is contained in:
537
extensions/voice-call/src/providers/twilio.ts
Normal file
537
extensions/voice-call/src/providers/twilio.ts
Normal file
@@ -0,0 +1,537 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type { TwilioConfig } from "../config.js";
|
||||
import type { MediaStreamHandler } from "../media-stream.js";
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
|
||||
import { verifyTwilioWebhook } from "../webhook-security.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import type { OpenAITTSProvider } from "./tts-openai.js";
|
||||
import { chunkAudio } from "./tts-openai.js";
|
||||
|
||||
/**
|
||||
* Twilio Voice API provider implementation.
|
||||
*
|
||||
* Uses Twilio Programmable Voice API with Media Streams for real-time
|
||||
* bidirectional audio streaming.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/voice
|
||||
* @see https://www.twilio.com/docs/voice/media-streams
|
||||
*/
|
||||
export interface TwilioProviderOptions {
|
||||
/** Allow ngrok free tier compatibility mode (less secure) */
|
||||
allowNgrokFreeTier?: boolean;
|
||||
/** Override public URL for signature verification */
|
||||
publicUrl?: string;
|
||||
/** Path for media stream WebSocket (e.g., /voice/stream) */
|
||||
streamPath?: string;
|
||||
/** Skip webhook signature verification (development only) */
|
||||
skipVerification?: boolean;
|
||||
}
|
||||
|
||||
export class TwilioProvider implements VoiceCallProvider {
|
||||
readonly name = "twilio" as const;
|
||||
|
||||
private readonly accountSid: string;
|
||||
private readonly authToken: string;
|
||||
private readonly baseUrl: string;
|
||||
private readonly callWebhookUrls = new Map<string, string>();
|
||||
private readonly options: TwilioProviderOptions;
|
||||
|
||||
/** Current public webhook URL (set when tunnel starts or from config) */
|
||||
private currentPublicUrl: string | null = null;
|
||||
|
||||
/** Optional OpenAI TTS provider for streaming TTS */
|
||||
private ttsProvider: OpenAITTSProvider | null = null;
|
||||
|
||||
/** Optional media stream handler for sending audio */
|
||||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
|
||||
/** Map of call SID to stream SID for media streams */
|
||||
private callStreamMap = new Map<string, string>();
|
||||
|
||||
constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
|
||||
if (!config.accountSid) {
|
||||
throw new Error("Twilio Account SID is required");
|
||||
}
|
||||
if (!config.authToken) {
|
||||
throw new Error("Twilio Auth Token is required");
|
||||
}
|
||||
|
||||
this.accountSid = config.accountSid;
|
||||
this.authToken = config.authToken;
|
||||
this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
|
||||
this.options = options;
|
||||
|
||||
if (options.publicUrl) {
|
||||
this.currentPublicUrl = options.publicUrl;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the current public webhook URL (called when tunnel starts).
|
||||
*/
|
||||
setPublicUrl(url: string): void {
|
||||
this.currentPublicUrl = url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current public webhook URL.
|
||||
*/
|
||||
getPublicUrl(): string | null {
|
||||
return this.currentPublicUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the OpenAI TTS provider for streaming TTS.
|
||||
* When set, playTts will use OpenAI audio via media streams.
|
||||
*/
|
||||
setTTSProvider(provider: OpenAITTSProvider): void {
|
||||
this.ttsProvider = provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the media stream handler for sending audio.
|
||||
*/
|
||||
setMediaStreamHandler(handler: MediaStreamHandler): void {
|
||||
this.mediaStreamHandler = handler;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a call's stream SID for audio routing.
|
||||
*/
|
||||
registerCallStream(callSid: string, streamSid: string): void {
|
||||
this.callStreamMap.set(callSid, streamSid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unregister a call's stream SID.
|
||||
*/
|
||||
unregisterCallStream(callSid: string): void {
|
||||
this.callStreamMap.delete(callSid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an authenticated request to the Twilio API.
|
||||
*/
|
||||
private async apiRequest<T = unknown>(
|
||||
endpoint: string,
|
||||
params: Record<string, string>,
|
||||
options?: { allowNotFound?: boolean },
|
||||
): Promise<T> {
|
||||
const response = await fetch(`${this.baseUrl}${endpoint}`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${this.accountSid}:${this.authToken}`).toString("base64")}`,
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
body: new URLSearchParams(params),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (options?.allowNotFound && response.status === 404) {
|
||||
return undefined as T;
|
||||
}
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Twilio API error: ${response.status} ${errorText}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return text ? (JSON.parse(text) as T) : (undefined as T);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Twilio webhook signature using HMAC-SHA1.
|
||||
*
|
||||
* Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
|
||||
* the public URL from forwarding headers.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
const result = verifyTwilioWebhook(ctx, this.authToken, {
|
||||
publicUrl: this.currentPublicUrl || undefined,
|
||||
allowNgrokFreeTier: this.options.allowNgrokFreeTier ?? true,
|
||||
skipVerification: this.options.skipVerification,
|
||||
});
|
||||
|
||||
if (!result.ok) {
|
||||
console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
|
||||
if (result.verificationUrl) {
|
||||
console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
reason: result.reason,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Twilio webhook event into normalized format.
|
||||
*/
|
||||
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
|
||||
try {
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const callIdFromQuery =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
const event = this.normalizeEvent(params, callIdFromQuery);
|
||||
|
||||
// For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
|
||||
// so the webhook response is typically a pause to keep the call alive.
|
||||
const twiml = this.generateTwimlResponse(ctx);
|
||||
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
providerResponseBody: twiml,
|
||||
providerResponseHeaders: { "Content-Type": "application/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Twilio direction to normalized format.
|
||||
*/
|
||||
private static parseDirection(
|
||||
direction: string | null,
|
||||
): "inbound" | "outbound" | undefined {
|
||||
if (direction === "inbound") return "inbound";
|
||||
if (direction === "outbound-api" || direction === "outbound-dial")
|
||||
return "outbound";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Twilio webhook params to normalized event format.
|
||||
*/
|
||||
private normalizeEvent(
|
||||
params: URLSearchParams,
|
||||
callIdOverride?: string,
|
||||
): NormalizedEvent | null {
|
||||
const callSid = params.get("CallSid") || "";
|
||||
|
||||
const baseEvent = {
|
||||
id: crypto.randomUUID(),
|
||||
callId: callIdOverride || callSid,
|
||||
providerCallId: callSid,
|
||||
timestamp: Date.now(),
|
||||
direction: TwilioProvider.parseDirection(params.get("Direction")),
|
||||
from: params.get("From") || undefined,
|
||||
to: params.get("To") || undefined,
|
||||
};
|
||||
|
||||
// Handle speech result (from <Gather>)
|
||||
const speechResult = params.get("SpeechResult");
|
||||
if (speechResult) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript: speechResult,
|
||||
isFinal: true,
|
||||
confidence: parseFloat(params.get("Confidence") || "0.9"),
|
||||
};
|
||||
}
|
||||
|
||||
// Handle DTMF
|
||||
const digits = params.get("Digits");
|
||||
if (digits) {
|
||||
return { ...baseEvent, type: "call.dtmf", digits };
|
||||
}
|
||||
|
||||
// Handle call status changes
|
||||
const callStatus = params.get("CallStatus");
|
||||
switch (callStatus) {
|
||||
case "initiated":
|
||||
return { ...baseEvent, type: "call.initiated" };
|
||||
case "ringing":
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
case "in-progress":
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
case "completed":
|
||||
case "busy":
|
||||
case "no-answer":
|
||||
case "failed":
|
||||
return { ...baseEvent, type: "call.ended", reason: callStatus };
|
||||
case "canceled":
|
||||
return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly EMPTY_TWIML =
|
||||
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
|
||||
|
||||
private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Pause length="30"/>
|
||||
</Response>`;
|
||||
|
||||
/**
|
||||
* Generate TwiML response for webhook.
|
||||
* When a call is answered, connects to media stream for bidirectional audio.
|
||||
*/
|
||||
private generateTwimlResponse(ctx?: WebhookContext): string {
|
||||
if (!ctx) return TwilioProvider.EMPTY_TWIML;
|
||||
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const callStatus = params.get("CallStatus");
|
||||
const direction = params.get("Direction");
|
||||
|
||||
console.log(
|
||||
`[voice-call] generateTwimlResponse: status=${callStatus} direction=${direction}`,
|
||||
);
|
||||
|
||||
// For inbound calls, answer immediately with stream
|
||||
if (direction === "inbound") {
|
||||
const streamUrl = this.getStreamUrl();
|
||||
return streamUrl
|
||||
? this.getStreamConnectXml(streamUrl)
|
||||
: TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
|
||||
// For outbound calls, only connect to stream when call is in-progress
|
||||
if (callStatus !== "in-progress") {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
const streamUrl = this.getStreamUrl();
|
||||
return streamUrl
|
||||
? this.getStreamConnectXml(streamUrl)
|
||||
: TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the WebSocket URL for media streaming.
|
||||
* Derives from the public URL origin + stream path.
|
||||
*/
|
||||
private getStreamUrl(): string | null {
|
||||
if (!this.currentPublicUrl || !this.options.streamPath) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract just the origin (host) from the public URL, ignoring any path
|
||||
const url = new URL(this.currentPublicUrl);
|
||||
const origin = url.origin;
|
||||
|
||||
// Convert https:// to wss:// for WebSocket
|
||||
const wsOrigin = origin
|
||||
.replace(/^https:\/\//, "wss://")
|
||||
.replace(/^http:\/\//, "ws://");
|
||||
|
||||
// Append the stream path
|
||||
const path = this.options.streamPath.startsWith("/")
|
||||
? this.options.streamPath
|
||||
: `/${this.options.streamPath}`;
|
||||
|
||||
return `${wsOrigin}${path}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate TwiML to connect a call to a WebSocket media stream.
|
||||
* This enables bidirectional audio streaming for real-time STT/TTS.
|
||||
*
|
||||
* @param streamUrl - WebSocket URL (wss://...) for the media stream
|
||||
*/
|
||||
getStreamConnectXml(streamUrl: string): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="${escapeXml(streamUrl)}" />
|
||||
</Connect>
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an outbound call via Twilio API.
|
||||
* If inlineTwiml is provided, uses that directly (for notify mode).
|
||||
* Otherwise, uses webhook URL for dynamic TwiML.
|
||||
*/
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const url = new URL(input.webhookUrl);
|
||||
url.searchParams.set("callId", input.callId);
|
||||
|
||||
// Build request params
|
||||
const params: Record<string, string> = {
|
||||
To: input.to,
|
||||
From: input.from,
|
||||
StatusCallback: url.toString(),
|
||||
StatusCallbackEvent: "initiated ringing answered completed",
|
||||
Timeout: "30",
|
||||
};
|
||||
|
||||
// Use inline TwiML for notify mode (simpler, no webhook needed)
|
||||
if (input.inlineTwiml) {
|
||||
params.Twiml = input.inlineTwiml;
|
||||
} else {
|
||||
params.Url = url.toString();
|
||||
}
|
||||
|
||||
const result = await this.apiRequest<TwilioCallResponse>(
|
||||
"/Calls.json",
|
||||
params,
|
||||
);
|
||||
|
||||
this.callWebhookUrls.set(result.sid, url.toString());
|
||||
|
||||
return {
|
||||
providerCallId: result.sid,
|
||||
status: result.status === "queued" ? "queued" : "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Hang up a call via Twilio API.
|
||||
*/
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
this.callWebhookUrls.delete(input.providerCallId);
|
||||
|
||||
await this.apiRequest(
|
||||
`/Calls/${input.providerCallId}.json`,
|
||||
{ Status: "completed" },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS audio via Twilio.
|
||||
*
|
||||
* Two modes:
|
||||
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via OpenAI and streams it through WebSocket (preferred).
|
||||
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
|
||||
* Note: This may not work on all Twilio accounts.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
// Try OpenAI TTS via media stream first (if configured)
|
||||
const streamSid = this.callStreamMap.get(input.providerCallId);
|
||||
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
|
||||
try {
|
||||
await this.playTtsViaStream(input.text, streamSid);
|
||||
return;
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
|
||||
err instanceof Error ? err.message : err,
|
||||
);
|
||||
// Fall through to TwiML <Say> fallback
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to TwiML <Say> (may not work on all accounts)
|
||||
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
||||
if (!webhookUrl) {
|
||||
throw new Error(
|
||||
"Missing webhook URL for this call (provider state not initialized)",
|
||||
);
|
||||
}
|
||||
|
||||
console.warn(
|
||||
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
|
||||
);
|
||||
|
||||
const pollyVoice = mapVoiceToPolly(input.voice);
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
|
||||
<Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
|
||||
<Say>.</Say>
|
||||
</Gather>
|
||||
</Response>`;
|
||||
|
||||
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
||||
Twiml: twiml,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS via OpenAI and Twilio Media Streams.
|
||||
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Uses a jitter buffer to smooth out timing variations.
|
||||
*/
|
||||
private async playTtsViaStream(
|
||||
text: string,
|
||||
streamSid: string,
|
||||
): Promise<void> {
|
||||
if (!this.ttsProvider || !this.mediaStreamHandler) {
|
||||
throw new Error("TTS provider and media stream handler required");
|
||||
}
|
||||
|
||||
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
|
||||
|
||||
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
||||
const CHUNK_SIZE = 160;
|
||||
const CHUNK_DELAY_MS = 20;
|
||||
|
||||
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
|
||||
this.mediaStreamHandler.sendAudio(streamSid, chunk);
|
||||
|
||||
// Pace the audio to match real-time playback
|
||||
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
|
||||
}
|
||||
|
||||
// Send a mark to track when audio finishes
|
||||
this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Start listening for speech via Twilio <Gather>.
|
||||
*/
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
||||
if (!webhookUrl) {
|
||||
throw new Error(
|
||||
"Missing webhook URL for this call (provider state not initialized)",
|
||||
);
|
||||
}
|
||||
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
|
||||
</Gather>
|
||||
</Response>`;
|
||||
|
||||
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
||||
Twiml: twiml,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
|
||||
*/
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// Twilio's <Gather> automatically stops on speech end
|
||||
// No explicit action needed
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Twilio-specific types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
interface TwilioCallResponse {
|
||||
sid: string;
|
||||
status: string;
|
||||
direction: string;
|
||||
from: string;
|
||||
to: string;
|
||||
uri: string;
|
||||
}
|
||||
Reference in New Issue
Block a user