refactor: align voice-call TTS with core config
This commit is contained in:
@@ -1,5 +1,12 @@
|
||||
# Changelog
|
||||
|
||||
## 2026.1.24
|
||||
|
||||
### Changes
|
||||
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
|
||||
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
|
||||
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
|
||||
|
||||
## 2026.1.23
|
||||
|
||||
### Changes
|
||||
|
||||
@@ -75,6 +75,27 @@ Notes:
|
||||
- Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
|
||||
- `mock` is a local dev provider (no network calls).
|
||||
|
||||
## TTS for calls
|
||||
|
||||
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
|
||||
streaming speech on calls. You can override it under the plugin config with the
|
||||
same shape — overrides deep-merge with `messages.tts`.
|
||||
|
||||
```json5
|
||||
{
|
||||
tts: {
|
||||
provider: "openai",
|
||||
openai: {
|
||||
voice: "alloy"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
|
||||
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
|
||||
|
||||
## CLI
|
||||
|
||||
```bash
|
||||
|
||||
@@ -99,16 +99,39 @@
|
||||
"label": "Media Stream Path",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.model": {
|
||||
"label": "TTS Model",
|
||||
"tts.provider": {
|
||||
"label": "TTS Provider Override",
|
||||
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.voice": {
|
||||
"label": "TTS Voice",
|
||||
"tts.openai.model": {
|
||||
"label": "OpenAI TTS Model",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.instructions": {
|
||||
"label": "TTS Instructions",
|
||||
"tts.openai.voice": {
|
||||
"label": "OpenAI TTS Voice",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.openai.apiKey": {
|
||||
"label": "OpenAI API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.modelId": {
|
||||
"label": "ElevenLabs Model ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.voiceId": {
|
||||
"label": "ElevenLabs Voice ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.apiKey": {
|
||||
"label": "ElevenLabs API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.baseUrl": {
|
||||
"label": "ElevenLabs Base URL",
|
||||
"advanced": true
|
||||
},
|
||||
"publicUrl": {
|
||||
@@ -370,20 +393,193 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"auto": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"off",
|
||||
"always",
|
||||
"inbound",
|
||||
"tagged"
|
||||
]
|
||||
},
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"final",
|
||||
"all"
|
||||
]
|
||||
},
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"openai"
|
||||
"openai",
|
||||
"elevenlabs",
|
||||
"edge"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"summaryModel": {
|
||||
"type": "string"
|
||||
},
|
||||
"voice": {
|
||||
"modelOverrides": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowText": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowProvider": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowVoice": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowModelId": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowVoiceSettings": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowNormalization": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowSeed": {
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
"elevenlabs": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"baseUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
"voiceId": {
|
||||
"type": "string"
|
||||
},
|
||||
"modelId": {
|
||||
"type": "string"
|
||||
},
|
||||
"seed": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 4294967295
|
||||
},
|
||||
"applyTextNormalization": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"auto",
|
||||
"on",
|
||||
"off"
|
||||
]
|
||||
},
|
||||
"languageCode": {
|
||||
"type": "string"
|
||||
},
|
||||
"voiceSettings": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"stability": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"similarityBoost": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"style": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"useSpeakerBoost": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"speed": {
|
||||
"type": "number",
|
||||
"minimum": 0.5,
|
||||
"maximum": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"openai": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"edge": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
},
|
||||
"lang": {
|
||||
"type": "string"
|
||||
},
|
||||
"outputFormat": {
|
||||
"type": "string"
|
||||
},
|
||||
"pitch": {
|
||||
"type": "string"
|
||||
},
|
||||
"rate": {
|
||||
"type": "string"
|
||||
},
|
||||
"volume": {
|
||||
"type": "string"
|
||||
},
|
||||
"saveSubtitles": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"proxy": {
|
||||
"type": "string"
|
||||
},
|
||||
"timeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1000,
|
||||
"maximum": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
"prefsPath": {
|
||||
"type": "string"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
"maxTextLength": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"timeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1000,
|
||||
"maximum": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
|
||||
},
|
||||
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
|
||||
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
||||
"tts.model": { label: "TTS Model", advanced: true },
|
||||
"tts.voice": { label: "TTS Voice", advanced: true },
|
||||
"tts.instructions": { label: "TTS Instructions", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
advanced: true,
|
||||
},
|
||||
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
|
||||
"tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
|
||||
"tts.openai.apiKey": {
|
||||
label: "OpenAI API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
|
||||
"tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
|
||||
"tts.elevenlabs.apiKey": {
|
||||
label: "ElevenLabs API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
|
||||
publicUrl: { label: "Public Webhook URL", advanced: true },
|
||||
skipSignatureVerification: {
|
||||
label: "Skip Signature Verification",
|
||||
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
|
||||
runtimePromise = createVoiceCallRuntime({
|
||||
config: cfg,
|
||||
coreConfig: api.config as CoreConfig,
|
||||
ttsRuntime: api.runtime.tts,
|
||||
logger: api.logger,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -82,31 +82,82 @@ export const SttConfigSchema = z
|
||||
.default({ provider: "openai", model: "whisper-1" });
|
||||
export type SttConfig = z.infer<typeof SttConfigSchema>;
|
||||
|
||||
export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
|
||||
export const TtsModeSchema = z.enum(["final", "all"]);
|
||||
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export const TtsConfigSchema = z
|
||||
.object({
|
||||
/** TTS provider (currently only OpenAI supported) */
|
||||
provider: z.literal("openai").default("openai"),
|
||||
/**
|
||||
* TTS model to use:
|
||||
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
||||
* - tts-1: lower latency
|
||||
* - tts-1-hd: higher quality
|
||||
*/
|
||||
model: z.string().min(1).default("gpt-4o-mini-tts"),
|
||||
/**
|
||||
* Voice ID. For best quality, use marin or cedar.
|
||||
* All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
||||
*/
|
||||
voice: z.string().min(1).default("coral"),
|
||||
/**
|
||||
* Instructions for speech style (only works with gpt-4o-mini-tts).
|
||||
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
||||
*/
|
||||
instructions: z.string().optional(),
|
||||
auto: TtsAutoSchema.optional(),
|
||||
enabled: z.boolean().optional(),
|
||||
mode: TtsModeSchema.optional(),
|
||||
provider: TtsProviderSchema.optional(),
|
||||
summaryModel: z.string().optional(),
|
||||
modelOverrides: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
allowText: z.boolean().optional(),
|
||||
allowProvider: z.boolean().optional(),
|
||||
allowVoice: z.boolean().optional(),
|
||||
allowModelId: z.boolean().optional(),
|
||||
allowVoiceSettings: z.boolean().optional(),
|
||||
allowNormalization: z.boolean().optional(),
|
||||
allowSeed: z.boolean().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
elevenlabs: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
voiceId: z.string().optional(),
|
||||
modelId: z.string().optional(),
|
||||
seed: z.number().int().min(0).max(4294967295).optional(),
|
||||
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
|
||||
languageCode: z.string().optional(),
|
||||
voiceSettings: z
|
||||
.object({
|
||||
stability: z.number().min(0).max(1).optional(),
|
||||
similarityBoost: z.number().min(0).max(1).optional(),
|
||||
style: z.number().min(0).max(1).optional(),
|
||||
useSpeakerBoost: z.boolean().optional(),
|
||||
speed: z.number().min(0.5).max(2).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
openai: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
edge: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
voice: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
outputFormat: z.string().optional(),
|
||||
pitch: z.string().optional(),
|
||||
rate: z.string().optional(),
|
||||
volume: z.string().optional(),
|
||||
saveSubtitles: z.boolean().optional(),
|
||||
proxy: z.string().optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
prefsPath: z.string().optional(),
|
||||
maxTextLength: z.number().int().min(1).optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
|
||||
export type TtsConfig = z.infer<typeof TtsConfigSchema>;
|
||||
.optional();
|
||||
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Webhook Server Configuration
|
||||
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
|
||||
/** STT configuration */
|
||||
stt: SttConfigSchema,
|
||||
|
||||
/** TTS configuration */
|
||||
/** TTS override (deep-merges with core messages.tts) */
|
||||
tts: TtsConfigSchema,
|
||||
|
||||
/** Store path for call logs */
|
||||
|
||||
@@ -2,10 +2,16 @@ import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath, pathToFileURL } from "node:url";
|
||||
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
|
||||
export type CoreConfig = {
|
||||
session?: {
|
||||
store?: string;
|
||||
};
|
||||
messages?: {
|
||||
tts?: VoiceCallTtsConfig;
|
||||
};
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
type CoreAgentDeps = {
|
||||
|
||||
@@ -143,7 +143,7 @@ export class CallManager {
|
||||
// For notify mode with a message, use inline TwiML with <Say>
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
|
||||
const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
|
||||
inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(
|
||||
`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
|
||||
@@ -210,11 +210,13 @@ export class CallManager {
|
||||
this.addTranscriptEntry(call, "bot", text);
|
||||
|
||||
// Play TTS
|
||||
const voice =
|
||||
this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
|
||||
await this.provider.playTts({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
text,
|
||||
voice: this.config.tts.voice,
|
||||
voice,
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
|
||||
@@ -68,7 +68,7 @@ export async function initiateCall(
|
||||
// For notify mode with a message, use inline TwiML with <Say>.
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
|
||||
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
|
||||
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
|
||||
}
|
||||
@@ -120,11 +120,13 @@ export async function speak(
|
||||
|
||||
addTranscriptEntry(call, "bot", text);
|
||||
|
||||
const voice =
|
||||
ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
|
||||
await ctx.provider.playTts({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
text,
|
||||
voice: ctx.config.tts.voice,
|
||||
voice,
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
@@ -244,4 +246,3 @@ export async function endCall(
|
||||
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ import type {
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
|
||||
import { chunkAudio } from "../telephony-audio.js";
|
||||
import type { TelephonyTtsProvider } from "../telephony-tts.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import type { OpenAITTSProvider } from "./tts-openai.js";
|
||||
import { chunkAudio } from "./tts-openai.js";
|
||||
import { twilioApiRequest } from "./twilio/api.js";
|
||||
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
|
||||
|
||||
@@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
/** Current public webhook URL (set when tunnel starts or from config) */
|
||||
private currentPublicUrl: string | null = null;
|
||||
|
||||
/** Optional OpenAI TTS provider for streaming TTS */
|
||||
private ttsProvider: OpenAITTSProvider | null = null;
|
||||
/** Optional telephony TTS provider for streaming TTS */
|
||||
private ttsProvider: TelephonyTtsProvider | null = null;
|
||||
|
||||
/** Optional media stream handler for sending audio */
|
||||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
@@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
return this.currentPublicUrl;
|
||||
}
|
||||
|
||||
setTTSProvider(provider: OpenAITTSProvider): void {
|
||||
setTTSProvider(provider: TelephonyTtsProvider): void {
|
||||
this.ttsProvider = provider;
|
||||
}
|
||||
|
||||
@@ -454,13 +454,13 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
* Play TTS audio via Twilio.
|
||||
*
|
||||
* Two modes:
|
||||
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via OpenAI and streams it through WebSocket (preferred).
|
||||
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via core TTS and streams it through WebSocket (preferred).
|
||||
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
|
||||
* Note: This may not work on all Twilio accounts.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
// Try OpenAI TTS via media stream first (if configured)
|
||||
// Try telephony TTS via media stream first (if configured)
|
||||
const streamSid = this.callStreamMap.get(input.providerCallId);
|
||||
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
|
||||
try {
|
||||
@@ -468,7 +468,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
return;
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
|
||||
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
|
||||
err instanceof Error ? err.message : err,
|
||||
);
|
||||
// Fall through to TwiML <Say> fallback
|
||||
@@ -484,7 +484,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
}
|
||||
|
||||
console.warn(
|
||||
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
|
||||
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
|
||||
);
|
||||
|
||||
const pollyVoice = mapVoiceToPolly(input.voice);
|
||||
@@ -502,8 +502,8 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS via OpenAI and Twilio Media Streams.
|
||||
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Play TTS via core TTS and Twilio Media Streams.
|
||||
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Uses a jitter buffer to smooth out timing variations.
|
||||
*/
|
||||
private async playTtsViaStream(
|
||||
@@ -514,8 +514,8 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
throw new Error("TTS provider and media stream handler required");
|
||||
}
|
||||
|
||||
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
|
||||
// Generate audio with core TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
|
||||
|
||||
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
||||
const CHUNK_SIZE = 160;
|
||||
|
||||
@@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import { MockProvider } from "./providers/mock.js";
|
||||
import { PlivoProvider } from "./providers/plivo.js";
|
||||
import { TelnyxProvider } from "./providers/telnyx.js";
|
||||
import { OpenAITTSProvider } from "./providers/tts-openai.js";
|
||||
import { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
|
||||
import { createTelephonyTtsProvider } from "./telephony-tts.js";
|
||||
import { startTunnel, type TunnelResult } from "./tunnel.js";
|
||||
import {
|
||||
cleanupTailscaleExposure,
|
||||
@@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
||||
export async function createVoiceCallRuntime(params: {
|
||||
config: VoiceCallConfig;
|
||||
coreConfig: CoreConfig;
|
||||
ttsRuntime?: TelephonyTtsRuntime;
|
||||
logger?: Logger;
|
||||
}): Promise<VoiceCallRuntime> {
|
||||
const { config, coreConfig, logger } = params;
|
||||
const { config, coreConfig, ttsRuntime, logger } = params;
|
||||
const log = logger ?? {
|
||||
info: console.log,
|
||||
warn: console.warn,
|
||||
@@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: {
|
||||
|
||||
if (provider.name === "twilio" && config.streaming?.enabled) {
|
||||
const twilioProvider = provider as TwilioProvider;
|
||||
const openaiApiKey =
|
||||
config.streaming.openaiApiKey || process.env.OPENAI_API_KEY;
|
||||
if (openaiApiKey) {
|
||||
if (ttsRuntime?.textToSpeechTelephony) {
|
||||
try {
|
||||
const ttsProvider = new OpenAITTSProvider({
|
||||
apiKey: openaiApiKey,
|
||||
voice: config.tts.voice,
|
||||
model: config.tts.model,
|
||||
instructions: config.tts.instructions,
|
||||
const ttsProvider = createTelephonyTtsProvider({
|
||||
coreConfig,
|
||||
ttsOverride: config.tts,
|
||||
runtime: ttsRuntime,
|
||||
});
|
||||
twilioProvider.setTTSProvider(ttsProvider);
|
||||
log.info("[voice-call] OpenAI TTS provider configured");
|
||||
log.info("[voice-call] Telephony TTS provider configured");
|
||||
} catch (err) {
|
||||
log.warn(
|
||||
`[voice-call] Failed to initialize OpenAI TTS: ${
|
||||
`[voice-call] Failed to initialize telephony TTS: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled");
|
||||
log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
|
||||
}
|
||||
|
||||
const mediaHandler = webhookServer.getMediaStreamHandler();
|
||||
|
||||
88
extensions/voice-call/src/telephony-audio.ts
Normal file
88
extensions/voice-call/src/telephony-audio.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
|
||||
*/
|
||||
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
|
||||
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
|
||||
const inputSamples = Math.floor(input.length / 2);
|
||||
if (inputSamples === 0) return Buffer.alloc(0);
|
||||
|
||||
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
|
||||
const outputSamples = Math.floor(inputSamples / ratio);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
const srcPos = i * ratio;
|
||||
const srcIndex = Math.floor(srcPos);
|
||||
const frac = srcPos - srcIndex;
|
||||
|
||||
const s0 = input.readInt16LE(srcIndex * 2);
|
||||
const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
|
||||
const s1 = input.readInt16LE(s1Index * 2);
|
||||
|
||||
const sample = Math.round(s0 + frac * (s1 - s0));
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law (G.711).
|
||||
*/
|
||||
export function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = Math.floor(pcm.length / 2);
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
export function convertPcmToMulaw8k(
|
||||
pcm: Buffer,
|
||||
inputSampleRate: number,
|
||||
): Buffer {
|
||||
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
|
||||
*/
|
||||
export function chunkAudio(
|
||||
audio: Buffer,
|
||||
chunkSize = 160,
|
||||
): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) sample = -sample;
|
||||
if (sample > CLIP) sample = CLIP;
|
||||
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
|
||||
expMask >>= 1;
|
||||
}
|
||||
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
95
extensions/voice-call/src/telephony-tts.ts
Normal file
95
extensions/voice-call/src/telephony-tts.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import type { CoreConfig } from "./core-bridge.js";
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
import { convertPcmToMulaw8k } from "./telephony-audio.js";
|
||||
|
||||
export type TelephonyTtsRuntime = {
|
||||
textToSpeechTelephony: (params: {
|
||||
text: string;
|
||||
cfg: CoreConfig;
|
||||
prefsPath?: string;
|
||||
}) => Promise<{
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
sampleRate?: number;
|
||||
provider?: string;
|
||||
error?: string;
|
||||
}>;
|
||||
};
|
||||
|
||||
export type TelephonyTtsProvider = {
|
||||
synthesizeForTelephony: (text: string) => Promise<Buffer>;
|
||||
};
|
||||
|
||||
export function createTelephonyTtsProvider(params: {
|
||||
coreConfig: CoreConfig;
|
||||
ttsOverride?: VoiceCallTtsConfig;
|
||||
runtime: TelephonyTtsRuntime;
|
||||
}): TelephonyTtsProvider {
|
||||
const { coreConfig, ttsOverride, runtime } = params;
|
||||
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
|
||||
|
||||
return {
|
||||
synthesizeForTelephony: async (text: string) => {
|
||||
const result = await runtime.textToSpeechTelephony({
|
||||
text,
|
||||
cfg: mergedConfig,
|
||||
});
|
||||
|
||||
if (!result.success || !result.audioBuffer || !result.sampleRate) {
|
||||
throw new Error(result.error ?? "TTS conversion failed");
|
||||
}
|
||||
|
||||
return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function applyTtsOverride(
|
||||
coreConfig: CoreConfig,
|
||||
override?: VoiceCallTtsConfig,
|
||||
): CoreConfig {
|
||||
if (!override) return coreConfig;
|
||||
|
||||
const base = coreConfig.messages?.tts;
|
||||
const merged = mergeTtsConfig(base, override);
|
||||
if (!merged) return coreConfig;
|
||||
|
||||
return {
|
||||
...coreConfig,
|
||||
messages: {
|
||||
...(coreConfig.messages ?? {}),
|
||||
tts: merged,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function mergeTtsConfig(
|
||||
base?: VoiceCallTtsConfig,
|
||||
override?: VoiceCallTtsConfig,
|
||||
): VoiceCallTtsConfig | undefined {
|
||||
if (!base && !override) return undefined;
|
||||
if (!override) return base;
|
||||
if (!base) return override;
|
||||
return deepMerge(base, override);
|
||||
}
|
||||
|
||||
function deepMerge<T>(base: T, override: T): T {
|
||||
if (!isPlainObject(base) || !isPlainObject(override)) {
|
||||
return override;
|
||||
}
|
||||
const result: Record<string, unknown> = { ...base };
|
||||
for (const [key, value] of Object.entries(override)) {
|
||||
if (value === undefined) continue;
|
||||
const existing = (base as Record<string, unknown>)[key];
|
||||
if (isPlainObject(existing) && isPlainObject(value)) {
|
||||
result[key] = deepMerge(existing, value);
|
||||
} else {
|
||||
result[key] = value;
|
||||
}
|
||||
}
|
||||
return result as T;
|
||||
}
|
||||
|
||||
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
Reference in New Issue
Block a user