refactor: align voice-call TTS with core config
This commit is contained in:
@@ -67,6 +67,22 @@ Plugins can register:
|
||||
Plugins run **in‑process** with the Gateway, so treat them as trusted code.
|
||||
Tool authoring guide: [Plugin agent tools](/plugins/agent-tools).
|
||||
|
||||
## Runtime helpers
|
||||
|
||||
Plugins can access selected core helpers via `api.runtime`. For telephony TTS:
|
||||
|
||||
```ts
|
||||
const result = await api.runtime.tts.textToSpeechTelephony({
|
||||
text: "Hello from Clawdbot",
|
||||
cfg: api.config,
|
||||
});
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Uses core `messages.tts` configuration (OpenAI or ElevenLabs).
|
||||
- Returns PCM audio buffer + sample rate. Plugins must resample/encode for providers.
|
||||
- Edge TTS is not supported for telephony.
|
||||
|
||||
## Discovery & precedence
|
||||
|
||||
Clawdbot scans, in order:
|
||||
|
||||
@@ -104,6 +104,87 @@ Notes:
|
||||
- `mock` is a local dev provider (no network calls).
|
||||
- `skipSignatureVerification` is for local testing only.
|
||||
|
||||
## TTS for calls
|
||||
|
||||
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
|
||||
streaming speech on calls. You can override it under the plugin config with the
|
||||
**same shape** — it deep‑merges with `messages.tts`.
|
||||
|
||||
```json5
|
||||
{
|
||||
tts: {
|
||||
provider: "elevenlabs",
|
||||
elevenlabs: {
|
||||
voiceId: "pMsXgVXv3BLzUgSXRplE",
|
||||
modelId: "eleven_multilingual_v2"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notes:
|
||||
- **Edge TTS is ignored for voice calls** (telephony audio needs PCM; Edge output is unreliable).
|
||||
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
|
||||
|
||||
### More examples
|
||||
|
||||
Use core TTS only (no override):
|
||||
|
||||
```json5
|
||||
{
|
||||
messages: {
|
||||
tts: {
|
||||
provider: "openai",
|
||||
openai: { voice: "alloy" }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Override to ElevenLabs just for calls (keep core default elsewhere):
|
||||
|
||||
```json5
|
||||
{
|
||||
plugins: {
|
||||
entries: {
|
||||
"voice-call": {
|
||||
config: {
|
||||
tts: {
|
||||
provider: "elevenlabs",
|
||||
elevenlabs: {
|
||||
apiKey: "elevenlabs_key",
|
||||
voiceId: "pMsXgVXv3BLzUgSXRplE",
|
||||
modelId: "eleven_multilingual_v2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Override only the OpenAI model for calls (deep‑merge example):
|
||||
|
||||
```json5
|
||||
{
|
||||
plugins: {
|
||||
entries: {
|
||||
"voice-call": {
|
||||
config: {
|
||||
tts: {
|
||||
openai: {
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "marin"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Inbound calls
|
||||
|
||||
Inbound policy defaults to `disabled`. To enable inbound calls, set:
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
# Changelog
|
||||
|
||||
## 2026.1.24
|
||||
|
||||
### Changes
|
||||
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
|
||||
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
|
||||
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
|
||||
|
||||
## 2026.1.23
|
||||
|
||||
### Changes
|
||||
|
||||
@@ -75,6 +75,27 @@ Notes:
|
||||
- Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
|
||||
- `mock` is a local dev provider (no network calls).
|
||||
|
||||
## TTS for calls
|
||||
|
||||
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
|
||||
streaming speech on calls. You can override it under the plugin config with the
|
||||
same shape — overrides deep-merge with `messages.tts`.
|
||||
|
||||
```json5
|
||||
{
|
||||
tts: {
|
||||
provider: "openai",
|
||||
openai: {
|
||||
voice: "alloy"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
|
||||
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
|
||||
|
||||
## CLI
|
||||
|
||||
```bash
|
||||
|
||||
@@ -99,16 +99,39 @@
|
||||
"label": "Media Stream Path",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.model": {
|
||||
"label": "TTS Model",
|
||||
"tts.provider": {
|
||||
"label": "TTS Provider Override",
|
||||
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.voice": {
|
||||
"label": "TTS Voice",
|
||||
"tts.openai.model": {
|
||||
"label": "OpenAI TTS Model",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.instructions": {
|
||||
"label": "TTS Instructions",
|
||||
"tts.openai.voice": {
|
||||
"label": "OpenAI TTS Voice",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.openai.apiKey": {
|
||||
"label": "OpenAI API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.modelId": {
|
||||
"label": "ElevenLabs Model ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.voiceId": {
|
||||
"label": "ElevenLabs Voice ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.apiKey": {
|
||||
"label": "ElevenLabs API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.baseUrl": {
|
||||
"label": "ElevenLabs Base URL",
|
||||
"advanced": true
|
||||
},
|
||||
"publicUrl": {
|
||||
@@ -370,20 +393,193 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"auto": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"off",
|
||||
"always",
|
||||
"inbound",
|
||||
"tagged"
|
||||
]
|
||||
},
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"final",
|
||||
"all"
|
||||
]
|
||||
},
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"openai"
|
||||
"openai",
|
||||
"elevenlabs",
|
||||
"edge"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"summaryModel": {
|
||||
"type": "string"
|
||||
},
|
||||
"voice": {
|
||||
"modelOverrides": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowText": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowProvider": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowVoice": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowModelId": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowVoiceSettings": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowNormalization": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowSeed": {
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
"elevenlabs": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"baseUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
"voiceId": {
|
||||
"type": "string"
|
||||
},
|
||||
"modelId": {
|
||||
"type": "string"
|
||||
},
|
||||
"seed": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 4294967295
|
||||
},
|
||||
"applyTextNormalization": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"auto",
|
||||
"on",
|
||||
"off"
|
||||
]
|
||||
},
|
||||
"languageCode": {
|
||||
"type": "string"
|
||||
},
|
||||
"voiceSettings": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"stability": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"similarityBoost": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"style": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"useSpeakerBoost": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"speed": {
|
||||
"type": "number",
|
||||
"minimum": 0.5,
|
||||
"maximum": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"openai": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"edge": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
},
|
||||
"lang": {
|
||||
"type": "string"
|
||||
},
|
||||
"outputFormat": {
|
||||
"type": "string"
|
||||
},
|
||||
"pitch": {
|
||||
"type": "string"
|
||||
},
|
||||
"rate": {
|
||||
"type": "string"
|
||||
},
|
||||
"volume": {
|
||||
"type": "string"
|
||||
},
|
||||
"saveSubtitles": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"proxy": {
|
||||
"type": "string"
|
||||
},
|
||||
"timeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1000,
|
||||
"maximum": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
"prefsPath": {
|
||||
"type": "string"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
"maxTextLength": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"timeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1000,
|
||||
"maximum": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
|
||||
},
|
||||
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
|
||||
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
||||
"tts.model": { label: "TTS Model", advanced: true },
|
||||
"tts.voice": { label: "TTS Voice", advanced: true },
|
||||
"tts.instructions": { label: "TTS Instructions", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
advanced: true,
|
||||
},
|
||||
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
|
||||
"tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
|
||||
"tts.openai.apiKey": {
|
||||
label: "OpenAI API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
|
||||
"tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
|
||||
"tts.elevenlabs.apiKey": {
|
||||
label: "ElevenLabs API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
|
||||
publicUrl: { label: "Public Webhook URL", advanced: true },
|
||||
skipSignatureVerification: {
|
||||
label: "Skip Signature Verification",
|
||||
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
|
||||
runtimePromise = createVoiceCallRuntime({
|
||||
config: cfg,
|
||||
coreConfig: api.config as CoreConfig,
|
||||
ttsRuntime: api.runtime.tts,
|
||||
logger: api.logger,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -82,31 +82,82 @@ export const SttConfigSchema = z
|
||||
.default({ provider: "openai", model: "whisper-1" });
|
||||
export type SttConfig = z.infer<typeof SttConfigSchema>;
|
||||
|
||||
export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
|
||||
export const TtsModeSchema = z.enum(["final", "all"]);
|
||||
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export const TtsConfigSchema = z
|
||||
.object({
|
||||
/** TTS provider (currently only OpenAI supported) */
|
||||
provider: z.literal("openai").default("openai"),
|
||||
/**
|
||||
* TTS model to use:
|
||||
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
||||
* - tts-1: lower latency
|
||||
* - tts-1-hd: higher quality
|
||||
*/
|
||||
model: z.string().min(1).default("gpt-4o-mini-tts"),
|
||||
/**
|
||||
* Voice ID. For best quality, use marin or cedar.
|
||||
* All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
||||
*/
|
||||
voice: z.string().min(1).default("coral"),
|
||||
/**
|
||||
* Instructions for speech style (only works with gpt-4o-mini-tts).
|
||||
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
||||
*/
|
||||
instructions: z.string().optional(),
|
||||
auto: TtsAutoSchema.optional(),
|
||||
enabled: z.boolean().optional(),
|
||||
mode: TtsModeSchema.optional(),
|
||||
provider: TtsProviderSchema.optional(),
|
||||
summaryModel: z.string().optional(),
|
||||
modelOverrides: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
allowText: z.boolean().optional(),
|
||||
allowProvider: z.boolean().optional(),
|
||||
allowVoice: z.boolean().optional(),
|
||||
allowModelId: z.boolean().optional(),
|
||||
allowVoiceSettings: z.boolean().optional(),
|
||||
allowNormalization: z.boolean().optional(),
|
||||
allowSeed: z.boolean().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
elevenlabs: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
voiceId: z.string().optional(),
|
||||
modelId: z.string().optional(),
|
||||
seed: z.number().int().min(0).max(4294967295).optional(),
|
||||
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
|
||||
languageCode: z.string().optional(),
|
||||
voiceSettings: z
|
||||
.object({
|
||||
stability: z.number().min(0).max(1).optional(),
|
||||
similarityBoost: z.number().min(0).max(1).optional(),
|
||||
style: z.number().min(0).max(1).optional(),
|
||||
useSpeakerBoost: z.boolean().optional(),
|
||||
speed: z.number().min(0.5).max(2).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
openai: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
edge: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
voice: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
outputFormat: z.string().optional(),
|
||||
pitch: z.string().optional(),
|
||||
rate: z.string().optional(),
|
||||
volume: z.string().optional(),
|
||||
saveSubtitles: z.boolean().optional(),
|
||||
proxy: z.string().optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
prefsPath: z.string().optional(),
|
||||
maxTextLength: z.number().int().min(1).optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
|
||||
export type TtsConfig = z.infer<typeof TtsConfigSchema>;
|
||||
.optional();
|
||||
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Webhook Server Configuration
|
||||
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
|
||||
/** STT configuration */
|
||||
stt: SttConfigSchema,
|
||||
|
||||
/** TTS configuration */
|
||||
/** TTS override (deep-merges with core messages.tts) */
|
||||
tts: TtsConfigSchema,
|
||||
|
||||
/** Store path for call logs */
|
||||
|
||||
@@ -2,10 +2,16 @@ import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath, pathToFileURL } from "node:url";
|
||||
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
|
||||
export type CoreConfig = {
|
||||
session?: {
|
||||
store?: string;
|
||||
};
|
||||
messages?: {
|
||||
tts?: VoiceCallTtsConfig;
|
||||
};
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
type CoreAgentDeps = {
|
||||
|
||||
@@ -143,7 +143,7 @@ export class CallManager {
|
||||
// For notify mode with a message, use inline TwiML with <Say>
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
|
||||
const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
|
||||
inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(
|
||||
`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
|
||||
@@ -210,11 +210,13 @@ export class CallManager {
|
||||
this.addTranscriptEntry(call, "bot", text);
|
||||
|
||||
// Play TTS
|
||||
const voice =
|
||||
this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
|
||||
await this.provider.playTts({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
text,
|
||||
voice: this.config.tts.voice,
|
||||
voice,
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
|
||||
@@ -68,7 +68,7 @@ export async function initiateCall(
|
||||
// For notify mode with a message, use inline TwiML with <Say>.
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
|
||||
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
|
||||
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
|
||||
}
|
||||
@@ -120,11 +120,13 @@ export async function speak(
|
||||
|
||||
addTranscriptEntry(call, "bot", text);
|
||||
|
||||
const voice =
|
||||
ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
|
||||
await ctx.provider.playTts({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
text,
|
||||
voice: ctx.config.tts.voice,
|
||||
voice,
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
@@ -244,4 +246,3 @@ export async function endCall(
|
||||
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ import type {
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
|
||||
import { chunkAudio } from "../telephony-audio.js";
|
||||
import type { TelephonyTtsProvider } from "../telephony-tts.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import type { OpenAITTSProvider } from "./tts-openai.js";
|
||||
import { chunkAudio } from "./tts-openai.js";
|
||||
import { twilioApiRequest } from "./twilio/api.js";
|
||||
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
|
||||
|
||||
@@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
/** Current public webhook URL (set when tunnel starts or from config) */
|
||||
private currentPublicUrl: string | null = null;
|
||||
|
||||
/** Optional OpenAI TTS provider for streaming TTS */
|
||||
private ttsProvider: OpenAITTSProvider | null = null;
|
||||
/** Optional telephony TTS provider for streaming TTS */
|
||||
private ttsProvider: TelephonyTtsProvider | null = null;
|
||||
|
||||
/** Optional media stream handler for sending audio */
|
||||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
@@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
return this.currentPublicUrl;
|
||||
}
|
||||
|
||||
setTTSProvider(provider: OpenAITTSProvider): void {
|
||||
setTTSProvider(provider: TelephonyTtsProvider): void {
|
||||
this.ttsProvider = provider;
|
||||
}
|
||||
|
||||
@@ -454,13 +454,13 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
* Play TTS audio via Twilio.
|
||||
*
|
||||
* Two modes:
|
||||
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via OpenAI and streams it through WebSocket (preferred).
|
||||
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via core TTS and streams it through WebSocket (preferred).
|
||||
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
|
||||
* Note: This may not work on all Twilio accounts.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
// Try OpenAI TTS via media stream first (if configured)
|
||||
// Try telephony TTS via media stream first (if configured)
|
||||
const streamSid = this.callStreamMap.get(input.providerCallId);
|
||||
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
|
||||
try {
|
||||
@@ -468,7 +468,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
return;
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
|
||||
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
|
||||
err instanceof Error ? err.message : err,
|
||||
);
|
||||
// Fall through to TwiML <Say> fallback
|
||||
@@ -484,7 +484,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
}
|
||||
|
||||
console.warn(
|
||||
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
|
||||
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
|
||||
);
|
||||
|
||||
const pollyVoice = mapVoiceToPolly(input.voice);
|
||||
@@ -502,8 +502,8 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS via OpenAI and Twilio Media Streams.
|
||||
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Play TTS via core TTS and Twilio Media Streams.
|
||||
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Uses a jitter buffer to smooth out timing variations.
|
||||
*/
|
||||
private async playTtsViaStream(
|
||||
@@ -514,8 +514,8 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
throw new Error("TTS provider and media stream handler required");
|
||||
}
|
||||
|
||||
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
|
||||
// Generate audio with core TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
|
||||
|
||||
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
||||
const CHUNK_SIZE = 160;
|
||||
|
||||
@@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import { MockProvider } from "./providers/mock.js";
|
||||
import { PlivoProvider } from "./providers/plivo.js";
|
||||
import { TelnyxProvider } from "./providers/telnyx.js";
|
||||
import { OpenAITTSProvider } from "./providers/tts-openai.js";
|
||||
import { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
|
||||
import { createTelephonyTtsProvider } from "./telephony-tts.js";
|
||||
import { startTunnel, type TunnelResult } from "./tunnel.js";
|
||||
import {
|
||||
cleanupTailscaleExposure,
|
||||
@@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
||||
export async function createVoiceCallRuntime(params: {
|
||||
config: VoiceCallConfig;
|
||||
coreConfig: CoreConfig;
|
||||
ttsRuntime?: TelephonyTtsRuntime;
|
||||
logger?: Logger;
|
||||
}): Promise<VoiceCallRuntime> {
|
||||
const { config, coreConfig, logger } = params;
|
||||
const { config, coreConfig, ttsRuntime, logger } = params;
|
||||
const log = logger ?? {
|
||||
info: console.log,
|
||||
warn: console.warn,
|
||||
@@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: {
|
||||
|
||||
if (provider.name === "twilio" && config.streaming?.enabled) {
|
||||
const twilioProvider = provider as TwilioProvider;
|
||||
const openaiApiKey =
|
||||
config.streaming.openaiApiKey || process.env.OPENAI_API_KEY;
|
||||
if (openaiApiKey) {
|
||||
if (ttsRuntime?.textToSpeechTelephony) {
|
||||
try {
|
||||
const ttsProvider = new OpenAITTSProvider({
|
||||
apiKey: openaiApiKey,
|
||||
voice: config.tts.voice,
|
||||
model: config.tts.model,
|
||||
instructions: config.tts.instructions,
|
||||
const ttsProvider = createTelephonyTtsProvider({
|
||||
coreConfig,
|
||||
ttsOverride: config.tts,
|
||||
runtime: ttsRuntime,
|
||||
});
|
||||
twilioProvider.setTTSProvider(ttsProvider);
|
||||
log.info("[voice-call] OpenAI TTS provider configured");
|
||||
log.info("[voice-call] Telephony TTS provider configured");
|
||||
} catch (err) {
|
||||
log.warn(
|
||||
`[voice-call] Failed to initialize OpenAI TTS: ${
|
||||
`[voice-call] Failed to initialize telephony TTS: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled");
|
||||
log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
|
||||
}
|
||||
|
||||
const mediaHandler = webhookServer.getMediaStreamHandler();
|
||||
|
||||
88
extensions/voice-call/src/telephony-audio.ts
Normal file
88
extensions/voice-call/src/telephony-audio.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
|
||||
*/
|
||||
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
|
||||
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
|
||||
const inputSamples = Math.floor(input.length / 2);
|
||||
if (inputSamples === 0) return Buffer.alloc(0);
|
||||
|
||||
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
|
||||
const outputSamples = Math.floor(inputSamples / ratio);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
const srcPos = i * ratio;
|
||||
const srcIndex = Math.floor(srcPos);
|
||||
const frac = srcPos - srcIndex;
|
||||
|
||||
const s0 = input.readInt16LE(srcIndex * 2);
|
||||
const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
|
||||
const s1 = input.readInt16LE(s1Index * 2);
|
||||
|
||||
const sample = Math.round(s0 + frac * (s1 - s0));
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law (G.711).
|
||||
*/
|
||||
export function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = Math.floor(pcm.length / 2);
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
export function convertPcmToMulaw8k(
|
||||
pcm: Buffer,
|
||||
inputSampleRate: number,
|
||||
): Buffer {
|
||||
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
|
||||
*/
|
||||
export function chunkAudio(
|
||||
audio: Buffer,
|
||||
chunkSize = 160,
|
||||
): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) sample = -sample;
|
||||
if (sample > CLIP) sample = CLIP;
|
||||
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
|
||||
expMask >>= 1;
|
||||
}
|
||||
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
95
extensions/voice-call/src/telephony-tts.ts
Normal file
95
extensions/voice-call/src/telephony-tts.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import type { CoreConfig } from "./core-bridge.js";
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
import { convertPcmToMulaw8k } from "./telephony-audio.js";
|
||||
|
||||
export type TelephonyTtsRuntime = {
|
||||
textToSpeechTelephony: (params: {
|
||||
text: string;
|
||||
cfg: CoreConfig;
|
||||
prefsPath?: string;
|
||||
}) => Promise<{
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
sampleRate?: number;
|
||||
provider?: string;
|
||||
error?: string;
|
||||
}>;
|
||||
};
|
||||
|
||||
export type TelephonyTtsProvider = {
|
||||
synthesizeForTelephony: (text: string) => Promise<Buffer>;
|
||||
};
|
||||
|
||||
export function createTelephonyTtsProvider(params: {
|
||||
coreConfig: CoreConfig;
|
||||
ttsOverride?: VoiceCallTtsConfig;
|
||||
runtime: TelephonyTtsRuntime;
|
||||
}): TelephonyTtsProvider {
|
||||
const { coreConfig, ttsOverride, runtime } = params;
|
||||
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
|
||||
|
||||
return {
|
||||
synthesizeForTelephony: async (text: string) => {
|
||||
const result = await runtime.textToSpeechTelephony({
|
||||
text,
|
||||
cfg: mergedConfig,
|
||||
});
|
||||
|
||||
if (!result.success || !result.audioBuffer || !result.sampleRate) {
|
||||
throw new Error(result.error ?? "TTS conversion failed");
|
||||
}
|
||||
|
||||
return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function applyTtsOverride(
|
||||
coreConfig: CoreConfig,
|
||||
override?: VoiceCallTtsConfig,
|
||||
): CoreConfig {
|
||||
if (!override) return coreConfig;
|
||||
|
||||
const base = coreConfig.messages?.tts;
|
||||
const merged = mergeTtsConfig(base, override);
|
||||
if (!merged) return coreConfig;
|
||||
|
||||
return {
|
||||
...coreConfig,
|
||||
messages: {
|
||||
...(coreConfig.messages ?? {}),
|
||||
tts: merged,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function mergeTtsConfig(
|
||||
base?: VoiceCallTtsConfig,
|
||||
override?: VoiceCallTtsConfig,
|
||||
): VoiceCallTtsConfig | undefined {
|
||||
if (!base && !override) return undefined;
|
||||
if (!override) return base;
|
||||
if (!base) return override;
|
||||
return deepMerge(base, override);
|
||||
}
|
||||
|
||||
function deepMerge<T>(base: T, override: T): T {
|
||||
if (!isPlainObject(base) || !isPlainObject(override)) {
|
||||
return override;
|
||||
}
|
||||
const result: Record<string, unknown> = { ...base };
|
||||
for (const [key, value] of Object.entries(override)) {
|
||||
if (value === undefined) continue;
|
||||
const existing = (base as Record<string, unknown>)[key];
|
||||
if (isPlainObject(existing) && isPlainObject(value)) {
|
||||
result[key] = deepMerge(existing, value);
|
||||
} else {
|
||||
result[key] = value;
|
||||
}
|
||||
}
|
||||
return result as T;
|
||||
}
|
||||
|
||||
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
@@ -124,6 +124,7 @@ import { startWebLoginWithQr, waitForWebLogin } from "../../web/login-qr.js";
|
||||
import { sendMessageWhatsApp, sendPollWhatsApp } from "../../web/outbound.js";
|
||||
import { registerMemoryCli } from "../../cli/memory-cli.js";
|
||||
import { formatNativeDependencyHint } from "./native-deps.js";
|
||||
import { textToSpeechTelephony } from "../../tts/tts.js";
|
||||
|
||||
import type { PluginRuntime } from "./types.js";
|
||||
|
||||
@@ -162,6 +163,9 @@ export function createPluginRuntime(): PluginRuntime {
|
||||
getImageMetadata,
|
||||
resizeToJpeg,
|
||||
},
|
||||
tts: {
|
||||
textToSpeechTelephony,
|
||||
},
|
||||
tools: {
|
||||
createMemoryGetTool,
|
||||
createMemorySearchTool,
|
||||
|
||||
@@ -16,6 +16,7 @@ type UpsertChannelPairingRequest =
|
||||
typeof import("../../pairing/pairing-store.js").upsertChannelPairingRequest;
|
||||
type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
|
||||
type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
|
||||
type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
|
||||
type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
|
||||
type MatchesMentionPatterns =
|
||||
typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
|
||||
@@ -173,6 +174,9 @@ export type PluginRuntime = {
|
||||
getImageMetadata: GetImageMetadata;
|
||||
resizeToJpeg: ResizeToJpeg;
|
||||
};
|
||||
tts: {
|
||||
textToSpeechTelephony: TextToSpeechTelephony;
|
||||
};
|
||||
tools: {
|
||||
createMemoryGetTool: CreateMemoryGetTool;
|
||||
createMemorySearchTool: CreateMemorySearchTool;
|
||||
|
||||
@@ -43,6 +43,7 @@ function setup(config: Record<string, unknown>): Registered {
|
||||
source: "test",
|
||||
config: {},
|
||||
pluginConfig: config,
|
||||
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
|
||||
logger: noopLogger,
|
||||
registerGatewayMethod: (method, handler) => methods.set(method, handler),
|
||||
registerTool: (tool) => tools.push(tool),
|
||||
@@ -142,6 +143,7 @@ describe("voice-call plugin", () => {
|
||||
source: "test",
|
||||
config: {},
|
||||
pluginConfig: { provider: "mock" },
|
||||
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
|
||||
logger: noopLogger,
|
||||
registerGatewayMethod: () => {},
|
||||
registerTool: () => {},
|
||||
|
||||
111
src/tts/tts.ts
111
src/tts/tts.ts
@@ -76,6 +76,11 @@ const DEFAULT_OUTPUT = {
|
||||
voiceCompatible: false,
|
||||
};
|
||||
|
||||
const TELEPHONY_OUTPUT = {
|
||||
openai: { format: "pcm" as const, sampleRate: 24000 },
|
||||
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
|
||||
};
|
||||
|
||||
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export type ResolvedTtsConfig = {
|
||||
@@ -180,6 +185,16 @@ export type TtsResult = {
|
||||
voiceCompatible?: boolean;
|
||||
};
|
||||
|
||||
export type TtsTelephonyResult = {
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
outputFormat?: string;
|
||||
sampleRate?: number;
|
||||
};
|
||||
|
||||
type TtsStatusEntry = {
|
||||
timestamp: number;
|
||||
success: boolean;
|
||||
@@ -980,7 +995,7 @@ async function openaiTTS(params: {
|
||||
apiKey: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus";
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
@@ -1224,6 +1239,100 @@ export async function textToSpeech(params: {
|
||||
};
|
||||
}
|
||||
|
||||
export async function textToSpeechTelephony(params: {
|
||||
text: string;
|
||||
cfg: ClawdbotConfig;
|
||||
prefsPath?: string;
|
||||
}): Promise<TtsTelephonyResult> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
|
||||
|
||||
if (params.text.length > config.maxTextLength) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
|
||||
};
|
||||
}
|
||||
|
||||
const userProvider = getTtsProvider(config, prefsPath);
|
||||
const providers = resolveTtsProviderOrder(userProvider);
|
||||
|
||||
let lastError: string | undefined;
|
||||
|
||||
for (const provider of providers) {
|
||||
const providerStart = Date.now();
|
||||
try {
|
||||
if (provider === "edge") {
|
||||
lastError = "edge: unsupported for telephony";
|
||||
continue;
|
||||
}
|
||||
|
||||
const apiKey = resolveTtsApiKey(config, provider);
|
||||
if (!apiKey) {
|
||||
lastError = `No API key for ${provider}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (provider === "elevenlabs") {
|
||||
const output = TELEPHONY_OUTPUT.elevenlabs;
|
||||
const audioBuffer = await elevenLabsTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.elevenlabs.baseUrl,
|
||||
voiceId: config.elevenlabs.voiceId,
|
||||
modelId: config.elevenlabs.modelId,
|
||||
outputFormat: output.format,
|
||||
seed: config.elevenlabs.seed,
|
||||
applyTextNormalization: config.elevenlabs.applyTextNormalization,
|
||||
languageCode: config.elevenlabs.languageCode,
|
||||
voiceSettings: config.elevenlabs.voiceSettings,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: output.format,
|
||||
sampleRate: output.sampleRate,
|
||||
};
|
||||
}
|
||||
|
||||
const output = TELEPHONY_OUTPUT.openai;
|
||||
const audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
responseFormat: output.format,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: output.format,
|
||||
sampleRate: output.sampleRate,
|
||||
};
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
if (error.name === "AbortError") {
|
||||
lastError = `${provider}: request timed out`;
|
||||
} else {
|
||||
lastError = `${provider}: ${error.message}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `TTS conversion failed: ${lastError || "no providers available"}`,
|
||||
};
|
||||
}
|
||||
|
||||
export async function maybeApplyTtsToPayload(params: {
|
||||
payload: ReplyPayload;
|
||||
cfg: ClawdbotConfig;
|
||||
|
||||
Reference in New Issue
Block a user