refactor: align voice-call TTS with core config

This commit is contained in:
Peter Steinberger
2026-01-25 09:29:50 +00:00
parent 9366cbc7db
commit 83f92e34af
18 changed files with 769 additions and 69 deletions

View File

@@ -67,6 +67,22 @@ Plugins can register:
Plugins run **inprocess** with the Gateway, so treat them as trusted code.
Tool authoring guide: [Plugin agent tools](/plugins/agent-tools).
## Runtime helpers
Plugins can access selected core helpers via `api.runtime`. For telephony TTS:
```ts
const result = await api.runtime.tts.textToSpeechTelephony({
text: "Hello from Clawdbot",
cfg: api.config,
});
```
Notes:
- Uses core `messages.tts` configuration (OpenAI or ElevenLabs).
- Returns PCM audio buffer + sample rate. Plugins must resample/encode for providers.
- Edge TTS is not supported for telephony.
## Discovery & precedence
Clawdbot scans, in order:

View File

@@ -104,6 +104,87 @@ Notes:
- `mock` is a local dev provider (no network calls).
- `skipSignatureVerification` is for local testing only.
## TTS for calls
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
streaming speech on calls. You can override it under the plugin config with the
**same shape** — it deepmerges with `messages.tts`.
```json5
{
tts: {
provider: "elevenlabs",
elevenlabs: {
voiceId: "pMsXgVXv3BLzUgSXRplE",
modelId: "eleven_multilingual_v2"
}
}
}
```
Notes:
- **Edge TTS is ignored for voice calls** (telephony audio needs PCM; Edge output is unreliable).
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
### More examples
Use core TTS only (no override):
```json5
{
messages: {
tts: {
provider: "openai",
openai: { voice: "alloy" }
}
}
}
```
Override to ElevenLabs just for calls (keep core default elsewhere):
```json5
{
plugins: {
entries: {
"voice-call": {
config: {
tts: {
provider: "elevenlabs",
elevenlabs: {
apiKey: "elevenlabs_key",
voiceId: "pMsXgVXv3BLzUgSXRplE",
modelId: "eleven_multilingual_v2"
}
}
}
}
}
}
}
```
Override only the OpenAI model for calls (deepmerge example):
```json5
{
plugins: {
entries: {
"voice-call": {
config: {
tts: {
openai: {
model: "gpt-4o-mini-tts",
voice: "marin"
}
}
}
}
}
}
}
```
## Inbound calls
Inbound policy defaults to `disabled`. To enable inbound calls, set:

View File

@@ -1,5 +1,12 @@
# Changelog
## 2026.1.24
### Changes
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deepmerges with core).
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
## 2026.1.23
### Changes

View File

@@ -75,6 +75,27 @@ Notes:
- Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
- `mock` is a local dev provider (no network calls).
## TTS for calls
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
streaming speech on calls. You can override it under the plugin config with the
same shape — overrides deep-merge with `messages.tts`.
```json5
{
tts: {
provider: "openai",
openai: {
voice: "alloy"
}
}
}
```
Notes:
- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
## CLI
```bash

View File

@@ -99,16 +99,39 @@
"label": "Media Stream Path",
"advanced": true
},
"tts.model": {
"label": "TTS Model",
"tts.provider": {
"label": "TTS Provider Override",
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
"advanced": true
},
"tts.voice": {
"label": "TTS Voice",
"tts.openai.model": {
"label": "OpenAI TTS Model",
"advanced": true
},
"tts.instructions": {
"label": "TTS Instructions",
"tts.openai.voice": {
"label": "OpenAI TTS Voice",
"advanced": true
},
"tts.openai.apiKey": {
"label": "OpenAI API Key",
"sensitive": true,
"advanced": true
},
"tts.elevenlabs.modelId": {
"label": "ElevenLabs Model ID",
"advanced": true
},
"tts.elevenlabs.voiceId": {
"label": "ElevenLabs Voice ID",
"advanced": true
},
"tts.elevenlabs.apiKey": {
"label": "ElevenLabs API Key",
"sensitive": true,
"advanced": true
},
"tts.elevenlabs.baseUrl": {
"label": "ElevenLabs Base URL",
"advanced": true
},
"publicUrl": {
@@ -370,20 +393,193 @@
"type": "object",
"additionalProperties": false,
"properties": {
"auto": {
"type": "string",
"enum": [
"off",
"always",
"inbound",
"tagged"
]
},
"enabled": {
"type": "boolean"
},
"mode": {
"type": "string",
"enum": [
"final",
"all"
]
},
"provider": {
"type": "string",
"enum": [
"openai"
"openai",
"elevenlabs",
"edge"
]
},
"model": {
"summaryModel": {
"type": "string"
},
"voice": {
"modelOverrides": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"allowText": {
"type": "boolean"
},
"allowProvider": {
"type": "boolean"
},
"allowVoice": {
"type": "boolean"
},
"allowModelId": {
"type": "boolean"
},
"allowVoiceSettings": {
"type": "boolean"
},
"allowNormalization": {
"type": "boolean"
},
"allowSeed": {
"type": "boolean"
}
}
},
"elevenlabs": {
"type": "object",
"additionalProperties": false,
"properties": {
"apiKey": {
"type": "string"
},
"baseUrl": {
"type": "string"
},
"voiceId": {
"type": "string"
},
"modelId": {
"type": "string"
},
"seed": {
"type": "integer",
"minimum": 0,
"maximum": 4294967295
},
"applyTextNormalization": {
"type": "string",
"enum": [
"auto",
"on",
"off"
]
},
"languageCode": {
"type": "string"
},
"voiceSettings": {
"type": "object",
"additionalProperties": false,
"properties": {
"stability": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"similarityBoost": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"style": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"useSpeakerBoost": {
"type": "boolean"
},
"speed": {
"type": "number",
"minimum": 0.5,
"maximum": 2
}
}
}
}
},
"openai": {
"type": "object",
"additionalProperties": false,
"properties": {
"apiKey": {
"type": "string"
},
"model": {
"type": "string"
},
"voice": {
"type": "string"
}
}
},
"edge": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"voice": {
"type": "string"
},
"lang": {
"type": "string"
},
"outputFormat": {
"type": "string"
},
"pitch": {
"type": "string"
},
"rate": {
"type": "string"
},
"volume": {
"type": "string"
},
"saveSubtitles": {
"type": "boolean"
},
"proxy": {
"type": "string"
},
"timeoutMs": {
"type": "integer",
"minimum": 1000,
"maximum": 120000
}
}
},
"prefsPath": {
"type": "string"
},
"instructions": {
"type": "string"
"maxTextLength": {
"type": "integer",
"minimum": 1
},
"timeoutMs": {
"type": "integer",
"minimum": 1000,
"maximum": 120000
}
}
},

View File

@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
},
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
"tts.model": { label: "TTS Model", advanced: true },
"tts.voice": { label: "TTS Voice", advanced: true },
"tts.instructions": { label: "TTS Instructions", advanced: true },
"tts.provider": {
label: "TTS Provider Override",
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
advanced: true,
},
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
"tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
"tts.openai.apiKey": {
label: "OpenAI API Key",
sensitive: true,
advanced: true,
},
"tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
"tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
"tts.elevenlabs.apiKey": {
label: "ElevenLabs API Key",
sensitive: true,
advanced: true,
},
"tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
publicUrl: { label: "Public Webhook URL", advanced: true },
skipSignatureVerification: {
label: "Skip Signature Verification",
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
runtimePromise = createVoiceCallRuntime({
config: cfg,
coreConfig: api.config as CoreConfig,
ttsRuntime: api.runtime.tts,
logger: api.logger,
});
}

View File

@@ -82,31 +82,82 @@ export const SttConfigSchema = z
.default({ provider: "openai", model: "whisper-1" });
export type SttConfig = z.infer<typeof SttConfigSchema>;
export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
export const TtsConfigSchema = z
.object({
/** TTS provider (currently only OpenAI supported) */
provider: z.literal("openai").default("openai"),
/**
* TTS model to use:
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
* - tts-1: lower latency
* - tts-1-hd: higher quality
*/
model: z.string().min(1).default("gpt-4o-mini-tts"),
/**
* Voice ID. For best quality, use marin or cedar.
* All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
*/
voice: z.string().min(1).default("coral"),
/**
* Instructions for speech style (only works with gpt-4o-mini-tts).
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
*/
instructions: z.string().optional(),
auto: TtsAutoSchema.optional(),
enabled: z.boolean().optional(),
mode: TtsModeSchema.optional(),
provider: TtsProviderSchema.optional(),
summaryModel: z.string().optional(),
modelOverrides: z
.object({
enabled: z.boolean().optional(),
allowText: z.boolean().optional(),
allowProvider: z.boolean().optional(),
allowVoice: z.boolean().optional(),
allowModelId: z.boolean().optional(),
allowVoiceSettings: z.boolean().optional(),
allowNormalization: z.boolean().optional(),
allowSeed: z.boolean().optional(),
})
.strict()
.optional(),
elevenlabs: z
.object({
apiKey: z.string().optional(),
baseUrl: z.string().optional(),
voiceId: z.string().optional(),
modelId: z.string().optional(),
seed: z.number().int().min(0).max(4294967295).optional(),
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
languageCode: z.string().optional(),
voiceSettings: z
.object({
stability: z.number().min(0).max(1).optional(),
similarityBoost: z.number().min(0).max(1).optional(),
style: z.number().min(0).max(1).optional(),
useSpeakerBoost: z.boolean().optional(),
speed: z.number().min(0.5).max(2).optional(),
})
.strict()
.optional(),
})
.strict()
.optional(),
openai: z
.object({
apiKey: z.string().optional(),
model: z.string().optional(),
voice: z.string().optional(),
})
.strict()
.optional(),
edge: z
.object({
enabled: z.boolean().optional(),
voice: z.string().optional(),
lang: z.string().optional(),
outputFormat: z.string().optional(),
pitch: z.string().optional(),
rate: z.string().optional(),
volume: z.string().optional(),
saveSubtitles: z.boolean().optional(),
proxy: z.string().optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
export type TtsConfig = z.infer<typeof TtsConfigSchema>;
.optional();
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
// -----------------------------------------------------------------------------
// Webhook Server Configuration
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
/** STT configuration */
stt: SttConfigSchema,
/** TTS configuration */
/** TTS override (deep-merges with core messages.tts) */
tts: TtsConfigSchema,
/** Store path for call logs */

View File

@@ -2,10 +2,16 @@ import fs from "node:fs";
import path from "node:path";
import { fileURLToPath, pathToFileURL } from "node:url";
import type { VoiceCallTtsConfig } from "./config.js";
export type CoreConfig = {
session?: {
store?: string;
};
messages?: {
tts?: VoiceCallTtsConfig;
};
[key: string]: unknown;
};
type CoreAgentDeps = {

View File

@@ -143,7 +143,7 @@ export class CallManager {
// For notify mode with a message, use inline TwiML with <Say>
let inlineTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
console.log(
`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
@@ -210,11 +210,13 @@ export class CallManager {
this.addTranscriptEntry(call, "bot", text);
// Play TTS
const voice =
this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
await this.provider.playTts({
callId,
providerCallId: call.providerCallId,
text,
voice: this.config.tts.voice,
voice,
});
return { success: true };

View File

@@ -68,7 +68,7 @@ export async function initiateCall(
// For notify mode with a message, use inline TwiML with <Say>.
let inlineTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
}
@@ -120,11 +120,13 @@ export async function speak(
addTranscriptEntry(call, "bot", text);
const voice =
ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
await ctx.provider.playTts({
callId,
providerCallId: call.providerCallId,
text,
voice: ctx.config.tts.voice,
voice,
});
return { success: true };
@@ -244,4 +246,3 @@ export async function endCall(
return { success: false, error: err instanceof Error ? err.message : String(err) };
}
}

View File

@@ -15,9 +15,9 @@ import type {
WebhookVerificationResult,
} from "../types.js";
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
import { chunkAudio } from "../telephony-audio.js";
import type { TelephonyTtsProvider } from "../telephony-tts.js";
import type { VoiceCallProvider } from "./base.js";
import type { OpenAITTSProvider } from "./tts-openai.js";
import { chunkAudio } from "./tts-openai.js";
import { twilioApiRequest } from "./twilio/api.js";
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
@@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider {
/** Current public webhook URL (set when tunnel starts or from config) */
private currentPublicUrl: string | null = null;
/** Optional OpenAI TTS provider for streaming TTS */
private ttsProvider: OpenAITTSProvider | null = null;
/** Optional telephony TTS provider for streaming TTS */
private ttsProvider: TelephonyTtsProvider | null = null;
/** Optional media stream handler for sending audio */
private mediaStreamHandler: MediaStreamHandler | null = null;
@@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider {
return this.currentPublicUrl;
}
setTTSProvider(provider: OpenAITTSProvider): void {
setTTSProvider(provider: TelephonyTtsProvider): void {
this.ttsProvider = provider;
}
@@ -454,13 +454,13 @@ export class TwilioProvider implements VoiceCallProvider {
* Play TTS audio via Twilio.
*
* Two modes:
* 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
* generates audio via OpenAI and streams it through WebSocket (preferred).
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
* generates audio via core TTS and streams it through WebSocket (preferred).
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
* Note: This may not work on all Twilio accounts.
*/
async playTts(input: PlayTtsInput): Promise<void> {
// Try OpenAI TTS via media stream first (if configured)
// Try telephony TTS via media stream first (if configured)
const streamSid = this.callStreamMap.get(input.providerCallId);
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
try {
@@ -468,7 +468,7 @@ export class TwilioProvider implements VoiceCallProvider {
return;
} catch (err) {
console.warn(
`[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
err instanceof Error ? err.message : err,
);
// Fall through to TwiML <Say> fallback
@@ -484,7 +484,7 @@ export class TwilioProvider implements VoiceCallProvider {
}
console.warn(
"[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
);
const pollyVoice = mapVoiceToPolly(input.voice);
@@ -502,8 +502,8 @@ export class TwilioProvider implements VoiceCallProvider {
}
/**
* Play TTS via OpenAI and Twilio Media Streams.
* Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
* Play TTS via core TTS and Twilio Media Streams.
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
* Uses a jitter buffer to smooth out timing variations.
*/
private async playTtsViaStream(
@@ -514,8 +514,8 @@ export class TwilioProvider implements VoiceCallProvider {
throw new Error("TTS provider and media stream handler required");
}
// Generate audio with OpenAI TTS (returns mu-law at 8kHz)
const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
// Generate audio with core TTS (returns mu-law at 8kHz)
const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
const CHUNK_SIZE = 160;

View File

@@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js";
import { MockProvider } from "./providers/mock.js";
import { PlivoProvider } from "./providers/plivo.js";
import { TelnyxProvider } from "./providers/telnyx.js";
import { OpenAITTSProvider } from "./providers/tts-openai.js";
import { TwilioProvider } from "./providers/twilio.js";
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
import { startTunnel, type TunnelResult } from "./tunnel.js";
import {
cleanupTailscaleExposure,
@@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
export async function createVoiceCallRuntime(params: {
config: VoiceCallConfig;
coreConfig: CoreConfig;
ttsRuntime?: TelephonyTtsRuntime;
logger?: Logger;
}): Promise<VoiceCallRuntime> {
const { config, coreConfig, logger } = params;
const { config, coreConfig, ttsRuntime, logger } = params;
const log = logger ?? {
info: console.log,
warn: console.warn,
@@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: {
if (provider.name === "twilio" && config.streaming?.enabled) {
const twilioProvider = provider as TwilioProvider;
const openaiApiKey =
config.streaming.openaiApiKey || process.env.OPENAI_API_KEY;
if (openaiApiKey) {
if (ttsRuntime?.textToSpeechTelephony) {
try {
const ttsProvider = new OpenAITTSProvider({
apiKey: openaiApiKey,
voice: config.tts.voice,
model: config.tts.model,
instructions: config.tts.instructions,
const ttsProvider = createTelephonyTtsProvider({
coreConfig,
ttsOverride: config.tts,
runtime: ttsRuntime,
});
twilioProvider.setTTSProvider(ttsProvider);
log.info("[voice-call] OpenAI TTS provider configured");
log.info("[voice-call] Telephony TTS provider configured");
} catch (err) {
log.warn(
`[voice-call] Failed to initialize OpenAI TTS: ${
`[voice-call] Failed to initialize telephony TTS: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
} else {
log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled");
log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
}
const mediaHandler = webhookServer.getMediaStreamHandler();

View File

@@ -0,0 +1,88 @@
const TELEPHONY_SAMPLE_RATE = 8000;
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
/**
* Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
*/
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
const inputSamples = Math.floor(input.length / 2);
if (inputSamples === 0) return Buffer.alloc(0);
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
const outputSamples = Math.floor(inputSamples / ratio);
const output = Buffer.alloc(outputSamples * 2);
for (let i = 0; i < outputSamples; i++) {
const srcPos = i * ratio;
const srcIndex = Math.floor(srcPos);
const frac = srcPos - srcIndex;
const s0 = input.readInt16LE(srcIndex * 2);
const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
const s1 = input.readInt16LE(s1Index * 2);
const sample = Math.round(s0 + frac * (s1 - s0));
output.writeInt16LE(clamp16(sample), i * 2);
}
return output;
}
/**
* Convert 16-bit PCM to 8-bit mu-law (G.711).
*/
export function pcmToMulaw(pcm: Buffer): Buffer {
const samples = Math.floor(pcm.length / 2);
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
export function convertPcmToMulaw8k(
pcm: Buffer,
inputSampleRate: number,
): Buffer {
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
return pcmToMulaw(pcm8k);
}
/**
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
*/
export function chunkAudio(
audio: Buffer,
chunkSize = 160,
): Generator<Buffer, void, unknown> {
return (function* () {
for (let i = 0; i < audio.length; i += chunkSize) {
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
}
})();
}
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) sample = -sample;
if (sample > CLIP) sample = CLIP;
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
expMask >>= 1;
}
const mantissa = (sample >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}

View File

@@ -0,0 +1,95 @@
import type { CoreConfig } from "./core-bridge.js";
import type { VoiceCallTtsConfig } from "./config.js";
import { convertPcmToMulaw8k } from "./telephony-audio.js";
export type TelephonyTtsRuntime = {
textToSpeechTelephony: (params: {
text: string;
cfg: CoreConfig;
prefsPath?: string;
}) => Promise<{
success: boolean;
audioBuffer?: Buffer;
sampleRate?: number;
provider?: string;
error?: string;
}>;
};
export type TelephonyTtsProvider = {
synthesizeForTelephony: (text: string) => Promise<Buffer>;
};
export function createTelephonyTtsProvider(params: {
coreConfig: CoreConfig;
ttsOverride?: VoiceCallTtsConfig;
runtime: TelephonyTtsRuntime;
}): TelephonyTtsProvider {
const { coreConfig, ttsOverride, runtime } = params;
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
return {
synthesizeForTelephony: async (text: string) => {
const result = await runtime.textToSpeechTelephony({
text,
cfg: mergedConfig,
});
if (!result.success || !result.audioBuffer || !result.sampleRate) {
throw new Error(result.error ?? "TTS conversion failed");
}
return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
},
};
}
function applyTtsOverride(
coreConfig: CoreConfig,
override?: VoiceCallTtsConfig,
): CoreConfig {
if (!override) return coreConfig;
const base = coreConfig.messages?.tts;
const merged = mergeTtsConfig(base, override);
if (!merged) return coreConfig;
return {
...coreConfig,
messages: {
...(coreConfig.messages ?? {}),
tts: merged,
},
};
}
function mergeTtsConfig(
base?: VoiceCallTtsConfig,
override?: VoiceCallTtsConfig,
): VoiceCallTtsConfig | undefined {
if (!base && !override) return undefined;
if (!override) return base;
if (!base) return override;
return deepMerge(base, override);
}
function deepMerge<T>(base: T, override: T): T {
if (!isPlainObject(base) || !isPlainObject(override)) {
return override;
}
const result: Record<string, unknown> = { ...base };
for (const [key, value] of Object.entries(override)) {
if (value === undefined) continue;
const existing = (base as Record<string, unknown>)[key];
if (isPlainObject(existing) && isPlainObject(value)) {
result[key] = deepMerge(existing, value);
} else {
result[key] = value;
}
}
return result as T;
}
function isPlainObject(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}

View File

@@ -124,6 +124,7 @@ import { startWebLoginWithQr, waitForWebLogin } from "../../web/login-qr.js";
import { sendMessageWhatsApp, sendPollWhatsApp } from "../../web/outbound.js";
import { registerMemoryCli } from "../../cli/memory-cli.js";
import { formatNativeDependencyHint } from "./native-deps.js";
import { textToSpeechTelephony } from "../../tts/tts.js";
import type { PluginRuntime } from "./types.js";
@@ -162,6 +163,9 @@ export function createPluginRuntime(): PluginRuntime {
getImageMetadata,
resizeToJpeg,
},
tts: {
textToSpeechTelephony,
},
tools: {
createMemoryGetTool,
createMemorySearchTool,

View File

@@ -16,6 +16,7 @@ type UpsertChannelPairingRequest =
typeof import("../../pairing/pairing-store.js").upsertChannelPairingRequest;
type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
type MatchesMentionPatterns =
typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
@@ -173,6 +174,9 @@ export type PluginRuntime = {
getImageMetadata: GetImageMetadata;
resizeToJpeg: ResizeToJpeg;
};
tts: {
textToSpeechTelephony: TextToSpeechTelephony;
};
tools: {
createMemoryGetTool: CreateMemoryGetTool;
createMemorySearchTool: CreateMemorySearchTool;

View File

@@ -43,6 +43,7 @@ function setup(config: Record<string, unknown>): Registered {
source: "test",
config: {},
pluginConfig: config,
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
logger: noopLogger,
registerGatewayMethod: (method, handler) => methods.set(method, handler),
registerTool: (tool) => tools.push(tool),
@@ -142,6 +143,7 @@ describe("voice-call plugin", () => {
source: "test",
config: {},
pluginConfig: { provider: "mock" },
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
logger: noopLogger,
registerGatewayMethod: () => {},
registerTool: () => {},

View File

@@ -76,6 +76,11 @@ const DEFAULT_OUTPUT = {
voiceCompatible: false,
};
const TELEPHONY_OUTPUT = {
openai: { format: "pcm" as const, sampleRate: 24000 },
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
export type ResolvedTtsConfig = {
@@ -180,6 +185,16 @@ export type TtsResult = {
voiceCompatible?: boolean;
};
export type TtsTelephonyResult = {
success: boolean;
audioBuffer?: Buffer;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
sampleRate?: number;
};
type TtsStatusEntry = {
timestamp: number;
success: boolean;
@@ -980,7 +995,7 @@ async function openaiTTS(params: {
apiKey: string;
model: string;
voice: string;
responseFormat: "mp3" | "opus";
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
@@ -1224,6 +1239,100 @@ export async function textToSpeech(params: {
};
}
export async function textToSpeechTelephony(params: {
text: string;
cfg: ClawdbotConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
success: false,
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const providers = resolveTtsProviderOrder(userProvider);
let lastError: string | undefined;
for (const provider of providers) {
const providerStart = Date.now();
try {
if (provider === "edge") {
lastError = "edge: unsupported for telephony";
continue;
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
if (provider === "elevenlabs") {
const output = TELEPHONY_OUTPUT.elevenlabs;
const audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
baseUrl: config.elevenlabs.baseUrl,
voiceId: config.elevenlabs.voiceId,
modelId: config.elevenlabs.modelId,
outputFormat: output.format,
seed: config.elevenlabs.seed,
applyTextNormalization: config.elevenlabs.applyTextNormalization,
languageCode: config.elevenlabs.languageCode,
voiceSettings: config.elevenlabs.voiceSettings,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
}
const output = TELEPHONY_OUTPUT.openai;
const audioBuffer = await openaiTTS({
text: params.text,
apiKey,
model: config.openai.model,
voice: config.openai.voice,
responseFormat: output.format,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
lastError = `${provider}: request timed out`;
} else {
lastError = `${provider}: ${error.message}`;
}
}
}
return {
success: false,
error: `TTS conversion failed: ${lastError || "no providers available"}`,
};
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: ClawdbotConfig;