From 83f92e34afe5f8a9932005b2cec862fbabc29212 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 25 Jan 2026 09:29:50 +0000 Subject: [PATCH] refactor: align voice-call TTS with core config --- docs/plugin.md | 16 ++ docs/plugins/voice-call.md | 81 +++++++ extensions/voice-call/CHANGELOG.md | 7 + extensions/voice-call/README.md | 21 ++ extensions/voice-call/clawdbot.plugin.json | 218 +++++++++++++++++- extensions/voice-call/index.ts | 24 +- extensions/voice-call/src/config.ts | 95 ++++++-- extensions/voice-call/src/core-bridge.ts | 6 + extensions/voice-call/src/manager.ts | 6 +- extensions/voice-call/src/manager/outbound.ts | 7 +- extensions/voice-call/src/providers/twilio.ts | 28 +-- extensions/voice-call/src/runtime.ts | 25 +- extensions/voice-call/src/telephony-audio.ts | 88 +++++++ extensions/voice-call/src/telephony-tts.ts | 95 ++++++++ src/plugins/runtime/index.ts | 4 + src/plugins/runtime/types.ts | 4 + src/plugins/voice-call.plugin.test.ts | 2 + src/tts/tts.ts | 111 ++++++++- 18 files changed, 769 insertions(+), 69 deletions(-) create mode 100644 extensions/voice-call/src/telephony-audio.ts create mode 100644 extensions/voice-call/src/telephony-tts.ts diff --git a/docs/plugin.md b/docs/plugin.md index ee9dfd8b0..c57a024f2 100644 --- a/docs/plugin.md +++ b/docs/plugin.md @@ -67,6 +67,22 @@ Plugins can register: Plugins run **in‑process** with the Gateway, so treat them as trusted code. Tool authoring guide: [Plugin agent tools](/plugins/agent-tools). +## Runtime helpers + +Plugins can access selected core helpers via `api.runtime`. For telephony TTS: + +```ts +const result = await api.runtime.tts.textToSpeechTelephony({ + text: "Hello from Clawdbot", + cfg: api.config, +}); +``` + +Notes: +- Uses core `messages.tts` configuration (OpenAI or ElevenLabs). +- Returns PCM audio buffer + sample rate. Plugins must resample/encode for providers. +- Edge TTS is not supported for telephony. + ## Discovery & precedence Clawdbot scans, in order: diff --git a/docs/plugins/voice-call.md b/docs/plugins/voice-call.md index 5c55cec88..eecb80133 100644 --- a/docs/plugins/voice-call.md +++ b/docs/plugins/voice-call.md @@ -104,6 +104,87 @@ Notes: - `mock` is a local dev provider (no network calls). - `skipSignatureVerification` is for local testing only. +## TTS for calls + +Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for +streaming speech on calls. You can override it under the plugin config with the +**same shape** — it deep‑merges with `messages.tts`. + +```json5 +{ + tts: { + provider: "elevenlabs", + elevenlabs: { + voiceId: "pMsXgVXv3BLzUgSXRplE", + modelId: "eleven_multilingual_v2" + } + } +} +``` + +Notes: +- **Edge TTS is ignored for voice calls** (telephony audio needs PCM; Edge output is unreliable). +- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices. + +### More examples + +Use core TTS only (no override): + +```json5 +{ + messages: { + tts: { + provider: "openai", + openai: { voice: "alloy" } + } + } +} +``` + +Override to ElevenLabs just for calls (keep core default elsewhere): + +```json5 +{ + plugins: { + entries: { + "voice-call": { + config: { + tts: { + provider: "elevenlabs", + elevenlabs: { + apiKey: "elevenlabs_key", + voiceId: "pMsXgVXv3BLzUgSXRplE", + modelId: "eleven_multilingual_v2" + } + } + } + } + } + } +} +``` + +Override only the OpenAI model for calls (deep‑merge example): + +```json5 +{ + plugins: { + entries: { + "voice-call": { + config: { + tts: { + openai: { + model: "gpt-4o-mini-tts", + voice: "marin" + } + } + } + } + } + } +} +``` + ## Inbound calls Inbound policy defaults to `disabled`. To enable inbound calls, set: diff --git a/extensions/voice-call/CHANGELOG.md b/extensions/voice-call/CHANGELOG.md index 0edc0dcb8..6123a7315 100644 --- a/extensions/voice-call/CHANGELOG.md +++ b/extensions/voice-call/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## 2026.1.24 + +### Changes +- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core). +- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls. +- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields. + ## 2026.1.23 ### Changes diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md index 11ff8324a..d96f90392 100644 --- a/extensions/voice-call/README.md +++ b/extensions/voice-call/README.md @@ -75,6 +75,27 @@ Notes: - Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL. - `mock` is a local dev provider (no network calls). +## TTS for calls + +Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for +streaming speech on calls. You can override it under the plugin config with the +same shape — overrides deep-merge with `messages.tts`. + +```json5 +{ + tts: { + provider: "openai", + openai: { + voice: "alloy" + } + } +} +``` + +Notes: +- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable). +- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices. + ## CLI ```bash diff --git a/extensions/voice-call/clawdbot.plugin.json b/extensions/voice-call/clawdbot.plugin.json index fca4a1ea0..2a4f04466 100644 --- a/extensions/voice-call/clawdbot.plugin.json +++ b/extensions/voice-call/clawdbot.plugin.json @@ -99,16 +99,39 @@ "label": "Media Stream Path", "advanced": true }, - "tts.model": { - "label": "TTS Model", + "tts.provider": { + "label": "TTS Provider Override", + "help": "Deep-merges with messages.tts (Edge is ignored for calls).", "advanced": true }, - "tts.voice": { - "label": "TTS Voice", + "tts.openai.model": { + "label": "OpenAI TTS Model", "advanced": true }, - "tts.instructions": { - "label": "TTS Instructions", + "tts.openai.voice": { + "label": "OpenAI TTS Voice", + "advanced": true + }, + "tts.openai.apiKey": { + "label": "OpenAI API Key", + "sensitive": true, + "advanced": true + }, + "tts.elevenlabs.modelId": { + "label": "ElevenLabs Model ID", + "advanced": true + }, + "tts.elevenlabs.voiceId": { + "label": "ElevenLabs Voice ID", + "advanced": true + }, + "tts.elevenlabs.apiKey": { + "label": "ElevenLabs API Key", + "sensitive": true, + "advanced": true + }, + "tts.elevenlabs.baseUrl": { + "label": "ElevenLabs Base URL", "advanced": true }, "publicUrl": { @@ -370,20 +393,193 @@ "type": "object", "additionalProperties": false, "properties": { + "auto": { + "type": "string", + "enum": [ + "off", + "always", + "inbound", + "tagged" + ] + }, + "enabled": { + "type": "boolean" + }, + "mode": { + "type": "string", + "enum": [ + "final", + "all" + ] + }, "provider": { "type": "string", "enum": [ - "openai" + "openai", + "elevenlabs", + "edge" ] }, - "model": { + "summaryModel": { "type": "string" }, - "voice": { + "modelOverrides": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "allowText": { + "type": "boolean" + }, + "allowProvider": { + "type": "boolean" + }, + "allowVoice": { + "type": "boolean" + }, + "allowModelId": { + "type": "boolean" + }, + "allowVoiceSettings": { + "type": "boolean" + }, + "allowNormalization": { + "type": "boolean" + }, + "allowSeed": { + "type": "boolean" + } + } + }, + "elevenlabs": { + "type": "object", + "additionalProperties": false, + "properties": { + "apiKey": { + "type": "string" + }, + "baseUrl": { + "type": "string" + }, + "voiceId": { + "type": "string" + }, + "modelId": { + "type": "string" + }, + "seed": { + "type": "integer", + "minimum": 0, + "maximum": 4294967295 + }, + "applyTextNormalization": { + "type": "string", + "enum": [ + "auto", + "on", + "off" + ] + }, + "languageCode": { + "type": "string" + }, + "voiceSettings": { + "type": "object", + "additionalProperties": false, + "properties": { + "stability": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "similarityBoost": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "style": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "useSpeakerBoost": { + "type": "boolean" + }, + "speed": { + "type": "number", + "minimum": 0.5, + "maximum": 2 + } + } + } + } + }, + "openai": { + "type": "object", + "additionalProperties": false, + "properties": { + "apiKey": { + "type": "string" + }, + "model": { + "type": "string" + }, + "voice": { + "type": "string" + } + } + }, + "edge": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "voice": { + "type": "string" + }, + "lang": { + "type": "string" + }, + "outputFormat": { + "type": "string" + }, + "pitch": { + "type": "string" + }, + "rate": { + "type": "string" + }, + "volume": { + "type": "string" + }, + "saveSubtitles": { + "type": "boolean" + }, + "proxy": { + "type": "string" + }, + "timeoutMs": { + "type": "integer", + "minimum": 1000, + "maximum": 120000 + } + } + }, + "prefsPath": { "type": "string" }, - "instructions": { - "type": "string" + "maxTextLength": { + "type": "integer", + "minimum": 1 + }, + "timeoutMs": { + "type": "integer", + "minimum": 1000, + "maximum": 120000 } } }, diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index f0fc8e3ad..760726faa 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -74,9 +74,26 @@ const voiceCallConfigSchema = { }, "streaming.sttModel": { label: "Realtime STT Model", advanced: true }, "streaming.streamPath": { label: "Media Stream Path", advanced: true }, - "tts.model": { label: "TTS Model", advanced: true }, - "tts.voice": { label: "TTS Voice", advanced: true }, - "tts.instructions": { label: "TTS Instructions", advanced: true }, + "tts.provider": { + label: "TTS Provider Override", + help: "Deep-merges with messages.tts (Edge is ignored for calls).", + advanced: true, + }, + "tts.openai.model": { label: "OpenAI TTS Model", advanced: true }, + "tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true }, + "tts.openai.apiKey": { + label: "OpenAI API Key", + sensitive: true, + advanced: true, + }, + "tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true }, + "tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true }, + "tts.elevenlabs.apiKey": { + label: "ElevenLabs API Key", + sensitive: true, + advanced: true, + }, + "tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true }, publicUrl: { label: "Public Webhook URL", advanced: true }, skipSignatureVerification: { label: "Skip Signature Verification", @@ -161,6 +178,7 @@ const voiceCallPlugin = { runtimePromise = createVoiceCallRuntime({ config: cfg, coreConfig: api.config as CoreConfig, + ttsRuntime: api.runtime.tts, logger: api.logger, }); } diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 832e692ca..48f4691fe 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -82,31 +82,82 @@ export const SttConfigSchema = z .default({ provider: "openai", model: "whisper-1" }); export type SttConfig = z.infer; +export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]); +export const TtsModeSchema = z.enum(["final", "all"]); +export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]); + export const TtsConfigSchema = z .object({ - /** TTS provider (currently only OpenAI supported) */ - provider: z.literal("openai").default("openai"), - /** - * TTS model to use: - * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended) - * - tts-1: lower latency - * - tts-1-hd: higher quality - */ - model: z.string().min(1).default("gpt-4o-mini-tts"), - /** - * Voice ID. For best quality, use marin or cedar. - * All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar - */ - voice: z.string().min(1).default("coral"), - /** - * Instructions for speech style (only works with gpt-4o-mini-tts). - * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent" - */ - instructions: z.string().optional(), + auto: TtsAutoSchema.optional(), + enabled: z.boolean().optional(), + mode: TtsModeSchema.optional(), + provider: TtsProviderSchema.optional(), + summaryModel: z.string().optional(), + modelOverrides: z + .object({ + enabled: z.boolean().optional(), + allowText: z.boolean().optional(), + allowProvider: z.boolean().optional(), + allowVoice: z.boolean().optional(), + allowModelId: z.boolean().optional(), + allowVoiceSettings: z.boolean().optional(), + allowNormalization: z.boolean().optional(), + allowSeed: z.boolean().optional(), + }) + .strict() + .optional(), + elevenlabs: z + .object({ + apiKey: z.string().optional(), + baseUrl: z.string().optional(), + voiceId: z.string().optional(), + modelId: z.string().optional(), + seed: z.number().int().min(0).max(4294967295).optional(), + applyTextNormalization: z.enum(["auto", "on", "off"]).optional(), + languageCode: z.string().optional(), + voiceSettings: z + .object({ + stability: z.number().min(0).max(1).optional(), + similarityBoost: z.number().min(0).max(1).optional(), + style: z.number().min(0).max(1).optional(), + useSpeakerBoost: z.boolean().optional(), + speed: z.number().min(0.5).max(2).optional(), + }) + .strict() + .optional(), + }) + .strict() + .optional(), + openai: z + .object({ + apiKey: z.string().optional(), + model: z.string().optional(), + voice: z.string().optional(), + }) + .strict() + .optional(), + edge: z + .object({ + enabled: z.boolean().optional(), + voice: z.string().optional(), + lang: z.string().optional(), + outputFormat: z.string().optional(), + pitch: z.string().optional(), + rate: z.string().optional(), + volume: z.string().optional(), + saveSubtitles: z.boolean().optional(), + proxy: z.string().optional(), + timeoutMs: z.number().int().min(1000).max(120000).optional(), + }) + .strict() + .optional(), + prefsPath: z.string().optional(), + maxTextLength: z.number().int().min(1).optional(), + timeoutMs: z.number().int().min(1000).max(120000).optional(), }) .strict() - .default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" }); -export type TtsConfig = z.infer; + .optional(); +export type VoiceCallTtsConfig = z.infer; // ----------------------------------------------------------------------------- // Webhook Server Configuration @@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z /** STT configuration */ stt: SttConfigSchema, - /** TTS configuration */ + /** TTS override (deep-merges with core messages.tts) */ tts: TtsConfigSchema, /** Store path for call logs */ diff --git a/extensions/voice-call/src/core-bridge.ts b/extensions/voice-call/src/core-bridge.ts index 23f3f7250..a1d01e10f 100644 --- a/extensions/voice-call/src/core-bridge.ts +++ b/extensions/voice-call/src/core-bridge.ts @@ -2,10 +2,16 @@ import fs from "node:fs"; import path from "node:path"; import { fileURLToPath, pathToFileURL } from "node:url"; +import type { VoiceCallTtsConfig } from "./config.js"; + export type CoreConfig = { session?: { store?: string; }; + messages?: { + tts?: VoiceCallTtsConfig; + }; + [key: string]: unknown; }; type CoreAgentDeps = { diff --git a/extensions/voice-call/src/manager.ts b/extensions/voice-call/src/manager.ts index 49d690053..2e2e4661b 100644 --- a/extensions/voice-call/src/manager.ts +++ b/extensions/voice-call/src/manager.ts @@ -143,7 +143,7 @@ export class CallManager { // For notify mode with a message, use inline TwiML with let inlineTwiml: string | undefined; if (mode === "notify" && initialMessage) { - const pollyVoice = mapVoiceToPolly(this.config.tts.voice); + const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice); inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice); console.log( `[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`, @@ -210,11 +210,13 @@ export class CallManager { this.addTranscriptEntry(call, "bot", text); // Play TTS + const voice = + this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined; await this.provider.playTts({ callId, providerCallId: call.providerCallId, text, - voice: this.config.tts.voice, + voice, }); return { success: true }; diff --git a/extensions/voice-call/src/manager/outbound.ts b/extensions/voice-call/src/manager/outbound.ts index 6cb037252..76bdc5a1a 100644 --- a/extensions/voice-call/src/manager/outbound.ts +++ b/extensions/voice-call/src/manager/outbound.ts @@ -68,7 +68,7 @@ export async function initiateCall( // For notify mode with a message, use inline TwiML with . let inlineTwiml: string | undefined; if (mode === "notify" && initialMessage) { - const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice); + const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice); inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice); console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`); } @@ -120,11 +120,13 @@ export async function speak( addTranscriptEntry(call, "bot", text); + const voice = + ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined; await ctx.provider.playTts({ callId, providerCallId: call.providerCallId, text, - voice: ctx.config.tts.voice, + voice, }); return { success: true }; @@ -244,4 +246,3 @@ export async function endCall( return { success: false, error: err instanceof Error ? err.message : String(err) }; } } - diff --git a/extensions/voice-call/src/providers/twilio.ts b/extensions/voice-call/src/providers/twilio.ts index 17102b732..8e400f82f 100644 --- a/extensions/voice-call/src/providers/twilio.ts +++ b/extensions/voice-call/src/providers/twilio.ts @@ -15,9 +15,9 @@ import type { WebhookVerificationResult, } from "../types.js"; import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js"; +import { chunkAudio } from "../telephony-audio.js"; +import type { TelephonyTtsProvider } from "../telephony-tts.js"; import type { VoiceCallProvider } from "./base.js"; -import type { OpenAITTSProvider } from "./tts-openai.js"; -import { chunkAudio } from "./tts-openai.js"; import { twilioApiRequest } from "./twilio/api.js"; import { verifyTwilioProviderWebhook } from "./twilio/webhook.js"; @@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider { /** Current public webhook URL (set when tunnel starts or from config) */ private currentPublicUrl: string | null = null; - /** Optional OpenAI TTS provider for streaming TTS */ - private ttsProvider: OpenAITTSProvider | null = null; + /** Optional telephony TTS provider for streaming TTS */ + private ttsProvider: TelephonyTtsProvider | null = null; /** Optional media stream handler for sending audio */ private mediaStreamHandler: MediaStreamHandler | null = null; @@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider { return this.currentPublicUrl; } - setTTSProvider(provider: OpenAITTSProvider): void { + setTTSProvider(provider: TelephonyTtsProvider): void { this.ttsProvider = provider; } @@ -454,13 +454,13 @@ export class TwilioProvider implements VoiceCallProvider { * Play TTS audio via Twilio. * * Two modes: - * 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available, - * generates audio via OpenAI and streams it through WebSocket (preferred). + * 1. Core TTS + Media Streams: If TTS provider and media stream are available, + * generates audio via core TTS and streams it through WebSocket (preferred). * 2. TwiML : Falls back to Twilio's native TTS with Polly voices. * Note: This may not work on all Twilio accounts. */ async playTts(input: PlayTtsInput): Promise { - // Try OpenAI TTS via media stream first (if configured) + // Try telephony TTS via media stream first (if configured) const streamSid = this.callStreamMap.get(input.providerCallId); if (this.ttsProvider && this.mediaStreamHandler && streamSid) { try { @@ -468,7 +468,7 @@ export class TwilioProvider implements VoiceCallProvider { return; } catch (err) { console.warn( - `[voice-call] OpenAI TTS failed, falling back to Twilio :`, + `[voice-call] Telephony TTS failed, falling back to Twilio :`, err instanceof Error ? err.message : err, ); // Fall through to TwiML fallback @@ -484,7 +484,7 @@ export class TwilioProvider implements VoiceCallProvider { } console.warn( - "[voice-call] Using TwiML fallback - OpenAI TTS not configured or media stream not active", + "[voice-call] Using TwiML fallback - telephony TTS not configured or media stream not active", ); const pollyVoice = mapVoiceToPolly(input.voice); @@ -502,8 +502,8 @@ export class TwilioProvider implements VoiceCallProvider { } /** - * Play TTS via OpenAI and Twilio Media Streams. - * Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket. + * Play TTS via core TTS and Twilio Media Streams. + * Generates audio with core TTS, converts to mu-law, and streams via WebSocket. * Uses a jitter buffer to smooth out timing variations. */ private async playTtsViaStream( @@ -514,8 +514,8 @@ export class TwilioProvider implements VoiceCallProvider { throw new Error("TTS provider and media stream handler required"); } - // Generate audio with OpenAI TTS (returns mu-law at 8kHz) - const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text); + // Generate audio with core TTS (returns mu-law at 8kHz) + const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text); // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law) const CHUNK_SIZE = 160; diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 08e7e5de2..0770333cd 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js"; import { MockProvider } from "./providers/mock.js"; import { PlivoProvider } from "./providers/plivo.js"; import { TelnyxProvider } from "./providers/telnyx.js"; -import { OpenAITTSProvider } from "./providers/tts-openai.js"; import { TwilioProvider } from "./providers/twilio.js"; +import type { TelephonyTtsRuntime } from "./telephony-tts.js"; +import { createTelephonyTtsProvider } from "./telephony-tts.js"; import { startTunnel, type TunnelResult } from "./tunnel.js"; import { cleanupTailscaleExposure, @@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider { export async function createVoiceCallRuntime(params: { config: VoiceCallConfig; coreConfig: CoreConfig; + ttsRuntime?: TelephonyTtsRuntime; logger?: Logger; }): Promise { - const { config, coreConfig, logger } = params; + const { config, coreConfig, ttsRuntime, logger } = params; const log = logger ?? { info: console.log, warn: console.warn, @@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: { if (provider.name === "twilio" && config.streaming?.enabled) { const twilioProvider = provider as TwilioProvider; - const openaiApiKey = - config.streaming.openaiApiKey || process.env.OPENAI_API_KEY; - if (openaiApiKey) { + if (ttsRuntime?.textToSpeechTelephony) { try { - const ttsProvider = new OpenAITTSProvider({ - apiKey: openaiApiKey, - voice: config.tts.voice, - model: config.tts.model, - instructions: config.tts.instructions, + const ttsProvider = createTelephonyTtsProvider({ + coreConfig, + ttsOverride: config.tts, + runtime: ttsRuntime, }); twilioProvider.setTTSProvider(ttsProvider); - log.info("[voice-call] OpenAI TTS provider configured"); + log.info("[voice-call] Telephony TTS provider configured"); } catch (err) { log.warn( - `[voice-call] Failed to initialize OpenAI TTS: ${ + `[voice-call] Failed to initialize telephony TTS: ${ err instanceof Error ? err.message : String(err) }`, ); } } else { - log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled"); + log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled"); } const mediaHandler = webhookServer.getMediaStreamHandler(); diff --git a/extensions/voice-call/src/telephony-audio.ts b/extensions/voice-call/src/telephony-audio.ts new file mode 100644 index 000000000..6a9a1d222 --- /dev/null +++ b/extensions/voice-call/src/telephony-audio.ts @@ -0,0 +1,88 @@ +const TELEPHONY_SAMPLE_RATE = 8000; + +function clamp16(value: number): number { + return Math.max(-32768, Math.min(32767, value)); +} + +/** + * Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation. + */ +export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer { + if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input; + const inputSamples = Math.floor(input.length / 2); + if (inputSamples === 0) return Buffer.alloc(0); + + const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE; + const outputSamples = Math.floor(inputSamples / ratio); + const output = Buffer.alloc(outputSamples * 2); + + for (let i = 0; i < outputSamples; i++) { + const srcPos = i * ratio; + const srcIndex = Math.floor(srcPos); + const frac = srcPos - srcIndex; + + const s0 = input.readInt16LE(srcIndex * 2); + const s1Index = Math.min(srcIndex + 1, inputSamples - 1); + const s1 = input.readInt16LE(s1Index * 2); + + const sample = Math.round(s0 + frac * (s1 - s0)); + output.writeInt16LE(clamp16(sample), i * 2); + } + + return output; +} + +/** + * Convert 16-bit PCM to 8-bit mu-law (G.711). + */ +export function pcmToMulaw(pcm: Buffer): Buffer { + const samples = Math.floor(pcm.length / 2); + const mulaw = Buffer.alloc(samples); + + for (let i = 0; i < samples; i++) { + const sample = pcm.readInt16LE(i * 2); + mulaw[i] = linearToMulaw(sample); + } + + return mulaw; +} + +export function convertPcmToMulaw8k( + pcm: Buffer, + inputSampleRate: number, +): Buffer { + const pcm8k = resamplePcmTo8k(pcm, inputSampleRate); + return pcmToMulaw(pcm8k); +} + +/** + * Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law). + */ +export function chunkAudio( + audio: Buffer, + chunkSize = 160, +): Generator { + return (function* () { + for (let i = 0; i < audio.length; i += chunkSize) { + yield audio.subarray(i, Math.min(i + chunkSize, audio.length)); + } + })(); +} + +function linearToMulaw(sample: number): number { + const BIAS = 132; + const CLIP = 32635; + + const sign = sample < 0 ? 0x80 : 0; + if (sample < 0) sample = -sample; + if (sample > CLIP) sample = CLIP; + + sample += BIAS; + let exponent = 7; + for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) { + expMask >>= 1; + } + + const mantissa = (sample >> (exponent + 3)) & 0x0f; + return ~(sign | (exponent << 4) | mantissa) & 0xff; +} diff --git a/extensions/voice-call/src/telephony-tts.ts b/extensions/voice-call/src/telephony-tts.ts new file mode 100644 index 000000000..147501e85 --- /dev/null +++ b/extensions/voice-call/src/telephony-tts.ts @@ -0,0 +1,95 @@ +import type { CoreConfig } from "./core-bridge.js"; +import type { VoiceCallTtsConfig } from "./config.js"; +import { convertPcmToMulaw8k } from "./telephony-audio.js"; + +export type TelephonyTtsRuntime = { + textToSpeechTelephony: (params: { + text: string; + cfg: CoreConfig; + prefsPath?: string; + }) => Promise<{ + success: boolean; + audioBuffer?: Buffer; + sampleRate?: number; + provider?: string; + error?: string; + }>; +}; + +export type TelephonyTtsProvider = { + synthesizeForTelephony: (text: string) => Promise; +}; + +export function createTelephonyTtsProvider(params: { + coreConfig: CoreConfig; + ttsOverride?: VoiceCallTtsConfig; + runtime: TelephonyTtsRuntime; +}): TelephonyTtsProvider { + const { coreConfig, ttsOverride, runtime } = params; + const mergedConfig = applyTtsOverride(coreConfig, ttsOverride); + + return { + synthesizeForTelephony: async (text: string) => { + const result = await runtime.textToSpeechTelephony({ + text, + cfg: mergedConfig, + }); + + if (!result.success || !result.audioBuffer || !result.sampleRate) { + throw new Error(result.error ?? "TTS conversion failed"); + } + + return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate); + }, + }; +} + +function applyTtsOverride( + coreConfig: CoreConfig, + override?: VoiceCallTtsConfig, +): CoreConfig { + if (!override) return coreConfig; + + const base = coreConfig.messages?.tts; + const merged = mergeTtsConfig(base, override); + if (!merged) return coreConfig; + + return { + ...coreConfig, + messages: { + ...(coreConfig.messages ?? {}), + tts: merged, + }, + }; +} + +function mergeTtsConfig( + base?: VoiceCallTtsConfig, + override?: VoiceCallTtsConfig, +): VoiceCallTtsConfig | undefined { + if (!base && !override) return undefined; + if (!override) return base; + if (!base) return override; + return deepMerge(base, override); +} + +function deepMerge(base: T, override: T): T { + if (!isPlainObject(base) || !isPlainObject(override)) { + return override; + } + const result: Record = { ...base }; + for (const [key, value] of Object.entries(override)) { + if (value === undefined) continue; + const existing = (base as Record)[key]; + if (isPlainObject(existing) && isPlainObject(value)) { + result[key] = deepMerge(existing, value); + } else { + result[key] = value; + } + } + return result as T; +} + +function isPlainObject(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} diff --git a/src/plugins/runtime/index.ts b/src/plugins/runtime/index.ts index a807bcacf..2edf22513 100644 --- a/src/plugins/runtime/index.ts +++ b/src/plugins/runtime/index.ts @@ -124,6 +124,7 @@ import { startWebLoginWithQr, waitForWebLogin } from "../../web/login-qr.js"; import { sendMessageWhatsApp, sendPollWhatsApp } from "../../web/outbound.js"; import { registerMemoryCli } from "../../cli/memory-cli.js"; import { formatNativeDependencyHint } from "./native-deps.js"; +import { textToSpeechTelephony } from "../../tts/tts.js"; import type { PluginRuntime } from "./types.js"; @@ -162,6 +163,9 @@ export function createPluginRuntime(): PluginRuntime { getImageMetadata, resizeToJpeg, }, + tts: { + textToSpeechTelephony, + }, tools: { createMemoryGetTool, createMemorySearchTool, diff --git a/src/plugins/runtime/types.ts b/src/plugins/runtime/types.ts index 027558681..b9589f4b7 100644 --- a/src/plugins/runtime/types.ts +++ b/src/plugins/runtime/types.ts @@ -16,6 +16,7 @@ type UpsertChannelPairingRequest = typeof import("../../pairing/pairing-store.js").upsertChannelPairingRequest; type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia; type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer; +type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony; type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes; type MatchesMentionPatterns = typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns; @@ -173,6 +174,9 @@ export type PluginRuntime = { getImageMetadata: GetImageMetadata; resizeToJpeg: ResizeToJpeg; }; + tts: { + textToSpeechTelephony: TextToSpeechTelephony; + }; tools: { createMemoryGetTool: CreateMemoryGetTool; createMemorySearchTool: CreateMemorySearchTool; diff --git a/src/plugins/voice-call.plugin.test.ts b/src/plugins/voice-call.plugin.test.ts index 55dae874f..c29adce14 100644 --- a/src/plugins/voice-call.plugin.test.ts +++ b/src/plugins/voice-call.plugin.test.ts @@ -43,6 +43,7 @@ function setup(config: Record): Registered { source: "test", config: {}, pluginConfig: config, + runtime: { tts: { textToSpeechTelephony: vi.fn() } }, logger: noopLogger, registerGatewayMethod: (method, handler) => methods.set(method, handler), registerTool: (tool) => tools.push(tool), @@ -142,6 +143,7 @@ describe("voice-call plugin", () => { source: "test", config: {}, pluginConfig: { provider: "mock" }, + runtime: { tts: { textToSpeechTelephony: vi.fn() } }, logger: noopLogger, registerGatewayMethod: () => {}, registerTool: () => {}, diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 5f911ec14..847876d04 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -76,6 +76,11 @@ const DEFAULT_OUTPUT = { voiceCompatible: false, }; +const TELEPHONY_OUTPUT = { + openai: { format: "pcm" as const, sampleRate: 24000 }, + elevenlabs: { format: "pcm_22050", sampleRate: 22050 }, +}; + const TTS_AUTO_MODES = new Set(["off", "always", "inbound", "tagged"]); export type ResolvedTtsConfig = { @@ -180,6 +185,16 @@ export type TtsResult = { voiceCompatible?: boolean; }; +export type TtsTelephonyResult = { + success: boolean; + audioBuffer?: Buffer; + error?: string; + latencyMs?: number; + provider?: string; + outputFormat?: string; + sampleRate?: number; +}; + type TtsStatusEntry = { timestamp: number; success: boolean; @@ -980,7 +995,7 @@ async function openaiTTS(params: { apiKey: string; model: string; voice: string; - responseFormat: "mp3" | "opus"; + responseFormat: "mp3" | "opus" | "pcm"; timeoutMs: number; }): Promise { const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; @@ -1224,6 +1239,100 @@ export async function textToSpeech(params: { }; } +export async function textToSpeechTelephony(params: { + text: string; + cfg: ClawdbotConfig; + prefsPath?: string; +}): Promise { + const config = resolveTtsConfig(params.cfg); + const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); + + if (params.text.length > config.maxTextLength) { + return { + success: false, + error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`, + }; + } + + const userProvider = getTtsProvider(config, prefsPath); + const providers = resolveTtsProviderOrder(userProvider); + + let lastError: string | undefined; + + for (const provider of providers) { + const providerStart = Date.now(); + try { + if (provider === "edge") { + lastError = "edge: unsupported for telephony"; + continue; + } + + const apiKey = resolveTtsApiKey(config, provider); + if (!apiKey) { + lastError = `No API key for ${provider}`; + continue; + } + + if (provider === "elevenlabs") { + const output = TELEPHONY_OUTPUT.elevenlabs; + const audioBuffer = await elevenLabsTTS({ + text: params.text, + apiKey, + baseUrl: config.elevenlabs.baseUrl, + voiceId: config.elevenlabs.voiceId, + modelId: config.elevenlabs.modelId, + outputFormat: output.format, + seed: config.elevenlabs.seed, + applyTextNormalization: config.elevenlabs.applyTextNormalization, + languageCode: config.elevenlabs.languageCode, + voiceSettings: config.elevenlabs.voiceSettings, + timeoutMs: config.timeoutMs, + }); + + return { + success: true, + audioBuffer, + latencyMs: Date.now() - providerStart, + provider, + outputFormat: output.format, + sampleRate: output.sampleRate, + }; + } + + const output = TELEPHONY_OUTPUT.openai; + const audioBuffer = await openaiTTS({ + text: params.text, + apiKey, + model: config.openai.model, + voice: config.openai.voice, + responseFormat: output.format, + timeoutMs: config.timeoutMs, + }); + + return { + success: true, + audioBuffer, + latencyMs: Date.now() - providerStart, + provider, + outputFormat: output.format, + sampleRate: output.sampleRate, + }; + } catch (err) { + const error = err as Error; + if (error.name === "AbortError") { + lastError = `${provider}: request timed out`; + } else { + lastError = `${provider}: ${error.message}`; + } + } + } + + return { + success: false, + error: `TTS conversion failed: ${lastError || "no providers available"}`, + }; +} + export async function maybeApplyTtsToPayload(params: { payload: ReplyPayload; cfg: ClawdbotConfig;