refactor: align voice-call TTS with core config

2026-01-25 09:29:50 +00:00
parent 9366cbc7db
commit 83f92e34af
18 changed files with 769 additions and 69 deletions
--- a/extensions/voice-call/CHANGELOG.md
+++ b/extensions/voice-call/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog

+## 2026.1.24
+
+### Changes
+- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
+- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
+- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
+
 ## 2026.1.23

 ### Changes
--- a/extensions/voice-call/README.md
+++ b/extensions/voice-call/README.md
@@ -75,6 +75,27 @@ Notes:
 - Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
 - `mock` is a local dev provider (no network calls).

+## TTS for calls
+
+Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
+streaming speech on calls. You can override it under the plugin config with the
+same shape — overrides deep-merge with `messages.tts`.
+
+```json5
+{
+  tts: {
+    provider: "openai",
+    openai: {
+      voice: "alloy"
+    }
+  }
+}
+```
+
+Notes:
+- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
+- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
+
 ## CLI

 ```bash
--- a/extensions/voice-call/clawdbot.plugin.json
+++ b/extensions/voice-call/clawdbot.plugin.json
@@ -99,16 +99,39 @@
      "label": "Media Stream Path",
      "advanced": true
    },
-    "tts.model": {
-      "label": "TTS Model",
+    "tts.provider": {
+      "label": "TTS Provider Override",
+      "help": "Deep-merges with messages.tts (Edge is ignored for calls).",
      "advanced": true
    },
-    "tts.voice": {
-      "label": "TTS Voice",
+    "tts.openai.model": {
+      "label": "OpenAI TTS Model",
      "advanced": true
    },
-    "tts.instructions": {
-      "label": "TTS Instructions",
+    "tts.openai.voice": {
+      "label": "OpenAI TTS Voice",
+      "advanced": true
+    },
+    "tts.openai.apiKey": {
+      "label": "OpenAI API Key",
+      "sensitive": true,
+      "advanced": true
+    },
+    "tts.elevenlabs.modelId": {
+      "label": "ElevenLabs Model ID",
+      "advanced": true
+    },
+    "tts.elevenlabs.voiceId": {
+      "label": "ElevenLabs Voice ID",
+      "advanced": true
+    },
+    "tts.elevenlabs.apiKey": {
+      "label": "ElevenLabs API Key",
+      "sensitive": true,
+      "advanced": true
+    },
+    "tts.elevenlabs.baseUrl": {
+      "label": "ElevenLabs Base URL",
      "advanced": true
    },
    "publicUrl": {
@@ -370,20 +393,193 @@
        "type": "object",
        "additionalProperties": false,
        "properties": {
+          "auto": {
+            "type": "string",
+            "enum": [
+              "off",
+              "always",
+              "inbound",
+              "tagged"
+            ]
+          },
+          "enabled": {
+            "type": "boolean"
+          },
+          "mode": {
+            "type": "string",
+            "enum": [
+              "final",
+              "all"
+            ]
+          },
          "provider": {
            "type": "string",
            "enum": [
-              "openai"
+              "openai",
+              "elevenlabs",
+              "edge"
            ]
          },
-          "model": {
+          "summaryModel": {
            "type": "string"
          },
-          "voice": {
+          "modelOverrides": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "enabled": {
+                "type": "boolean"
+              },
+              "allowText": {
+                "type": "boolean"
+              },
+              "allowProvider": {
+                "type": "boolean"
+              },
+              "allowVoice": {
+                "type": "boolean"
+              },
+              "allowModelId": {
+                "type": "boolean"
+              },
+              "allowVoiceSettings": {
+                "type": "boolean"
+              },
+              "allowNormalization": {
+                "type": "boolean"
+              },
+              "allowSeed": {
+                "type": "boolean"
+              }
+            }
+          },
+          "elevenlabs": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "apiKey": {
+                "type": "string"
+              },
+              "baseUrl": {
+                "type": "string"
+              },
+              "voiceId": {
+                "type": "string"
+              },
+              "modelId": {
+                "type": "string"
+              },
+              "seed": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 4294967295
+              },
+              "applyTextNormalization": {
+                "type": "string",
+                "enum": [
+                  "auto",
+                  "on",
+                  "off"
+                ]
+              },
+              "languageCode": {
+                "type": "string"
+              },
+              "voiceSettings": {
+                "type": "object",
+                "additionalProperties": false,
+                "properties": {
+                  "stability": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1
+                  },
+                  "similarityBoost": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1
+                  },
+                  "style": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1
+                  },
+                  "useSpeakerBoost": {
+                    "type": "boolean"
+                  },
+                  "speed": {
+                    "type": "number",
+                    "minimum": 0.5,
+                    "maximum": 2
+                  }
+                }
+              }
+            }
+          },
+          "openai": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "apiKey": {
+                "type": "string"
+              },
+              "model": {
+                "type": "string"
+              },
+              "voice": {
+                "type": "string"
+              }
+            }
+          },
+          "edge": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "enabled": {
+                "type": "boolean"
+              },
+              "voice": {
+                "type": "string"
+              },
+              "lang": {
+                "type": "string"
+              },
+              "outputFormat": {
+                "type": "string"
+              },
+              "pitch": {
+                "type": "string"
+              },
+              "rate": {
+                "type": "string"
+              },
+              "volume": {
+                "type": "string"
+              },
+              "saveSubtitles": {
+                "type": "boolean"
+              },
+              "proxy": {
+                "type": "string"
+              },
+              "timeoutMs": {
+                "type": "integer",
+                "minimum": 1000,
+                "maximum": 120000
+              }
+            }
+          },
+          "prefsPath": {
            "type": "string"
          },
-          "instructions": {
-            "type": "string"
+          "maxTextLength": {
+            "type": "integer",
+            "minimum": 1
+          },
+          "timeoutMs": {
+            "type": "integer",
+            "minimum": 1000,
+            "maximum": 120000
          }
        }
      },
--- a/extensions/voice-call/index.ts
+++ b/extensions/voice-call/index.ts
@@ -74,9 +74,26 @@ const voiceCallConfigSchema = {
    },
    "streaming.sttModel": { label: "Realtime STT Model", advanced: true },
    "streaming.streamPath": { label: "Media Stream Path", advanced: true },
-    "tts.model": { label: "TTS Model", advanced: true },
-    "tts.voice": { label: "TTS Voice", advanced: true },
-    "tts.instructions": { label: "TTS Instructions", advanced: true },
+    "tts.provider": {
+      label: "TTS Provider Override",
+      help: "Deep-merges with messages.tts (Edge is ignored for calls).",
+      advanced: true,
+    },
+    "tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
+    "tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
+    "tts.openai.apiKey": {
+      label: "OpenAI API Key",
+      sensitive: true,
+      advanced: true,
+    },
+    "tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
+    "tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
+    "tts.elevenlabs.apiKey": {
+      label: "ElevenLabs API Key",
+      sensitive: true,
+      advanced: true,
+    },
+    "tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
    publicUrl: { label: "Public Webhook URL", advanced: true },
    skipSignatureVerification: {
      label: "Skip Signature Verification",
@@ -161,6 +178,7 @@ const voiceCallPlugin = {
        runtimePromise = createVoiceCallRuntime({
          config: cfg,
          coreConfig: api.config as CoreConfig,
+          ttsRuntime: api.runtime.tts,
          logger: api.logger,
        });
      }
--- a/extensions/voice-call/src/config.ts
+++ b/extensions/voice-call/src/config.ts
@@ -82,31 +82,82 @@ export const SttConfigSchema = z
  .default({ provider: "openai", model: "whisper-1" });
 export type SttConfig = z.infer<typeof SttConfigSchema>;

+export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
+export const TtsModeSchema = z.enum(["final", "all"]);
+export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
+
 export const TtsConfigSchema = z
  .object({
-    /** TTS provider (currently only OpenAI supported) */
-    provider: z.literal("openai").default("openai"),
-    /**
-     * TTS model to use:
-     * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
-     * - tts-1: lower latency
-     * - tts-1-hd: higher quality
-     */
-    model: z.string().min(1).default("gpt-4o-mini-tts"),
-    /**
-     * Voice ID. For best quality, use marin or cedar.
-     * All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
-     */
-    voice: z.string().min(1).default("coral"),
-    /**
-     * Instructions for speech style (only works with gpt-4o-mini-tts).
-     * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
-     */
-    instructions: z.string().optional(),
+    auto: TtsAutoSchema.optional(),
+    enabled: z.boolean().optional(),
+    mode: TtsModeSchema.optional(),
+    provider: TtsProviderSchema.optional(),
+    summaryModel: z.string().optional(),
+    modelOverrides: z
+      .object({
+        enabled: z.boolean().optional(),
+        allowText: z.boolean().optional(),
+        allowProvider: z.boolean().optional(),
+        allowVoice: z.boolean().optional(),
+        allowModelId: z.boolean().optional(),
+        allowVoiceSettings: z.boolean().optional(),
+        allowNormalization: z.boolean().optional(),
+        allowSeed: z.boolean().optional(),
+      })
+      .strict()
+      .optional(),
+    elevenlabs: z
+      .object({
+        apiKey: z.string().optional(),
+        baseUrl: z.string().optional(),
+        voiceId: z.string().optional(),
+        modelId: z.string().optional(),
+        seed: z.number().int().min(0).max(4294967295).optional(),
+        applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
+        languageCode: z.string().optional(),
+        voiceSettings: z
+          .object({
+            stability: z.number().min(0).max(1).optional(),
+            similarityBoost: z.number().min(0).max(1).optional(),
+            style: z.number().min(0).max(1).optional(),
+            useSpeakerBoost: z.boolean().optional(),
+            speed: z.number().min(0.5).max(2).optional(),
+          })
+          .strict()
+          .optional(),
+      })
+      .strict()
+      .optional(),
+    openai: z
+      .object({
+        apiKey: z.string().optional(),
+        model: z.string().optional(),
+        voice: z.string().optional(),
+      })
+      .strict()
+      .optional(),
+    edge: z
+      .object({
+        enabled: z.boolean().optional(),
+        voice: z.string().optional(),
+        lang: z.string().optional(),
+        outputFormat: z.string().optional(),
+        pitch: z.string().optional(),
+        rate: z.string().optional(),
+        volume: z.string().optional(),
+        saveSubtitles: z.boolean().optional(),
+        proxy: z.string().optional(),
+        timeoutMs: z.number().int().min(1000).max(120000).optional(),
+      })
+      .strict()
+      .optional(),
+    prefsPath: z.string().optional(),
+    maxTextLength: z.number().int().min(1).optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
  })
  .strict()
-  .default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
-export type TtsConfig = z.infer<typeof TtsConfigSchema>;
+  .optional();
+export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;

 // -----------------------------------------------------------------------------
 // Webhook Server Configuration
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
  /** STT configuration */
  stt: SttConfigSchema,

-  /** TTS configuration */
+  /** TTS override (deep-merges with core messages.tts) */
  tts: TtsConfigSchema,

  /** Store path for call logs */
--- a/extensions/voice-call/src/core-bridge.ts
+++ b/extensions/voice-call/src/core-bridge.ts
@@ -2,10 +2,16 @@ import fs from "node:fs";
 import path from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";

+import type { VoiceCallTtsConfig } from "./config.js";
+
 export type CoreConfig = {
  session?: {
    store?: string;
  };
+  messages?: {
+    tts?: VoiceCallTtsConfig;
+  };
+  [key: string]: unknown;
 };

 type CoreAgentDeps = {
--- a/extensions/voice-call/src/manager.ts
+++ b/extensions/voice-call/src/manager.ts
@@ -143,7 +143,7 @@ export class CallManager {
      // For notify mode with a message, use inline TwiML with <Say>
      let inlineTwiml: string | undefined;
      if (mode === "notify" && initialMessage) {
-        const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
+        const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
        inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
        console.log(
          `[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
@@ -210,11 +210,13 @@ export class CallManager {
      this.addTranscriptEntry(call, "bot", text);

      // Play TTS
+      const voice =
+        this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
      await this.provider.playTts({
        callId,
        providerCallId: call.providerCallId,
        text,
-        voice: this.config.tts.voice,
+        voice,
      });

      return { success: true };
--- a/extensions/voice-call/src/manager/outbound.ts
+++ b/extensions/voice-call/src/manager/outbound.ts
@@ -68,7 +68,7 @@ export async function initiateCall(
    // For notify mode with a message, use inline TwiML with <Say>.
    let inlineTwiml: string | undefined;
    if (mode === "notify" && initialMessage) {
-      const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
+      const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
      inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
      console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
    }
@@ -120,11 +120,13 @@ export async function speak(

    addTranscriptEntry(call, "bot", text);

+    const voice =
+      ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
    await ctx.provider.playTts({
      callId,
      providerCallId: call.providerCallId,
      text,
-      voice: ctx.config.tts.voice,
+      voice,
    });

    return { success: true };
@@ -244,4 +246,3 @@ export async function endCall(
    return { success: false, error: err instanceof Error ? err.message : String(err) };
  }
 }
-
--- a/extensions/voice-call/src/providers/twilio.ts
+++ b/extensions/voice-call/src/providers/twilio.ts
@@ -15,9 +15,9 @@ import type {
  WebhookVerificationResult,
 } from "../types.js";
 import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
+import { chunkAudio } from "../telephony-audio.js";
+import type { TelephonyTtsProvider } from "../telephony-tts.js";
 import type { VoiceCallProvider } from "./base.js";
-import type { OpenAITTSProvider } from "./tts-openai.js";
-import { chunkAudio } from "./tts-openai.js";
 import { twilioApiRequest } from "./twilio/api.js";
 import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";

@@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider {
  /** Current public webhook URL (set when tunnel starts or from config) */
  private currentPublicUrl: string | null = null;

-  /** Optional OpenAI TTS provider for streaming TTS */
-  private ttsProvider: OpenAITTSProvider | null = null;
+  /** Optional telephony TTS provider for streaming TTS */
+  private ttsProvider: TelephonyTtsProvider | null = null;

  /** Optional media stream handler for sending audio */
  private mediaStreamHandler: MediaStreamHandler | null = null;
@@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider {
    return this.currentPublicUrl;
  }

-  setTTSProvider(provider: OpenAITTSProvider): void {
+  setTTSProvider(provider: TelephonyTtsProvider): void {
    this.ttsProvider = provider;
  }

@@ -454,13 +454,13 @@ export class TwilioProvider implements VoiceCallProvider {
   * Play TTS audio via Twilio.
   *
   * Two modes:
-   * 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
-   *    generates audio via OpenAI and streams it through WebSocket (preferred).
+   * 1. Core TTS + Media Streams: If TTS provider and media stream are available,
+   *    generates audio via core TTS and streams it through WebSocket (preferred).
   * 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
   *    Note: This may not work on all Twilio accounts.
   */
  async playTts(input: PlayTtsInput): Promise<void> {
-    // Try OpenAI TTS via media stream first (if configured)
+    // Try telephony TTS via media stream first (if configured)
    const streamSid = this.callStreamMap.get(input.providerCallId);
    if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
      try {
@@ -468,7 +468,7 @@ export class TwilioProvider implements VoiceCallProvider {
        return;
      } catch (err) {
        console.warn(
-          `[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
+          `[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
          err instanceof Error ? err.message : err,
        );
        // Fall through to TwiML <Say> fallback
@@ -484,7 +484,7 @@ export class TwilioProvider implements VoiceCallProvider {
    }

    console.warn(
-      "[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
+      "[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
    );

    const pollyVoice = mapVoiceToPolly(input.voice);
@@ -502,8 +502,8 @@ export class TwilioProvider implements VoiceCallProvider {
  }

  /**
-   * Play TTS via OpenAI and Twilio Media Streams.
-   * Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
+   * Play TTS via core TTS and Twilio Media Streams.
+   * Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
   * Uses a jitter buffer to smooth out timing variations.
   */
  private async playTtsViaStream(
@@ -514,8 +514,8 @@ export class TwilioProvider implements VoiceCallProvider {
      throw new Error("TTS provider and media stream handler required");
    }

-    // Generate audio with OpenAI TTS (returns mu-law at 8kHz)
-    const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
+    // Generate audio with core TTS (returns mu-law at 8kHz)
+    const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);

    // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
    const CHUNK_SIZE = 160;
--- a/extensions/voice-call/src/runtime.ts
+++ b/extensions/voice-call/src/runtime.ts
@@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js";
 import { MockProvider } from "./providers/mock.js";
 import { PlivoProvider } from "./providers/plivo.js";
 import { TelnyxProvider } from "./providers/telnyx.js";
-import { OpenAITTSProvider } from "./providers/tts-openai.js";
 import { TwilioProvider } from "./providers/twilio.js";
+import type { TelephonyTtsRuntime } from "./telephony-tts.js";
+import { createTelephonyTtsProvider } from "./telephony-tts.js";
 import { startTunnel, type TunnelResult } from "./tunnel.js";
 import {
  cleanupTailscaleExposure,
@@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
 export async function createVoiceCallRuntime(params: {
  config: VoiceCallConfig;
  coreConfig: CoreConfig;
+  ttsRuntime?: TelephonyTtsRuntime;
  logger?: Logger;
 }): Promise<VoiceCallRuntime> {
-  const { config, coreConfig, logger } = params;
+  const { config, coreConfig, ttsRuntime, logger } = params;
  const log = logger ?? {
    info: console.log,
    warn: console.warn,
@@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: {

  if (provider.name === "twilio" && config.streaming?.enabled) {
    const twilioProvider = provider as TwilioProvider;
-    const openaiApiKey =
-      config.streaming.openaiApiKey || process.env.OPENAI_API_KEY;
-    if (openaiApiKey) {
+    if (ttsRuntime?.textToSpeechTelephony) {
      try {
-        const ttsProvider = new OpenAITTSProvider({
-          apiKey: openaiApiKey,
-          voice: config.tts.voice,
-          model: config.tts.model,
-          instructions: config.tts.instructions,
+        const ttsProvider = createTelephonyTtsProvider({
+          coreConfig,
+          ttsOverride: config.tts,
+          runtime: ttsRuntime,
        });
        twilioProvider.setTTSProvider(ttsProvider);
-        log.info("[voice-call] OpenAI TTS provider configured");
+        log.info("[voice-call] Telephony TTS provider configured");
      } catch (err) {
        log.warn(
-          `[voice-call] Failed to initialize OpenAI TTS: ${
+          `[voice-call] Failed to initialize telephony TTS: ${
            err instanceof Error ? err.message : String(err)
          }`,
        );
      }
    } else {
-      log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled");
+      log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
    }

    const mediaHandler = webhookServer.getMediaStreamHandler();
--- a/extensions/voice-call/src/telephony-audio.ts
+++ b/extensions/voice-call/src/telephony-audio.ts
@@ -0,0 +1,88 @@
+const TELEPHONY_SAMPLE_RATE = 8000;
+
+function clamp16(value: number): number {
+  return Math.max(-32768, Math.min(32767, value));
+}
+
+/**
+ * Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
+ */
+export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
+  if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
+  const inputSamples = Math.floor(input.length / 2);
+  if (inputSamples === 0) return Buffer.alloc(0);
+
+  const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
+  const outputSamples = Math.floor(inputSamples / ratio);
+  const output = Buffer.alloc(outputSamples * 2);
+
+  for (let i = 0; i < outputSamples; i++) {
+    const srcPos = i * ratio;
+    const srcIndex = Math.floor(srcPos);
+    const frac = srcPos - srcIndex;
+
+    const s0 = input.readInt16LE(srcIndex * 2);
+    const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
+    const s1 = input.readInt16LE(s1Index * 2);
+
+    const sample = Math.round(s0 + frac * (s1 - s0));
+    output.writeInt16LE(clamp16(sample), i * 2);
+  }
+
+  return output;
+}
+
+/**
+ * Convert 16-bit PCM to 8-bit mu-law (G.711).
+ */
+export function pcmToMulaw(pcm: Buffer): Buffer {
+  const samples = Math.floor(pcm.length / 2);
+  const mulaw = Buffer.alloc(samples);
+
+  for (let i = 0; i < samples; i++) {
+    const sample = pcm.readInt16LE(i * 2);
+    mulaw[i] = linearToMulaw(sample);
+  }
+
+  return mulaw;
+}
+
+export function convertPcmToMulaw8k(
+  pcm: Buffer,
+  inputSampleRate: number,
+): Buffer {
+  const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
+  return pcmToMulaw(pcm8k);
+}
+
+/**
+ * Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
+ */
+export function chunkAudio(
+  audio: Buffer,
+  chunkSize = 160,
+): Generator<Buffer, void, unknown> {
+  return (function* () {
+    for (let i = 0; i < audio.length; i += chunkSize) {
+      yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
+    }
+  })();
+}
+
+function linearToMulaw(sample: number): number {
+  const BIAS = 132;
+  const CLIP = 32635;
+
+  const sign = sample < 0 ? 0x80 : 0;
+  if (sample < 0) sample = -sample;
+  if (sample > CLIP) sample = CLIP;
+
+  sample += BIAS;
+  let exponent = 7;
+  for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
+    expMask >>= 1;
+  }
+
+  const mantissa = (sample >> (exponent + 3)) & 0x0f;
+  return ~(sign | (exponent << 4) | mantissa) & 0xff;
+}
--- a/extensions/voice-call/src/telephony-tts.ts
+++ b/extensions/voice-call/src/telephony-tts.ts
@@ -0,0 +1,95 @@
+import type { CoreConfig } from "./core-bridge.js";
+import type { VoiceCallTtsConfig } from "./config.js";
+import { convertPcmToMulaw8k } from "./telephony-audio.js";
+
+export type TelephonyTtsRuntime = {
+  textToSpeechTelephony: (params: {
+    text: string;
+    cfg: CoreConfig;
+    prefsPath?: string;
+  }) => Promise<{
+    success: boolean;
+    audioBuffer?: Buffer;
+    sampleRate?: number;
+    provider?: string;
+    error?: string;
+  }>;
+};
+
+export type TelephonyTtsProvider = {
+  synthesizeForTelephony: (text: string) => Promise<Buffer>;
+};
+
+export function createTelephonyTtsProvider(params: {
+  coreConfig: CoreConfig;
+  ttsOverride?: VoiceCallTtsConfig;
+  runtime: TelephonyTtsRuntime;
+}): TelephonyTtsProvider {
+  const { coreConfig, ttsOverride, runtime } = params;
+  const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
+
+  return {
+    synthesizeForTelephony: async (text: string) => {
+      const result = await runtime.textToSpeechTelephony({
+        text,
+        cfg: mergedConfig,
+      });
+
+      if (!result.success || !result.audioBuffer || !result.sampleRate) {
+        throw new Error(result.error ?? "TTS conversion failed");
+      }
+
+      return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
+    },
+  };
+}
+
+function applyTtsOverride(
+  coreConfig: CoreConfig,
+  override?: VoiceCallTtsConfig,
+): CoreConfig {
+  if (!override) return coreConfig;
+
+  const base = coreConfig.messages?.tts;
+  const merged = mergeTtsConfig(base, override);
+  if (!merged) return coreConfig;
+
+  return {
+    ...coreConfig,
+    messages: {
+      ...(coreConfig.messages ?? {}),
+      tts: merged,
+    },
+  };
+}
+
+function mergeTtsConfig(
+  base?: VoiceCallTtsConfig,
+  override?: VoiceCallTtsConfig,
+): VoiceCallTtsConfig | undefined {
+  if (!base && !override) return undefined;
+  if (!override) return base;
+  if (!base) return override;
+  return deepMerge(base, override);
+}
+
+function deepMerge<T>(base: T, override: T): T {
+  if (!isPlainObject(base) || !isPlainObject(override)) {
+    return override;
+  }
+  const result: Record<string, unknown> = { ...base };
+  for (const [key, value] of Object.entries(override)) {
+    if (value === undefined) continue;
+    const existing = (base as Record<string, unknown>)[key];
+    if (isPlainObject(existing) && isPlainObject(value)) {
+      result[key] = deepMerge(existing, value);
+    } else {
+      result[key] = value;
+    }
+  }
+  return result as T;
+}
+
+function isPlainObject(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}