Merge branch 'main' into fix/voice-call-env-var-validation

2026-01-26 13:10:58 +00:00
parent 8b4696c087 58949a1f95
commit 1da6c05e62
416 changed files with 26012 additions and 8724 deletions
--- a/extensions/voice-call/src/config.ts
+++ b/extensions/voice-call/src/config.ts
@@ -82,31 +82,82 @@ export const SttConfigSchema = z
  .default({ provider: "openai", model: "whisper-1" });
 export type SttConfig = z.infer<typeof SttConfigSchema>;

+export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
+export const TtsModeSchema = z.enum(["final", "all"]);
+export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
+
 export const TtsConfigSchema = z
  .object({
-    /** TTS provider (currently only OpenAI supported) */
-    provider: z.literal("openai").default("openai"),
-    /**
-     * TTS model to use:
-     * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
-     * - tts-1: lower latency
-     * - tts-1-hd: higher quality
-     */
-    model: z.string().min(1).default("gpt-4o-mini-tts"),
-    /**
-     * Voice ID. For best quality, use marin or cedar.
-     * All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
-     */
-    voice: z.string().min(1).default("coral"),
-    /**
-     * Instructions for speech style (only works with gpt-4o-mini-tts).
-     * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
-     */
-    instructions: z.string().optional(),
+    auto: TtsAutoSchema.optional(),
+    enabled: z.boolean().optional(),
+    mode: TtsModeSchema.optional(),
+    provider: TtsProviderSchema.optional(),
+    summaryModel: z.string().optional(),
+    modelOverrides: z
+      .object({
+        enabled: z.boolean().optional(),
+        allowText: z.boolean().optional(),
+        allowProvider: z.boolean().optional(),
+        allowVoice: z.boolean().optional(),
+        allowModelId: z.boolean().optional(),
+        allowVoiceSettings: z.boolean().optional(),
+        allowNormalization: z.boolean().optional(),
+        allowSeed: z.boolean().optional(),
+      })
+      .strict()
+      .optional(),
+    elevenlabs: z
+      .object({
+        apiKey: z.string().optional(),
+        baseUrl: z.string().optional(),
+        voiceId: z.string().optional(),
+        modelId: z.string().optional(),
+        seed: z.number().int().min(0).max(4294967295).optional(),
+        applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
+        languageCode: z.string().optional(),
+        voiceSettings: z
+          .object({
+            stability: z.number().min(0).max(1).optional(),
+            similarityBoost: z.number().min(0).max(1).optional(),
+            style: z.number().min(0).max(1).optional(),
+            useSpeakerBoost: z.boolean().optional(),
+            speed: z.number().min(0.5).max(2).optional(),
+          })
+          .strict()
+          .optional(),
+      })
+      .strict()
+      .optional(),
+    openai: z
+      .object({
+        apiKey: z.string().optional(),
+        model: z.string().optional(),
+        voice: z.string().optional(),
+      })
+      .strict()
+      .optional(),
+    edge: z
+      .object({
+        enabled: z.boolean().optional(),
+        voice: z.string().optional(),
+        lang: z.string().optional(),
+        outputFormat: z.string().optional(),
+        pitch: z.string().optional(),
+        rate: z.string().optional(),
+        volume: z.string().optional(),
+        saveSubtitles: z.boolean().optional(),
+        proxy: z.string().optional(),
+        timeoutMs: z.number().int().min(1000).max(120000).optional(),
+      })
+      .strict()
+      .optional(),
+    prefsPath: z.string().optional(),
+    maxTextLength: z.number().int().min(1).optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
  })
  .strict()
-  .default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
-export type TtsConfig = z.infer<typeof TtsConfigSchema>;
+  .optional();
+export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;

 // -----------------------------------------------------------------------------
 // Webhook Server Configuration
@@ -307,7 +358,7 @@ export const VoiceCallConfigSchema = z
  /** STT configuration */
  stt: SttConfigSchema,

-  /** TTS configuration */
+  /** TTS override (deep-merges with core messages.tts) */
  tts: TtsConfigSchema,

  /** Store path for call logs */
--- a/extensions/voice-call/src/core-bridge.ts
+++ b/extensions/voice-call/src/core-bridge.ts
@@ -2,10 +2,16 @@ import fs from "node:fs";
 import path from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";

+import type { VoiceCallTtsConfig } from "./config.js";
+
 export type CoreConfig = {
  session?: {
    store?: string;
  };
+  messages?: {
+    tts?: VoiceCallTtsConfig;
+  };
+  [key: string]: unknown;
 };

 type CoreAgentDeps = {
--- a/extensions/voice-call/src/manager.ts
+++ b/extensions/voice-call/src/manager.ts
@@ -143,7 +143,7 @@ export class CallManager {
      // For notify mode with a message, use inline TwiML with <Say>
      let inlineTwiml: string | undefined;
      if (mode === "notify" && initialMessage) {
-        const pollyVoice = mapVoiceToPolly(this.config.tts.voice);
+        const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
        inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
        console.log(
          `[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
@@ -210,11 +210,13 @@ export class CallManager {
      this.addTranscriptEntry(call, "bot", text);

      // Play TTS
+      const voice =
+        this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
      await this.provider.playTts({
        callId,
        providerCallId: call.providerCallId,
        text,
-        voice: this.config.tts.voice,
+        voice,
      });

      return { success: true };
--- a/extensions/voice-call/src/manager/context.ts
+++ b/extensions/voice-call/src/manager/context.ts
@@ -19,4 +19,3 @@ export type CallManagerContext = {
  transcriptWaiters: Map<CallId, TranscriptWaiter>;
  maxDurationTimers: Map<CallId, NodeJS.Timeout>;
 };
-
--- a/extensions/voice-call/src/manager/events.ts
+++ b/extensions/voice-call/src/manager/events.ts
@@ -175,4 +175,3 @@ export function processEvent(ctx: CallManagerContext, event: NormalizedEvent): v

  persistCallRecord(ctx.storePath, call);
 }
-
--- a/extensions/voice-call/src/manager/lookup.ts
+++ b/extensions/voice-call/src/manager/lookup.ts
@@ -31,4 +31,3 @@ export function findCall(params: {
    providerCallId: params.callIdOrProviderCallId,
  });
 }
-
--- a/extensions/voice-call/src/manager/outbound.ts
+++ b/extensions/voice-call/src/manager/outbound.ts
@@ -68,7 +68,7 @@ export async function initiateCall(
    // For notify mode with a message, use inline TwiML with <Say>.
    let inlineTwiml: string | undefined;
    if (mode === "notify" && initialMessage) {
-      const pollyVoice = mapVoiceToPolly(ctx.config.tts.voice);
+      const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
      inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
      console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
    }
@@ -120,11 +120,13 @@ export async function speak(

    addTranscriptEntry(call, "bot", text);

+    const voice =
+      ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
    await ctx.provider.playTts({
      callId,
      providerCallId: call.providerCallId,
      text,
-      voice: ctx.config.tts.voice,
+      voice,
    });

    return { success: true };
@@ -244,4 +246,3 @@ export async function endCall(
    return { success: false, error: err instanceof Error ? err.message : String(err) };
  }
 }
-
--- a/extensions/voice-call/src/manager/state.ts
+++ b/extensions/voice-call/src/manager/state.ts
@@ -48,4 +48,3 @@ export function addTranscriptEntry(
  };
  call.transcript.push(entry);
 }
-
--- a/extensions/voice-call/src/manager/store.ts
+++ b/extensions/voice-call/src/manager/store.ts
@@ -86,4 +86,3 @@ export async function getCallHistoryFromStore(

  return calls;
 }
-
--- a/extensions/voice-call/src/manager/timers.ts
+++ b/extensions/voice-call/src/manager/timers.ts
@@ -84,4 +84,3 @@ export function waitForFinalTranscript(
    ctx.transcriptWaiters.set(callId, { resolve, reject, timeout });
  });
 }
-
--- a/extensions/voice-call/src/manager/twiml.ts
+++ b/extensions/voice-call/src/manager/twiml.ts
@@ -7,4 +7,3 @@ export function generateNotifyTwiml(message: string, voice: string): string {
  <Hangup/>
 </Response>`;
 }
-
--- a/extensions/voice-call/src/media-stream.test.ts
+++ b/extensions/voice-call/src/media-stream.test.ts
@@ -0,0 +1,97 @@
+import { describe, expect, it } from "vitest";
+
+import type {
+  OpenAIRealtimeSTTProvider,
+  RealtimeSTTSession,
+} from "./providers/stt-openai-realtime.js";
+import { MediaStreamHandler } from "./media-stream.js";
+
+const createStubSession = (): RealtimeSTTSession => ({
+  connect: async () => {},
+  sendAudio: () => {},
+  waitForTranscript: async () => "",
+  onPartial: () => {},
+  onTranscript: () => {},
+  onSpeechStart: () => {},
+  close: () => {},
+  isConnected: () => true,
+});
+
+const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
+  ({
+    createSession: () => createStubSession(),
+  }) as unknown as OpenAIRealtimeSTTProvider;
+
+const flush = async (): Promise<void> => {
+  await new Promise((resolve) => setTimeout(resolve, 0));
+};
+
+const waitForAbort = (signal: AbortSignal): Promise<void> =>
+  new Promise((resolve) => {
+    if (signal.aborted) {
+      resolve();
+      return;
+    }
+    signal.addEventListener("abort", () => resolve(), { once: true });
+  });
+
+describe("MediaStreamHandler TTS queue", () => {
+  it("serializes TTS playback and resolves in order", async () => {
+    const handler = new MediaStreamHandler({
+      sttProvider: createStubSttProvider(),
+    });
+    const started: number[] = [];
+    const finished: number[] = [];
+
+    let resolveFirst!: () => void;
+    const firstGate = new Promise<void>((resolve) => {
+      resolveFirst = resolve;
+    });
+
+    const first = handler.queueTts("stream-1", async () => {
+      started.push(1);
+      await firstGate;
+      finished.push(1);
+    });
+    const second = handler.queueTts("stream-1", async () => {
+      started.push(2);
+      finished.push(2);
+    });
+
+    await flush();
+    expect(started).toEqual([1]);
+
+    resolveFirst();
+    await first;
+    await second;
+
+    expect(started).toEqual([1, 2]);
+    expect(finished).toEqual([1, 2]);
+  });
+
+  it("cancels active playback and clears queued items", async () => {
+    const handler = new MediaStreamHandler({
+      sttProvider: createStubSttProvider(),
+    });
+
+    let queuedRan = false;
+    const started: string[] = [];
+
+    const active = handler.queueTts("stream-1", async (signal) => {
+      started.push("active");
+      await waitForAbort(signal);
+    });
+    void handler.queueTts("stream-1", async () => {
+      queuedRan = true;
+    });
+
+    await flush();
+    expect(started).toEqual(["active"]);
+
+    handler.clearTtsQueue("stream-1");
+    await active;
+    await flush();
+
+    expect(queuedRan).toBe(false);
+  });
+});
--- a/extensions/voice-call/src/media-stream.ts
+++ b/extensions/voice-call/src/media-stream.ts
@@ -29,6 +29,8 @@ export interface MediaStreamConfig {
  onPartialTranscript?: (callId: string, partial: string) => void;
  /** Callback when stream connects */
  onConnect?: (callId: string, streamSid: string) => void;
+  /** Callback when speech starts (barge-in) */
+  onSpeechStart?: (callId: string) => void;
  /** Callback when stream disconnects */
  onDisconnect?: (callId: string) => void;
 }
@@ -43,6 +45,13 @@ interface StreamSession {
  sttSession: RealtimeSTTSession;
 }

+type TtsQueueEntry = {
+  playFn: (signal: AbortSignal) => Promise<void>;
+  controller: AbortController;
+  resolve: () => void;
+  reject: (error: unknown) => void;
+};
+
 /**
 * Manages WebSocket connections for Twilio media streams.
 */
@@ -50,6 +59,12 @@ export class MediaStreamHandler {
  private wss: WebSocketServer | null = null;
  private sessions = new Map<string, StreamSession>();
  private config: MediaStreamConfig;
+  /** TTS playback queues per stream (serialize audio to prevent overlap) */
+  private ttsQueues = new Map<string, TtsQueueEntry[]>();
+  /** Whether TTS is currently playing per stream */
+  private ttsPlaying = new Map<string, boolean>();
+  /** Active TTS playback controllers per stream */
+  private ttsActiveControllers = new Map<string, AbortController>();

  constructor(config: MediaStreamConfig) {
    this.config = config;
@@ -148,6 +163,10 @@ export class MediaStreamHandler {
      this.config.onTranscript?.(callSid, transcript);
    });

+    sttSession.onSpeechStart(() => {
+      this.config.onSpeechStart?.(callSid);
+    });
+
    const session: StreamSession = {
      callId: callSid,
      streamSid,
@@ -177,6 +196,7 @@ export class MediaStreamHandler {
  private handleStop(session: StreamSession): void {
    console.log(`[MediaStream] Stream stopped: ${session.streamSid}`);

+    this.clearTtsState(session.streamSid);
    session.sttSession.close();
    this.sessions.delete(session.streamSid);
    this.config.onDisconnect?.(session.callId);
@@ -228,6 +248,46 @@ export class MediaStreamHandler {
    this.sendToStream(streamSid, { event: "clear", streamSid });
  }

+  /**
+   * Queue a TTS operation for sequential playback.
+   * Only one TTS operation plays at a time per stream to prevent overlap.
+   */
+  async queueTts(
+    streamSid: string,
+    playFn: (signal: AbortSignal) => Promise<void>,
+  ): Promise<void> {
+    const queue = this.getTtsQueue(streamSid);
+    let resolveEntry: () => void;
+    let rejectEntry: (error: unknown) => void;
+    const promise = new Promise<void>((resolve, reject) => {
+      resolveEntry = resolve;
+      rejectEntry = reject;
+    });
+
+    queue.push({
+      playFn,
+      controller: new AbortController(),
+      resolve: resolveEntry!,
+      reject: rejectEntry!,
+    });
+
+    if (!this.ttsPlaying.get(streamSid)) {
+      void this.processQueue(streamSid);
+    }
+
+    return promise;
+  }
+
+  /**
+   * Clear TTS queue and interrupt current playback (barge-in).
+   */
+  clearTtsQueue(streamSid: string): void {
+    const queue = this.getTtsQueue(streamSid);
+    queue.length = 0;
+    this.ttsActiveControllers.get(streamSid)?.abort();
+    this.clearAudio(streamSid);
+  }
+
  /**
   * Get active session by call ID.
   */
@@ -242,11 +302,65 @@ export class MediaStreamHandler {
   */
  closeAll(): void {
    for (const session of this.sessions.values()) {
+      this.clearTtsState(session.streamSid);
      session.sttSession.close();
      session.ws.close();
    }
    this.sessions.clear();
  }
+
+  private getTtsQueue(streamSid: string): TtsQueueEntry[] {
+    const existing = this.ttsQueues.get(streamSid);
+    if (existing) return existing;
+    const queue: TtsQueueEntry[] = [];
+    this.ttsQueues.set(streamSid, queue);
+    return queue;
+  }
+
+  /**
+   * Process the TTS queue for a stream.
+   * Uses iterative approach to avoid stack accumulation from recursion.
+   */
+  private async processQueue(streamSid: string): Promise<void> {
+    this.ttsPlaying.set(streamSid, true);
+
+    while (true) {
+      const queue = this.ttsQueues.get(streamSid);
+      if (!queue || queue.length === 0) {
+        this.ttsPlaying.set(streamSid, false);
+        this.ttsActiveControllers.delete(streamSid);
+        return;
+      }
+
+      const entry = queue.shift()!;
+      this.ttsActiveControllers.set(streamSid, entry.controller);
+
+      try {
+        await entry.playFn(entry.controller.signal);
+        entry.resolve();
+      } catch (error) {
+        if (entry.controller.signal.aborted) {
+          entry.resolve();
+        } else {
+          console.error("[MediaStream] TTS playback error:", error);
+          entry.reject(error);
+        }
+      } finally {
+        if (this.ttsActiveControllers.get(streamSid) === entry.controller) {
+          this.ttsActiveControllers.delete(streamSid);
+        }
+      }
+    }
+  }
+
+  private clearTtsState(streamSid: string): void {
+    const queue = this.ttsQueues.get(streamSid);
+    if (queue) queue.length = 0;
+    this.ttsActiveControllers.get(streamSid)?.abort();
+    this.ttsActiveControllers.delete(streamSid);
+    this.ttsPlaying.delete(streamSid);
+    this.ttsQueues.delete(streamSid);
+  }
 }

 /**
--- a/extensions/voice-call/src/providers/plivo.test.ts
+++ b/extensions/voice-call/src/providers/plivo.test.ts
@@ -26,4 +26,3 @@ describe("PlivoProvider", () => {
    expect(result.providerResponseBody).toContain('length="300"');
  });
 });
-
--- a/extensions/voice-call/src/providers/stt-openai-realtime.ts
+++ b/extensions/voice-call/src/providers/stt-openai-realtime.ts
@@ -38,6 +38,8 @@ export interface RealtimeSTTSession {
  onPartial(callback: (partial: string) => void): void;
  /** Set callback for final transcripts */
  onTranscript(callback: (transcript: string) => void): void;
+  /** Set callback when speech starts (VAD) */
+  onSpeechStart(callback: () => void): void;
  /** Close the session */
  close(): void;
  /** Check if session is connected */
@@ -91,6 +93,7 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
  private pendingTranscript = "";
  private onTranscriptCallback: ((transcript: string) => void) | null = null;
  private onPartialCallback: ((partial: string) => void) | null = null;
+  private onSpeechStartCallback: (() => void) | null = null;

  constructor(
    private readonly apiKey: string,
@@ -243,6 +246,7 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
      case "input_audio_buffer.speech_started":
        console.log("[RealtimeSTT] Speech started");
        this.pendingTranscript = "";
+        this.onSpeechStartCallback?.();
        break;

      case "error":
@@ -273,6 +277,10 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
    this.onTranscriptCallback = callback;
  }

+  onSpeechStart(callback: () => void): void {
+    this.onSpeechStartCallback = callback;
+  }
+
  async waitForTranscript(timeoutMs = 30000): Promise<string> {
    return new Promise((resolve, reject) => {
      const timeout = setTimeout(() => {
--- a/extensions/voice-call/src/providers/twilio.ts
+++ b/extensions/voice-call/src/providers/twilio.ts
@@ -15,9 +15,9 @@ import type {
  WebhookVerificationResult,
 } from "../types.js";
 import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
+import { chunkAudio } from "../telephony-audio.js";
+import type { TelephonyTtsProvider } from "../telephony-tts.js";
 import type { VoiceCallProvider } from "./base.js";
-import type { OpenAITTSProvider } from "./tts-openai.js";
-import { chunkAudio } from "./tts-openai.js";
 import { twilioApiRequest } from "./twilio/api.js";
 import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";

@@ -53,8 +53,8 @@ export class TwilioProvider implements VoiceCallProvider {
  /** Current public webhook URL (set when tunnel starts or from config) */
  private currentPublicUrl: string | null = null;

-  /** Optional OpenAI TTS provider for streaming TTS */
-  private ttsProvider: OpenAITTSProvider | null = null;
+  /** Optional telephony TTS provider for streaming TTS */
+  private ttsProvider: TelephonyTtsProvider | null = null;

  /** Optional media stream handler for sending audio */
  private mediaStreamHandler: MediaStreamHandler | null = null;
@@ -119,7 +119,7 @@ export class TwilioProvider implements VoiceCallProvider {
    return this.currentPublicUrl;
  }

-  setTTSProvider(provider: OpenAITTSProvider): void {
+  setTTSProvider(provider: TelephonyTtsProvider): void {
    this.ttsProvider = provider;
  }

@@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider {
    this.callStreamMap.delete(callSid);
  }

+  /**
+   * Clear TTS queue for a call (barge-in).
+   * Used when user starts speaking to interrupt current TTS playback.
+   */
+  clearTtsQueue(callSid: string): void {
+    const streamSid = this.callStreamMap.get(callSid);
+    if (streamSid && this.mediaStreamHandler) {
+      this.mediaStreamHandler.clearTtsQueue(streamSid);
+    }
+  }
+
  /**
   * Make an authenticated request to the Twilio API.
   */
@@ -454,13 +465,13 @@ export class TwilioProvider implements VoiceCallProvider {
   * Play TTS audio via Twilio.
   *
   * Two modes:
-   * 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
-   *    generates audio via OpenAI and streams it through WebSocket (preferred).
+   * 1. Core TTS + Media Streams: If TTS provider and media stream are available,
+   *    generates audio via core TTS and streams it through WebSocket (preferred).
   * 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
   *    Note: This may not work on all Twilio accounts.
   */
  async playTts(input: PlayTtsInput): Promise<void> {
-    // Try OpenAI TTS via media stream first (if configured)
+    // Try telephony TTS via media stream first (if configured)
    const streamSid = this.callStreamMap.get(input.providerCallId);
    if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
      try {
@@ -468,7 +479,7 @@ export class TwilioProvider implements VoiceCallProvider {
        return;
      } catch (err) {
        console.warn(
-          `[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
+          `[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
          err instanceof Error ? err.message : err,
        );
        // Fall through to TwiML <Say> fallback
@@ -484,7 +495,7 @@ export class TwilioProvider implements VoiceCallProvider {
    }

    console.warn(
-      "[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
+      "[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
    );

    const pollyVoice = mapVoiceToPolly(input.voice);
@@ -502,9 +513,9 @@ export class TwilioProvider implements VoiceCallProvider {
  }

  /**
-   * Play TTS via OpenAI and Twilio Media Streams.
-   * Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
-   * Uses a jitter buffer to smooth out timing variations.
+   * Play TTS via core TTS and Twilio Media Streams.
+   * Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
+   * Uses a queue to serialize playback and prevent overlapping audio.
   */
  private async playTtsViaStream(
    text: string,
@@ -514,22 +525,29 @@ export class TwilioProvider implements VoiceCallProvider {
      throw new Error("TTS provider and media stream handler required");
    }

-    // Generate audio with OpenAI TTS (returns mu-law at 8kHz)
-    const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
-
    // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
    const CHUNK_SIZE = 160;
    const CHUNK_DELAY_MS = 20;

-    for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
-      this.mediaStreamHandler.sendAudio(streamSid, chunk);
+    const handler = this.mediaStreamHandler;
+    const ttsProvider = this.ttsProvider;
+    await handler.queueTts(streamSid, async (signal) => {
+      // Generate audio with core TTS (returns mu-law at 8kHz)
+      const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
+      for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
+        if (signal.aborted) break;
+        handler.sendAudio(streamSid, chunk);

-      // Pace the audio to match real-time playback
-      await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
-    }
+        // Pace the audio to match real-time playback
+        await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
+        if (signal.aborted) break;
+      }

-    // Send a mark to track when audio finishes
-    this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
+      if (!signal.aborted) {
+        // Send a mark to track when audio finishes
+        handler.sendMark(streamSid, `tts-${Date.now()}`);
+      }
+    });
  }

  /**
--- a/extensions/voice-call/src/providers/twilio/webhook.ts
+++ b/extensions/voice-call/src/providers/twilio/webhook.ts
@@ -27,4 +27,3 @@ export function verifyTwilioProviderWebhook(params: {
    reason: result.reason,
  };
 }
-
--- a/extensions/voice-call/src/runtime.ts
+++ b/extensions/voice-call/src/runtime.ts
@@ -6,8 +6,9 @@ import type { VoiceCallProvider } from "./providers/base.js";
 import { MockProvider } from "./providers/mock.js";
 import { PlivoProvider } from "./providers/plivo.js";
 import { TelnyxProvider } from "./providers/telnyx.js";
-import { OpenAITTSProvider } from "./providers/tts-openai.js";
 import { TwilioProvider } from "./providers/twilio.js";
+import type { TelephonyTtsRuntime } from "./telephony-tts.js";
+import { createTelephonyTtsProvider } from "./telephony-tts.js";
 import { startTunnel, type TunnelResult } from "./tunnel.js";
 import {
  cleanupTailscaleExposure,
@@ -81,9 +82,10 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
 export async function createVoiceCallRuntime(params: {
  config: VoiceCallConfig;
  coreConfig: CoreConfig;
+  ttsRuntime?: TelephonyTtsRuntime;
  logger?: Logger;
 }): Promise<VoiceCallRuntime> {
-  const { config, coreConfig, logger } = params;
+  const { config, coreConfig, ttsRuntime, logger } = params;
  const log = logger ?? {
    info: console.log,
    warn: console.warn,
@@ -149,27 +151,24 @@ export async function createVoiceCallRuntime(params: {

  if (provider.name === "twilio" && config.streaming?.enabled) {
    const twilioProvider = provider as TwilioProvider;
-    const openaiApiKey =
-      config.streaming.openaiApiKey || process.env.OPENAI_API_KEY;
-    if (openaiApiKey) {
+    if (ttsRuntime?.textToSpeechTelephony) {
      try {
-        const ttsProvider = new OpenAITTSProvider({
-          apiKey: openaiApiKey,
-          voice: config.tts.voice,
-          model: config.tts.model,
-          instructions: config.tts.instructions,
+        const ttsProvider = createTelephonyTtsProvider({
+          coreConfig,
+          ttsOverride: config.tts,
+          runtime: ttsRuntime,
        });
        twilioProvider.setTTSProvider(ttsProvider);
-        log.info("[voice-call] OpenAI TTS provider configured");
+        log.info("[voice-call] Telephony TTS provider configured");
      } catch (err) {
        log.warn(
-          `[voice-call] Failed to initialize OpenAI TTS: ${
+          `[voice-call] Failed to initialize telephony TTS: ${
            err instanceof Error ? err.message : String(err)
          }`,
        );
      }
    } else {
-      log.warn("[voice-call] OpenAI TTS key missing; streaming TTS disabled");
+      log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
    }

    const mediaHandler = webhookServer.getMediaStreamHandler();
--- a/extensions/voice-call/src/telephony-audio.ts
+++ b/extensions/voice-call/src/telephony-audio.ts
@@ -0,0 +1,88 @@
+const TELEPHONY_SAMPLE_RATE = 8000;
+
+function clamp16(value: number): number {
+  return Math.max(-32768, Math.min(32767, value));
+}
+
+/**
+ * Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
+ */
+export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
+  if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
+  const inputSamples = Math.floor(input.length / 2);
+  if (inputSamples === 0) return Buffer.alloc(0);
+
+  const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
+  const outputSamples = Math.floor(inputSamples / ratio);
+  const output = Buffer.alloc(outputSamples * 2);
+
+  for (let i = 0; i < outputSamples; i++) {
+    const srcPos = i * ratio;
+    const srcIndex = Math.floor(srcPos);
+    const frac = srcPos - srcIndex;
+
+    const s0 = input.readInt16LE(srcIndex * 2);
+    const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
+    const s1 = input.readInt16LE(s1Index * 2);
+
+    const sample = Math.round(s0 + frac * (s1 - s0));
+    output.writeInt16LE(clamp16(sample), i * 2);
+  }
+
+  return output;
+}
+
+/**
+ * Convert 16-bit PCM to 8-bit mu-law (G.711).
+ */
+export function pcmToMulaw(pcm: Buffer): Buffer {
+  const samples = Math.floor(pcm.length / 2);
+  const mulaw = Buffer.alloc(samples);
+
+  for (let i = 0; i < samples; i++) {
+    const sample = pcm.readInt16LE(i * 2);
+    mulaw[i] = linearToMulaw(sample);
+  }
+
+  return mulaw;
+}
+
+export function convertPcmToMulaw8k(
+  pcm: Buffer,
+  inputSampleRate: number,
+): Buffer {
+  const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
+  return pcmToMulaw(pcm8k);
+}
+
+/**
+ * Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
+ */
+export function chunkAudio(
+  audio: Buffer,
+  chunkSize = 160,
+): Generator<Buffer, void, unknown> {
+  return (function* () {
+    for (let i = 0; i < audio.length; i += chunkSize) {
+      yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
+    }
+  })();
+}
+
+function linearToMulaw(sample: number): number {
+  const BIAS = 132;
+  const CLIP = 32635;
+
+  const sign = sample < 0 ? 0x80 : 0;
+  if (sample < 0) sample = -sample;
+  if (sample > CLIP) sample = CLIP;
+
+  sample += BIAS;
+  let exponent = 7;
+  for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
+    expMask >>= 1;
+  }
+
+  const mantissa = (sample >> (exponent + 3)) & 0x0f;
+  return ~(sign | (exponent << 4) | mantissa) & 0xff;
+}
--- a/extensions/voice-call/src/telephony-tts.ts
+++ b/extensions/voice-call/src/telephony-tts.ts
@@ -0,0 +1,95 @@
+import type { CoreConfig } from "./core-bridge.js";
+import type { VoiceCallTtsConfig } from "./config.js";
+import { convertPcmToMulaw8k } from "./telephony-audio.js";
+
+export type TelephonyTtsRuntime = {
+  textToSpeechTelephony: (params: {
+    text: string;
+    cfg: CoreConfig;
+    prefsPath?: string;
+  }) => Promise<{
+    success: boolean;
+    audioBuffer?: Buffer;
+    sampleRate?: number;
+    provider?: string;
+    error?: string;
+  }>;
+};
+
+export type TelephonyTtsProvider = {
+  synthesizeForTelephony: (text: string) => Promise<Buffer>;
+};
+
+export function createTelephonyTtsProvider(params: {
+  coreConfig: CoreConfig;
+  ttsOverride?: VoiceCallTtsConfig;
+  runtime: TelephonyTtsRuntime;
+}): TelephonyTtsProvider {
+  const { coreConfig, ttsOverride, runtime } = params;
+  const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
+
+  return {
+    synthesizeForTelephony: async (text: string) => {
+      const result = await runtime.textToSpeechTelephony({
+        text,
+        cfg: mergedConfig,
+      });
+
+      if (!result.success || !result.audioBuffer || !result.sampleRate) {
+        throw new Error(result.error ?? "TTS conversion failed");
+      }
+
+      return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
+    },
+  };
+}
+
+function applyTtsOverride(
+  coreConfig: CoreConfig,
+  override?: VoiceCallTtsConfig,
+): CoreConfig {
+  if (!override) return coreConfig;
+
+  const base = coreConfig.messages?.tts;
+  const merged = mergeTtsConfig(base, override);
+  if (!merged) return coreConfig;
+
+  return {
+    ...coreConfig,
+    messages: {
+      ...(coreConfig.messages ?? {}),
+      tts: merged,
+    },
+  };
+}
+
+function mergeTtsConfig(
+  base?: VoiceCallTtsConfig,
+  override?: VoiceCallTtsConfig,
+): VoiceCallTtsConfig | undefined {
+  if (!base && !override) return undefined;
+  if (!override) return base;
+  if (!base) return override;
+  return deepMerge(base, override);
+}
+
+function deepMerge<T>(base: T, override: T): T {
+  if (!isPlainObject(base) || !isPlainObject(override)) {
+    return override;
+  }
+  const result: Record<string, unknown> = { ...base };
+  for (const [key, value] of Object.entries(override)) {
+    if (value === undefined) continue;
+    const existing = (base as Record<string, unknown>)[key];
+    if (isPlainObject(existing) && isPlainObject(value)) {
+      result[key] = deepMerge(existing, value);
+    } else {
+      result[key] = value;
+    }
+  }
+  return result as T;
+}
+
+function isPlainObject(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}
--- a/extensions/voice-call/src/webhook.ts
+++ b/extensions/voice-call/src/webhook.ts
@@ -78,6 +78,11 @@ export class VoiceCallWebhookServer {
          `[voice-call] Transcript for ${providerCallId}: ${transcript}`,
        );

+        // Clear TTS queue on barge-in (user started speaking, interrupt current playback)
+        if (this.provider.name === "twilio") {
+          (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
+        }
+
        // Look up our internal call ID from the provider call ID
        const call = this.manager.getCallByProviderCallId(providerCallId);
        if (!call) {
@@ -109,6 +114,11 @@ export class VoiceCallWebhookServer {
          });
        }
      },
+      onSpeechStart: (providerCallId) => {
+        if (this.provider.name === "twilio") {
+          (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
+        }
+      },
      onPartialTranscript: (callId, partial) => {
        console.log(`[voice-call] Partial for ${callId}: ${partial}`);
      },