fix(voice-call): prevent audio overlap with TTS queue (#1713)

* fix(voice-call): prevent audio overlap with TTS queue Add a TTS queue to serialize audio playback and prevent overlapping speech during voice calls. Previously, concurrent speak() calls could send audio chunks simultaneously, causing garbled/choppy output. Changes: - Add queueTts() to MediaStreamHandler for sequential TTS playback - Wrap playTtsViaStream() audio sending in the queue - Clear queue on barge-in (when user starts speaking) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(voice-call): use iterative queue processing to prevent heap exhaustion The recursive processQueue() pattern accumulated stack frames, causing JavaScript heap out of memory errors on macOS CI. Convert to while loop for constant stack usage regardless of queue depth. * fix: prevent voice-call TTS overlap (#1713) (thanks @dguido) --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-01-25 07:02:17 -05:00
parent 875b018ea1
commit 101d0f451f
6 changed files with 259 additions and 11 deletions
--- a/extensions/voice-call/src/providers/stt-openai-realtime.ts
+++ b/extensions/voice-call/src/providers/stt-openai-realtime.ts
@@ -38,6 +38,8 @@ export interface RealtimeSTTSession {
  onPartial(callback: (partial: string) => void): void;
  /** Set callback for final transcripts */
  onTranscript(callback: (transcript: string) => void): void;
+  /** Set callback when speech starts (VAD) */
+  onSpeechStart(callback: () => void): void;
  /** Close the session */
  close(): void;
  /** Check if session is connected */
@@ -91,6 +93,7 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
  private pendingTranscript = "";
  private onTranscriptCallback: ((transcript: string) => void) | null = null;
  private onPartialCallback: ((partial: string) => void) | null = null;
+  private onSpeechStartCallback: (() => void) | null = null;

  constructor(
    private readonly apiKey: string,
@@ -243,6 +246,7 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
      case "input_audio_buffer.speech_started":
        console.log("[RealtimeSTT] Speech started");
        this.pendingTranscript = "";
+        this.onSpeechStartCallback?.();
        break;

      case "error":
@@ -273,6 +277,10 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
    this.onTranscriptCallback = callback;
  }

+  onSpeechStart(callback: () => void): void {
+    this.onSpeechStartCallback = callback;
+  }
+
  async waitForTranscript(timeoutMs = 30000): Promise<string> {
    return new Promise((resolve, reject) => {
      const timeout = setTimeout(() => {
--- a/extensions/voice-call/src/providers/twilio.ts
+++ b/extensions/voice-call/src/providers/twilio.ts
@@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider {
    this.callStreamMap.delete(callSid);
  }

+  /**
+   * Clear TTS queue for a call (barge-in).
+   * Used when user starts speaking to interrupt current TTS playback.
+   */
+  clearTtsQueue(callSid: string): void {
+    const streamSid = this.callStreamMap.get(callSid);
+    if (streamSid && this.mediaStreamHandler) {
+      this.mediaStreamHandler.clearTtsQueue(streamSid);
+    }
+  }
+
  /**
   * Make an authenticated request to the Twilio API.
   */
@@ -504,7 +515,7 @@ export class TwilioProvider implements VoiceCallProvider {
  /**
   * Play TTS via core TTS and Twilio Media Streams.
   * Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
-   * Uses a jitter buffer to smooth out timing variations.
+   * Uses a queue to serialize playback and prevent overlapping audio.
   */
  private async playTtsViaStream(
    text: string,
@@ -514,22 +525,29 @@ export class TwilioProvider implements VoiceCallProvider {
      throw new Error("TTS provider and media stream handler required");
    }

-    // Generate audio with core TTS (returns mu-law at 8kHz)
-    const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
-
    // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
    const CHUNK_SIZE = 160;
    const CHUNK_DELAY_MS = 20;

-    for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
-      this.mediaStreamHandler.sendAudio(streamSid, chunk);
+    const handler = this.mediaStreamHandler;
+    const ttsProvider = this.ttsProvider;
+    await handler.queueTts(streamSid, async (signal) => {
+      // Generate audio with core TTS (returns mu-law at 8kHz)
+      const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
+      for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
+        if (signal.aborted) break;
+        handler.sendAudio(streamSid, chunk);

-      // Pace the audio to match real-time playback
-      await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
-    }
+        // Pace the audio to match real-time playback
+        await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
+        if (signal.aborted) break;
+      }

-    // Send a mark to track when audio finishes
-    this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
+      if (!signal.aborted) {
+        // Send a mark to track when audio finishes
+        handler.sendMark(streamSid, `tts-${Date.now()}`);
+      }
+    });
  }

  /**