refactor: align voice-call TTS with core config

2026-01-25 09:29:50 +00:00
parent 9366cbc7db
commit 83f92e34af
18 changed files with 769 additions and 69 deletions
--- a/src/plugins/runtime/index.ts
+++ b/src/plugins/runtime/index.ts
@@ -124,6 +124,7 @@ import { startWebLoginWithQr, waitForWebLogin } from "../../web/login-qr.js";
 import { sendMessageWhatsApp, sendPollWhatsApp } from "../../web/outbound.js";
 import { registerMemoryCli } from "../../cli/memory-cli.js";
 import { formatNativeDependencyHint } from "./native-deps.js";
+import { textToSpeechTelephony } from "../../tts/tts.js";

 import type { PluginRuntime } from "./types.js";

@@ -162,6 +163,9 @@ export function createPluginRuntime(): PluginRuntime {
      getImageMetadata,
      resizeToJpeg,
    },
+    tts: {
+      textToSpeechTelephony,
+    },
    tools: {
      createMemoryGetTool,
      createMemorySearchTool,
--- a/src/plugins/runtime/types.ts
+++ b/src/plugins/runtime/types.ts
@@ -16,6 +16,7 @@ type UpsertChannelPairingRequest =
  typeof import("../../pairing/pairing-store.js").upsertChannelPairingRequest;
 type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
 type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
+type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
 type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
 type MatchesMentionPatterns =
  typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
@@ -173,6 +174,9 @@ export type PluginRuntime = {
    getImageMetadata: GetImageMetadata;
    resizeToJpeg: ResizeToJpeg;
  };
+  tts: {
+    textToSpeechTelephony: TextToSpeechTelephony;
+  };
  tools: {
    createMemoryGetTool: CreateMemoryGetTool;
    createMemorySearchTool: CreateMemorySearchTool;
--- a/src/plugins/voice-call.plugin.test.ts
+++ b/src/plugins/voice-call.plugin.test.ts
@@ -43,6 +43,7 @@ function setup(config: Record<string, unknown>): Registered {
    source: "test",
    config: {},
    pluginConfig: config,
+    runtime: { tts: { textToSpeechTelephony: vi.fn() } },
    logger: noopLogger,
    registerGatewayMethod: (method, handler) => methods.set(method, handler),
    registerTool: (tool) => tools.push(tool),
@@ -142,6 +143,7 @@ describe("voice-call plugin", () => {
      source: "test",
      config: {},
      pluginConfig: { provider: "mock" },
+      runtime: { tts: { textToSpeechTelephony: vi.fn() } },
      logger: noopLogger,
      registerGatewayMethod: () => {},
      registerTool: () => {},
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -76,6 +76,11 @@ const DEFAULT_OUTPUT = {
  voiceCompatible: false,
 };

+const TELEPHONY_OUTPUT = {
+  openai: { format: "pcm" as const, sampleRate: 24000 },
+  elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
+};
+
 const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);

 export type ResolvedTtsConfig = {
@@ -180,6 +185,16 @@ export type TtsResult = {
  voiceCompatible?: boolean;
 };

+export type TtsTelephonyResult = {
+  success: boolean;
+  audioBuffer?: Buffer;
+  error?: string;
+  latencyMs?: number;
+  provider?: string;
+  outputFormat?: string;
+  sampleRate?: number;
+};
+
 type TtsStatusEntry = {
  timestamp: number;
  success: boolean;
@@ -980,7 +995,7 @@ async function openaiTTS(params: {
  apiKey: string;
  model: string;
  voice: string;
-  responseFormat: "mp3" | "opus";
+  responseFormat: "mp3" | "opus" | "pcm";
  timeoutMs: number;
 }): Promise<Buffer> {
  const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
@@ -1224,6 +1239,100 @@ export async function textToSpeech(params: {
  };
 }

+export async function textToSpeechTelephony(params: {
+  text: string;
+  cfg: ClawdbotConfig;
+  prefsPath?: string;
+}): Promise<TtsTelephonyResult> {
+  const config = resolveTtsConfig(params.cfg);
+  const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
+
+  if (params.text.length > config.maxTextLength) {
+    return {
+      success: false,
+      error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
+    };
+  }
+
+  const userProvider = getTtsProvider(config, prefsPath);
+  const providers = resolveTtsProviderOrder(userProvider);
+
+  let lastError: string | undefined;
+
+  for (const provider of providers) {
+    const providerStart = Date.now();
+    try {
+      if (provider === "edge") {
+        lastError = "edge: unsupported for telephony";
+        continue;
+      }
+
+      const apiKey = resolveTtsApiKey(config, provider);
+      if (!apiKey) {
+        lastError = `No API key for ${provider}`;
+        continue;
+      }
+
+      if (provider === "elevenlabs") {
+        const output = TELEPHONY_OUTPUT.elevenlabs;
+        const audioBuffer = await elevenLabsTTS({
+          text: params.text,
+          apiKey,
+          baseUrl: config.elevenlabs.baseUrl,
+          voiceId: config.elevenlabs.voiceId,
+          modelId: config.elevenlabs.modelId,
+          outputFormat: output.format,
+          seed: config.elevenlabs.seed,
+          applyTextNormalization: config.elevenlabs.applyTextNormalization,
+          languageCode: config.elevenlabs.languageCode,
+          voiceSettings: config.elevenlabs.voiceSettings,
+          timeoutMs: config.timeoutMs,
+        });
+
+        return {
+          success: true,
+          audioBuffer,
+          latencyMs: Date.now() - providerStart,
+          provider,
+          outputFormat: output.format,
+          sampleRate: output.sampleRate,
+        };
+      }
+
+      const output = TELEPHONY_OUTPUT.openai;
+      const audioBuffer = await openaiTTS({
+        text: params.text,
+        apiKey,
+        model: config.openai.model,
+        voice: config.openai.voice,
+        responseFormat: output.format,
+        timeoutMs: config.timeoutMs,
+      });
+
+      return {
+        success: true,
+        audioBuffer,
+        latencyMs: Date.now() - providerStart,
+        provider,
+        outputFormat: output.format,
+        sampleRate: output.sampleRate,
+      };
+    } catch (err) {
+      const error = err as Error;
+      if (error.name === "AbortError") {
+        lastError = `${provider}: request timed out`;
+      } else {
+        lastError = `${provider}: ${error.message}`;
+      }
+    }
+  }
+
+  return {
+    success: false,
+    error: `TTS conversion failed: ${lastError || "no providers available"}`,
+  };
+}
+
 export async function maybeApplyTtsToPayload(params: {
  payload: ReplyPayload;
  cfg: ClawdbotConfig;