refactor: align voice-call TTS with core config

This commit is contained in:
Peter Steinberger
2026-01-25 09:29:50 +00:00
parent 9366cbc7db
commit 83f92e34af
18 changed files with 769 additions and 69 deletions

View File

@@ -124,6 +124,7 @@ import { startWebLoginWithQr, waitForWebLogin } from "../../web/login-qr.js";
import { sendMessageWhatsApp, sendPollWhatsApp } from "../../web/outbound.js";
import { registerMemoryCli } from "../../cli/memory-cli.js";
import { formatNativeDependencyHint } from "./native-deps.js";
import { textToSpeechTelephony } from "../../tts/tts.js";
import type { PluginRuntime } from "./types.js";
@@ -162,6 +163,9 @@ export function createPluginRuntime(): PluginRuntime {
getImageMetadata,
resizeToJpeg,
},
tts: {
textToSpeechTelephony,
},
tools: {
createMemoryGetTool,
createMemorySearchTool,

View File

@@ -16,6 +16,7 @@ type UpsertChannelPairingRequest =
typeof import("../../pairing/pairing-store.js").upsertChannelPairingRequest;
type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
type MatchesMentionPatterns =
typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
@@ -173,6 +174,9 @@ export type PluginRuntime = {
getImageMetadata: GetImageMetadata;
resizeToJpeg: ResizeToJpeg;
};
tts: {
textToSpeechTelephony: TextToSpeechTelephony;
};
tools: {
createMemoryGetTool: CreateMemoryGetTool;
createMemorySearchTool: CreateMemorySearchTool;

View File

@@ -43,6 +43,7 @@ function setup(config: Record<string, unknown>): Registered {
source: "test",
config: {},
pluginConfig: config,
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
logger: noopLogger,
registerGatewayMethod: (method, handler) => methods.set(method, handler),
registerTool: (tool) => tools.push(tool),
@@ -142,6 +143,7 @@ describe("voice-call plugin", () => {
source: "test",
config: {},
pluginConfig: { provider: "mock" },
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
logger: noopLogger,
registerGatewayMethod: () => {},
registerTool: () => {},

View File

@@ -76,6 +76,11 @@ const DEFAULT_OUTPUT = {
voiceCompatible: false,
};
const TELEPHONY_OUTPUT = {
openai: { format: "pcm" as const, sampleRate: 24000 },
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
export type ResolvedTtsConfig = {
@@ -180,6 +185,16 @@ export type TtsResult = {
voiceCompatible?: boolean;
};
export type TtsTelephonyResult = {
success: boolean;
audioBuffer?: Buffer;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
sampleRate?: number;
};
type TtsStatusEntry = {
timestamp: number;
success: boolean;
@@ -980,7 +995,7 @@ async function openaiTTS(params: {
apiKey: string;
model: string;
voice: string;
responseFormat: "mp3" | "opus";
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
@@ -1224,6 +1239,100 @@ export async function textToSpeech(params: {
};
}
export async function textToSpeechTelephony(params: {
text: string;
cfg: ClawdbotConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
success: false,
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const providers = resolveTtsProviderOrder(userProvider);
let lastError: string | undefined;
for (const provider of providers) {
const providerStart = Date.now();
try {
if (provider === "edge") {
lastError = "edge: unsupported for telephony";
continue;
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
if (provider === "elevenlabs") {
const output = TELEPHONY_OUTPUT.elevenlabs;
const audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
baseUrl: config.elevenlabs.baseUrl,
voiceId: config.elevenlabs.voiceId,
modelId: config.elevenlabs.modelId,
outputFormat: output.format,
seed: config.elevenlabs.seed,
applyTextNormalization: config.elevenlabs.applyTextNormalization,
languageCode: config.elevenlabs.languageCode,
voiceSettings: config.elevenlabs.voiceSettings,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
}
const output = TELEPHONY_OUTPUT.openai;
const audioBuffer = await openaiTTS({
text: params.text,
apiKey,
model: config.openai.model,
voice: config.openai.voice,
responseFormat: output.format,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
lastError = `${provider}: request timed out`;
} else {
lastError = `${provider}: ${error.message}`;
}
}
}
return {
success: false,
error: `TTS conversion failed: ${lastError || "no providers available"}`,
};
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: ClawdbotConfig;