refactor: align voice-call TTS with core config
This commit is contained in:
@@ -124,6 +124,7 @@ import { startWebLoginWithQr, waitForWebLogin } from "../../web/login-qr.js";
|
||||
import { sendMessageWhatsApp, sendPollWhatsApp } from "../../web/outbound.js";
|
||||
import { registerMemoryCli } from "../../cli/memory-cli.js";
|
||||
import { formatNativeDependencyHint } from "./native-deps.js";
|
||||
import { textToSpeechTelephony } from "../../tts/tts.js";
|
||||
|
||||
import type { PluginRuntime } from "./types.js";
|
||||
|
||||
@@ -162,6 +163,9 @@ export function createPluginRuntime(): PluginRuntime {
|
||||
getImageMetadata,
|
||||
resizeToJpeg,
|
||||
},
|
||||
tts: {
|
||||
textToSpeechTelephony,
|
||||
},
|
||||
tools: {
|
||||
createMemoryGetTool,
|
||||
createMemorySearchTool,
|
||||
|
||||
@@ -16,6 +16,7 @@ type UpsertChannelPairingRequest =
|
||||
typeof import("../../pairing/pairing-store.js").upsertChannelPairingRequest;
|
||||
type FetchRemoteMedia = typeof import("../../media/fetch.js").fetchRemoteMedia;
|
||||
type SaveMediaBuffer = typeof import("../../media/store.js").saveMediaBuffer;
|
||||
type TextToSpeechTelephony = typeof import("../../tts/tts.js").textToSpeechTelephony;
|
||||
type BuildMentionRegexes = typeof import("../../auto-reply/reply/mentions.js").buildMentionRegexes;
|
||||
type MatchesMentionPatterns =
|
||||
typeof import("../../auto-reply/reply/mentions.js").matchesMentionPatterns;
|
||||
@@ -173,6 +174,9 @@ export type PluginRuntime = {
|
||||
getImageMetadata: GetImageMetadata;
|
||||
resizeToJpeg: ResizeToJpeg;
|
||||
};
|
||||
tts: {
|
||||
textToSpeechTelephony: TextToSpeechTelephony;
|
||||
};
|
||||
tools: {
|
||||
createMemoryGetTool: CreateMemoryGetTool;
|
||||
createMemorySearchTool: CreateMemorySearchTool;
|
||||
|
||||
@@ -43,6 +43,7 @@ function setup(config: Record<string, unknown>): Registered {
|
||||
source: "test",
|
||||
config: {},
|
||||
pluginConfig: config,
|
||||
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
|
||||
logger: noopLogger,
|
||||
registerGatewayMethod: (method, handler) => methods.set(method, handler),
|
||||
registerTool: (tool) => tools.push(tool),
|
||||
@@ -142,6 +143,7 @@ describe("voice-call plugin", () => {
|
||||
source: "test",
|
||||
config: {},
|
||||
pluginConfig: { provider: "mock" },
|
||||
runtime: { tts: { textToSpeechTelephony: vi.fn() } },
|
||||
logger: noopLogger,
|
||||
registerGatewayMethod: () => {},
|
||||
registerTool: () => {},
|
||||
|
||||
111
src/tts/tts.ts
111
src/tts/tts.ts
@@ -76,6 +76,11 @@ const DEFAULT_OUTPUT = {
|
||||
voiceCompatible: false,
|
||||
};
|
||||
|
||||
const TELEPHONY_OUTPUT = {
|
||||
openai: { format: "pcm" as const, sampleRate: 24000 },
|
||||
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
|
||||
};
|
||||
|
||||
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export type ResolvedTtsConfig = {
|
||||
@@ -180,6 +185,16 @@ export type TtsResult = {
|
||||
voiceCompatible?: boolean;
|
||||
};
|
||||
|
||||
export type TtsTelephonyResult = {
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
outputFormat?: string;
|
||||
sampleRate?: number;
|
||||
};
|
||||
|
||||
type TtsStatusEntry = {
|
||||
timestamp: number;
|
||||
success: boolean;
|
||||
@@ -980,7 +995,7 @@ async function openaiTTS(params: {
|
||||
apiKey: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus";
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
@@ -1224,6 +1239,100 @@ export async function textToSpeech(params: {
|
||||
};
|
||||
}
|
||||
|
||||
export async function textToSpeechTelephony(params: {
|
||||
text: string;
|
||||
cfg: ClawdbotConfig;
|
||||
prefsPath?: string;
|
||||
}): Promise<TtsTelephonyResult> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
|
||||
|
||||
if (params.text.length > config.maxTextLength) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
|
||||
};
|
||||
}
|
||||
|
||||
const userProvider = getTtsProvider(config, prefsPath);
|
||||
const providers = resolveTtsProviderOrder(userProvider);
|
||||
|
||||
let lastError: string | undefined;
|
||||
|
||||
for (const provider of providers) {
|
||||
const providerStart = Date.now();
|
||||
try {
|
||||
if (provider === "edge") {
|
||||
lastError = "edge: unsupported for telephony";
|
||||
continue;
|
||||
}
|
||||
|
||||
const apiKey = resolveTtsApiKey(config, provider);
|
||||
if (!apiKey) {
|
||||
lastError = `No API key for ${provider}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (provider === "elevenlabs") {
|
||||
const output = TELEPHONY_OUTPUT.elevenlabs;
|
||||
const audioBuffer = await elevenLabsTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.elevenlabs.baseUrl,
|
||||
voiceId: config.elevenlabs.voiceId,
|
||||
modelId: config.elevenlabs.modelId,
|
||||
outputFormat: output.format,
|
||||
seed: config.elevenlabs.seed,
|
||||
applyTextNormalization: config.elevenlabs.applyTextNormalization,
|
||||
languageCode: config.elevenlabs.languageCode,
|
||||
voiceSettings: config.elevenlabs.voiceSettings,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: output.format,
|
||||
sampleRate: output.sampleRate,
|
||||
};
|
||||
}
|
||||
|
||||
const output = TELEPHONY_OUTPUT.openai;
|
||||
const audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
responseFormat: output.format,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: output.format,
|
||||
sampleRate: output.sampleRate,
|
||||
};
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
if (error.name === "AbortError") {
|
||||
lastError = `${provider}: request timed out`;
|
||||
} else {
|
||||
lastError = `${provider}: ${error.message}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `TTS conversion failed: ${lastError || "no providers available"}`,
|
||||
};
|
||||
}
|
||||
|
||||
export async function maybeApplyTtsToPayload(params: {
|
||||
payload: ReplyPayload;
|
||||
cfg: ClawdbotConfig;
|
||||
|
||||
Reference in New Issue
Block a user