feat: move TTS into core (#1559) (thanks @Glucksberg)

This commit is contained in:
Peter Steinberger
2026-01-24 07:57:46 +00:00
parent aef88cd9f1
commit d9a467fe3b
26 changed files with 1522 additions and 1649 deletions

View File

@@ -17,6 +17,7 @@ import { createSessionsListTool } from "./tools/sessions-list-tool.js";
import { createSessionsSendTool } from "./tools/sessions-send-tool.js";
import { createSessionsSpawnTool } from "./tools/sessions-spawn-tool.js";
import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js";
import { createTtsTool } from "./tools/tts-tool.js";
export function createClawdbotTools(options?: {
browserControlUrl?: string;
@@ -96,6 +97,10 @@ export function createClawdbotTools(options?: {
replyToMode: options?.replyToMode,
hasRepliedRef: options?.hasRepliedRef,
}),
createTtsTool({
agentChannel: options?.agentChannel,
config: options?.config,
}),
createGatewayTool({
agentSessionKey: options?.agentSessionKey,
config: options?.config,

View File

@@ -0,0 +1,60 @@
import { Type } from "@sinclair/typebox";
import { loadConfig } from "../../config/config.js";
import type { ClawdbotConfig } from "../../config/config.js";
import type { GatewayMessageChannel } from "../../utils/message-channel.js";
import { textToSpeech } from "../../tts/tts.js";
import type { AnyAgentTool } from "./common.js";
import { readStringParam } from "./common.js";
const TtsToolSchema = Type.Object({
text: Type.String({ description: "Text to convert to speech." }),
channel: Type.Optional(
Type.String({ description: "Optional channel id to pick output format (e.g. telegram)." }),
),
});
export function createTtsTool(opts?: {
config?: ClawdbotConfig;
agentChannel?: GatewayMessageChannel;
}): AnyAgentTool {
return {
label: "TTS",
name: "tts",
description:
"Convert text to speech and return a MEDIA: path. Use when the user requests audio or TTS is enabled. Copy the MEDIA line exactly.",
parameters: TtsToolSchema,
execute: async (_toolCallId, args) => {
const params = args as Record<string, unknown>;
const text = readStringParam(params, "text", { required: true });
const channel = readStringParam(params, "channel");
const cfg = opts?.config ?? loadConfig();
const result = await textToSpeech({
text,
cfg,
channel: channel ?? opts?.agentChannel,
});
if (result.success && result.audioPath) {
const lines: string[] = [];
// Tag Telegram Opus output as a voice bubble instead of a file attachment.
if (result.voiceCompatible) lines.push("[[audio_as_voice]]");
lines.push(`MEDIA:${result.audioPath}`);
return {
content: [{ type: "text", text: lines.join("\n") }],
details: { audioPath: result.audioPath, provider: result.provider },
};
}
return {
content: [
{
type: "text",
text: result.error ?? "TTS conversion failed",
},
],
details: { error: result.error },
};
},
};
}

View File

@@ -272,6 +272,81 @@ function buildChatCommands(): ChatCommandDefinition[] {
],
argsMenu: "auto",
}),
defineChatCommand({
key: "audio",
nativeName: "audio",
description: "Convert text to a TTS audio reply.",
textAlias: "/audio",
args: [
{
name: "text",
description: "Text to speak",
type: "string",
captureRemaining: true,
},
],
}),
defineChatCommand({
key: "tts_on",
nativeName: "tts_on",
description: "Enable text-to-speech for replies.",
textAlias: "/tts_on",
}),
defineChatCommand({
key: "tts_off",
nativeName: "tts_off",
description: "Disable text-to-speech for replies.",
textAlias: "/tts_off",
}),
defineChatCommand({
key: "tts_provider",
nativeName: "tts_provider",
description: "Set or show the TTS provider.",
textAlias: "/tts_provider",
args: [
{
name: "provider",
description: "openai or elevenlabs",
type: "string",
choices: ["openai", "elevenlabs"],
},
],
argsMenu: "auto",
}),
defineChatCommand({
key: "tts_limit",
nativeName: "tts_limit",
description: "Set or show the max TTS text length.",
textAlias: "/tts_limit",
args: [
{
name: "maxLength",
description: "Max chars before summarizing",
type: "number",
},
],
}),
defineChatCommand({
key: "tts_summary",
nativeName: "tts_summary",
description: "Enable or disable TTS auto-summary.",
textAlias: "/tts_summary",
args: [
{
name: "mode",
description: "on or off",
type: "string",
choices: ["on", "off"],
},
],
argsMenu: "auto",
}),
defineChatCommand({
key: "tts_status",
nativeName: "tts_status",
description: "Show TTS status and last attempt.",
textAlias: "/tts_status",
}),
defineChatCommand({
key: "stop",
nativeName: "stop",

View File

@@ -16,6 +16,7 @@ import {
import { handleAllowlistCommand } from "./commands-allowlist.js";
import { handleSubagentsCommand } from "./commands-subagents.js";
import { handleModelsCommand } from "./commands-models.js";
import { handleTtsCommands } from "./commands-tts.js";
import {
handleAbortTrigger,
handleActivationCommand,
@@ -39,6 +40,7 @@ const HANDLERS: CommandHandler[] = [
handleSendPolicyCommand,
handleUsageCommand,
handleRestartCommand,
handleTtsCommands,
handleHelpCommand,
handleCommandsListCommand,
handleStatusCommand,

View File

@@ -0,0 +1,214 @@
import { logVerbose } from "../../globals.js";
import type { ReplyPayload } from "../types.js";
import type { CommandHandler } from "./commands-types.js";
import {
getLastTtsAttempt,
getTtsMaxLength,
getTtsProvider,
isSummarizationEnabled,
isTtsEnabled,
resolveTtsApiKey,
resolveTtsConfig,
resolveTtsPrefsPath,
setLastTtsAttempt,
setSummarizationEnabled,
setTtsEnabled,
setTtsMaxLength,
setTtsProvider,
textToSpeech,
} from "../../tts/tts.js";
function parseCommandArg(normalized: string, command: string): string | null {
if (normalized === command) return "";
if (normalized.startsWith(`${command} `)) return normalized.slice(command.length).trim();
return null;
}
export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => {
if (!allowTextCommands) return null;
const normalized = params.command.commandBodyNormalized;
if (
!normalized.startsWith("/tts_") &&
normalized !== "/audio" &&
!normalized.startsWith("/audio ")
) {
return null;
}
if (!params.command.isAuthorizedSender) {
logVerbose(
`Ignoring TTS command from unauthorized sender: ${params.command.senderId || "<unknown>"}`,
);
return { shouldContinue: false };
}
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
if (normalized === "/tts_on") {
setTtsEnabled(prefsPath, true);
return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } };
}
if (normalized === "/tts_off") {
setTtsEnabled(prefsPath, false);
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
}
const audioArg = parseCommandArg(normalized, "/audio");
if (audioArg !== null) {
if (!audioArg.trim()) {
return { shouldContinue: false, reply: { text: "⚙️ Usage: /audio <text>" } };
}
const start = Date.now();
const result = await textToSpeech({
text: audioArg,
cfg: params.cfg,
channel: params.command.channel,
prefsPath,
});
if (result.success && result.audioPath) {
setLastTtsAttempt({
timestamp: Date.now(),
success: true,
textLength: audioArg.length,
summarized: false,
provider: result.provider,
latencyMs: result.latencyMs,
});
const payload: ReplyPayload = {
mediaUrl: result.audioPath,
audioAsVoice: result.voiceCompatible === true,
};
return { shouldContinue: false, reply: payload };
}
setLastTtsAttempt({
timestamp: Date.now(),
success: false,
textLength: audioArg.length,
summarized: false,
error: result.error,
latencyMs: Date.now() - start,
});
return {
shouldContinue: false,
reply: { text: `❌ Error generating audio: ${result.error ?? "unknown error"}` },
};
}
const providerArg = parseCommandArg(normalized, "/tts_provider");
if (providerArg !== null) {
const currentProvider = getTtsProvider(config, prefsPath);
if (!providerArg.trim()) {
const fallback = currentProvider === "openai" ? "elevenlabs" : "openai";
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
return {
shouldContinue: false,
reply: {
text:
`🎙️ TTS provider\n` +
`Primary: ${currentProvider}\n` +
`Fallback: ${fallback}\n` +
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
`Usage: /tts_provider openai | elevenlabs`,
},
};
}
const requested = providerArg.trim().toLowerCase();
if (requested !== "openai" && requested !== "elevenlabs") {
return {
shouldContinue: false,
reply: { text: "⚙️ Usage: /tts_provider openai | elevenlabs" },
};
}
setTtsProvider(prefsPath, requested);
const fallback = requested === "openai" ? "elevenlabs" : "openai";
return {
shouldContinue: false,
reply: { text: `✅ TTS provider set to ${requested} (fallback: ${fallback}).` },
};
}
const limitArg = parseCommandArg(normalized, "/tts_limit");
if (limitArg !== null) {
if (!limitArg.trim()) {
const currentLimit = getTtsMaxLength(prefsPath);
return {
shouldContinue: false,
reply: { text: `📏 TTS limit: ${currentLimit} characters.` },
};
}
const next = Number.parseInt(limitArg.trim(), 10);
if (!Number.isFinite(next) || next < 100 || next > 10_000) {
return {
shouldContinue: false,
reply: { text: "⚙️ Usage: /tts_limit <100-10000>" },
};
}
setTtsMaxLength(prefsPath, next);
return {
shouldContinue: false,
reply: { text: `✅ TTS limit set to ${next} characters.` },
};
}
const summaryArg = parseCommandArg(normalized, "/tts_summary");
if (summaryArg !== null) {
if (!summaryArg.trim()) {
const enabled = isSummarizationEnabled(prefsPath);
return {
shouldContinue: false,
reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` },
};
}
const requested = summaryArg.trim().toLowerCase();
if (requested !== "on" && requested !== "off") {
return { shouldContinue: false, reply: { text: "⚙️ Usage: /tts_summary on|off" } };
}
setSummarizationEnabled(prefsPath, requested === "on");
return {
shouldContinue: false,
reply: {
text: requested === "on" ? "✅ TTS auto-summary enabled." : "❌ TTS auto-summary disabled.",
},
};
}
if (normalized === "/tts_status") {
const enabled = isTtsEnabled(config, prefsPath);
const provider = getTtsProvider(config, prefsPath);
const hasKey = Boolean(resolveTtsApiKey(config, provider));
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath);
const last = getLastTtsAttempt();
const lines = [
"📊 TTS status",
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
`Provider: ${provider} (${hasKey ? "✅ key" : "❌ no key"})`,
`Text limit: ${maxLength} chars`,
`Auto-summary: ${summarize ? "on" : "off"}`,
];
if (last) {
const timeAgo = Math.round((Date.now() - last.timestamp) / 1000);
lines.push("");
lines.push(`Last attempt (${timeAgo}s ago): ${last.success ? "✅" : "❌"}`);
lines.push(`Text: ${last.textLength} chars${last.summarized ? " (summarized)" : ""}`);
if (last.success) {
lines.push(`Provider: ${last.provider ?? "unknown"}`);
lines.push(`Latency: ${last.latencyMs ?? 0}ms`);
} else if (last.error) {
lines.push(`Error: ${last.error}`);
}
}
return { shouldContinue: false, reply: { text: lines.join("\n") } };
}
return null;
};

View File

@@ -13,6 +13,7 @@ import { formatAbortReplyText, tryFastAbortFromMessage } from "./abort.js";
import { shouldSkipDuplicateInbound } from "./inbound-dedupe.js";
import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js";
import { isRoutableChannel, routeReply } from "./route-reply.js";
import { maybeApplyTtsToPayload } from "../../tts/tts.js";
export type DispatchFromConfigResult = {
queuedFinal: boolean;
@@ -91,6 +92,7 @@ export async function dispatchReplyFromConfig(params: {
const currentSurface = (ctx.Surface ?? ctx.Provider)?.toLowerCase();
const shouldRouteToOriginating =
isRoutableChannel(originatingChannel) && originatingTo && originatingChannel !== currentSurface;
const ttsChannel = shouldRouteToOriginating ? originatingChannel : currentSurface;
/**
* Helper to send a payload via route-reply (async).
@@ -164,22 +166,36 @@ export async function dispatchReplyFromConfig(params: {
{
...params.replyOptions,
onToolResult: (payload: ReplyPayload) => {
if (shouldRouteToOriginating) {
// Fire-and-forget for streaming tool results when routing.
void sendPayloadAsync(payload);
} else {
// Synchronous dispatch to preserve callback timing.
dispatcher.sendToolResult(payload);
}
const run = async () => {
const ttsPayload = await maybeApplyTtsToPayload({
payload,
cfg,
channel: ttsChannel,
kind: "tool",
});
if (shouldRouteToOriginating) {
await sendPayloadAsync(ttsPayload);
} else {
dispatcher.sendToolResult(ttsPayload);
}
};
return run();
},
onBlockReply: (payload: ReplyPayload, context) => {
if (shouldRouteToOriginating) {
// Await routed sends so upstream can enforce ordering/timeouts.
return sendPayloadAsync(payload, context?.abortSignal);
} else {
// Synchronous dispatch to preserve callback timing.
dispatcher.sendBlockReply(payload);
}
const run = async () => {
const ttsPayload = await maybeApplyTtsToPayload({
payload,
cfg,
channel: ttsChannel,
kind: "block",
});
if (shouldRouteToOriginating) {
await sendPayloadAsync(ttsPayload, context?.abortSignal);
} else {
dispatcher.sendBlockReply(ttsPayload);
}
};
return run();
},
},
cfg,
@@ -190,10 +206,16 @@ export async function dispatchReplyFromConfig(params: {
let queuedFinal = false;
let routedFinalCount = 0;
for (const reply of replies) {
const ttsReply = await maybeApplyTtsToPayload({
payload: reply,
cfg,
channel: ttsChannel,
kind: "final",
});
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
// Route final reply to originating channel.
const result = await routeReply({
payload: reply,
payload: ttsReply,
channel: originatingChannel,
to: originatingTo,
sessionKey: ctx.SessionKey,
@@ -209,7 +231,7 @@ export async function dispatchReplyFromConfig(params: {
queuedFinal = result.ok || queuedFinal;
if (result.ok) routedFinalCount += 1;
} else {
queuedFinal = dispatcher.sendFinalReply(reply) || queuedFinal;
queuedFinal = dispatcher.sendFinalReply(ttsReply) || queuedFinal;
}
}
await dispatcher.waitForIdle();

View File

@@ -10,7 +10,6 @@
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
import { resolveEffectiveMessagesConfig } from "../../agents/identity.js";
import { normalizeChannelId } from "../../channels/plugins/index.js";
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
import type { ClawdbotConfig } from "../../config/config.js";
import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
import type { OriginatingChannelType } from "../templating.js";
@@ -81,48 +80,6 @@ export async function routeReply(params: RouteReplyParams): Promise<RouteReplyRe
: [];
const replyToId = normalized.replyToId;
// Run message_sending hook (allows plugins to modify or cancel)
const hookRunner = getGlobalHookRunner();
const normalizedChannel = normalizeChannelId(channel);
if (hookRunner && text.trim() && normalizedChannel) {
try {
const hookResult = await hookRunner.runMessageSending(
{
to,
content: text,
metadata: { channel, accountId, threadId },
},
{
channelId: normalizedChannel,
accountId: accountId ?? undefined,
conversationId: to,
},
);
// Check if hook wants to cancel the message
if (hookResult?.cancel) {
return { ok: true }; // Silently cancel
}
// Check if hook modified the content
if (hookResult?.content !== undefined) {
// Check if the modified content contains MEDIA: directive
const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m);
if (mediaMatch) {
// Extract media path and add to mediaUrls
const mediaPath = mediaMatch[1].trim();
mediaUrls = [mediaPath];
// Remove MEDIA: directive from text (send audio only)
text = hookResult.content.replace(/^MEDIA:.+$/m, "").trim();
} else {
text = hookResult.content;
}
}
} catch {
// Hook errors shouldn't block message sending
}
}
// Skip empty replies.
if (!text.trim() && mediaUrls.length === 0) {
return { ok: true };

View File

@@ -1,4 +1,5 @@
import type { QueueDropPolicy, QueueMode, QueueModeByProvider } from "./types.queue.js";
import type { TtsConfig } from "./types.tts.js";
export type GroupChatConfig = {
mentionPatterns?: string[];
@@ -81,6 +82,8 @@ export type MessagesConfig = {
ackReactionScope?: "group-mentions" | "group-all" | "direct" | "all";
/** Remove ack reaction after reply is sent (default: false). */
removeAckAfterReply?: boolean;
/** Text-to-speech settings for outbound replies. */
tts?: TtsConfig;
};
export type NativeCommandsSetting = boolean | "auto";

View File

@@ -23,5 +23,6 @@ export * from "./types.signal.js";
export * from "./types.skills.js";
export * from "./types.slack.js";
export * from "./types.telegram.js";
export * from "./types.tts.js";
export * from "./types.tools.js";
export * from "./types.whatsapp.js";

30
src/config/types.tts.ts Normal file
View File

@@ -0,0 +1,30 @@
export type TtsProvider = "elevenlabs" | "openai";
export type TtsMode = "final" | "all";
export type TtsConfig = {
/** Enable auto-TTS (can be overridden by local prefs). */
enabled?: boolean;
/** Apply TTS to final replies only or to all replies (tool/block/final). */
mode?: TtsMode;
/** Primary TTS provider (fallbacks are automatic). */
provider?: TtsProvider;
/** ElevenLabs configuration. */
elevenlabs?: {
apiKey?: string;
voiceId?: string;
modelId?: string;
};
/** OpenAI configuration. */
openai?: {
apiKey?: string;
model?: string;
voice?: string;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */
maxTextLength?: number;
/** API request timeout (ms). */
timeoutMs?: number;
};

View File

@@ -155,6 +155,36 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsConfigSchema = z
.object({
enabled: z.boolean().optional(),
mode: TtsModeSchema.optional(),
provider: TtsProviderSchema.optional(),
elevenlabs: z
.object({
apiKey: z.string().optional(),
voiceId: z.string().optional(),
modelId: z.string().optional(),
})
.strict()
.optional(),
openai: z
.object({
apiKey: z.string().optional(),
model: z.string().optional(),
voice: z.string().optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional();
export const HumanDelaySchema = z
.object({
mode: z.union([z.literal("off"), z.literal("natural"), z.literal("custom")]).optional(),

View File

@@ -5,6 +5,7 @@ import {
InboundDebounceSchema,
NativeCommandsSettingSchema,
QueueSchema,
TtsConfigSchema,
} from "./zod-schema.core.js";
const SessionResetConfigSchema = z
@@ -90,6 +91,7 @@ export const MessagesSchema = z
ackReaction: z.string().optional(),
ackReactionScope: z.enum(["group-mentions", "group-all", "direct", "all"]).optional(),
removeAckAfterReply: z.boolean().optional(),
tts: TtsConfigSchema,
})
.strict()
.optional();

View File

@@ -8,6 +8,12 @@ const BASE_METHODS = [
"status",
"usage.status",
"usage.cost",
"tts.status",
"tts.providers",
"tts.enable",
"tts.disable",
"tts.convert",
"tts.setProvider",
"config.get",
"config.set",
"config.apply",

View File

@@ -17,6 +17,7 @@ import { sessionsHandlers } from "./server-methods/sessions.js";
import { skillsHandlers } from "./server-methods/skills.js";
import { systemHandlers } from "./server-methods/system.js";
import { talkHandlers } from "./server-methods/talk.js";
import { ttsHandlers } from "./server-methods/tts.js";
import type { GatewayRequestHandlers, GatewayRequestOptions } from "./server-methods/types.js";
import { updateHandlers } from "./server-methods/update.js";
import { usageHandlers } from "./server-methods/usage.js";
@@ -53,6 +54,8 @@ const READ_METHODS = new Set([
"status",
"usage.status",
"usage.cost",
"tts.status",
"tts.providers",
"models.list",
"agents.list",
"agent.identity.get",
@@ -75,6 +78,10 @@ const WRITE_METHODS = new Set([
"agent.wait",
"wake",
"talk.mode",
"tts.enable",
"tts.disable",
"tts.convert",
"tts.setProvider",
"voicewake.set",
"node.invoke",
"chat.send",
@@ -151,6 +158,7 @@ export const coreGatewayHandlers: GatewayRequestHandlers = {
...configHandlers,
...wizardHandlers,
...talkHandlers,
...ttsHandlers,
...skillsHandlers,
...sessionsHandlers,
...systemHandlers,

View File

@@ -0,0 +1,138 @@
import { loadConfig } from "../../config/config.js";
import {
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
getTtsProvider,
isTtsEnabled,
resolveTtsApiKey,
resolveTtsConfig,
resolveTtsPrefsPath,
setTtsEnabled,
setTtsProvider,
textToSpeech,
} from "../../tts/tts.js";
import { ErrorCodes, errorShape } from "../protocol/index.js";
import { formatForLog } from "../ws-log.js";
import type { GatewayRequestHandlers } from "./types.js";
export const ttsHandlers: GatewayRequestHandlers = {
"tts.status": async ({ respond }) => {
try {
const cfg = loadConfig();
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
const provider = getTtsProvider(config, prefsPath);
respond(true, {
enabled: isTtsEnabled(config, prefsPath),
provider,
fallbackProvider: provider === "openai" ? "elevenlabs" : "openai",
prefsPath,
hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")),
hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")),
});
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"tts.enable": async ({ respond }) => {
try {
const cfg = loadConfig();
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
setTtsEnabled(prefsPath, true);
respond(true, { enabled: true });
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"tts.disable": async ({ respond }) => {
try {
const cfg = loadConfig();
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
setTtsEnabled(prefsPath, false);
respond(true, { enabled: false });
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"tts.convert": async ({ params, respond }) => {
const text = typeof params.text === "string" ? params.text.trim() : "";
if (!text) {
respond(
false,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, "tts.convert requires text"),
);
return;
}
try {
const cfg = loadConfig();
const channel = typeof params.channel === "string" ? params.channel.trim() : undefined;
const result = await textToSpeech({ text, cfg, channel });
if (result.success && result.audioPath) {
respond(true, {
audioPath: result.audioPath,
provider: result.provider,
outputFormat: result.outputFormat,
voiceCompatible: result.voiceCompatible,
});
return;
}
respond(
false,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "TTS conversion failed"),
);
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"tts.setProvider": async ({ params, respond }) => {
const provider = typeof params.provider === "string" ? params.provider.trim() : "";
if (provider !== "openai" && provider !== "elevenlabs") {
respond(
false,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai or elevenlabs."),
);
return;
}
try {
const cfg = loadConfig();
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
setTtsProvider(prefsPath, provider);
respond(true, { provider });
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"tts.providers": async ({ respond }) => {
try {
const cfg = loadConfig();
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
respond(true, {
providers: [
{
id: "openai",
name: "OpenAI",
configured: Boolean(resolveTtsApiKey(config, "openai")),
models: [...OPENAI_TTS_MODELS],
voices: [...OPENAI_TTS_VOICES],
},
{
id: "elevenlabs",
name: "ElevenLabs",
configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
},
],
active: getTtsProvider(config, prefsPath),
});
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
};

View File

@@ -14,7 +14,6 @@ import { mediaKindFromMime } from "../../media/constants.js";
import { fetchRemoteMedia } from "../../media/fetch.js";
import { isGifMedia } from "../../media/mime.js";
import { saveMediaBuffer } from "../../media/store.js";
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
import type { RuntimeEnv } from "../../runtime.js";
import { loadWebMedia } from "../../web/media.js";
import { resolveTelegramVoiceSend } from "../voice.js";
@@ -40,45 +39,6 @@ export async function deliverReplies(params: {
const threadParams = buildTelegramThreadParams(messageThreadId);
let hasReplied = false;
for (const reply of replies) {
// Track if hook wants to send audio after text
let audioToSendAfter: string | undefined;
// Run message_sending hook (allows plugins like TTS to generate audio)
const hookRunner = getGlobalHookRunner();
if (hookRunner && reply?.text?.trim()) {
try {
const hookResult = await hookRunner.runMessageSending(
{
to: chatId,
content: reply.text,
metadata: { channel: "telegram", threadId: messageThreadId },
},
{
channelId: "telegram",
accountId: undefined,
conversationId: chatId,
},
);
// Check if hook wants to cancel the message
if (hookResult?.cancel) {
continue; // Skip this reply
}
// Check if hook returned a MEDIA directive (TTS audio)
if (hookResult?.content !== undefined) {
const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m);
if (mediaMatch) {
// Save audio path to send AFTER the text message
audioToSendAfter = mediaMatch[1].trim();
}
}
} catch (err) {
// Hook errors shouldn't block message sending
logVerbose(`[telegram delivery] hook error: ${String(err)}`);
}
}
const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0;
if (!reply?.text && !hasMedia) {
if (reply?.audioAsVoice) {
@@ -110,25 +70,6 @@ export async function deliverReplies(params: {
hasReplied = true;
}
}
// Send TTS audio after text (if hook generated one)
if (audioToSendAfter) {
try {
const audioMedia = await loadWebMedia(audioToSendAfter);
const audioFile = new InputFile(audioMedia.buffer, "voice.mp3");
// Switch typing indicator to record_voice before sending
await params.onVoiceRecording?.();
const audioParams: Record<string, unknown> = {};
if (threadParams) {
audioParams.message_thread_id = threadParams.message_thread_id;
}
await bot.api.sendVoice(chatId, audioFile, audioParams);
logVerbose(`[telegram delivery] TTS audio sent: ${audioToSendAfter}`);
} catch (err) {
logVerbose(`[telegram delivery] TTS audio send failed: ${String(err)}`);
}
}
continue;
}
// media with optional caption on first item

234
src/tts/tts.test.ts Normal file
View File

@@ -0,0 +1,234 @@
import { describe, expect, it, vi, beforeEach, afterEach } from "vitest";
import { _test } from "./tts.js";
const {
isValidVoiceId,
isValidOpenAIVoice,
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
summarizeText,
resolveOutputFormat,
} = _test;
describe("tts", () => {
describe("isValidVoiceId", () => {
it("accepts valid ElevenLabs voice IDs", () => {
expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true);
expect(isValidVoiceId("21m00Tcm4TlvDq8ikWAM")).toBe(true);
expect(isValidVoiceId("EXAVITQu4vr4xnSDxMaL")).toBe(true);
});
it("accepts voice IDs of varying valid lengths", () => {
expect(isValidVoiceId("a1b2c3d4e5")).toBe(true);
expect(isValidVoiceId("a".repeat(40))).toBe(true);
});
it("rejects too short voice IDs", () => {
expect(isValidVoiceId("")).toBe(false);
expect(isValidVoiceId("abc")).toBe(false);
expect(isValidVoiceId("123456789")).toBe(false);
});
it("rejects too long voice IDs", () => {
expect(isValidVoiceId("a".repeat(41))).toBe(false);
expect(isValidVoiceId("a".repeat(100))).toBe(false);
});
it("rejects voice IDs with invalid characters", () => {
expect(isValidVoiceId("pMsXgVXv3BLz-gSXRplE")).toBe(false);
expect(isValidVoiceId("pMsXgVXv3BLz_gSXRplE")).toBe(false);
expect(isValidVoiceId("pMsXgVXv3BLz gSXRplE")).toBe(false);
expect(isValidVoiceId("../../../etc/passwd")).toBe(false);
expect(isValidVoiceId("voice?param=value")).toBe(false);
});
});
describe("isValidOpenAIVoice", () => {
it("accepts all valid OpenAI voices", () => {
for (const voice of OPENAI_TTS_VOICES) {
expect(isValidOpenAIVoice(voice)).toBe(true);
}
});
it("rejects invalid voice names", () => {
expect(isValidOpenAIVoice("invalid")).toBe(false);
expect(isValidOpenAIVoice("")).toBe(false);
expect(isValidOpenAIVoice("ALLOY")).toBe(false);
expect(isValidOpenAIVoice("alloy ")).toBe(false);
expect(isValidOpenAIVoice(" alloy")).toBe(false);
});
});
describe("isValidOpenAIModel", () => {
it("accepts gpt-4o-mini-tts model", () => {
expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true);
});
it("rejects other models", () => {
expect(isValidOpenAIModel("tts-1")).toBe(false);
expect(isValidOpenAIModel("tts-1-hd")).toBe(false);
expect(isValidOpenAIModel("invalid")).toBe(false);
expect(isValidOpenAIModel("")).toBe(false);
expect(isValidOpenAIModel("gpt-4")).toBe(false);
});
});
describe("OPENAI_TTS_MODELS", () => {
it("contains only gpt-4o-mini-tts", () => {
expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts");
expect(OPENAI_TTS_MODELS).toHaveLength(1);
});
it("is a non-empty array", () => {
expect(Array.isArray(OPENAI_TTS_MODELS)).toBe(true);
expect(OPENAI_TTS_MODELS.length).toBeGreaterThan(0);
});
});
describe("resolveOutputFormat", () => {
it("uses Opus for Telegram", () => {
const output = resolveOutputFormat("telegram");
expect(output.openai).toBe("opus");
expect(output.elevenlabs).toBe("opus_48000_64");
expect(output.extension).toBe(".opus");
expect(output.voiceCompatible).toBe(true);
});
it("uses MP3 for other channels", () => {
const output = resolveOutputFormat("discord");
expect(output.openai).toBe("mp3");
expect(output.elevenlabs).toBe("mp3_44100_128");
expect(output.extension).toBe(".mp3");
expect(output.voiceCompatible).toBe(false);
});
});
describe("summarizeText", () => {
const mockApiKey = "test-api-key";
const originalFetch = globalThis.fetch;
beforeEach(() => {
vi.useFakeTimers({ shouldAdvanceTime: true });
});
afterEach(() => {
globalThis.fetch = originalFetch;
vi.useRealTimers();
});
it("summarizes text and returns result with metrics", async () => {
const mockSummary = "This is a summarized version of the text.";
globalThis.fetch = vi.fn().mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
choices: [{ message: { content: mockSummary } }],
}),
});
const longText = "A".repeat(2000);
const result = await summarizeText(longText, 1500, mockApiKey, 30_000);
expect(result.summary).toBe(mockSummary);
expect(result.inputLength).toBe(2000);
expect(result.outputLength).toBe(mockSummary.length);
expect(result.latencyMs).toBeGreaterThanOrEqual(0);
expect(globalThis.fetch).toHaveBeenCalledTimes(1);
});
it("calls OpenAI API with correct parameters", async () => {
globalThis.fetch = vi.fn().mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
choices: [{ message: { content: "Summary" } }],
}),
});
await summarizeText("Long text to summarize", 500, mockApiKey, 30_000);
expect(globalThis.fetch).toHaveBeenCalledWith(
"https://api.openai.com/v1/chat/completions",
expect.objectContaining({
method: "POST",
headers: {
Authorization: `Bearer ${mockApiKey}`,
"Content-Type": "application/json",
},
}),
);
const callArgs = (globalThis.fetch as ReturnType<typeof vi.fn>).mock.calls[0];
const body = JSON.parse(callArgs[1].body);
expect(body.model).toBe("gpt-4o-mini");
expect(body.temperature).toBe(0.3);
expect(body.max_tokens).toBe(250);
});
it("rejects targetLength below minimum (100)", async () => {
await expect(summarizeText("text", 99, mockApiKey, 30_000)).rejects.toThrow(
"Invalid targetLength: 99",
);
});
it("rejects targetLength above maximum (10000)", async () => {
await expect(summarizeText("text", 10001, mockApiKey, 30_000)).rejects.toThrow(
"Invalid targetLength: 10001",
);
});
it("accepts targetLength at boundaries", async () => {
globalThis.fetch = vi.fn().mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
choices: [{ message: { content: "Summary" } }],
}),
});
await expect(summarizeText("text", 100, mockApiKey, 30_000)).resolves.toBeDefined();
await expect(summarizeText("text", 10000, mockApiKey, 30_000)).resolves.toBeDefined();
});
it("throws error when API returns non-ok response", async () => {
globalThis.fetch = vi.fn().mockResolvedValue({
ok: false,
status: 500,
});
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
"Summarization service unavailable",
);
});
it("throws error when no summary is returned", async () => {
globalThis.fetch = vi.fn().mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
choices: [],
}),
});
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
"No summary returned",
);
});
it("throws error when summary content is empty", async () => {
globalThis.fetch = vi.fn().mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
choices: [{ message: { content: " " } }],
}),
});
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
"No summary returned",
);
});
});
});

630
src/tts/tts.ts Normal file
View File

@@ -0,0 +1,630 @@
import {
existsSync,
mkdirSync,
readFileSync,
writeFileSync,
mkdtempSync,
rmSync,
renameSync,
unlinkSync,
} from "node:fs";
import { tmpdir } from "node:os";
import path from "node:path";
import type { ReplyPayload } from "../auto-reply/types.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
import type { ChannelId } from "../channels/plugins/types.js";
import type { ClawdbotConfig } from "../config/config.js";
import type { TtsConfig, TtsMode, TtsProvider } from "../config/types.tts.js";
import { logVerbose } from "../globals.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4000;
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
const DEFAULT_OPENAI_VOICE = "alloy";
const TELEGRAM_OUTPUT = {
openai: "opus" as const,
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
elevenlabs: "opus_48000_64",
extension: ".opus",
voiceCompatible: true,
};
const DEFAULT_OUTPUT = {
openai: "mp3" as const,
elevenlabs: "mp3_44100_128",
extension: ".mp3",
voiceCompatible: false,
};
export type ResolvedTtsConfig = {
enabled: boolean;
mode: TtsMode;
provider: TtsProvider;
elevenlabs: {
apiKey?: string;
voiceId: string;
modelId: string;
};
openai: {
apiKey?: string;
model: string;
voice: string;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
};
type TtsUserPrefs = {
tts?: {
enabled?: boolean;
provider?: TtsProvider;
maxLength?: number;
summarize?: boolean;
};
};
export type TtsResult = {
success: boolean;
audioPath?: string;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
voiceCompatible?: boolean;
};
type TtsStatusEntry = {
timestamp: number;
success: boolean;
textLength: number;
summarized: boolean;
provider?: string;
latencyMs?: number;
error?: string;
};
let lastTtsAttempt: TtsStatusEntry | undefined;
export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
return {
enabled: raw.enabled ?? false,
mode: raw.mode ?? "final",
provider: raw.provider ?? "elevenlabs",
elevenlabs: {
apiKey: raw.elevenlabs?.apiKey,
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
},
openai: {
apiKey: raw.openai?.apiKey,
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
};
}
export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
if (config.prefsPath?.trim()) return resolveUserPath(config.prefsPath.trim());
const envPath = process.env.CLAWDBOT_TTS_PREFS?.trim();
if (envPath) return resolveUserPath(envPath);
return path.join(CONFIG_DIR, "settings", "tts.json");
}
function readPrefs(prefsPath: string): TtsUserPrefs {
try {
if (!existsSync(prefsPath)) return {};
return JSON.parse(readFileSync(prefsPath, "utf8")) as TtsUserPrefs;
} catch {
return {};
}
}
function atomicWriteFileSync(filePath: string, content: string): void {
const tmpPath = `${filePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`;
writeFileSync(tmpPath, content);
try {
renameSync(tmpPath, filePath);
} catch (err) {
try {
unlinkSync(tmpPath);
} catch {
// ignore
}
throw err;
}
}
function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): void {
const prefs = readPrefs(prefsPath);
update(prefs);
mkdirSync(path.dirname(prefsPath), { recursive: true });
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
export function isTtsEnabled(config: ResolvedTtsConfig, prefsPath: string): boolean {
const prefs = readPrefs(prefsPath);
if (prefs.tts?.enabled !== undefined) return prefs.tts.enabled === true;
return config.enabled;
}
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, enabled };
});
}
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
const prefs = readPrefs(prefsPath);
return prefs.tts?.provider ?? config.provider;
}
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, provider };
});
}
export function getTtsMaxLength(prefsPath: string): number {
const prefs = readPrefs(prefsPath);
return prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH;
}
export function setTtsMaxLength(prefsPath: string, maxLength: number): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, maxLength };
});
}
export function isSummarizationEnabled(prefsPath: string): boolean {
const prefs = readPrefs(prefsPath);
return prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE;
}
export function setSummarizationEnabled(prefsPath: string, enabled: boolean): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, summarize: enabled };
});
}
export function getLastTtsAttempt(): TtsStatusEntry | undefined {
return lastTtsAttempt;
}
export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
lastTtsAttempt = entry;
}
function resolveOutputFormat(channelId?: string | null) {
if (channelId === "telegram") return TELEGRAM_OUTPUT;
return DEFAULT_OUTPUT;
}
function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
}
export function resolveTtsApiKey(
config: ResolvedTtsConfig,
provider: TtsProvider,
): string | undefined {
if (provider === "elevenlabs") {
return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
}
if (provider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
return undefined;
}
function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const;
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"coral",
"echo",
"fable",
"onyx",
"nova",
"sage",
"shimmer",
] as const;
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function isValidOpenAIModel(model: string): boolean {
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
type SummarizeResult = {
summary: string;
latencyMs: number;
inputLength: number;
outputLength: number;
};
async function summarizeText(
text: string,
targetLength: number,
apiKey: string,
timeoutMs: number,
): Promise<SummarizeResult> {
if (targetLength < 100 || targetLength > 10_000) {
throw new Error(`Invalid targetLength: ${targetLength}`);
}
const startTime = Date.now();
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch("https://api.openai.com/v1/chat/completions", {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-mini",
messages: [
{
role: "system",
content: `You are an assistant that summarizes texts concisely while keeping the most important information. Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. Reply only with the summary, without additional explanations.`,
},
{
role: "user",
content: `<text_to_summarize>\n${text}\n</text_to_summarize>`,
},
],
max_tokens: Math.ceil(targetLength / 2),
temperature: 0.3,
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error("Summarization service unavailable");
}
const data = (await response.json()) as {
choices?: Array<{ message?: { content?: string } }>;
};
const summary = data.choices?.[0]?.message?.content?.trim();
if (!summary) {
throw new Error("No summary returned");
}
return {
summary,
latencyMs: Date.now() - startTime,
inputLength: text.length,
outputLength: summary.length,
};
} finally {
clearTimeout(timeout);
}
}
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
const timer = setTimeout(() => {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
}, delayMs);
timer.unref();
}
async function elevenLabsTTS(params: {
text: string;
apiKey: string;
voiceId: string;
modelId: string;
outputFormat: string;
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, voiceId, modelId, outputFormat, timeoutMs } = params;
if (!isValidVoiceId(voiceId)) {
throw new Error("Invalid voiceId format");
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`);
if (outputFormat) {
url.searchParams.set("output_format", outputFormat);
}
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
style: 0.0,
use_speaker_boost: true,
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
async function openaiTTS(params: {
text: string;
apiKey: string;
model: string;
voice: string;
responseFormat: "mp3" | "opus";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
if (!isValidOpenAIModel(model)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: responseFormat,
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
export async function textToSpeech(params: {
text: string;
cfg: ClawdbotConfig;
prefsPath?: string;
channel?: string;
}): Promise<TtsResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
const channelId = resolveChannelId(params.channel);
const output = resolveOutputFormat(channelId);
if (params.text.length > config.maxTextLength) {
return {
success: false,
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const providers: TtsProvider[] = [
userProvider,
userProvider === "openai" ? "elevenlabs" : "openai",
];
let lastError: string | undefined;
for (const provider of providers) {
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
const providerStart = Date.now();
try {
let audioBuffer: Buffer;
if (provider === "elevenlabs") {
audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
voiceId: config.elevenlabs.voiceId,
modelId: config.elevenlabs.modelId,
outputFormat: output.elevenlabs,
timeoutMs: config.timeoutMs,
});
} else {
audioBuffer = await openaiTTS({
text: params.text,
apiKey,
model: config.openai.model,
voice: config.openai.voice,
responseFormat: output.openai,
timeoutMs: config.timeoutMs,
});
}
const latencyMs = Date.now() - providerStart;
const tempDir = mkdtempSync(path.join(tmpdir(), "tts-"));
const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
writeFileSync(audioPath, audioBuffer);
scheduleCleanup(tempDir);
return {
success: true,
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
voiceCompatible: output.voiceCompatible,
};
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
lastError = `${provider}: request timed out`;
} else {
lastError = `${provider}: ${error.message}`;
}
}
}
return {
success: false,
error: `TTS conversion failed: ${lastError || "no providers available"}`,
};
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: ClawdbotConfig;
channel?: string;
kind?: "tool" | "block" | "final";
}): Promise<ReplyPayload> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
if (!isTtsEnabled(config, prefsPath)) return params.payload;
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
const text = params.payload.text ?? "";
if (!text.trim()) return params.payload;
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return params.payload;
if (text.includes("MEDIA:")) return params.payload;
if (text.trim().length < 10) return params.payload;
const maxLength = getTtsMaxLength(prefsPath);
let textForAudio = text.trim();
let wasSummarized = false;
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
logVerbose(
`TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
return params.payload;
}
const openaiKey = resolveTtsApiKey(config, "openai");
if (!openaiKey) {
logVerbose("TTS: skipping summarization - OpenAI key missing.");
return params.payload;
}
try {
const summary = await summarizeText(textForAudio, maxLength, openaiKey, config.timeoutMs);
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed: ${error.message}`);
return params.payload;
}
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
});
if (result.success && result.audioPath) {
lastTtsAttempt = {
timestamp: Date.now(),
success: true,
textLength: text.length,
summarized: wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
};
const channelId = resolveChannelId(params.channel);
const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
return {
...params.payload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
}
lastTtsAttempt = {
timestamp: Date.now(),
success: false,
textLength: text.length,
summarized: wasSummarized,
error: result.error,
};
const latency = Date.now() - ttsStart;
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
return params.payload;
}
export const _test = {
isValidVoiceId,
isValidOpenAIVoice,
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
summarizeText,
resolveOutputFormat,
};