Files
clawdbot/extensions/telegram-tts/index.ts
Glucksberg df09e583aa feat(telegram-tts): add auto-TTS hook and provider switching
- Integrate message_sending hook into Telegram delivery path
- Send text first, then audio as voice message after
- Add /tts_provider command to switch between OpenAI and ElevenLabs
- Implement automatic fallback when primary provider fails
- Use gpt-4o-mini-tts as default OpenAI model
- Add hook integration to route-reply.ts for other channels

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 08:00:44 +00:00

696 lines
21 KiB
TypeScript

/**
* telegram-tts - Automatic TTS for chat responses
*
* Self-contained TTS extension that calls ElevenLabs/OpenAI APIs directly.
* No external CLI dependencies.
*
* Features:
* - speak tool for programmatic TTS
* - Multi-provider support (ElevenLabs, OpenAI)
* - RPC methods for status and control
*
* Note: Slash commands (/tts_on, /tts_off, /audio) should be configured
* via Telegram customCommands and handled by the agent workspace.
*/
import { existsSync, readFileSync, writeFileSync, mkdtempSync, rmSync } from "fs";
import { join } from "path";
import { tmpdir } from "os";
import type { PluginApi } from "clawdbot";
const PLUGIN_ID = "telegram-tts";
const DEFAULT_TIMEOUT_MS = 30000;
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
// =============================================================================
// Types
// =============================================================================
interface TtsConfig {
enabled?: boolean;
provider?: "elevenlabs" | "openai";
elevenlabs?: {
apiKey?: string;
voiceId?: string;
modelId?: string;
};
openai?: {
apiKey?: string;
model?: string;
voice?: string;
};
prefsPath?: string;
maxTextLength?: number;
timeoutMs?: number;
}
interface UserPreferences {
tts?: {
enabled?: boolean;
provider?: "openai" | "elevenlabs";
};
}
interface TtsResult {
success: boolean;
audioPath?: string;
error?: string;
}
// =============================================================================
// Validation
// =============================================================================
/**
* Validates ElevenLabs voiceId format to prevent URL injection.
* Voice IDs are alphanumeric strings, typically 20 characters.
*/
function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
/**
* Validates OpenAI voice name.
*/
function isValidOpenAIVoice(voice: string): boolean {
const validVoices = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"];
return validVoices.includes(voice);
}
/**
* Available OpenAI TTS models.
*/
const OPENAI_TTS_MODELS = [
"gpt-4o-mini-tts",
"tts-1",
"tts-1-hd",
];
/**
* Validates OpenAI TTS model name.
*/
function isValidOpenAIModel(model: string): boolean {
return OPENAI_TTS_MODELS.includes(model) || model.startsWith("gpt-4o-mini-tts-");
}
// =============================================================================
// Configuration & Preferences
// =============================================================================
function getPrefsPath(config: TtsConfig): string {
return (
config.prefsPath ||
process.env.CLAWDBOT_TTS_PREFS ||
join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json")
);
}
function isTtsEnabled(prefsPath: string): boolean {
try {
if (!existsSync(prefsPath)) return false;
const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8"));
return prefs?.tts?.enabled === true;
} catch {
return false;
}
}
function setTtsEnabled(prefsPath: string, enabled: boolean): void {
let prefs: UserPreferences = {};
try {
if (existsSync(prefsPath)) {
prefs = JSON.parse(readFileSync(prefsPath, "utf8"));
}
} catch {
// ignore
}
prefs.tts = { ...prefs.tts, enabled };
writeFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
function getTtsProvider(prefsPath: string): "openai" | "elevenlabs" | undefined {
try {
if (!existsSync(prefsPath)) return undefined;
const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8"));
return prefs?.tts?.provider;
} catch {
return undefined;
}
}
function setTtsProvider(prefsPath: string, provider: "openai" | "elevenlabs"): void {
let prefs: UserPreferences = {};
try {
if (existsSync(prefsPath)) {
prefs = JSON.parse(readFileSync(prefsPath, "utf8"));
}
} catch {
// ignore
}
prefs.tts = { ...prefs.tts, provider };
writeFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
function getApiKey(config: TtsConfig, provider: string): string | undefined {
if (provider === "elevenlabs") {
return (
config.elevenlabs?.apiKey ||
process.env.ELEVENLABS_API_KEY ||
process.env.XI_API_KEY
);
}
if (provider === "openai") {
return config.openai?.apiKey || process.env.OPENAI_API_KEY;
}
return undefined;
}
// =============================================================================
// Temp File Cleanup
// =============================================================================
/**
* Schedules cleanup of a temp directory after a delay.
* This ensures the file is consumed before deletion.
*/
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
setTimeout(() => {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// Ignore cleanup errors
}
}, delayMs);
}
// =============================================================================
// TTS Providers
// =============================================================================
async function elevenLabsTTS(
text: string,
apiKey: string,
voiceId: string = "pMsXgVXv3BLzUgSXRplE",
modelId: string = "eleven_multilingual_v2",
timeoutMs: number = DEFAULT_TIMEOUT_MS
): Promise<Buffer> {
// Validate voiceId to prevent URL injection
if (!isValidVoiceId(voiceId)) {
throw new Error(`Invalid voiceId format`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
{
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
style: 0.0,
use_speaker_boost: true,
},
}),
signal: controller.signal,
}
);
if (!response.ok) {
// Don't leak API error details to users
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
async function openaiTTS(
text: string,
apiKey: string,
model: string = "gpt-4o-mini-tts",
voice: string = "alloy",
timeoutMs: number = DEFAULT_TIMEOUT_MS
): Promise<Buffer> {
// Validate model
if (!isValidOpenAIModel(model)) {
throw new Error(`Invalid model: ${model}`);
}
// Validate voice
if (!isValidOpenAIVoice(voice)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: "mp3",
}),
signal: controller.signal,
});
if (!response.ok) {
// Don't leak API error details to users
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
// =============================================================================
// Core TTS Function
// =============================================================================
async function textToSpeech(text: string, config: TtsConfig, prefsPath?: string): Promise<TtsResult> {
// Get user's preferred provider (from prefs) or fall back to config
const userProvider = prefsPath ? getTtsProvider(prefsPath) : undefined;
const primaryProvider = userProvider || config.provider || "openai";
const fallbackProvider = primaryProvider === "openai" ? "elevenlabs" : "openai";
const timeoutMs = config.timeoutMs || DEFAULT_TIMEOUT_MS;
const maxLen = config.maxTextLength || 4000;
if (text.length > maxLen) {
return {
success: false,
error: `Text too long (${text.length} chars, max ${maxLen})`,
};
}
// Try primary provider first, then fallback
const providers = [primaryProvider, fallbackProvider];
let lastError: string | undefined;
for (const provider of providers) {
const apiKey = getApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
try {
let audioBuffer: Buffer;
if (provider === "elevenlabs") {
audioBuffer = await elevenLabsTTS(
text,
apiKey,
config.elevenlabs?.voiceId,
config.elevenlabs?.modelId,
timeoutMs
);
} else if (provider === "openai") {
audioBuffer = await openaiTTS(
text,
apiKey,
config.openai?.model || "gpt-4o-mini-tts",
config.openai?.voice,
timeoutMs
);
} else {
lastError = `Unknown provider: ${provider}`;
continue;
}
// Save to temp file
const tempDir = mkdtempSync(join(tmpdir(), "tts-"));
const audioPath = join(tempDir, `voice-${Date.now()}.mp3`);
writeFileSync(audioPath, audioBuffer);
// Schedule cleanup after delay (file should be consumed by then)
scheduleCleanup(tempDir);
return { success: true, audioPath };
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
lastError = `${provider}: request timed out`;
} else {
lastError = `${provider}: ${error.message}`;
}
// Continue to try fallback provider
}
}
return {
success: false,
error: `TTS conversion failed: ${lastError || "no providers available"}`,
};
}
// =============================================================================
// Plugin Registration
// =============================================================================
export default function register(api: PluginApi) {
const log = api.logger;
const config: TtsConfig = {
enabled: false,
provider: "elevenlabs",
maxTextLength: 4000,
timeoutMs: DEFAULT_TIMEOUT_MS,
...(api.pluginConfig || {}),
};
const prefsPath = getPrefsPath(config);
log.info(`[${PLUGIN_ID}] Registering plugin...`);
log.info(`[${PLUGIN_ID}] Provider: ${config.provider}`);
log.info(`[${PLUGIN_ID}] Preferences: ${prefsPath}`);
// ===========================================================================
// Tool: speak
// ===========================================================================
api.registerTool({
name: "speak",
description: `Convert text to speech and generate voice message.
Use this tool when TTS mode is enabled or user requests audio.
IMPORTANT: After calling this tool, you MUST output the result exactly as returned.
The tool returns "MEDIA:/path/to/audio.mp3" - copy this EXACTLY to your response.
This MEDIA: directive tells the system to send the audio file.
Example flow:
1. User asks a question with TTS enabled
2. You call speak({text: "Your answer here"})
3. Tool returns: MEDIA:/tmp/tts-xxx/voice-123.mp3
4. You output: MEDIA:/tmp/tts-xxx/voice-123.mp3
Do NOT add extra text around the MEDIA directive.`,
parameters: {
type: "object",
properties: {
text: {
type: "string",
description: "The text to convert to speech",
},
},
required: ["text"],
},
execute: async (_id: string, params: { text?: unknown }) => {
// Validate text parameter
if (typeof params?.text !== "string" || params.text.length === 0) {
return { content: [{ type: "text", text: "Error: Invalid or missing text parameter" }] };
}
const text = params.text;
log.info(`[${PLUGIN_ID}] speak() called, length: ${text.length}`);
const result = await textToSpeech(text, config, prefsPath);
if (result.success && result.audioPath) {
log.info(`[${PLUGIN_ID}] Audio generated: ${result.audioPath}`);
// Return with MEDIA directive for clawdbot to send
return {
content: [
{
type: "text",
text: `MEDIA:${result.audioPath}`,
},
],
};
}
log.error(`[${PLUGIN_ID}] TTS failed: ${result.error}`);
return {
content: [
{
type: "text",
text: result.error || "TTS conversion failed",
},
],
};
},
});
// ===========================================================================
// RPC Methods
// ===========================================================================
// tts.status - Check if TTS is enabled
api.registerGatewayMethod("tts.status", async () => {
const userProvider = getTtsProvider(prefsPath);
const activeProvider = userProvider || config.provider || "openai";
return {
enabled: isTtsEnabled(prefsPath),
provider: activeProvider,
fallbackProvider: activeProvider === "openai" ? "elevenlabs" : "openai",
prefsPath,
hasOpenAIKey: !!getApiKey(config, "openai"),
hasElevenLabsKey: !!getApiKey(config, "elevenlabs"),
};
});
// tts.enable - Enable TTS mode
api.registerGatewayMethod("tts.enable", async () => {
setTtsEnabled(prefsPath, true);
log.info(`[${PLUGIN_ID}] TTS enabled via RPC`);
return { ok: true, enabled: true };
});
// tts.disable - Disable TTS mode
api.registerGatewayMethod("tts.disable", async () => {
setTtsEnabled(prefsPath, false);
log.info(`[${PLUGIN_ID}] TTS disabled via RPC`);
return { ok: true, enabled: false };
});
// tts.convert - Convert text to audio (returns path)
api.registerGatewayMethod("tts.convert", async (params: { text?: unknown }) => {
// Validate text parameter
if (typeof params?.text !== "string" || params.text.length === 0) {
return { ok: false, error: "Invalid or missing 'text' parameter" };
}
const result = await textToSpeech(params.text, config, prefsPath);
if (result.success) {
return { ok: true, audioPath: result.audioPath };
}
return { ok: false, error: result.error };
});
// tts.setProvider - Set primary TTS provider
api.registerGatewayMethod("tts.setProvider", async (params: { provider?: unknown }) => {
if (params?.provider !== "openai" && params?.provider !== "elevenlabs") {
return { ok: false, error: "Invalid provider. Use 'openai' or 'elevenlabs'" };
}
setTtsProvider(prefsPath, params.provider);
log.info(`[${PLUGIN_ID}] Provider set to ${params.provider} via RPC`);
return { ok: true, provider: params.provider };
});
// tts.providers - List available providers and their status
api.registerGatewayMethod("tts.providers", async () => {
const userProvider = getTtsProvider(prefsPath);
return {
providers: [
{
id: "openai",
name: "OpenAI",
configured: !!getApiKey(config, "openai"),
models: ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"],
voices: ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"],
},
{
id: "elevenlabs",
name: "ElevenLabs",
configured: !!getApiKey(config, "elevenlabs"),
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
},
],
active: userProvider || config.provider || "openai",
};
});
// ===========================================================================
// Plugin Commands (LLM-free, intercepted automatically)
// ===========================================================================
// /tts_on - Enable TTS mode
api.registerCommand({
name: "tts_on",
description: "Enable text-to-speech for responses",
handler: () => {
setTtsEnabled(prefsPath, true);
log.info(`[${PLUGIN_ID}] TTS enabled via /tts_on command`);
return { text: "🔊 TTS ativado! Agora vou responder em áudio." };
},
});
// /tts_off - Disable TTS mode
api.registerCommand({
name: "tts_off",
description: "Disable text-to-speech for responses",
handler: () => {
setTtsEnabled(prefsPath, false);
log.info(`[${PLUGIN_ID}] TTS disabled via /tts_off command`);
return { text: "🔇 TTS desativado. Voltando ao modo texto." };
},
});
// /audio <text> - Convert text to audio immediately
api.registerCommand({
name: "audio",
description: "Convert text to audio message",
acceptsArgs: true,
handler: async (ctx) => {
const text = ctx.args?.trim();
if (!text) {
return { text: "❌ Uso: /audio <texto para converter em áudio>" };
}
log.info(`[${PLUGIN_ID}] /audio command, text length: ${text.length}`);
const result = await textToSpeech(text, config, prefsPath);
if (result.success && result.audioPath) {
log.info(`[${PLUGIN_ID}] Audio generated: ${result.audioPath}`);
return { text: `MEDIA:${result.audioPath}` };
}
log.error(`[${PLUGIN_ID}] /audio failed: ${result.error}`);
return { text: `❌ Erro ao gerar áudio: ${result.error}` };
},
});
// /tts_provider [openai|elevenlabs] - Set or show TTS provider
api.registerCommand({
name: "tts_provider",
description: "Set or show TTS provider (openai or elevenlabs)",
acceptsArgs: true,
handler: (ctx) => {
const arg = ctx.args?.trim().toLowerCase();
const currentProvider = getTtsProvider(prefsPath) || config.provider || "openai";
if (!arg) {
// Show current provider
const fallback = currentProvider === "openai" ? "elevenlabs" : "openai";
const hasOpenAI = !!getApiKey(config, "openai");
const hasElevenLabs = !!getApiKey(config, "elevenlabs");
return {
text: `🎙️ **TTS Provider**\n\n` +
`Primário: **${currentProvider}** ${currentProvider === "openai" ? "(gpt-4o-mini-tts)" : "(eleven_multilingual_v2)"}\n` +
`Fallback: ${fallback}\n\n` +
`OpenAI: ${hasOpenAI ? "✅ configurado" : "❌ sem API key"}\n` +
`ElevenLabs: ${hasElevenLabs ? "✅ configurado" : "❌ sem API key"}\n\n` +
`Uso: /tts_provider openai ou /tts_provider elevenlabs`,
};
}
if (arg !== "openai" && arg !== "elevenlabs") {
return { text: "❌ Provedor inválido. Use: /tts_provider openai ou /tts_provider elevenlabs" };
}
setTtsProvider(prefsPath, arg);
const fallback = arg === "openai" ? "elevenlabs" : "openai";
log.info(`[${PLUGIN_ID}] Provider set to ${arg} via /tts_provider command`);
return {
text: `✅ Provedor TTS alterado!\n\n` +
`Primário: **${arg}** ${arg === "openai" ? "(gpt-4o-mini-tts)" : "(eleven_multilingual_v2)"}\n` +
`Fallback: ${fallback}`,
};
},
});
// ===========================================================================
// Auto-TTS Hook (message_sending)
// ===========================================================================
// Automatically convert text responses to audio when TTS is enabled
api.on("message_sending", async (event) => {
// Check if TTS is enabled
if (!isTtsEnabled(prefsPath)) {
return; // TTS disabled, don't modify message
}
const content = event.content?.trim();
if (!content) {
return; // Empty content, skip
}
// Skip if already contains MEDIA directive (avoid double conversion)
if (content.includes("MEDIA:")) {
return;
}
// Skip very short messages (likely errors or status)
if (content.length < 10) {
return;
}
log.info(`[${PLUGIN_ID}] Auto-TTS: Converting ${content.length} chars`);
try {
const result = await textToSpeech(content, config, prefsPath);
if (result.success && result.audioPath) {
log.info(`[${PLUGIN_ID}] Auto-TTS: Audio generated: ${result.audioPath}`);
// Return modified content with MEDIA directive
// The text is kept for accessibility, audio is appended
return {
content: `MEDIA:${result.audioPath}`,
};
} else {
log.warn(`[${PLUGIN_ID}] Auto-TTS: Failed - ${result.error}`);
// On failure, send original text without audio
return;
}
} catch (err) {
const error = err as Error;
log.error(`[${PLUGIN_ID}] Auto-TTS error: ${error.message}`);
// On error, send original text
return;
}
});
// ===========================================================================
// Startup
// ===========================================================================
const ttsEnabled = isTtsEnabled(prefsPath);
const userProvider = getTtsProvider(prefsPath);
const activeProvider = userProvider || config.provider || "openai";
const hasKey = !!getApiKey(config, activeProvider);
log.info(`[${PLUGIN_ID}] Ready. TTS: ${ttsEnabled ? "ON" : "OFF"}, Provider: ${activeProvider}, API Key: ${hasKey ? "OK" : "MISSING"}`);
if (!hasKey) {
log.warn(
`[${PLUGIN_ID}] No API key configured. Set ELEVENLABS_API_KEY or OPENAI_API_KEY.`
);
}
}
// =============================================================================
// Plugin Metadata
// =============================================================================
export const meta = {
id: PLUGIN_ID,
name: "Telegram TTS",
description: "Text-to-speech for chat responses using ElevenLabs or OpenAI",
version: "0.3.0",
};