/** * telegram-tts - Automatic TTS for chat responses * * Self-contained TTS extension that calls ElevenLabs/OpenAI APIs directly. * No external CLI dependencies. * * Features: * - speak tool for programmatic TTS * - Multi-provider support (ElevenLabs, OpenAI) * - RPC methods for status and control * * Note: Slash commands (/tts_on, /tts_off, /audio) should be configured * via Telegram customCommands and handled by the agent workspace. */ import { existsSync, readFileSync, writeFileSync, mkdtempSync, rmSync } from "fs"; import { join } from "path"; import { tmpdir } from "os"; import type { PluginApi } from "clawdbot"; const PLUGIN_ID = "telegram-tts"; const DEFAULT_TIMEOUT_MS = 30000; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes // ============================================================================= // Types // ============================================================================= interface TtsConfig { enabled?: boolean; provider?: "elevenlabs" | "openai"; elevenlabs?: { apiKey?: string; voiceId?: string; modelId?: string; }; openai?: { apiKey?: string; model?: string; voice?: string; }; prefsPath?: string; maxTextLength?: number; timeoutMs?: number; } interface UserPreferences { tts?: { enabled?: boolean; }; } interface TtsResult { success: boolean; audioPath?: string; error?: string; } // ============================================================================= // Validation // ============================================================================= /** * Validates ElevenLabs voiceId format to prevent URL injection. * Voice IDs are alphanumeric strings, typically 20 characters. */ function isValidVoiceId(voiceId: string): boolean { return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); } /** * Validates OpenAI voice name. */ function isValidOpenAIVoice(voice: string): boolean { const validVoices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; return validVoices.includes(voice); } // ============================================================================= // Configuration & Preferences // ============================================================================= function getPrefsPath(config: TtsConfig): string { return ( config.prefsPath || process.env.CLAWDBOT_TTS_PREFS || join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json") ); } function isTtsEnabled(prefsPath: string): boolean { try { if (!existsSync(prefsPath)) return false; const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8")); return prefs?.tts?.enabled === true; } catch { return false; } } function setTtsEnabled(prefsPath: string, enabled: boolean): void { let prefs: UserPreferences = {}; try { if (existsSync(prefsPath)) { prefs = JSON.parse(readFileSync(prefsPath, "utf8")); } } catch { // ignore } prefs.tts = { enabled }; writeFileSync(prefsPath, JSON.stringify(prefs, null, 2)); } function getApiKey(config: TtsConfig, provider: string): string | undefined { if (provider === "elevenlabs") { return ( config.elevenlabs?.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY ); } if (provider === "openai") { return config.openai?.apiKey || process.env.OPENAI_API_KEY; } return undefined; } // ============================================================================= // Temp File Cleanup // ============================================================================= /** * Schedules cleanup of a temp directory after a delay. * This ensures the file is consumed before deletion. */ function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void { setTimeout(() => { try { rmSync(tempDir, { recursive: true, force: true }); } catch { // Ignore cleanup errors } }, delayMs); } // ============================================================================= // TTS Providers // ============================================================================= async function elevenLabsTTS( text: string, apiKey: string, voiceId: string = "pMsXgVXv3BLzUgSXRplE", modelId: string = "eleven_multilingual_v2", timeoutMs: number = DEFAULT_TIMEOUT_MS ): Promise { // Validate voiceId to prevent URL injection if (!isValidVoiceId(voiceId)) { throw new Error(`Invalid voiceId format`); } const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs); try { const response = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, { method: "POST", headers: { "xi-api-key": apiKey, "Content-Type": "application/json", Accept: "audio/mpeg", }, body: JSON.stringify({ text, model_id: modelId, voice_settings: { stability: 0.5, similarity_boost: 0.75, style: 0.0, use_speaker_boost: true, }, }), signal: controller.signal, } ); if (!response.ok) { // Don't leak API error details to users throw new Error(`ElevenLabs API error (${response.status})`); } return Buffer.from(await response.arrayBuffer()); } finally { clearTimeout(timeout); } } async function openaiTTS( text: string, apiKey: string, model: string = "tts-1", voice: string = "alloy", timeoutMs: number = DEFAULT_TIMEOUT_MS ): Promise { // Validate voice if (!isValidOpenAIVoice(voice)) { throw new Error(`Invalid voice: ${voice}`); } const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs); try { const response = await fetch("https://api.openai.com/v1/audio/speech", { method: "POST", headers: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json", }, body: JSON.stringify({ model, input: text, voice, response_format: "mp3", }), signal: controller.signal, }); if (!response.ok) { // Don't leak API error details to users throw new Error(`OpenAI TTS API error (${response.status})`); } return Buffer.from(await response.arrayBuffer()); } finally { clearTimeout(timeout); } } // ============================================================================= // Core TTS Function // ============================================================================= async function textToSpeech(text: string, config: TtsConfig): Promise { const provider = config.provider || "elevenlabs"; const apiKey = getApiKey(config, provider); const timeoutMs = config.timeoutMs || DEFAULT_TIMEOUT_MS; if (!apiKey) { return { success: false, error: `No API key configured for ${provider}`, }; } const maxLen = config.maxTextLength || 4000; if (text.length > maxLen) { return { success: false, error: `Text too long (${text.length} chars, max ${maxLen})`, }; } try { let audioBuffer: Buffer; if (provider === "elevenlabs") { audioBuffer = await elevenLabsTTS( text, apiKey, config.elevenlabs?.voiceId, config.elevenlabs?.modelId, timeoutMs ); } else if (provider === "openai") { audioBuffer = await openaiTTS( text, apiKey, config.openai?.model, config.openai?.voice, timeoutMs ); } else { return { success: false, error: `Unknown provider: ${provider}` }; } // Save to temp file const tempDir = mkdtempSync(join(tmpdir(), "tts-")); const audioPath = join(tempDir, `voice-${Date.now()}.mp3`); writeFileSync(audioPath, audioBuffer); // Schedule cleanup after delay (file should be consumed by then) scheduleCleanup(tempDir); return { success: true, audioPath }; } catch (err) { const error = err as Error; if (error.name === "AbortError") { return { success: false, error: "TTS request timed out" }; } return { success: false, error: `TTS conversion failed: ${error.message}`, }; } } // ============================================================================= // Plugin Registration // ============================================================================= export default function register(api: PluginApi) { const log = api.logger; const config: TtsConfig = { enabled: false, provider: "elevenlabs", maxTextLength: 4000, timeoutMs: DEFAULT_TIMEOUT_MS, ...(api.pluginConfig || {}), }; const prefsPath = getPrefsPath(config); log.info(`[${PLUGIN_ID}] Registering plugin...`); log.info(`[${PLUGIN_ID}] Provider: ${config.provider}`); log.info(`[${PLUGIN_ID}] Preferences: ${prefsPath}`); // =========================================================================== // Tool: speak // =========================================================================== api.registerTool({ name: "speak", description: `Convert text to speech and generate voice message. Use this tool when TTS mode is enabled or user requests audio. IMPORTANT: After calling this tool, you MUST output the result exactly as returned. The tool returns "MEDIA:/path/to/audio.mp3" - copy this EXACTLY to your response. This MEDIA: directive tells the system to send the audio file. Example flow: 1. User asks a question with TTS enabled 2. You call speak({text: "Your answer here"}) 3. Tool returns: MEDIA:/tmp/tts-xxx/voice-123.mp3 4. You output: MEDIA:/tmp/tts-xxx/voice-123.mp3 Do NOT add extra text around the MEDIA directive.`, parameters: { type: "object", properties: { text: { type: "string", description: "The text to convert to speech", }, }, required: ["text"], }, execute: async (_id: string, params: { text?: unknown }) => { // Validate text parameter if (typeof params?.text !== "string" || params.text.length === 0) { return { content: [{ type: "text", text: "Error: Invalid or missing text parameter" }] }; } const text = params.text; log.info(`[${PLUGIN_ID}] speak() called, length: ${text.length}`); const result = await textToSpeech(text, config); if (result.success && result.audioPath) { log.info(`[${PLUGIN_ID}] Audio generated: ${result.audioPath}`); // Return with MEDIA directive for clawdbot to send return { content: [ { type: "text", text: `MEDIA:${result.audioPath}`, }, ], }; } log.error(`[${PLUGIN_ID}] TTS failed: ${result.error}`); return { content: [ { type: "text", text: result.error || "TTS conversion failed", }, ], }; }, }); // =========================================================================== // RPC Methods // =========================================================================== // tts.status - Check if TTS is enabled api.registerGatewayMethod("tts.status", async () => ({ enabled: isTtsEnabled(prefsPath), provider: config.provider, prefsPath, hasApiKey: !!getApiKey(config, config.provider || "elevenlabs"), })); // tts.enable - Enable TTS mode api.registerGatewayMethod("tts.enable", async () => { setTtsEnabled(prefsPath, true); log.info(`[${PLUGIN_ID}] TTS enabled via RPC`); return { ok: true, enabled: true }; }); // tts.disable - Disable TTS mode api.registerGatewayMethod("tts.disable", async () => { setTtsEnabled(prefsPath, false); log.info(`[${PLUGIN_ID}] TTS disabled via RPC`); return { ok: true, enabled: false }; }); // tts.convert - Convert text to audio (returns path) api.registerGatewayMethod("tts.convert", async (params: { text?: unknown }) => { // Validate text parameter if (typeof params?.text !== "string" || params.text.length === 0) { return { ok: false, error: "Invalid or missing 'text' parameter" }; } const result = await textToSpeech(params.text, config); if (result.success) { return { ok: true, audioPath: result.audioPath }; } return { ok: false, error: result.error }; }); // tts.providers - List available providers and their status api.registerGatewayMethod("tts.providers", async () => ({ providers: [ { id: "elevenlabs", name: "ElevenLabs", configured: !!getApiKey(config, "elevenlabs"), models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"], }, { id: "openai", name: "OpenAI", configured: !!getApiKey(config, "openai"), models: ["tts-1", "tts-1-hd"], voices: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"], }, ], active: config.provider, })); // =========================================================================== // Startup // =========================================================================== const ttsEnabled = isTtsEnabled(prefsPath); const hasKey = !!getApiKey(config, config.provider || "elevenlabs"); log.info(`[${PLUGIN_ID}] Ready. TTS: ${ttsEnabled ? "ON" : "OFF"}, API Key: ${hasKey ? "OK" : "MISSING"}`); if (!hasKey) { log.warn( `[${PLUGIN_ID}] No API key configured. Set ELEVENLABS_API_KEY or OPENAI_API_KEY.` ); } } // ============================================================================= // Plugin Metadata // ============================================================================= export const meta = { id: PLUGIN_ID, name: "Telegram TTS", description: "Text-to-speech for chat responses using ElevenLabs or OpenAI", version: "0.3.0", };