diff --git a/extensions/telegram-tts/README.md b/extensions/telegram-tts/README.md index 168365d40..0ea774bab 100644 --- a/extensions/telegram-tts/README.md +++ b/extensions/telegram-tts/README.md @@ -4,15 +4,18 @@ Automatic text-to-speech for chat responses using ElevenLabs or OpenAI. ## Features +- **Auto-TTS**: Automatically converts all text responses to voice when enabled - **`speak` Tool**: Converts text to speech and sends as voice message - **RPC Methods**: Control TTS via Gateway (`tts.status`, `tts.enable`, `tts.disable`, `tts.convert`, `tts.providers`) -- **User Preferences**: Persistent TTS state via JSON file -- **Multi-provider**: ElevenLabs and OpenAI TTS support +- **User Commands**: `/tts_on`, `/tts_off`, `/tts_provider`, `/tts_limit`, `/tts_summary`, `/tts_status` +- **Auto-Summarization**: Long texts are automatically summarized before TTS conversion +- **Multi-provider**: ElevenLabs and OpenAI TTS with automatic fallback - **Self-contained**: No external CLI dependencies - calls APIs directly ## Requirements -- ElevenLabs API key OR OpenAI API key +- **For TTS**: ElevenLabs API key OR OpenAI API key +- **For Auto-Summarization**: OpenAI API key (uses gpt-4o-mini to summarize long texts) ## Installation @@ -70,19 +73,20 @@ export OPENAI_API_KEY=your-api-key | Option | Type | Default | Description | |--------|------|---------|-------------| | `enabled` | boolean | `false` | Enable the plugin | -| `provider` | string | `"elevenlabs"` | TTS provider (`elevenlabs` or `openai`) | +| `provider` | string | `"openai"` | TTS provider (`elevenlabs` or `openai`) | | `elevenlabs.apiKey` | string | - | ElevenLabs API key | | `elevenlabs.voiceId` | string | `"pMsXgVXv3BLzUgSXRplE"` | ElevenLabs Voice ID | | `elevenlabs.modelId` | string | `"eleven_multilingual_v2"` | ElevenLabs Model ID | | `openai.apiKey` | string | - | OpenAI API key | -| `openai.model` | string | `"tts-1"` | OpenAI model (`tts-1` or `tts-1-hd`) | +| `openai.model` | string | `"gpt-4o-mini-tts"` | OpenAI model (`gpt-4o-mini-tts`, `tts-1`, or `tts-1-hd`) | | `openai.voice` | string | `"alloy"` | OpenAI voice | | `prefsPath` | string | `~/clawd/.user-preferences.json` | User preferences file | | `maxTextLength` | number | `4000` | Max characters for TTS | +| `timeoutMs` | number | `30000` | API request timeout in milliseconds | ### OpenAI Voices -Available voices: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` +Available voices: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` ## Usage @@ -114,23 +118,28 @@ clawdbot gateway call tts.providers ### Telegram Commands -Add custom commands to toggle TTS mode: +The plugin registers the following commands automatically: -```json -{ - "channels": { - "telegram": { - "customCommands": [ - {"command": "tts_on", "description": "Enable voice responses"}, - {"command": "tts_off", "description": "Disable voice responses"}, - {"command": "audio", "description": "Send response as voice message"} - ] - } - } -} -``` +| Command | Description | +|---------|-------------| +| `/tts_on` | Enable auto-TTS for all responses | +| `/tts_off` | Disable auto-TTS | +| `/tts_provider [openai\|elevenlabs]` | Switch TTS provider (with fallback) | +| `/tts_limit [chars]` | Set max text length before summarization (default: 1500) | +| `/tts_summary [on\|off]` | Enable/disable auto-summarization for long texts | +| `/tts_status` | Show TTS status, config, and last attempt result | -Then add handling instructions to your agent workspace (CLAUDE.md or TOOLS.md). +## Auto-Summarization + +When enabled (default), texts exceeding the configured limit are automatically summarized using OpenAI's gpt-4o-mini before TTS conversion. This ensures long responses can still be converted to audio. + +**Requirements**: OpenAI API key must be configured for summarization to work, even if using ElevenLabs for TTS. + +**Behavior**: +- Texts under the limit are converted directly +- Texts over the limit are summarized first, then converted +- If summarization is disabled (`/tts_summary off`), long texts are skipped (no audio) +- After summarization, a hard limit is applied to prevent oversized TTS requests ## License diff --git a/extensions/telegram-tts/index.test.ts b/extensions/telegram-tts/index.test.ts new file mode 100644 index 000000000..c396b1f24 --- /dev/null +++ b/extensions/telegram-tts/index.test.ts @@ -0,0 +1,101 @@ +/** + * Unit tests for telegram-tts extension + */ + +import { describe, expect, it } from "vitest"; +import { _test, meta } from "./index.js"; + +const { isValidVoiceId, isValidOpenAIVoice, isValidOpenAIModel, OPENAI_TTS_MODELS } = _test; + +describe("telegram-tts", () => { + describe("meta", () => { + it("should have correct plugin metadata", () => { + expect(meta.id).toBe("telegram-tts"); + expect(meta.name).toBe("Telegram TTS"); + expect(meta.version).toMatch(/^\d+\.\d+\.\d+$/); + }); + }); + + describe("isValidVoiceId", () => { + it("should accept valid ElevenLabs voice IDs", () => { + // Real ElevenLabs voice ID format (20 alphanumeric chars) + expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true); + expect(isValidVoiceId("21m00Tcm4TlvDq8ikWAM")).toBe(true); + expect(isValidVoiceId("EXAVITQu4vr4xnSDxMaL")).toBe(true); + }); + + it("should accept voice IDs of varying valid lengths", () => { + expect(isValidVoiceId("a1b2c3d4e5")).toBe(true); // 10 chars (min) + expect(isValidVoiceId("a".repeat(40))).toBe(true); // 40 chars (max) + }); + + it("should reject too short voice IDs", () => { + expect(isValidVoiceId("")).toBe(false); + expect(isValidVoiceId("abc")).toBe(false); + expect(isValidVoiceId("123456789")).toBe(false); // 9 chars + }); + + it("should reject too long voice IDs", () => { + expect(isValidVoiceId("a".repeat(41))).toBe(false); + expect(isValidVoiceId("a".repeat(100))).toBe(false); + }); + + it("should reject voice IDs with invalid characters", () => { + expect(isValidVoiceId("pMsXgVXv3BLz-gSXRplE")).toBe(false); // hyphen + expect(isValidVoiceId("pMsXgVXv3BLz_gSXRplE")).toBe(false); // underscore + expect(isValidVoiceId("pMsXgVXv3BLz gSXRplE")).toBe(false); // space + expect(isValidVoiceId("../../../etc/passwd")).toBe(false); // path traversal + expect(isValidVoiceId("voice?param=value")).toBe(false); // query string + }); + }); + + describe("isValidOpenAIVoice", () => { + it("should accept all valid OpenAI voices", () => { + const validVoices = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]; + for (const voice of validVoices) { + expect(isValidOpenAIVoice(voice)).toBe(true); + } + }); + + it("should reject invalid voice names", () => { + expect(isValidOpenAIVoice("invalid")).toBe(false); + expect(isValidOpenAIVoice("")).toBe(false); + expect(isValidOpenAIVoice("ALLOY")).toBe(false); // case sensitive + expect(isValidOpenAIVoice("alloy ")).toBe(false); // trailing space + expect(isValidOpenAIVoice(" alloy")).toBe(false); // leading space + }); + }); + + describe("isValidOpenAIModel", () => { + it("should accept standard OpenAI TTS models", () => { + expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true); + expect(isValidOpenAIModel("tts-1")).toBe(true); + expect(isValidOpenAIModel("tts-1-hd")).toBe(true); + }); + + it("should accept gpt-4o-mini-tts variants", () => { + expect(isValidOpenAIModel("gpt-4o-mini-tts-2025-12-15")).toBe(true); + expect(isValidOpenAIModel("gpt-4o-mini-tts-preview")).toBe(true); + }); + + it("should reject invalid model names", () => { + expect(isValidOpenAIModel("invalid")).toBe(false); + expect(isValidOpenAIModel("")).toBe(false); + expect(isValidOpenAIModel("tts-2")).toBe(false); + expect(isValidOpenAIModel("gpt-4")).toBe(false); + }); + }); + + describe("OPENAI_TTS_MODELS", () => { + it("should contain the expected models", () => { + expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts"); + expect(OPENAI_TTS_MODELS).toContain("tts-1"); + expect(OPENAI_TTS_MODELS).toContain("tts-1-hd"); + }); + + it("should be a non-empty array", () => { + expect(Array.isArray(OPENAI_TTS_MODELS)).toBe(true); + expect(OPENAI_TTS_MODELS.length).toBeGreaterThan(0); + }); + }); +}); diff --git a/extensions/telegram-tts/index.ts b/extensions/telegram-tts/index.ts index 5004889df..0774ec85f 100644 --- a/extensions/telegram-tts/index.ts +++ b/extensions/telegram-tts/index.ts @@ -13,7 +13,7 @@ * via Telegram customCommands and handled by the agent workspace. */ -import { existsSync, readFileSync, writeFileSync, mkdtempSync, rmSync } from "fs"; +import { existsSync, readFileSync, writeFileSync, mkdtempSync, rmSync, renameSync, unlinkSync } from "fs"; import { join } from "path"; import { tmpdir } from "os"; import type { PluginApi } from "clawdbot"; @@ -49,17 +49,35 @@ interface UserPreferences { enabled?: boolean; provider?: "openai" | "elevenlabs"; maxLength?: number; // Max chars before summarizing (default 1500) + summarize?: boolean; // Enable auto-summarization (default true) }; } const DEFAULT_TTS_MAX_LENGTH = 1500; +const DEFAULT_TTS_SUMMARIZE = true; interface TtsResult { success: boolean; audioPath?: string; error?: string; + latencyMs?: number; + provider?: string; } +interface TtsStatusEntry { + timestamp: number; + success: boolean; + textLength: number; + summarized: boolean; + provider?: string; + latencyMs?: number; + error?: string; +} + +// Track last TTS attempt for diagnostics (global, not per-user) +// Note: This shows the most recent TTS attempt system-wide, not user-specific +let lastTtsAttempt: TtsStatusEntry | undefined; + // ============================================================================= // Validation // ============================================================================= @@ -118,7 +136,27 @@ function isTtsEnabled(prefsPath: string): boolean { } } -function setTtsEnabled(prefsPath: string, enabled: boolean): void { +/** + * Atomically writes to a file using temp file + rename pattern. + * Prevents race conditions when multiple processes write simultaneously. + */ +function atomicWriteFileSync(filePath: string, content: string): void { + const tmpPath = `${filePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`; + writeFileSync(tmpPath, content); + try { + renameSync(tmpPath, filePath); + } catch (err) { + // Clean up temp file on rename failure + try { + unlinkSync(tmpPath); + } catch { + // Ignore cleanup errors + } + throw err; + } +} + +function updatePrefs(prefsPath: string, update: (prefs: UserPreferences) => void): void { let prefs: UserPreferences = {}; try { if (existsSync(prefsPath)) { @@ -127,8 +165,14 @@ function setTtsEnabled(prefsPath: string, enabled: boolean): void { } catch { // ignore } - prefs.tts = { ...prefs.tts, enabled }; - writeFileSync(prefsPath, JSON.stringify(prefs, null, 2)); + update(prefs); + atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2)); +} + +function setTtsEnabled(prefsPath: string, enabled: boolean): void { + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, enabled }; + }); } function getTtsProvider(prefsPath: string): "openai" | "elevenlabs" | undefined { @@ -142,16 +186,9 @@ function getTtsProvider(prefsPath: string): "openai" | "elevenlabs" | undefined } function setTtsProvider(prefsPath: string, provider: "openai" | "elevenlabs"): void { - let prefs: UserPreferences = {}; - try { - if (existsSync(prefsPath)) { - prefs = JSON.parse(readFileSync(prefsPath, "utf8")); - } - } catch { - // ignore - } - prefs.tts = { ...prefs.tts, provider }; - writeFileSync(prefsPath, JSON.stringify(prefs, null, 2)); + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, provider }; + }); } function getTtsMaxLength(prefsPath: string): number { @@ -165,33 +202,50 @@ function getTtsMaxLength(prefsPath: string): number { } function setTtsMaxLength(prefsPath: string, maxLength: number): void { - let prefs: UserPreferences = {}; + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, maxLength }; + }); +} + +function isSummarizationEnabled(prefsPath: string): boolean { try { - if (existsSync(prefsPath)) { - prefs = JSON.parse(readFileSync(prefsPath, "utf8")); - } + if (!existsSync(prefsPath)) return DEFAULT_TTS_SUMMARIZE; + const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8")); + return prefs?.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE; } catch { - // ignore + return DEFAULT_TTS_SUMMARIZE; } - prefs.tts = { ...prefs.tts, maxLength }; - writeFileSync(prefsPath, JSON.stringify(prefs, null, 2)); +} + +function setSummarizationEnabled(prefsPath: string, enabled: boolean): void { + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, summarize: enabled }; + }); } // ============================================================================= // Text Summarization (for long texts) // ============================================================================= +interface SummarizeResult { + summary: string; + latencyMs: number; + inputLength: number; + outputLength: number; +} + async function summarizeText( text: string, targetLength: number, apiKey: string, timeoutMs: number = 30000 -): Promise { +): Promise { // Validate targetLength if (targetLength < 100 || targetLength > 10000) { throw new Error(`Invalid targetLength: ${targetLength}`); } + const startTime = Date.now(); const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs); @@ -233,7 +287,13 @@ async function summarizeText( throw new Error("No summary returned"); } - return summary; + const latencyMs = Date.now() - startTime; + return { + summary, + latencyMs, + inputLength: text.length, + outputLength: summary.length, + }; } finally { clearTimeout(timeout); } @@ -262,13 +322,14 @@ function getApiKey(config: TtsConfig, provider: string): string | undefined { * This ensures the file is consumed before deletion. */ function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void { - setTimeout(() => { + const timer = setTimeout(() => { try { rmSync(tempDir, { recursive: true, force: true }); } catch { // Ignore cleanup errors } }, delayMs); + timer.unref(); // Allow process to exit without waiting for cleanup } // ============================================================================= @@ -401,6 +462,7 @@ async function textToSpeech(text: string, config: TtsConfig, prefsPath?: string) continue; } + const providerStartTime = Date.now(); try { let audioBuffer: Buffer; @@ -425,6 +487,8 @@ async function textToSpeech(text: string, config: TtsConfig, prefsPath?: string) continue; } + const latencyMs = Date.now() - providerStartTime; + // Save to temp file const tempDir = mkdtempSync(join(tmpdir(), "tts-")); const audioPath = join(tempDir, `voice-${Date.now()}.mp3`); @@ -433,7 +497,7 @@ async function textToSpeech(text: string, config: TtsConfig, prefsPath?: string) // Schedule cleanup after delay (file should be consumed by then) scheduleCleanup(tempDir); - return { success: true, audioPath }; + return { success: true, audioPath, latencyMs, provider }; } catch (err) { const error = err as Error; if (error.name === "AbortError") { @@ -735,6 +799,84 @@ Do NOT add extra text around the MEDIA directive.`, }, }); + // /tts_summary [on|off] - Enable/disable auto-summarization + api.registerCommand({ + name: "tts_summary", + description: "Enable or disable auto-summarization for long texts", + acceptsArgs: true, + handler: (ctx) => { + const arg = ctx.args?.trim().toLowerCase(); + const currentEnabled = isSummarizationEnabled(prefsPath); + const maxLength = getTtsMaxLength(prefsPath); + + if (!arg) { + // Show current status + return { + text: `📝 **Auto-Resumo TTS**\n\n` + + `Status: ${currentEnabled ? "✅ Ativado" : "❌ Desativado"}\n` + + `Limite: ${maxLength} caracteres\n\n` + + `Quando ativado, textos maiores que ${maxLength} chars são resumidos com gpt-4o-mini antes de virar áudio.\n\n` + + `Uso: /tts_summary on ou /tts_summary off`, + }; + } + + if (arg !== "on" && arg !== "off") { + return { text: "❌ Use: /tts_summary on ou /tts_summary off" }; + } + + const newEnabled = arg === "on"; + setSummarizationEnabled(prefsPath, newEnabled); + log.info(`[${PLUGIN_ID}] Summarization ${newEnabled ? "enabled" : "disabled"} via /tts_summary command`); + return { + text: newEnabled + ? `✅ Auto-resumo **ativado**!\n\nTextos longos serão resumidos antes de virar áudio.` + : `❌ Auto-resumo **desativado**!\n\nTextos longos serão ignorados (sem áudio).`, + }; + }, + }); + + // /tts_status - Show TTS status and last attempt result + api.registerCommand({ + name: "tts_status", + description: "Show TTS status, configuration, and last attempt result", + acceptsArgs: false, + handler: () => { + const enabled = isTtsEnabled(prefsPath); + const userProvider = getTtsProvider(prefsPath); + const activeProvider = userProvider || config.provider || "openai"; + const maxLength = getTtsMaxLength(prefsPath); + const summarizationEnabled = isSummarizationEnabled(prefsPath); + const hasKey = !!getApiKey(config, activeProvider); + + let statusLines = [ + `📊 **Status TTS**\n`, + `Estado: ${enabled ? "✅ Ativado" : "❌ Desativado"}`, + `Provedor: ${activeProvider} (API Key: ${hasKey ? "✅" : "❌"})`, + `Limite de texto: ${maxLength} caracteres`, + `Auto-resumo: ${summarizationEnabled ? "✅ Ativado" : "❌ Desativado"}`, + ]; + + if (lastTtsAttempt) { + const timeAgo = Math.round((Date.now() - lastTtsAttempt.timestamp) / 1000); + statusLines.push(``); + statusLines.push(`**Última tentativa** (há ${timeAgo}s):`); + statusLines.push(`Resultado: ${lastTtsAttempt.success ? "✅ Sucesso" : "❌ Falha"}`); + statusLines.push(`Texto: ${lastTtsAttempt.textLength} chars${lastTtsAttempt.summarized ? " (resumido)" : ""}`); + if (lastTtsAttempt.success) { + statusLines.push(`Provedor: ${lastTtsAttempt.provider}`); + statusLines.push(`Latência: ${lastTtsAttempt.latencyMs}ms`); + } else if (lastTtsAttempt.error) { + statusLines.push(`Erro: ${lastTtsAttempt.error}`); + } + } else { + statusLines.push(``); + statusLines.push(`_Nenhuma tentativa de TTS registrada nesta sessão._`); + } + + return { text: statusLines.join("\n") }; + }, + }); + // =========================================================================== // Auto-TTS Hook (message_sending) // =========================================================================== @@ -763,9 +905,15 @@ Do NOT add extra text around the MEDIA directive.`, const maxLength = getTtsMaxLength(prefsPath); let textForAudio = content; + const summarizationEnabled = isSummarizationEnabled(prefsPath); - // If text exceeds limit, summarize it first + // If text exceeds limit, summarize it first (if enabled) if (content.length > maxLength) { + if (!summarizationEnabled) { + log.info(`[${PLUGIN_ID}] Auto-TTS: Text too long (${content.length} > ${maxLength}), summarization disabled, skipping audio`); + return; // User disabled summarization, skip audio for long texts + } + log.info(`[${PLUGIN_ID}] Auto-TTS: Text too long (${content.length} > ${maxLength}), summarizing...`); const openaiKey = getApiKey(config, "openai"); @@ -775,8 +923,11 @@ Do NOT add extra text around the MEDIA directive.`, } try { - textForAudio = await summarizeText(content, maxLength, openaiKey, config.timeoutMs); - log.info(`[${PLUGIN_ID}] Auto-TTS: Summarized to ${textForAudio.length} chars`); + const summarizeResult = await summarizeText(content, maxLength, openaiKey, config.timeoutMs); + textForAudio = summarizeResult.summary; + log.info( + `[${PLUGIN_ID}] Auto-TTS: Summarized ${summarizeResult.inputLength} → ${summarizeResult.outputLength} chars in ${summarizeResult.latencyMs}ms` + ); // Safeguard: if summary still exceeds hard limit, truncate const hardLimit = config.maxTextLength || 4000; @@ -793,24 +944,61 @@ Do NOT add extra text around the MEDIA directive.`, log.info(`[${PLUGIN_ID}] Auto-TTS: Converting ${content.length} chars`); } + const wasSummarized = textForAudio !== content; + try { + const ttsStartTime = Date.now(); const result = await textToSpeech(textForAudio, config, prefsPath); if (result.success && result.audioPath) { - log.info(`[${PLUGIN_ID}] Auto-TTS: Audio generated: ${result.audioPath}`); + const totalLatency = Date.now() - ttsStartTime; + log.info( + `[${PLUGIN_ID}] Auto-TTS: Generated via ${result.provider} in ${result.latencyMs}ms (total: ${totalLatency}ms)` + ); + + // Track successful attempt + lastTtsAttempt = { + timestamp: Date.now(), + success: true, + textLength: content.length, + summarized: wasSummarized, + provider: result.provider, + latencyMs: result.latencyMs, + }; + // Return modified content with MEDIA directive // The text is kept for accessibility, audio is appended return { content: `MEDIA:${result.audioPath}`, }; } else { - log.warn(`[${PLUGIN_ID}] Auto-TTS: Failed - ${result.error}`); + log.warn(`[${PLUGIN_ID}] Auto-TTS: TTS conversion failed - ${result.error}`); + + // Track failed attempt + lastTtsAttempt = { + timestamp: Date.now(), + success: false, + textLength: content.length, + summarized: wasSummarized, + error: result.error, + }; + // On failure, send original text without audio return; } } catch (err) { const error = err as Error; - log.error(`[${PLUGIN_ID}] Auto-TTS error: ${error.message}`); + log.error(`[${PLUGIN_ID}] Auto-TTS: Unexpected error - ${error.message}`); + + // Track error + lastTtsAttempt = { + timestamp: Date.now(), + success: false, + textLength: content.length, + summarized: wasSummarized, + error: error.message, + }; + // On error, send original text return; } @@ -844,3 +1032,14 @@ export const meta = { description: "Text-to-speech for chat responses using ElevenLabs or OpenAI", version: "0.3.0", }; + +// ============================================================================= +// Test Exports (for unit testing) +// ============================================================================= + +export const _test = { + isValidVoiceId, + isValidOpenAIVoice, + isValidOpenAIModel, + OPENAI_TTS_MODELS, +};