From d9a467fe3b315e76a355491b1d3cfb58ead2145b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 24 Jan 2026 07:57:46 +0000 Subject: [PATCH] feat: move TTS into core (#1559) (thanks @Glucksberg) --- CHANGELOG.md | 1 + docs/gateway/configuration.md | 38 + docs/tools/slash-commands.md | 7 + extensions/telegram-tts/README.md | 146 --- extensions/telegram-tts/clawdbot.plugin.json | 117 -- extensions/telegram-tts/index.test.ts | 218 ---- extensions/telegram-tts/index.ts | 1042 ------------------ extensions/telegram-tts/package.json | 8 - src/agents/clawdbot-tools.ts | 5 + src/agents/tools/tts-tool.ts | 60 + src/auto-reply/commands-registry.data.ts | 75 ++ src/auto-reply/reply/commands-core.ts | 2 + src/auto-reply/reply/commands-tts.ts | 214 ++++ src/auto-reply/reply/dispatch-from-config.ts | 54 +- src/auto-reply/reply/route-reply.ts | 43 - src/config/types.messages.ts | 3 + src/config/types.ts | 1 + src/config/types.tts.ts | 30 + src/config/zod-schema.core.ts | 30 + src/config/zod-schema.session.ts | 2 + src/gateway/server-methods-list.ts | 6 + src/gateway/server-methods.ts | 8 + src/gateway/server-methods/tts.ts | 138 +++ src/telegram/bot/delivery.ts | 59 - src/tts/tts.test.ts | 234 ++++ src/tts/tts.ts | 630 +++++++++++ 26 files changed, 1522 insertions(+), 1649 deletions(-) delete mode 100644 extensions/telegram-tts/README.md delete mode 100644 extensions/telegram-tts/clawdbot.plugin.json delete mode 100644 extensions/telegram-tts/index.test.ts delete mode 100644 extensions/telegram-tts/index.ts delete mode 100644 extensions/telegram-tts/package.json create mode 100644 src/agents/tools/tts-tool.ts create mode 100644 src/auto-reply/reply/commands-tts.ts create mode 100644 src/config/types.tts.ts create mode 100644 src/gateway/server-methods/tts.ts create mode 100644 src/tts/tts.test.ts create mode 100644 src/tts/tts.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 965d5ea07..a0c945eea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.clawd.bot - Markdown: add per-channel table conversion (bullets for Signal/WhatsApp, code blocks elsewhere). (#1495) Thanks @odysseus0. - Tlon: add Urbit channel plugin (DMs, group mentions, thread replies). (#1544) Thanks @wca4a. - Channels: allow per-group tool allow/deny policies across built-in + plugin channels. (#1546) Thanks @adam91holt. +- TTS: move Telegram TTS into core with auto-replies, commands, and gateway methods. (#1559) Thanks @Glucksberg. ### Fixes - Skills: gate bird Homebrew install to macOS. (#1569) Thanks @bradleypriest. diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index ab41221a7..59d332190 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1446,6 +1446,44 @@ active agent’s `identity.emoji` when set, otherwise `"πŸ‘€"`. Set it to `""` t `removeAckAfterReply` removes the bot’s ack reaction after a reply is sent (Slack/Discord/Telegram only). Default: `false`. +#### `messages.tts` + +Enable text-to-speech for outbound replies. When on, Clawdbot generates audio +using ElevenLabs or OpenAI and attaches it to responses. Telegram uses Opus +voice notes; other channels send MP3 audio. + +```json5 +{ + messages: { + tts: { + enabled: true, + mode: "final", // final | all (include tool/block replies) + provider: "elevenlabs", + maxTextLength: 4000, + timeoutMs: 30000, + prefsPath: "~/.clawdbot/settings/tts.json", + elevenlabs: { + apiKey: "elevenlabs_api_key", + voiceId: "voice_id", + modelId: "eleven_multilingual_v2" + }, + openai: { + apiKey: "openai_api_key", + model: "gpt-4o-mini-tts", + voice: "alloy" + } + } + } +} +``` + +Notes: +- `messages.tts.enabled` can be overridden by local user prefs (see `/tts_on`, `/tts_off`). +- `prefsPath` stores local overrides (enabled/provider/limit/summarize). +- `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit. +- `/tts_limit` and `/tts_summary` control per-user summarization settings. +- `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY` and `OPENAI_API_KEY`. + ### `talk` Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. diff --git a/docs/tools/slash-commands.md b/docs/tools/slash-commands.md index 6ab3c87aa..b8ccb7c83 100644 --- a/docs/tools/slash-commands.md +++ b/docs/tools/slash-commands.md @@ -67,6 +67,13 @@ Text + native (when enabled): - `/config show|get|set|unset` (persist config to disk, owner-only; requires `commands.config: true`) - `/debug show|set|unset|reset` (runtime overrides, owner-only; requires `commands.debug: true`) - `/usage off|tokens|full|cost` (per-response usage footer or local cost summary) +- `/tts_on` (enable TTS replies) +- `/tts_off` (disable TTS replies) +- `/tts_provider [openai|elevenlabs]` (set or show TTS provider) +- `/tts_limit ` (max chars before TTS summarization) +- `/tts_summary on|off` (toggle TTS auto-summary) +- `/tts_status` (show TTS status) +- `/audio ` (convert text to a TTS audio reply) - `/stop` - `/restart` - `/dock-telegram` (alias: `/dock_telegram`) (switch replies to Telegram) diff --git a/extensions/telegram-tts/README.md b/extensions/telegram-tts/README.md deleted file mode 100644 index 0ea774bab..000000000 --- a/extensions/telegram-tts/README.md +++ /dev/null @@ -1,146 +0,0 @@ -# Telegram TTS Extension - -Automatic text-to-speech for chat responses using ElevenLabs or OpenAI. - -## Features - -- **Auto-TTS**: Automatically converts all text responses to voice when enabled -- **`speak` Tool**: Converts text to speech and sends as voice message -- **RPC Methods**: Control TTS via Gateway (`tts.status`, `tts.enable`, `tts.disable`, `tts.convert`, `tts.providers`) -- **User Commands**: `/tts_on`, `/tts_off`, `/tts_provider`, `/tts_limit`, `/tts_summary`, `/tts_status` -- **Auto-Summarization**: Long texts are automatically summarized before TTS conversion -- **Multi-provider**: ElevenLabs and OpenAI TTS with automatic fallback -- **Self-contained**: No external CLI dependencies - calls APIs directly - -## Requirements - -- **For TTS**: ElevenLabs API key OR OpenAI API key -- **For Auto-Summarization**: OpenAI API key (uses gpt-4o-mini to summarize long texts) - -## Installation - -The extension is bundled with Clawdbot. Enable it in your config: - -```json -{ - "plugins": { - "entries": { - "telegram-tts": { - "enabled": true, - "provider": "elevenlabs", - "elevenlabs": { - "apiKey": "your-api-key" - } - } - } - } -} -``` - -Or use OpenAI: - -```json -{ - "plugins": { - "entries": { - "telegram-tts": { - "enabled": true, - "provider": "openai", - "openai": { - "apiKey": "your-api-key", - "voice": "nova" - } - } - } - } -} -``` - -Or set API keys via environment variables: - -```bash -# For ElevenLabs -export ELEVENLABS_API_KEY=your-api-key -# or -export XI_API_KEY=your-api-key - -# For OpenAI -export OPENAI_API_KEY=your-api-key -``` - -## Configuration - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `enabled` | boolean | `false` | Enable the plugin | -| `provider` | string | `"openai"` | TTS provider (`elevenlabs` or `openai`) | -| `elevenlabs.apiKey` | string | - | ElevenLabs API key | -| `elevenlabs.voiceId` | string | `"pMsXgVXv3BLzUgSXRplE"` | ElevenLabs Voice ID | -| `elevenlabs.modelId` | string | `"eleven_multilingual_v2"` | ElevenLabs Model ID | -| `openai.apiKey` | string | - | OpenAI API key | -| `openai.model` | string | `"gpt-4o-mini-tts"` | OpenAI model (`gpt-4o-mini-tts`, `tts-1`, or `tts-1-hd`) | -| `openai.voice` | string | `"alloy"` | OpenAI voice | -| `prefsPath` | string | `~/clawd/.user-preferences.json` | User preferences file | -| `maxTextLength` | number | `4000` | Max characters for TTS | -| `timeoutMs` | number | `30000` | API request timeout in milliseconds | - -### OpenAI Voices - -Available voices: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` - -## Usage - -### Agent Tool - -The agent can use the `speak` tool to send voice messages: - -``` -User: Send me a voice message saying hello -Agent: [calls speak({ text: "Hello! How can I help you today?" })] -``` - -### RPC Methods - -```bash -# Check TTS status -clawdbot gateway call tts.status - -# Enable/disable TTS -clawdbot gateway call tts.enable -clawdbot gateway call tts.disable - -# Convert text to audio -clawdbot gateway call tts.convert '{"text": "Hello world"}' - -# List available providers -clawdbot gateway call tts.providers -``` - -### Telegram Commands - -The plugin registers the following commands automatically: - -| Command | Description | -|---------|-------------| -| `/tts_on` | Enable auto-TTS for all responses | -| `/tts_off` | Disable auto-TTS | -| `/tts_provider [openai\|elevenlabs]` | Switch TTS provider (with fallback) | -| `/tts_limit [chars]` | Set max text length before summarization (default: 1500) | -| `/tts_summary [on\|off]` | Enable/disable auto-summarization for long texts | -| `/tts_status` | Show TTS status, config, and last attempt result | - -## Auto-Summarization - -When enabled (default), texts exceeding the configured limit are automatically summarized using OpenAI's gpt-4o-mini before TTS conversion. This ensures long responses can still be converted to audio. - -**Requirements**: OpenAI API key must be configured for summarization to work, even if using ElevenLabs for TTS. - -**Behavior**: -- Texts under the limit are converted directly -- Texts over the limit are summarized first, then converted -- If summarization is disabled (`/tts_summary off`), long texts are skipped (no audio) -- After summarization, a hard limit is applied to prevent oversized TTS requests - -## License - -MIT diff --git a/extensions/telegram-tts/clawdbot.plugin.json b/extensions/telegram-tts/clawdbot.plugin.json deleted file mode 100644 index c92258cd0..000000000 --- a/extensions/telegram-tts/clawdbot.plugin.json +++ /dev/null @@ -1,117 +0,0 @@ -{ - "id": "telegram-tts", - "uiHints": { - "enabled": { - "label": "Enable TTS", - "help": "Automatically convert text responses to voice messages" - }, - "provider": { - "label": "TTS Provider", - "help": "Choose between ElevenLabs or OpenAI for voice synthesis" - }, - "elevenlabs.apiKey": { - "label": "ElevenLabs API Key", - "sensitive": true - }, - "elevenlabs.voiceId": { - "label": "ElevenLabs Voice ID", - "help": "Default: pMsXgVXv3BLzUgSXRplE (Borislav)" - }, - "elevenlabs.modelId": { - "label": "ElevenLabs Model ID", - "help": "Default: eleven_multilingual_v2" - }, - "openai.apiKey": { - "label": "OpenAI API Key", - "sensitive": true - }, - "openai.model": { - "label": "OpenAI TTS Model", - "help": "gpt-4o-mini-tts (recommended)" - }, - "openai.voice": { - "label": "OpenAI Voice", - "help": "alloy, echo, fable, onyx, nova, or shimmer" - }, - "prefsPath": { - "label": "User Preferences File", - "help": "Path to JSON file storing TTS state", - "advanced": true - }, - "maxTextLength": { - "label": "Max Text Length", - "help": "Maximum characters to convert to speech", - "advanced": true - }, - "timeoutMs": { - "label": "Request Timeout (ms)", - "help": "Maximum time to wait for TTS API response (default: 30000)", - "advanced": true - } - }, - "configSchema": { - "type": "object", - "additionalProperties": false, - "properties": { - "enabled": { - "type": "boolean", - "default": false - }, - "provider": { - "type": "string", - "enum": ["elevenlabs", "openai"], - "default": "elevenlabs" - }, - "elevenlabs": { - "type": "object", - "additionalProperties": false, - "properties": { - "apiKey": { - "type": "string" - }, - "voiceId": { - "type": "string", - "default": "pMsXgVXv3BLzUgSXRplE" - }, - "modelId": { - "type": "string", - "default": "eleven_multilingual_v2" - } - } - }, - "openai": { - "type": "object", - "additionalProperties": false, - "properties": { - "apiKey": { - "type": "string" - }, - "model": { - "type": "string", - "enum": ["gpt-4o-mini-tts"], - "default": "gpt-4o-mini-tts" - }, - "voice": { - "type": "string", - "enum": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"], - "default": "alloy" - } - } - }, - "prefsPath": { - "type": "string" - }, - "maxTextLength": { - "type": "integer", - "minimum": 1, - "default": 4000 - }, - "timeoutMs": { - "type": "integer", - "minimum": 1000, - "maximum": 120000, - "default": 30000 - } - } - } -} diff --git a/extensions/telegram-tts/index.test.ts b/extensions/telegram-tts/index.test.ts deleted file mode 100644 index add0d38c1..000000000 --- a/extensions/telegram-tts/index.test.ts +++ /dev/null @@ -1,218 +0,0 @@ -/** - * Unit tests for telegram-tts extension - */ - -import { describe, expect, it, vi, beforeEach, afterEach } from "vitest"; -import { _test, meta } from "./index.js"; - -const { isValidVoiceId, isValidOpenAIVoice, isValidOpenAIModel, OPENAI_TTS_MODELS, summarizeText } = _test; - -describe("telegram-tts", () => { - describe("meta", () => { - it("should have correct plugin metadata", () => { - expect(meta.id).toBe("telegram-tts"); - expect(meta.name).toBe("Telegram TTS"); - expect(meta.version).toMatch(/^\d+\.\d+\.\d+$/); - }); - }); - - describe("isValidVoiceId", () => { - it("should accept valid ElevenLabs voice IDs", () => { - // Real ElevenLabs voice ID format (20 alphanumeric chars) - expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true); - expect(isValidVoiceId("21m00Tcm4TlvDq8ikWAM")).toBe(true); - expect(isValidVoiceId("EXAVITQu4vr4xnSDxMaL")).toBe(true); - }); - - it("should accept voice IDs of varying valid lengths", () => { - expect(isValidVoiceId("a1b2c3d4e5")).toBe(true); // 10 chars (min) - expect(isValidVoiceId("a".repeat(40))).toBe(true); // 40 chars (max) - }); - - it("should reject too short voice IDs", () => { - expect(isValidVoiceId("")).toBe(false); - expect(isValidVoiceId("abc")).toBe(false); - expect(isValidVoiceId("123456789")).toBe(false); // 9 chars - }); - - it("should reject too long voice IDs", () => { - expect(isValidVoiceId("a".repeat(41))).toBe(false); - expect(isValidVoiceId("a".repeat(100))).toBe(false); - }); - - it("should reject voice IDs with invalid characters", () => { - expect(isValidVoiceId("pMsXgVXv3BLz-gSXRplE")).toBe(false); // hyphen - expect(isValidVoiceId("pMsXgVXv3BLz_gSXRplE")).toBe(false); // underscore - expect(isValidVoiceId("pMsXgVXv3BLz gSXRplE")).toBe(false); // space - expect(isValidVoiceId("../../../etc/passwd")).toBe(false); // path traversal - expect(isValidVoiceId("voice?param=value")).toBe(false); // query string - }); - }); - - describe("isValidOpenAIVoice", () => { - it("should accept all valid OpenAI voices", () => { - const validVoices = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]; - for (const voice of validVoices) { - expect(isValidOpenAIVoice(voice)).toBe(true); - } - }); - - it("should reject invalid voice names", () => { - expect(isValidOpenAIVoice("invalid")).toBe(false); - expect(isValidOpenAIVoice("")).toBe(false); - expect(isValidOpenAIVoice("ALLOY")).toBe(false); // case sensitive - expect(isValidOpenAIVoice("alloy ")).toBe(false); // trailing space - expect(isValidOpenAIVoice(" alloy")).toBe(false); // leading space - }); - }); - - describe("isValidOpenAIModel", () => { - it("should accept gpt-4o-mini-tts model", () => { - expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true); - }); - - it("should reject other models", () => { - expect(isValidOpenAIModel("tts-1")).toBe(false); - expect(isValidOpenAIModel("tts-1-hd")).toBe(false); - expect(isValidOpenAIModel("invalid")).toBe(false); - expect(isValidOpenAIModel("")).toBe(false); - expect(isValidOpenAIModel("gpt-4")).toBe(false); - }); - }); - - describe("OPENAI_TTS_MODELS", () => { - it("should contain only gpt-4o-mini-tts", () => { - expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts"); - expect(OPENAI_TTS_MODELS).toHaveLength(1); - }); - - it("should be a non-empty array", () => { - expect(Array.isArray(OPENAI_TTS_MODELS)).toBe(true); - expect(OPENAI_TTS_MODELS.length).toBeGreaterThan(0); - }); - }); - - describe("summarizeText", () => { - const mockApiKey = "test-api-key"; - const originalFetch = globalThis.fetch; - - beforeEach(() => { - vi.useFakeTimers({ shouldAdvanceTime: true }); - }); - - afterEach(() => { - globalThis.fetch = originalFetch; - vi.useRealTimers(); - }); - - it("should summarize text and return result with metrics", async () => { - const mockSummary = "This is a summarized version of the text."; - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => Promise.resolve({ - choices: [{ message: { content: mockSummary } }], - }), - }); - - const longText = "A".repeat(2000); // Text longer than default limit - const result = await summarizeText(longText, 1500, mockApiKey); - - expect(result.summary).toBe(mockSummary); - expect(result.inputLength).toBe(2000); - expect(result.outputLength).toBe(mockSummary.length); - expect(result.latencyMs).toBeGreaterThanOrEqual(0); - expect(globalThis.fetch).toHaveBeenCalledTimes(1); - }); - - it("should call OpenAI API with correct parameters", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => Promise.resolve({ - choices: [{ message: { content: "Summary" } }], - }), - }); - - await summarizeText("Long text to summarize", 500, mockApiKey); - - expect(globalThis.fetch).toHaveBeenCalledWith( - "https://api.openai.com/v1/chat/completions", - expect.objectContaining({ - method: "POST", - headers: { - Authorization: `Bearer ${mockApiKey}`, - "Content-Type": "application/json", - }, - }) - ); - - const callArgs = (globalThis.fetch as ReturnType).mock.calls[0]; - const body = JSON.parse(callArgs[1].body); - expect(body.model).toBe("gpt-4o-mini"); - expect(body.temperature).toBe(0.3); - expect(body.max_tokens).toBe(250); // Math.ceil(500 / 2) - }); - - it("should reject targetLength below minimum (100)", async () => { - await expect(summarizeText("text", 99, mockApiKey)).rejects.toThrow( - "Invalid targetLength: 99" - ); - }); - - it("should reject targetLength above maximum (10000)", async () => { - await expect(summarizeText("text", 10001, mockApiKey)).rejects.toThrow( - "Invalid targetLength: 10001" - ); - }); - - it("should accept targetLength at boundaries", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => Promise.resolve({ - choices: [{ message: { content: "Summary" } }], - }), - }); - - // Min boundary - await expect(summarizeText("text", 100, mockApiKey)).resolves.toBeDefined(); - // Max boundary - await expect(summarizeText("text", 10000, mockApiKey)).resolves.toBeDefined(); - }); - - it("should throw error when API returns non-ok response", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: false, - status: 500, - }); - - await expect(summarizeText("text", 500, mockApiKey)).rejects.toThrow( - "Summarization service unavailable" - ); - }); - - it("should throw error when no summary is returned", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => Promise.resolve({ - choices: [], - }), - }); - - await expect(summarizeText("text", 500, mockApiKey)).rejects.toThrow( - "No summary returned" - ); - }); - - it("should throw error when summary content is empty", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => Promise.resolve({ - choices: [{ message: { content: " " } }], // whitespace only - }), - }); - - await expect(summarizeText("text", 500, mockApiKey)).rejects.toThrow( - "No summary returned" - ); - }); - }); -}); diff --git a/extensions/telegram-tts/index.ts b/extensions/telegram-tts/index.ts deleted file mode 100644 index 984bb1abd..000000000 --- a/extensions/telegram-tts/index.ts +++ /dev/null @@ -1,1042 +0,0 @@ -/** - * telegram-tts - Automatic TTS for chat responses - * - * Self-contained TTS extension that calls ElevenLabs/OpenAI APIs directly. - * No external CLI dependencies. - * - * Features: - * - speak tool for programmatic TTS - * - Multi-provider support (ElevenLabs, OpenAI) - * - RPC methods for status and control - * - * Note: Slash commands (/tts_on, /tts_off, /audio) should be configured - * via Telegram customCommands and handled by the agent workspace. - */ - -import { existsSync, readFileSync, writeFileSync, mkdtempSync, rmSync, renameSync, unlinkSync } from "fs"; -import { join } from "path"; -import { tmpdir } from "os"; -import type { PluginApi } from "clawdbot"; - -const PLUGIN_ID = "telegram-tts"; -const DEFAULT_TIMEOUT_MS = 30000; -const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes - -// ============================================================================= -// Types -// ============================================================================= - -interface TtsConfig { - enabled?: boolean; - provider?: "elevenlabs" | "openai"; - elevenlabs?: { - apiKey?: string; - voiceId?: string; - modelId?: string; - }; - openai?: { - apiKey?: string; - model?: string; - voice?: string; - }; - prefsPath?: string; - maxTextLength?: number; - timeoutMs?: number; -} - -interface UserPreferences { - tts?: { - enabled?: boolean; - provider?: "openai" | "elevenlabs"; - maxLength?: number; // Max chars before summarizing (default 1500) - summarize?: boolean; // Enable auto-summarization (default true) - }; -} - -const DEFAULT_TTS_MAX_LENGTH = 1500; -const DEFAULT_TTS_SUMMARIZE = true; - -interface TtsResult { - success: boolean; - audioPath?: string; - error?: string; - latencyMs?: number; - provider?: string; -} - -interface TtsStatusEntry { - timestamp: number; - success: boolean; - textLength: number; - summarized: boolean; - provider?: string; - latencyMs?: number; - error?: string; -} - -// Track last TTS attempt for diagnostics (global, not per-user) -// Note: This shows the most recent TTS attempt system-wide, not user-specific -let lastTtsAttempt: TtsStatusEntry | undefined; - -// ============================================================================= -// Validation -// ============================================================================= - -/** - * Validates ElevenLabs voiceId format to prevent URL injection. - * Voice IDs are alphanumeric strings, typically 20 characters. - */ -function isValidVoiceId(voiceId: string): boolean { - return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); -} - -/** - * Validates OpenAI voice name. - */ -function isValidOpenAIVoice(voice: string): boolean { - const validVoices = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]; - return validVoices.includes(voice); -} - -/** - * Available OpenAI TTS models. - */ -const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"]; - -/** - * Validates OpenAI TTS model name. - */ -function isValidOpenAIModel(model: string): boolean { - return OPENAI_TTS_MODELS.includes(model); -} - -// ============================================================================= -// Configuration & Preferences -// ============================================================================= - -function getPrefsPath(config: TtsConfig): string { - return ( - config.prefsPath || - process.env.CLAWDBOT_TTS_PREFS || - join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json") - ); -} - -function isTtsEnabled(prefsPath: string): boolean { - try { - if (!existsSync(prefsPath)) return false; - const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8")); - return prefs?.tts?.enabled === true; - } catch { - return false; - } -} - -/** - * Atomically writes to a file using temp file + rename pattern. - * Prevents race conditions when multiple processes write simultaneously. - */ -function atomicWriteFileSync(filePath: string, content: string): void { - const tmpPath = `${filePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`; - writeFileSync(tmpPath, content); - try { - renameSync(tmpPath, filePath); - } catch (err) { - // Clean up temp file on rename failure - try { - unlinkSync(tmpPath); - } catch { - // Ignore cleanup errors - } - throw err; - } -} - -function updatePrefs(prefsPath: string, update: (prefs: UserPreferences) => void): void { - let prefs: UserPreferences = {}; - try { - if (existsSync(prefsPath)) { - prefs = JSON.parse(readFileSync(prefsPath, "utf8")); - } - } catch { - // ignore - } - update(prefs); - atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2)); -} - -function setTtsEnabled(prefsPath: string, enabled: boolean): void { - updatePrefs(prefsPath, (prefs) => { - prefs.tts = { ...prefs.tts, enabled }; - }); -} - -function getTtsProvider(prefsPath: string): "openai" | "elevenlabs" | undefined { - try { - if (!existsSync(prefsPath)) return undefined; - const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8")); - return prefs?.tts?.provider; - } catch { - return undefined; - } -} - -function setTtsProvider(prefsPath: string, provider: "openai" | "elevenlabs"): void { - updatePrefs(prefsPath, (prefs) => { - prefs.tts = { ...prefs.tts, provider }; - }); -} - -function getTtsMaxLength(prefsPath: string): number { - try { - if (!existsSync(prefsPath)) return DEFAULT_TTS_MAX_LENGTH; - const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8")); - return prefs?.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH; - } catch { - return DEFAULT_TTS_MAX_LENGTH; - } -} - -function setTtsMaxLength(prefsPath: string, maxLength: number): void { - updatePrefs(prefsPath, (prefs) => { - prefs.tts = { ...prefs.tts, maxLength }; - }); -} - -function isSummarizationEnabled(prefsPath: string): boolean { - try { - if (!existsSync(prefsPath)) return DEFAULT_TTS_SUMMARIZE; - const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8")); - return prefs?.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE; - } catch { - return DEFAULT_TTS_SUMMARIZE; - } -} - -function setSummarizationEnabled(prefsPath: string, enabled: boolean): void { - updatePrefs(prefsPath, (prefs) => { - prefs.tts = { ...prefs.tts, summarize: enabled }; - }); -} - -// ============================================================================= -// Text Summarization (for long texts) -// ============================================================================= - -interface SummarizeResult { - summary: string; - latencyMs: number; - inputLength: number; - outputLength: number; -} - -async function summarizeText( - text: string, - targetLength: number, - apiKey: string, - timeoutMs: number = 30000 -): Promise { - // Validate targetLength - if (targetLength < 100 || targetLength > 10000) { - throw new Error(`Invalid targetLength: ${targetLength}`); - } - - const startTime = Date.now(); - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); - - try { - const response = await fetch("https://api.openai.com/v1/chat/completions", { - method: "POST", - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: "gpt-4o-mini", - messages: [ - { - role: "system", - content: `You are an assistant that summarizes texts concisely while keeping the most important information. Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. Reply only with the summary, without additional explanations.`, - }, - { - role: "user", - content: `\n${text}\n`, - }, - ], - max_tokens: Math.ceil(targetLength / 2), // Conservative estimate for multilingual text - temperature: 0.3, - }), - signal: controller.signal, - }); - - if (!response.ok) { - throw new Error("Summarization service unavailable"); - } - - const data = await response.json() as { - choices?: Array<{ message?: { content?: string } }>; - }; - const summary = data.choices?.[0]?.message?.content?.trim(); - - if (!summary) { - throw new Error("No summary returned"); - } - - const latencyMs = Date.now() - startTime; - return { - summary, - latencyMs, - inputLength: text.length, - outputLength: summary.length, - }; - } finally { - clearTimeout(timeout); - } -} - -function getApiKey(config: TtsConfig, provider: string): string | undefined { - if (provider === "elevenlabs") { - return ( - config.elevenlabs?.apiKey || - process.env.ELEVENLABS_API_KEY || - process.env.XI_API_KEY - ); - } - if (provider === "openai") { - return config.openai?.apiKey || process.env.OPENAI_API_KEY; - } - return undefined; -} - -// ============================================================================= -// Temp File Cleanup -// ============================================================================= - -/** - * Schedules cleanup of a temp directory after a delay. - * This ensures the file is consumed before deletion. - */ -function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void { - const timer = setTimeout(() => { - try { - rmSync(tempDir, { recursive: true, force: true }); - } catch { - // Ignore cleanup errors - } - }, delayMs); - timer.unref(); // Allow process to exit without waiting for cleanup -} - -// ============================================================================= -// TTS Providers -// ============================================================================= - -async function elevenLabsTTS( - text: string, - apiKey: string, - voiceId: string = "pMsXgVXv3BLzUgSXRplE", - modelId: string = "eleven_multilingual_v2", - timeoutMs: number = DEFAULT_TIMEOUT_MS -): Promise { - // Validate voiceId to prevent URL injection - if (!isValidVoiceId(voiceId)) { - throw new Error(`Invalid voiceId format`); - } - - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); - - try { - const response = await fetch( - `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, - { - method: "POST", - headers: { - "xi-api-key": apiKey, - "Content-Type": "application/json", - Accept: "audio/mpeg", - }, - body: JSON.stringify({ - text, - model_id: modelId, - voice_settings: { - stability: 0.5, - similarity_boost: 0.75, - style: 0.0, - use_speaker_boost: true, - }, - }), - signal: controller.signal, - } - ); - - if (!response.ok) { - // Don't leak API error details to users - throw new Error(`ElevenLabs API error (${response.status})`); - } - - return Buffer.from(await response.arrayBuffer()); - } finally { - clearTimeout(timeout); - } -} - -async function openaiTTS( - text: string, - apiKey: string, - model: string = "gpt-4o-mini-tts", - voice: string = "alloy", - timeoutMs: number = DEFAULT_TIMEOUT_MS -): Promise { - // Validate model - if (!isValidOpenAIModel(model)) { - throw new Error(`Invalid model: ${model}`); - } - // Validate voice - if (!isValidOpenAIVoice(voice)) { - throw new Error(`Invalid voice: ${voice}`); - } - - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); - - try { - const response = await fetch("https://api.openai.com/v1/audio/speech", { - method: "POST", - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model, - input: text, - voice, - response_format: "mp3", - }), - signal: controller.signal, - }); - - if (!response.ok) { - // Don't leak API error details to users - throw new Error(`OpenAI TTS API error (${response.status})`); - } - - return Buffer.from(await response.arrayBuffer()); - } finally { - clearTimeout(timeout); - } -} - -// ============================================================================= -// Core TTS Function -// ============================================================================= - -async function textToSpeech(text: string, config: TtsConfig, prefsPath?: string): Promise { - // Get user's preferred provider (from prefs) or fall back to config - const userProvider = prefsPath ? getTtsProvider(prefsPath) : undefined; - const primaryProvider = userProvider || config.provider || "elevenlabs"; - const fallbackProvider = primaryProvider === "openai" ? "elevenlabs" : "openai"; - const timeoutMs = config.timeoutMs || DEFAULT_TIMEOUT_MS; - - const maxLen = config.maxTextLength || 4000; - if (text.length > maxLen) { - return { - success: false, - error: `Text too long (${text.length} chars, max ${maxLen})`, - }; - } - - // Try primary provider first, then fallback - const providers = [primaryProvider, fallbackProvider]; - let lastError: string | undefined; - - for (const provider of providers) { - const apiKey = getApiKey(config, provider); - if (!apiKey) { - lastError = `No API key for ${provider}`; - continue; - } - - const providerStartTime = Date.now(); - try { - let audioBuffer: Buffer; - - if (provider === "elevenlabs") { - audioBuffer = await elevenLabsTTS( - text, - apiKey, - config.elevenlabs?.voiceId, - config.elevenlabs?.modelId, - timeoutMs - ); - } else if (provider === "openai") { - audioBuffer = await openaiTTS( - text, - apiKey, - config.openai?.model || "gpt-4o-mini-tts", - config.openai?.voice, - timeoutMs - ); - } else { - lastError = `Unknown provider: ${provider}`; - continue; - } - - const latencyMs = Date.now() - providerStartTime; - - // Save to temp file - const tempDir = mkdtempSync(join(tmpdir(), "tts-")); - const audioPath = join(tempDir, `voice-${Date.now()}.mp3`); - writeFileSync(audioPath, audioBuffer); - - // Schedule cleanup after delay (file should be consumed by then) - scheduleCleanup(tempDir); - - return { success: true, audioPath, latencyMs, provider }; - } catch (err) { - const error = err as Error; - if (error.name === "AbortError") { - lastError = `${provider}: request timed out`; - } else { - lastError = `${provider}: ${error.message}`; - } - // Continue to try fallback provider - } - } - - return { - success: false, - error: `TTS conversion failed: ${lastError || "no providers available"}`, - }; -} - -// ============================================================================= -// Plugin Registration -// ============================================================================= - -export default function register(api: PluginApi) { - const log = api.logger; - const config: TtsConfig = { - enabled: false, - provider: "elevenlabs", - maxTextLength: 4000, - timeoutMs: DEFAULT_TIMEOUT_MS, - ...(api.pluginConfig || {}), - }; - const prefsPath = getPrefsPath(config); - - log.info(`[${PLUGIN_ID}] Registering plugin...`); - log.info(`[${PLUGIN_ID}] Provider: ${config.provider}`); - log.info(`[${PLUGIN_ID}] Preferences: ${prefsPath}`); - - // =========================================================================== - // Tool: speak - // =========================================================================== - api.registerTool({ - name: "speak", - description: `Convert text to speech and generate voice message. -Use this tool when TTS mode is enabled or user requests audio. - -IMPORTANT: After calling this tool, you MUST output the result exactly as returned. -The tool returns "MEDIA:/path/to/audio.mp3" - copy this EXACTLY to your response. -This MEDIA: directive tells the system to send the audio file. - -Example flow: -1. User asks a question with TTS enabled -2. You call speak({text: "Your answer here"}) -3. Tool returns: MEDIA:/tmp/tts-xxx/voice-123.mp3 -4. You output: MEDIA:/tmp/tts-xxx/voice-123.mp3 - -Do NOT add extra text around the MEDIA directive.`, - parameters: { - type: "object", - properties: { - text: { - type: "string", - description: "The text to convert to speech", - }, - }, - required: ["text"], - }, - execute: async (_id: string, params: { text?: unknown }) => { - // Validate text parameter - if (typeof params?.text !== "string" || params.text.length === 0) { - return { content: [{ type: "text", text: "Error: Invalid or missing text parameter" }] }; - } - - const text = params.text; - log.info(`[${PLUGIN_ID}] speak() called, length: ${text.length}`); - - const result = await textToSpeech(text, config, prefsPath); - - if (result.success && result.audioPath) { - log.info(`[${PLUGIN_ID}] Audio generated: ${result.audioPath}`); - // Return with MEDIA directive for clawdbot to send - return { - content: [ - { - type: "text", - text: `MEDIA:${result.audioPath}`, - }, - ], - }; - } - - log.error(`[${PLUGIN_ID}] TTS failed: ${result.error}`); - return { - content: [ - { - type: "text", - text: result.error || "TTS conversion failed", - }, - ], - }; - }, - }); - - // =========================================================================== - // RPC Methods - // =========================================================================== - - // tts.status - Check if TTS is enabled - api.registerGatewayMethod("tts.status", async () => { - const userProvider = getTtsProvider(prefsPath); - const activeProvider = userProvider || config.provider || "elevenlabs"; - return { - enabled: isTtsEnabled(prefsPath), - provider: activeProvider, - fallbackProvider: activeProvider === "openai" ? "elevenlabs" : "openai", - prefsPath, - hasOpenAIKey: !!getApiKey(config, "openai"), - hasElevenLabsKey: !!getApiKey(config, "elevenlabs"), - }; - }); - - // tts.enable - Enable TTS mode - api.registerGatewayMethod("tts.enable", async () => { - setTtsEnabled(prefsPath, true); - log.info(`[${PLUGIN_ID}] TTS enabled via RPC`); - return { ok: true, enabled: true }; - }); - - // tts.disable - Disable TTS mode - api.registerGatewayMethod("tts.disable", async () => { - setTtsEnabled(prefsPath, false); - log.info(`[${PLUGIN_ID}] TTS disabled via RPC`); - return { ok: true, enabled: false }; - }); - - // tts.convert - Convert text to audio (returns path) - api.registerGatewayMethod("tts.convert", async (params: { text?: unknown }) => { - // Validate text parameter - if (typeof params?.text !== "string" || params.text.length === 0) { - return { ok: false, error: "Invalid or missing 'text' parameter" }; - } - const result = await textToSpeech(params.text, config, prefsPath); - if (result.success) { - return { ok: true, audioPath: result.audioPath }; - } - return { ok: false, error: result.error }; - }); - - // tts.setProvider - Set primary TTS provider - api.registerGatewayMethod("tts.setProvider", async (params: { provider?: unknown }) => { - if (params?.provider !== "openai" && params?.provider !== "elevenlabs") { - return { ok: false, error: "Invalid provider. Use 'openai' or 'elevenlabs'" }; - } - setTtsProvider(prefsPath, params.provider); - log.info(`[${PLUGIN_ID}] Provider set to ${params.provider} via RPC`); - return { ok: true, provider: params.provider }; - }); - - // tts.providers - List available providers and their status - api.registerGatewayMethod("tts.providers", async () => { - const userProvider = getTtsProvider(prefsPath); - return { - providers: [ - { - id: "openai", - name: "OpenAI", - configured: !!getApiKey(config, "openai"), - models: ["gpt-4o-mini-tts"], - voices: ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"], - }, - { - id: "elevenlabs", - name: "ElevenLabs", - configured: !!getApiKey(config, "elevenlabs"), - models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"], - }, - ], - active: userProvider || config.provider || "elevenlabs", - }; - }); - - // =========================================================================== - // Plugin Commands (LLM-free, intercepted automatically) - // =========================================================================== - - // /tts_on - Enable TTS mode - api.registerCommand({ - name: "tts_on", - description: "Enable text-to-speech for responses", - handler: () => { - setTtsEnabled(prefsPath, true); - log.info(`[${PLUGIN_ID}] TTS enabled via /tts_on command`); - return { text: "πŸ”Š TTS enabled! I'll now respond with audio." }; - }, - }); - - // /tts_off - Disable TTS mode - api.registerCommand({ - name: "tts_off", - description: "Disable text-to-speech for responses", - handler: () => { - setTtsEnabled(prefsPath, false); - log.info(`[${PLUGIN_ID}] TTS disabled via /tts_off command`); - return { text: "πŸ”‡ TTS disabled. Back to text mode." }; - }, - }); - - // /audio - Convert text to audio immediately - api.registerCommand({ - name: "audio", - description: "Convert text to audio message", - acceptsArgs: true, - handler: async (ctx) => { - const text = ctx.args?.trim(); - if (!text) { - return { text: "❌ Usage: /audio " }; - } - - log.info(`[${PLUGIN_ID}] /audio command, text length: ${text.length}`); - const result = await textToSpeech(text, config, prefsPath); - - if (result.success && result.audioPath) { - log.info(`[${PLUGIN_ID}] Audio generated: ${result.audioPath}`); - return { text: `MEDIA:${result.audioPath}` }; - } - - log.error(`[${PLUGIN_ID}] /audio failed: ${result.error}`); - return { text: `❌ Error generating audio: ${result.error}` }; - }, - }); - - // /tts_provider [openai|elevenlabs] - Set or show TTS provider - api.registerCommand({ - name: "tts_provider", - description: "Set or show TTS provider (openai or elevenlabs)", - acceptsArgs: true, - handler: (ctx) => { - const arg = ctx.args?.trim().toLowerCase(); - const currentProvider = getTtsProvider(prefsPath) || config.provider || "elevenlabs"; - - if (!arg) { - // Show current provider - const fallback = currentProvider === "openai" ? "elevenlabs" : "openai"; - const hasOpenAI = !!getApiKey(config, "openai"); - const hasElevenLabs = !!getApiKey(config, "elevenlabs"); - return { - text: `πŸŽ™οΈ **TTS Provider**\n\n` + - `Primary: **${currentProvider}** ${currentProvider === "openai" ? "(gpt-4o-mini-tts)" : "(eleven_multilingual_v2)"}\n` + - `Fallback: ${fallback}\n\n` + - `OpenAI: ${hasOpenAI ? "βœ… configured" : "❌ no API key"}\n` + - `ElevenLabs: ${hasElevenLabs ? "βœ… configured" : "❌ no API key"}\n\n` + - `Usage: /tts_provider openai or /tts_provider elevenlabs`, - }; - } - - if (arg !== "openai" && arg !== "elevenlabs") { - return { text: "❌ Invalid provider. Use: /tts_provider openai or /tts_provider elevenlabs" }; - } - - setTtsProvider(prefsPath, arg); - const fallback = arg === "openai" ? "elevenlabs" : "openai"; - log.info(`[${PLUGIN_ID}] Provider set to ${arg} via /tts_provider command`); - return { - text: `βœ… TTS provider changed!\n\n` + - `Primary: **${arg}** ${arg === "openai" ? "(gpt-4o-mini-tts)" : "(eleven_multilingual_v2)"}\n` + - `Fallback: ${fallback}`, - }; - }, - }); - - // /tts_limit [number] - Set or show max text length before summarizing - api.registerCommand({ - name: "tts_limit", - description: "Set or show max text length for TTS (longer texts are summarized)", - acceptsArgs: true, - handler: (ctx) => { - const arg = ctx.args?.trim(); - const currentLimit = getTtsMaxLength(prefsPath); - - if (!arg) { - // Show current limit - return { - text: `πŸ“ **TTS Limit**\n\n` + - `Current limit: **${currentLimit}** characters\n\n` + - `Texts longer than ${currentLimit} chars will be automatically summarized with gpt-4o-mini before converting to audio.\n\n` + - `Usage: /tts_limit 2000 (sets new limit)`, - }; - } - - const newLimit = parseInt(arg, 10); - if (isNaN(newLimit) || newLimit < 100 || newLimit > 10000) { - return { text: "❌ Invalid limit. Use a number between 100 and 10000." }; - } - - setTtsMaxLength(prefsPath, newLimit); - log.info(`[${PLUGIN_ID}] Max length set to ${newLimit} via /tts_limit command`); - return { - text: `βœ… TTS limit changed to **${newLimit}** characters!\n\n` + - `Longer texts will be automatically summarized before converting to audio.`, - }; - }, - }); - - // /tts_summary [on|off] - Enable/disable auto-summarization - api.registerCommand({ - name: "tts_summary", - description: "Enable or disable auto-summarization for long texts", - acceptsArgs: true, - handler: (ctx) => { - const arg = ctx.args?.trim().toLowerCase(); - const currentEnabled = isSummarizationEnabled(prefsPath); - const maxLength = getTtsMaxLength(prefsPath); - - if (!arg) { - // Show current status - return { - text: `πŸ“ **TTS Auto-Summary**\n\n` + - `Status: ${currentEnabled ? "βœ… Enabled" : "❌ Disabled"}\n` + - `Limit: ${maxLength} characters\n\n` + - `When enabled, texts longer than ${maxLength} chars are summarized with gpt-4o-mini before converting to audio.\n\n` + - `Usage: /tts_summary on or /tts_summary off`, - }; - } - - if (arg !== "on" && arg !== "off") { - return { text: "❌ Use: /tts_summary on or /tts_summary off" }; - } - - const newEnabled = arg === "on"; - setSummarizationEnabled(prefsPath, newEnabled); - log.info(`[${PLUGIN_ID}] Summarization ${newEnabled ? "enabled" : "disabled"} via /tts_summary command`); - return { - text: newEnabled - ? `βœ… Auto-summary **enabled**!\n\nLong texts will be summarized before converting to audio.` - : `❌ Auto-summary **disabled**!\n\nLong texts will be skipped (no audio).`, - }; - }, - }); - - // /tts_status - Show TTS status and last attempt result - api.registerCommand({ - name: "tts_status", - description: "Show TTS status, configuration, and last attempt result", - acceptsArgs: false, - handler: () => { - const enabled = isTtsEnabled(prefsPath); - const userProvider = getTtsProvider(prefsPath); - const activeProvider = userProvider || config.provider || "elevenlabs"; - const maxLength = getTtsMaxLength(prefsPath); - const summarizationEnabled = isSummarizationEnabled(prefsPath); - const hasKey = !!getApiKey(config, activeProvider); - - let statusLines = [ - `πŸ“Š **TTS Status**\n`, - `State: ${enabled ? "βœ… Enabled" : "❌ Disabled"}`, - `Provider: ${activeProvider} (API Key: ${hasKey ? "βœ…" : "❌"})`, - `Text limit: ${maxLength} characters`, - `Auto-summary: ${summarizationEnabled ? "βœ… Enabled" : "❌ Disabled"}`, - ]; - - if (lastTtsAttempt) { - const timeAgo = Math.round((Date.now() - lastTtsAttempt.timestamp) / 1000); - statusLines.push(``); - statusLines.push(`**Last attempt** (${timeAgo}s ago):`); - statusLines.push(`Result: ${lastTtsAttempt.success ? "βœ… Success" : "❌ Failed"}`); - statusLines.push(`Text: ${lastTtsAttempt.textLength} chars${lastTtsAttempt.summarized ? " (summarized)" : ""}`); - if (lastTtsAttempt.success) { - statusLines.push(`Provider: ${lastTtsAttempt.provider}`); - statusLines.push(`Latency: ${lastTtsAttempt.latencyMs}ms`); - } else if (lastTtsAttempt.error) { - statusLines.push(`Error: ${lastTtsAttempt.error}`); - } - } else { - statusLines.push(``); - statusLines.push(`_No TTS attempts recorded in this session._`); - } - - return { text: statusLines.join("\n") }; - }, - }); - - // =========================================================================== - // Auto-TTS Hook (message_sending) - // =========================================================================== - - // Automatically convert text responses to audio when TTS is enabled - api.on("message_sending", async (event) => { - // Check if TTS is enabled - if (!isTtsEnabled(prefsPath)) { - return; // TTS disabled, don't modify message - } - - const content = event.content?.trim(); - if (!content) { - return; // Empty content, skip - } - - // Skip if already contains MEDIA directive (avoid double conversion) - if (content.includes("MEDIA:")) { - return; - } - - // Skip very short messages (likely errors or status) - if (content.length < 10) { - return; - } - - const maxLength = getTtsMaxLength(prefsPath); - let textForAudio = content; - const summarizationEnabled = isSummarizationEnabled(prefsPath); - - // If text exceeds limit, summarize it first (if enabled) - if (content.length > maxLength) { - if (!summarizationEnabled) { - log.info(`[${PLUGIN_ID}] Auto-TTS: Text too long (${content.length} > ${maxLength}), summarization disabled, skipping audio`); - return; // User disabled summarization, skip audio for long texts - } - - log.info(`[${PLUGIN_ID}] Auto-TTS: Text too long (${content.length} > ${maxLength}), summarizing...`); - - const openaiKey = getApiKey(config, "openai"); - if (!openaiKey) { - log.warn(`[${PLUGIN_ID}] Auto-TTS: No OpenAI key for summarization, skipping audio`); - return; // Can't summarize without OpenAI key - } - - try { - const summarizeResult = await summarizeText(content, maxLength, openaiKey, config.timeoutMs); - textForAudio = summarizeResult.summary; - log.info( - `[${PLUGIN_ID}] Auto-TTS: Summarized ${summarizeResult.inputLength} β†’ ${summarizeResult.outputLength} chars in ${summarizeResult.latencyMs}ms` - ); - - // Safeguard: if summary still exceeds hard limit, truncate - const hardLimit = config.maxTextLength || 4000; - if (textForAudio.length > hardLimit) { - log.warn(`[${PLUGIN_ID}] Auto-TTS: Summary exceeded hard limit (${textForAudio.length} > ${hardLimit}), truncating`); - textForAudio = textForAudio.slice(0, hardLimit - 3) + "..."; - } - } catch (err) { - const error = err as Error; - log.error(`[${PLUGIN_ID}] Auto-TTS: Summarization failed: ${error.message}`); - return; // On summarization failure, skip audio - } - } else { - log.info(`[${PLUGIN_ID}] Auto-TTS: Converting ${content.length} chars`); - } - - const wasSummarized = textForAudio !== content; - - try { - const ttsStartTime = Date.now(); - const result = await textToSpeech(textForAudio, config, prefsPath); - - if (result.success && result.audioPath) { - const totalLatency = Date.now() - ttsStartTime; - log.info( - `[${PLUGIN_ID}] Auto-TTS: Generated via ${result.provider} in ${result.latencyMs}ms (total: ${totalLatency}ms)` - ); - - // Track successful attempt - lastTtsAttempt = { - timestamp: Date.now(), - success: true, - textLength: content.length, - summarized: wasSummarized, - provider: result.provider, - latencyMs: result.latencyMs, - }; - - // Return modified content with MEDIA directive - // The text is kept for accessibility, audio is appended - return { - content: `MEDIA:${result.audioPath}`, - }; - } else { - log.warn(`[${PLUGIN_ID}] Auto-TTS: TTS conversion failed - ${result.error}`); - - // Track failed attempt - lastTtsAttempt = { - timestamp: Date.now(), - success: false, - textLength: content.length, - summarized: wasSummarized, - error: result.error, - }; - - // On failure, send original text without audio - return; - } - } catch (err) { - const error = err as Error; - log.error(`[${PLUGIN_ID}] Auto-TTS: Unexpected error - ${error.message}`); - - // Track error - lastTtsAttempt = { - timestamp: Date.now(), - success: false, - textLength: content.length, - summarized: wasSummarized, - error: error.message, - }; - - // On error, send original text - return; - } - }); - - // =========================================================================== - // Startup - // =========================================================================== - - const ttsEnabled = isTtsEnabled(prefsPath); - const userProvider = getTtsProvider(prefsPath); - const activeProvider = userProvider || config.provider || "elevenlabs"; - const hasKey = !!getApiKey(config, activeProvider); - - log.info(`[${PLUGIN_ID}] Ready. TTS: ${ttsEnabled ? "ON" : "OFF"}, Provider: ${activeProvider}, API Key: ${hasKey ? "OK" : "MISSING"}`); - - if (!hasKey) { - log.warn( - `[${PLUGIN_ID}] No API key configured. Set ELEVENLABS_API_KEY or OPENAI_API_KEY.` - ); - } -} - -// ============================================================================= -// Plugin Metadata -// ============================================================================= - -export const meta = { - id: PLUGIN_ID, - name: "Telegram TTS", - description: "Text-to-speech for chat responses using ElevenLabs or OpenAI", - version: "0.3.0", -}; - -// ============================================================================= -// Test Exports (for unit testing) -// ============================================================================= - -export const _test = { - isValidVoiceId, - isValidOpenAIVoice, - isValidOpenAIModel, - OPENAI_TTS_MODELS, - summarizeText, -}; diff --git a/extensions/telegram-tts/package.json b/extensions/telegram-tts/package.json deleted file mode 100644 index a3cbc51b7..000000000 --- a/extensions/telegram-tts/package.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "@clawdbot/telegram-tts", - "version": "0.3.0", - "private": true, - "description": "Text-to-speech for chat responses using ElevenLabs or OpenAI", - "main": "index.ts", - "keywords": ["clawdbot", "tts", "elevenlabs", "openai", "telegram", "voice"] -} diff --git a/src/agents/clawdbot-tools.ts b/src/agents/clawdbot-tools.ts index 60fde06fb..91de31937 100644 --- a/src/agents/clawdbot-tools.ts +++ b/src/agents/clawdbot-tools.ts @@ -17,6 +17,7 @@ import { createSessionsListTool } from "./tools/sessions-list-tool.js"; import { createSessionsSendTool } from "./tools/sessions-send-tool.js"; import { createSessionsSpawnTool } from "./tools/sessions-spawn-tool.js"; import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js"; +import { createTtsTool } from "./tools/tts-tool.js"; export function createClawdbotTools(options?: { browserControlUrl?: string; @@ -96,6 +97,10 @@ export function createClawdbotTools(options?: { replyToMode: options?.replyToMode, hasRepliedRef: options?.hasRepliedRef, }), + createTtsTool({ + agentChannel: options?.agentChannel, + config: options?.config, + }), createGatewayTool({ agentSessionKey: options?.agentSessionKey, config: options?.config, diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts new file mode 100644 index 000000000..e0a49cf16 --- /dev/null +++ b/src/agents/tools/tts-tool.ts @@ -0,0 +1,60 @@ +import { Type } from "@sinclair/typebox"; + +import { loadConfig } from "../../config/config.js"; +import type { ClawdbotConfig } from "../../config/config.js"; +import type { GatewayMessageChannel } from "../../utils/message-channel.js"; +import { textToSpeech } from "../../tts/tts.js"; +import type { AnyAgentTool } from "./common.js"; +import { readStringParam } from "./common.js"; + +const TtsToolSchema = Type.Object({ + text: Type.String({ description: "Text to convert to speech." }), + channel: Type.Optional( + Type.String({ description: "Optional channel id to pick output format (e.g. telegram)." }), + ), +}); + +export function createTtsTool(opts?: { + config?: ClawdbotConfig; + agentChannel?: GatewayMessageChannel; +}): AnyAgentTool { + return { + label: "TTS", + name: "tts", + description: + "Convert text to speech and return a MEDIA: path. Use when the user requests audio or TTS is enabled. Copy the MEDIA line exactly.", + parameters: TtsToolSchema, + execute: async (_toolCallId, args) => { + const params = args as Record; + const text = readStringParam(params, "text", { required: true }); + const channel = readStringParam(params, "channel"); + const cfg = opts?.config ?? loadConfig(); + const result = await textToSpeech({ + text, + cfg, + channel: channel ?? opts?.agentChannel, + }); + + if (result.success && result.audioPath) { + const lines: string[] = []; + // Tag Telegram Opus output as a voice bubble instead of a file attachment. + if (result.voiceCompatible) lines.push("[[audio_as_voice]]"); + lines.push(`MEDIA:${result.audioPath}`); + return { + content: [{ type: "text", text: lines.join("\n") }], + details: { audioPath: result.audioPath, provider: result.provider }, + }; + } + + return { + content: [ + { + type: "text", + text: result.error ?? "TTS conversion failed", + }, + ], + details: { error: result.error }, + }; + }, + }; +} diff --git a/src/auto-reply/commands-registry.data.ts b/src/auto-reply/commands-registry.data.ts index 7e6d76399..3e2ad8775 100644 --- a/src/auto-reply/commands-registry.data.ts +++ b/src/auto-reply/commands-registry.data.ts @@ -272,6 +272,81 @@ function buildChatCommands(): ChatCommandDefinition[] { ], argsMenu: "auto", }), + defineChatCommand({ + key: "audio", + nativeName: "audio", + description: "Convert text to a TTS audio reply.", + textAlias: "/audio", + args: [ + { + name: "text", + description: "Text to speak", + type: "string", + captureRemaining: true, + }, + ], + }), + defineChatCommand({ + key: "tts_on", + nativeName: "tts_on", + description: "Enable text-to-speech for replies.", + textAlias: "/tts_on", + }), + defineChatCommand({ + key: "tts_off", + nativeName: "tts_off", + description: "Disable text-to-speech for replies.", + textAlias: "/tts_off", + }), + defineChatCommand({ + key: "tts_provider", + nativeName: "tts_provider", + description: "Set or show the TTS provider.", + textAlias: "/tts_provider", + args: [ + { + name: "provider", + description: "openai or elevenlabs", + type: "string", + choices: ["openai", "elevenlabs"], + }, + ], + argsMenu: "auto", + }), + defineChatCommand({ + key: "tts_limit", + nativeName: "tts_limit", + description: "Set or show the max TTS text length.", + textAlias: "/tts_limit", + args: [ + { + name: "maxLength", + description: "Max chars before summarizing", + type: "number", + }, + ], + }), + defineChatCommand({ + key: "tts_summary", + nativeName: "tts_summary", + description: "Enable or disable TTS auto-summary.", + textAlias: "/tts_summary", + args: [ + { + name: "mode", + description: "on or off", + type: "string", + choices: ["on", "off"], + }, + ], + argsMenu: "auto", + }), + defineChatCommand({ + key: "tts_status", + nativeName: "tts_status", + description: "Show TTS status and last attempt.", + textAlias: "/tts_status", + }), defineChatCommand({ key: "stop", nativeName: "stop", diff --git a/src/auto-reply/reply/commands-core.ts b/src/auto-reply/reply/commands-core.ts index ad39e198c..5cf40dfb2 100644 --- a/src/auto-reply/reply/commands-core.ts +++ b/src/auto-reply/reply/commands-core.ts @@ -16,6 +16,7 @@ import { import { handleAllowlistCommand } from "./commands-allowlist.js"; import { handleSubagentsCommand } from "./commands-subagents.js"; import { handleModelsCommand } from "./commands-models.js"; +import { handleTtsCommands } from "./commands-tts.js"; import { handleAbortTrigger, handleActivationCommand, @@ -39,6 +40,7 @@ const HANDLERS: CommandHandler[] = [ handleSendPolicyCommand, handleUsageCommand, handleRestartCommand, + handleTtsCommands, handleHelpCommand, handleCommandsListCommand, handleStatusCommand, diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts new file mode 100644 index 000000000..9582143af --- /dev/null +++ b/src/auto-reply/reply/commands-tts.ts @@ -0,0 +1,214 @@ +import { logVerbose } from "../../globals.js"; +import type { ReplyPayload } from "../types.js"; +import type { CommandHandler } from "./commands-types.js"; +import { + getLastTtsAttempt, + getTtsMaxLength, + getTtsProvider, + isSummarizationEnabled, + isTtsEnabled, + resolveTtsApiKey, + resolveTtsConfig, + resolveTtsPrefsPath, + setLastTtsAttempt, + setSummarizationEnabled, + setTtsEnabled, + setTtsMaxLength, + setTtsProvider, + textToSpeech, +} from "../../tts/tts.js"; + +function parseCommandArg(normalized: string, command: string): string | null { + if (normalized === command) return ""; + if (normalized.startsWith(`${command} `)) return normalized.slice(command.length).trim(); + return null; +} + +export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => { + if (!allowTextCommands) return null; + const normalized = params.command.commandBodyNormalized; + if ( + !normalized.startsWith("/tts_") && + normalized !== "/audio" && + !normalized.startsWith("/audio ") + ) { + return null; + } + + if (!params.command.isAuthorizedSender) { + logVerbose( + `Ignoring TTS command from unauthorized sender: ${params.command.senderId || ""}`, + ); + return { shouldContinue: false }; + } + + const config = resolveTtsConfig(params.cfg); + const prefsPath = resolveTtsPrefsPath(config); + + if (normalized === "/tts_on") { + setTtsEnabled(prefsPath, true); + return { shouldContinue: false, reply: { text: "πŸ”Š TTS enabled." } }; + } + + if (normalized === "/tts_off") { + setTtsEnabled(prefsPath, false); + return { shouldContinue: false, reply: { text: "πŸ”‡ TTS disabled." } }; + } + + const audioArg = parseCommandArg(normalized, "/audio"); + if (audioArg !== null) { + if (!audioArg.trim()) { + return { shouldContinue: false, reply: { text: "βš™οΈ Usage: /audio " } }; + } + + const start = Date.now(); + const result = await textToSpeech({ + text: audioArg, + cfg: params.cfg, + channel: params.command.channel, + prefsPath, + }); + + if (result.success && result.audioPath) { + setLastTtsAttempt({ + timestamp: Date.now(), + success: true, + textLength: audioArg.length, + summarized: false, + provider: result.provider, + latencyMs: result.latencyMs, + }); + const payload: ReplyPayload = { + mediaUrl: result.audioPath, + audioAsVoice: result.voiceCompatible === true, + }; + return { shouldContinue: false, reply: payload }; + } + + setLastTtsAttempt({ + timestamp: Date.now(), + success: false, + textLength: audioArg.length, + summarized: false, + error: result.error, + latencyMs: Date.now() - start, + }); + return { + shouldContinue: false, + reply: { text: `❌ Error generating audio: ${result.error ?? "unknown error"}` }, + }; + } + + const providerArg = parseCommandArg(normalized, "/tts_provider"); + if (providerArg !== null) { + const currentProvider = getTtsProvider(config, prefsPath); + if (!providerArg.trim()) { + const fallback = currentProvider === "openai" ? "elevenlabs" : "openai"; + const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); + const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); + return { + shouldContinue: false, + reply: { + text: + `πŸŽ™οΈ TTS provider\n` + + `Primary: ${currentProvider}\n` + + `Fallback: ${fallback}\n` + + `OpenAI key: ${hasOpenAI ? "βœ…" : "❌"}\n` + + `ElevenLabs key: ${hasElevenLabs ? "βœ…" : "❌"}\n` + + `Usage: /tts_provider openai | elevenlabs`, + }, + }; + } + + const requested = providerArg.trim().toLowerCase(); + if (requested !== "openai" && requested !== "elevenlabs") { + return { + shouldContinue: false, + reply: { text: "βš™οΈ Usage: /tts_provider openai | elevenlabs" }, + }; + } + + setTtsProvider(prefsPath, requested); + const fallback = requested === "openai" ? "elevenlabs" : "openai"; + return { + shouldContinue: false, + reply: { text: `βœ… TTS provider set to ${requested} (fallback: ${fallback}).` }, + }; + } + + const limitArg = parseCommandArg(normalized, "/tts_limit"); + if (limitArg !== null) { + if (!limitArg.trim()) { + const currentLimit = getTtsMaxLength(prefsPath); + return { + shouldContinue: false, + reply: { text: `πŸ“ TTS limit: ${currentLimit} characters.` }, + }; + } + const next = Number.parseInt(limitArg.trim(), 10); + if (!Number.isFinite(next) || next < 100 || next > 10_000) { + return { + shouldContinue: false, + reply: { text: "βš™οΈ Usage: /tts_limit <100-10000>" }, + }; + } + setTtsMaxLength(prefsPath, next); + return { + shouldContinue: false, + reply: { text: `βœ… TTS limit set to ${next} characters.` }, + }; + } + + const summaryArg = parseCommandArg(normalized, "/tts_summary"); + if (summaryArg !== null) { + if (!summaryArg.trim()) { + const enabled = isSummarizationEnabled(prefsPath); + return { + shouldContinue: false, + reply: { text: `πŸ“ TTS auto-summary: ${enabled ? "on" : "off"}.` }, + }; + } + const requested = summaryArg.trim().toLowerCase(); + if (requested !== "on" && requested !== "off") { + return { shouldContinue: false, reply: { text: "βš™οΈ Usage: /tts_summary on|off" } }; + } + setSummarizationEnabled(prefsPath, requested === "on"); + return { + shouldContinue: false, + reply: { + text: requested === "on" ? "βœ… TTS auto-summary enabled." : "❌ TTS auto-summary disabled.", + }, + }; + } + + if (normalized === "/tts_status") { + const enabled = isTtsEnabled(config, prefsPath); + const provider = getTtsProvider(config, prefsPath); + const hasKey = Boolean(resolveTtsApiKey(config, provider)); + const maxLength = getTtsMaxLength(prefsPath); + const summarize = isSummarizationEnabled(prefsPath); + const last = getLastTtsAttempt(); + const lines = [ + "πŸ“Š TTS status", + `State: ${enabled ? "βœ… enabled" : "❌ disabled"}`, + `Provider: ${provider} (${hasKey ? "βœ… key" : "❌ no key"})`, + `Text limit: ${maxLength} chars`, + `Auto-summary: ${summarize ? "on" : "off"}`, + ]; + if (last) { + const timeAgo = Math.round((Date.now() - last.timestamp) / 1000); + lines.push(""); + lines.push(`Last attempt (${timeAgo}s ago): ${last.success ? "βœ…" : "❌"}`); + lines.push(`Text: ${last.textLength} chars${last.summarized ? " (summarized)" : ""}`); + if (last.success) { + lines.push(`Provider: ${last.provider ?? "unknown"}`); + lines.push(`Latency: ${last.latencyMs ?? 0}ms`); + } else if (last.error) { + lines.push(`Error: ${last.error}`); + } + } + return { shouldContinue: false, reply: { text: lines.join("\n") } }; + } + + return null; +}; diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 47989026c..eb8d303b7 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -13,6 +13,7 @@ import { formatAbortReplyText, tryFastAbortFromMessage } from "./abort.js"; import { shouldSkipDuplicateInbound } from "./inbound-dedupe.js"; import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js"; import { isRoutableChannel, routeReply } from "./route-reply.js"; +import { maybeApplyTtsToPayload } from "../../tts/tts.js"; export type DispatchFromConfigResult = { queuedFinal: boolean; @@ -91,6 +92,7 @@ export async function dispatchReplyFromConfig(params: { const currentSurface = (ctx.Surface ?? ctx.Provider)?.toLowerCase(); const shouldRouteToOriginating = isRoutableChannel(originatingChannel) && originatingTo && originatingChannel !== currentSurface; + const ttsChannel = shouldRouteToOriginating ? originatingChannel : currentSurface; /** * Helper to send a payload via route-reply (async). @@ -164,22 +166,36 @@ export async function dispatchReplyFromConfig(params: { { ...params.replyOptions, onToolResult: (payload: ReplyPayload) => { - if (shouldRouteToOriginating) { - // Fire-and-forget for streaming tool results when routing. - void sendPayloadAsync(payload); - } else { - // Synchronous dispatch to preserve callback timing. - dispatcher.sendToolResult(payload); - } + const run = async () => { + const ttsPayload = await maybeApplyTtsToPayload({ + payload, + cfg, + channel: ttsChannel, + kind: "tool", + }); + if (shouldRouteToOriginating) { + await sendPayloadAsync(ttsPayload); + } else { + dispatcher.sendToolResult(ttsPayload); + } + }; + return run(); }, onBlockReply: (payload: ReplyPayload, context) => { - if (shouldRouteToOriginating) { - // Await routed sends so upstream can enforce ordering/timeouts. - return sendPayloadAsync(payload, context?.abortSignal); - } else { - // Synchronous dispatch to preserve callback timing. - dispatcher.sendBlockReply(payload); - } + const run = async () => { + const ttsPayload = await maybeApplyTtsToPayload({ + payload, + cfg, + channel: ttsChannel, + kind: "block", + }); + if (shouldRouteToOriginating) { + await sendPayloadAsync(ttsPayload, context?.abortSignal); + } else { + dispatcher.sendBlockReply(ttsPayload); + } + }; + return run(); }, }, cfg, @@ -190,10 +206,16 @@ export async function dispatchReplyFromConfig(params: { let queuedFinal = false; let routedFinalCount = 0; for (const reply of replies) { + const ttsReply = await maybeApplyTtsToPayload({ + payload: reply, + cfg, + channel: ttsChannel, + kind: "final", + }); if (shouldRouteToOriginating && originatingChannel && originatingTo) { // Route final reply to originating channel. const result = await routeReply({ - payload: reply, + payload: ttsReply, channel: originatingChannel, to: originatingTo, sessionKey: ctx.SessionKey, @@ -209,7 +231,7 @@ export async function dispatchReplyFromConfig(params: { queuedFinal = result.ok || queuedFinal; if (result.ok) routedFinalCount += 1; } else { - queuedFinal = dispatcher.sendFinalReply(reply) || queuedFinal; + queuedFinal = dispatcher.sendFinalReply(ttsReply) || queuedFinal; } } await dispatcher.waitForIdle(); diff --git a/src/auto-reply/reply/route-reply.ts b/src/auto-reply/reply/route-reply.ts index c874d1c04..bbc7efa7d 100644 --- a/src/auto-reply/reply/route-reply.ts +++ b/src/auto-reply/reply/route-reply.ts @@ -10,7 +10,6 @@ import { resolveSessionAgentId } from "../../agents/agent-scope.js"; import { resolveEffectiveMessagesConfig } from "../../agents/identity.js"; import { normalizeChannelId } from "../../channels/plugins/index.js"; -import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js"; import type { ClawdbotConfig } from "../../config/config.js"; import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js"; import type { OriginatingChannelType } from "../templating.js"; @@ -81,48 +80,6 @@ export async function routeReply(params: RouteReplyParams): Promise { + try { + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + const provider = getTtsProvider(config, prefsPath); + respond(true, { + enabled: isTtsEnabled(config, prefsPath), + provider, + fallbackProvider: provider === "openai" ? "elevenlabs" : "openai", + prefsPath, + hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")), + hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")), + }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "tts.enable": async ({ respond }) => { + try { + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + setTtsEnabled(prefsPath, true); + respond(true, { enabled: true }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "tts.disable": async ({ respond }) => { + try { + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + setTtsEnabled(prefsPath, false); + respond(true, { enabled: false }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "tts.convert": async ({ params, respond }) => { + const text = typeof params.text === "string" ? params.text.trim() : ""; + if (!text) { + respond( + false, + undefined, + errorShape(ErrorCodes.INVALID_REQUEST, "tts.convert requires text"), + ); + return; + } + try { + const cfg = loadConfig(); + const channel = typeof params.channel === "string" ? params.channel.trim() : undefined; + const result = await textToSpeech({ text, cfg, channel }); + if (result.success && result.audioPath) { + respond(true, { + audioPath: result.audioPath, + provider: result.provider, + outputFormat: result.outputFormat, + voiceCompatible: result.voiceCompatible, + }); + return; + } + respond( + false, + undefined, + errorShape(ErrorCodes.UNAVAILABLE, result.error ?? "TTS conversion failed"), + ); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "tts.setProvider": async ({ params, respond }) => { + const provider = typeof params.provider === "string" ? params.provider.trim() : ""; + if (provider !== "openai" && provider !== "elevenlabs") { + respond( + false, + undefined, + errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai or elevenlabs."), + ); + return; + } + try { + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + setTtsProvider(prefsPath, provider); + respond(true, { provider }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, + "tts.providers": async ({ respond }) => { + try { + const cfg = loadConfig(); + const config = resolveTtsConfig(cfg); + const prefsPath = resolveTtsPrefsPath(config); + respond(true, { + providers: [ + { + id: "openai", + name: "OpenAI", + configured: Boolean(resolveTtsApiKey(config, "openai")), + models: [...OPENAI_TTS_MODELS], + voices: [...OPENAI_TTS_VOICES], + }, + { + id: "elevenlabs", + name: "ElevenLabs", + configured: Boolean(resolveTtsApiKey(config, "elevenlabs")), + models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"], + }, + ], + active: getTtsProvider(config, prefsPath), + }); + } catch (err) { + respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); + } + }, +}; diff --git a/src/telegram/bot/delivery.ts b/src/telegram/bot/delivery.ts index 7839f9ced..653474d50 100644 --- a/src/telegram/bot/delivery.ts +++ b/src/telegram/bot/delivery.ts @@ -14,7 +14,6 @@ import { mediaKindFromMime } from "../../media/constants.js"; import { fetchRemoteMedia } from "../../media/fetch.js"; import { isGifMedia } from "../../media/mime.js"; import { saveMediaBuffer } from "../../media/store.js"; -import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js"; import type { RuntimeEnv } from "../../runtime.js"; import { loadWebMedia } from "../../web/media.js"; import { resolveTelegramVoiceSend } from "../voice.js"; @@ -40,45 +39,6 @@ export async function deliverReplies(params: { const threadParams = buildTelegramThreadParams(messageThreadId); let hasReplied = false; for (const reply of replies) { - // Track if hook wants to send audio after text - let audioToSendAfter: string | undefined; - - // Run message_sending hook (allows plugins like TTS to generate audio) - const hookRunner = getGlobalHookRunner(); - if (hookRunner && reply?.text?.trim()) { - try { - const hookResult = await hookRunner.runMessageSending( - { - to: chatId, - content: reply.text, - metadata: { channel: "telegram", threadId: messageThreadId }, - }, - { - channelId: "telegram", - accountId: undefined, - conversationId: chatId, - }, - ); - - // Check if hook wants to cancel the message - if (hookResult?.cancel) { - continue; // Skip this reply - } - - // Check if hook returned a MEDIA directive (TTS audio) - if (hookResult?.content !== undefined) { - const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m); - if (mediaMatch) { - // Save audio path to send AFTER the text message - audioToSendAfter = mediaMatch[1].trim(); - } - } - } catch (err) { - // Hook errors shouldn't block message sending - logVerbose(`[telegram delivery] hook error: ${String(err)}`); - } - } - const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0; if (!reply?.text && !hasMedia) { if (reply?.audioAsVoice) { @@ -110,25 +70,6 @@ export async function deliverReplies(params: { hasReplied = true; } } - - // Send TTS audio after text (if hook generated one) - if (audioToSendAfter) { - try { - const audioMedia = await loadWebMedia(audioToSendAfter); - const audioFile = new InputFile(audioMedia.buffer, "voice.mp3"); - // Switch typing indicator to record_voice before sending - await params.onVoiceRecording?.(); - const audioParams: Record = {}; - if (threadParams) { - audioParams.message_thread_id = threadParams.message_thread_id; - } - await bot.api.sendVoice(chatId, audioFile, audioParams); - logVerbose(`[telegram delivery] TTS audio sent: ${audioToSendAfter}`); - } catch (err) { - logVerbose(`[telegram delivery] TTS audio send failed: ${String(err)}`); - } - } - continue; } // media with optional caption on first item diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts new file mode 100644 index 000000000..c4725a723 --- /dev/null +++ b/src/tts/tts.test.ts @@ -0,0 +1,234 @@ +import { describe, expect, it, vi, beforeEach, afterEach } from "vitest"; + +import { _test } from "./tts.js"; + +const { + isValidVoiceId, + isValidOpenAIVoice, + isValidOpenAIModel, + OPENAI_TTS_MODELS, + OPENAI_TTS_VOICES, + summarizeText, + resolveOutputFormat, +} = _test; + +describe("tts", () => { + describe("isValidVoiceId", () => { + it("accepts valid ElevenLabs voice IDs", () => { + expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true); + expect(isValidVoiceId("21m00Tcm4TlvDq8ikWAM")).toBe(true); + expect(isValidVoiceId("EXAVITQu4vr4xnSDxMaL")).toBe(true); + }); + + it("accepts voice IDs of varying valid lengths", () => { + expect(isValidVoiceId("a1b2c3d4e5")).toBe(true); + expect(isValidVoiceId("a".repeat(40))).toBe(true); + }); + + it("rejects too short voice IDs", () => { + expect(isValidVoiceId("")).toBe(false); + expect(isValidVoiceId("abc")).toBe(false); + expect(isValidVoiceId("123456789")).toBe(false); + }); + + it("rejects too long voice IDs", () => { + expect(isValidVoiceId("a".repeat(41))).toBe(false); + expect(isValidVoiceId("a".repeat(100))).toBe(false); + }); + + it("rejects voice IDs with invalid characters", () => { + expect(isValidVoiceId("pMsXgVXv3BLz-gSXRplE")).toBe(false); + expect(isValidVoiceId("pMsXgVXv3BLz_gSXRplE")).toBe(false); + expect(isValidVoiceId("pMsXgVXv3BLz gSXRplE")).toBe(false); + expect(isValidVoiceId("../../../etc/passwd")).toBe(false); + expect(isValidVoiceId("voice?param=value")).toBe(false); + }); + }); + + describe("isValidOpenAIVoice", () => { + it("accepts all valid OpenAI voices", () => { + for (const voice of OPENAI_TTS_VOICES) { + expect(isValidOpenAIVoice(voice)).toBe(true); + } + }); + + it("rejects invalid voice names", () => { + expect(isValidOpenAIVoice("invalid")).toBe(false); + expect(isValidOpenAIVoice("")).toBe(false); + expect(isValidOpenAIVoice("ALLOY")).toBe(false); + expect(isValidOpenAIVoice("alloy ")).toBe(false); + expect(isValidOpenAIVoice(" alloy")).toBe(false); + }); + }); + + describe("isValidOpenAIModel", () => { + it("accepts gpt-4o-mini-tts model", () => { + expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true); + }); + + it("rejects other models", () => { + expect(isValidOpenAIModel("tts-1")).toBe(false); + expect(isValidOpenAIModel("tts-1-hd")).toBe(false); + expect(isValidOpenAIModel("invalid")).toBe(false); + expect(isValidOpenAIModel("")).toBe(false); + expect(isValidOpenAIModel("gpt-4")).toBe(false); + }); + }); + + describe("OPENAI_TTS_MODELS", () => { + it("contains only gpt-4o-mini-tts", () => { + expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts"); + expect(OPENAI_TTS_MODELS).toHaveLength(1); + }); + + it("is a non-empty array", () => { + expect(Array.isArray(OPENAI_TTS_MODELS)).toBe(true); + expect(OPENAI_TTS_MODELS.length).toBeGreaterThan(0); + }); + }); + + describe("resolveOutputFormat", () => { + it("uses Opus for Telegram", () => { + const output = resolveOutputFormat("telegram"); + expect(output.openai).toBe("opus"); + expect(output.elevenlabs).toBe("opus_48000_64"); + expect(output.extension).toBe(".opus"); + expect(output.voiceCompatible).toBe(true); + }); + + it("uses MP3 for other channels", () => { + const output = resolveOutputFormat("discord"); + expect(output.openai).toBe("mp3"); + expect(output.elevenlabs).toBe("mp3_44100_128"); + expect(output.extension).toBe(".mp3"); + expect(output.voiceCompatible).toBe(false); + }); + }); + + describe("summarizeText", () => { + const mockApiKey = "test-api-key"; + const originalFetch = globalThis.fetch; + + beforeEach(() => { + vi.useFakeTimers({ shouldAdvanceTime: true }); + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + vi.useRealTimers(); + }); + + it("summarizes text and returns result with metrics", async () => { + const mockSummary = "This is a summarized version of the text."; + globalThis.fetch = vi.fn().mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + choices: [{ message: { content: mockSummary } }], + }), + }); + + const longText = "A".repeat(2000); + const result = await summarizeText(longText, 1500, mockApiKey, 30_000); + + expect(result.summary).toBe(mockSummary); + expect(result.inputLength).toBe(2000); + expect(result.outputLength).toBe(mockSummary.length); + expect(result.latencyMs).toBeGreaterThanOrEqual(0); + expect(globalThis.fetch).toHaveBeenCalledTimes(1); + }); + + it("calls OpenAI API with correct parameters", async () => { + globalThis.fetch = vi.fn().mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + choices: [{ message: { content: "Summary" } }], + }), + }); + + await summarizeText("Long text to summarize", 500, mockApiKey, 30_000); + + expect(globalThis.fetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/chat/completions", + expect.objectContaining({ + method: "POST", + headers: { + Authorization: `Bearer ${mockApiKey}`, + "Content-Type": "application/json", + }, + }), + ); + + const callArgs = (globalThis.fetch as ReturnType).mock.calls[0]; + const body = JSON.parse(callArgs[1].body); + expect(body.model).toBe("gpt-4o-mini"); + expect(body.temperature).toBe(0.3); + expect(body.max_tokens).toBe(250); + }); + + it("rejects targetLength below minimum (100)", async () => { + await expect(summarizeText("text", 99, mockApiKey, 30_000)).rejects.toThrow( + "Invalid targetLength: 99", + ); + }); + + it("rejects targetLength above maximum (10000)", async () => { + await expect(summarizeText("text", 10001, mockApiKey, 30_000)).rejects.toThrow( + "Invalid targetLength: 10001", + ); + }); + + it("accepts targetLength at boundaries", async () => { + globalThis.fetch = vi.fn().mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + choices: [{ message: { content: "Summary" } }], + }), + }); + + await expect(summarizeText("text", 100, mockApiKey, 30_000)).resolves.toBeDefined(); + await expect(summarizeText("text", 10000, mockApiKey, 30_000)).resolves.toBeDefined(); + }); + + it("throws error when API returns non-ok response", async () => { + globalThis.fetch = vi.fn().mockResolvedValue({ + ok: false, + status: 500, + }); + + await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow( + "Summarization service unavailable", + ); + }); + + it("throws error when no summary is returned", async () => { + globalThis.fetch = vi.fn().mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + choices: [], + }), + }); + + await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow( + "No summary returned", + ); + }); + + it("throws error when summary content is empty", async () => { + globalThis.fetch = vi.fn().mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + choices: [{ message: { content: " " } }], + }), + }); + + await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow( + "No summary returned", + ); + }); + }); +}); diff --git a/src/tts/tts.ts b/src/tts/tts.ts new file mode 100644 index 000000000..0a03063a9 --- /dev/null +++ b/src/tts/tts.ts @@ -0,0 +1,630 @@ +import { + existsSync, + mkdirSync, + readFileSync, + writeFileSync, + mkdtempSync, + rmSync, + renameSync, + unlinkSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; + +import type { ReplyPayload } from "../auto-reply/types.js"; +import { normalizeChannelId } from "../channels/plugins/index.js"; +import type { ChannelId } from "../channels/plugins/types.js"; +import type { ClawdbotConfig } from "../config/config.js"; +import type { TtsConfig, TtsMode, TtsProvider } from "../config/types.tts.js"; +import { logVerbose } from "../globals.js"; +import { CONFIG_DIR, resolveUserPath } from "../utils.js"; + +const DEFAULT_TIMEOUT_MS = 30_000; +const DEFAULT_TTS_MAX_LENGTH = 1500; +const DEFAULT_TTS_SUMMARIZE = true; +const DEFAULT_MAX_TEXT_LENGTH = 4000; +const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes + +const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; +const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"; +const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"; +const DEFAULT_OPENAI_VOICE = "alloy"; + +const TELEGRAM_OUTPUT = { + openai: "opus" as const, + // ElevenLabs output formats use codec_sample_rate_bitrate naming. + // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram. + elevenlabs: "opus_48000_64", + extension: ".opus", + voiceCompatible: true, +}; + +const DEFAULT_OUTPUT = { + openai: "mp3" as const, + elevenlabs: "mp3_44100_128", + extension: ".mp3", + voiceCompatible: false, +}; + +export type ResolvedTtsConfig = { + enabled: boolean; + mode: TtsMode; + provider: TtsProvider; + elevenlabs: { + apiKey?: string; + voiceId: string; + modelId: string; + }; + openai: { + apiKey?: string; + model: string; + voice: string; + }; + prefsPath?: string; + maxTextLength: number; + timeoutMs: number; +}; + +type TtsUserPrefs = { + tts?: { + enabled?: boolean; + provider?: TtsProvider; + maxLength?: number; + summarize?: boolean; + }; +}; + +export type TtsResult = { + success: boolean; + audioPath?: string; + error?: string; + latencyMs?: number; + provider?: string; + outputFormat?: string; + voiceCompatible?: boolean; +}; + +type TtsStatusEntry = { + timestamp: number; + success: boolean; + textLength: number; + summarized: boolean; + provider?: string; + latencyMs?: number; + error?: string; +}; + +let lastTtsAttempt: TtsStatusEntry | undefined; + +export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { + const raw: TtsConfig = cfg.messages?.tts ?? {}; + return { + enabled: raw.enabled ?? false, + mode: raw.mode ?? "final", + provider: raw.provider ?? "elevenlabs", + elevenlabs: { + apiKey: raw.elevenlabs?.apiKey, + voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID, + modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID, + }, + openai: { + apiKey: raw.openai?.apiKey, + model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, + voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, + }, + prefsPath: raw.prefsPath, + maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, + timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, + }; +} + +export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string { + if (config.prefsPath?.trim()) return resolveUserPath(config.prefsPath.trim()); + const envPath = process.env.CLAWDBOT_TTS_PREFS?.trim(); + if (envPath) return resolveUserPath(envPath); + return path.join(CONFIG_DIR, "settings", "tts.json"); +} + +function readPrefs(prefsPath: string): TtsUserPrefs { + try { + if (!existsSync(prefsPath)) return {}; + return JSON.parse(readFileSync(prefsPath, "utf8")) as TtsUserPrefs; + } catch { + return {}; + } +} + +function atomicWriteFileSync(filePath: string, content: string): void { + const tmpPath = `${filePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`; + writeFileSync(tmpPath, content); + try { + renameSync(tmpPath, filePath); + } catch (err) { + try { + unlinkSync(tmpPath); + } catch { + // ignore + } + throw err; + } +} + +function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): void { + const prefs = readPrefs(prefsPath); + update(prefs); + mkdirSync(path.dirname(prefsPath), { recursive: true }); + atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2)); +} + +export function isTtsEnabled(config: ResolvedTtsConfig, prefsPath: string): boolean { + const prefs = readPrefs(prefsPath); + if (prefs.tts?.enabled !== undefined) return prefs.tts.enabled === true; + return config.enabled; +} + +export function setTtsEnabled(prefsPath: string, enabled: boolean): void { + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, enabled }; + }); +} + +export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider { + const prefs = readPrefs(prefsPath); + return prefs.tts?.provider ?? config.provider; +} + +export function setTtsProvider(prefsPath: string, provider: TtsProvider): void { + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, provider }; + }); +} + +export function getTtsMaxLength(prefsPath: string): number { + const prefs = readPrefs(prefsPath); + return prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH; +} + +export function setTtsMaxLength(prefsPath: string, maxLength: number): void { + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, maxLength }; + }); +} + +export function isSummarizationEnabled(prefsPath: string): boolean { + const prefs = readPrefs(prefsPath); + return prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE; +} + +export function setSummarizationEnabled(prefsPath: string, enabled: boolean): void { + updatePrefs(prefsPath, (prefs) => { + prefs.tts = { ...prefs.tts, summarize: enabled }; + }); +} + +export function getLastTtsAttempt(): TtsStatusEntry | undefined { + return lastTtsAttempt; +} + +export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { + lastTtsAttempt = entry; +} + +function resolveOutputFormat(channelId?: string | null) { + if (channelId === "telegram") return TELEGRAM_OUTPUT; + return DEFAULT_OUTPUT; +} + +function resolveChannelId(channel: string | undefined): ChannelId | null { + return channel ? normalizeChannelId(channel) : null; +} + +export function resolveTtsApiKey( + config: ResolvedTtsConfig, + provider: TtsProvider, +): string | undefined { + if (provider === "elevenlabs") { + return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY; + } + if (provider === "openai") { + return config.openai.apiKey || process.env.OPENAI_API_KEY; + } + return undefined; +} + +function isValidVoiceId(voiceId: string): boolean { + return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); +} + +export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const; +export const OPENAI_TTS_VOICES = [ + "alloy", + "ash", + "coral", + "echo", + "fable", + "onyx", + "nova", + "sage", + "shimmer", +] as const; + +type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; + +function isValidOpenAIModel(model: string): boolean { + return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); +} + +function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice { + return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice); +} + +type SummarizeResult = { + summary: string; + latencyMs: number; + inputLength: number; + outputLength: number; +}; + +async function summarizeText( + text: string, + targetLength: number, + apiKey: string, + timeoutMs: number, +): Promise { + if (targetLength < 100 || targetLength > 10_000) { + throw new Error(`Invalid targetLength: ${targetLength}`); + } + + const startTime = Date.now(); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch("https://api.openai.com/v1/chat/completions", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: "gpt-4o-mini", + messages: [ + { + role: "system", + content: `You are an assistant that summarizes texts concisely while keeping the most important information. Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. Reply only with the summary, without additional explanations.`, + }, + { + role: "user", + content: `\n${text}\n`, + }, + ], + max_tokens: Math.ceil(targetLength / 2), + temperature: 0.3, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error("Summarization service unavailable"); + } + + const data = (await response.json()) as { + choices?: Array<{ message?: { content?: string } }>; + }; + const summary = data.choices?.[0]?.message?.content?.trim(); + + if (!summary) { + throw new Error("No summary returned"); + } + + return { + summary, + latencyMs: Date.now() - startTime, + inputLength: text.length, + outputLength: summary.length, + }; + } finally { + clearTimeout(timeout); + } +} + +function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void { + const timer = setTimeout(() => { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } + }, delayMs); + timer.unref(); +} + +async function elevenLabsTTS(params: { + text: string; + apiKey: string; + voiceId: string; + modelId: string; + outputFormat: string; + timeoutMs: number; +}): Promise { + const { text, apiKey, voiceId, modelId, outputFormat, timeoutMs } = params; + if (!isValidVoiceId(voiceId)) { + throw new Error("Invalid voiceId format"); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`); + if (outputFormat) { + url.searchParams.set("output_format", outputFormat); + } + + const response = await fetch(url.toString(), { + method: "POST", + headers: { + "xi-api-key": apiKey, + "Content-Type": "application/json", + Accept: "audio/mpeg", + }, + body: JSON.stringify({ + text, + model_id: modelId, + voice_settings: { + stability: 0.5, + similarity_boost: 0.75, + style: 0.0, + use_speaker_boost: true, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`ElevenLabs API error (${response.status})`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + +async function openaiTTS(params: { + text: string; + apiKey: string; + model: string; + voice: string; + responseFormat: "mp3" | "opus"; + timeoutMs: number; +}): Promise { + const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; + + if (!isValidOpenAIModel(model)) { + throw new Error(`Invalid model: ${model}`); + } + if (!isValidOpenAIVoice(voice)) { + throw new Error(`Invalid voice: ${voice}`); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch("https://api.openai.com/v1/audio/speech", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + input: text, + voice, + response_format: responseFormat, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`OpenAI TTS API error (${response.status})`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + +export async function textToSpeech(params: { + text: string; + cfg: ClawdbotConfig; + prefsPath?: string; + channel?: string; +}): Promise { + const config = resolveTtsConfig(params.cfg); + const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); + const channelId = resolveChannelId(params.channel); + const output = resolveOutputFormat(channelId); + + if (params.text.length > config.maxTextLength) { + return { + success: false, + error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`, + }; + } + + const userProvider = getTtsProvider(config, prefsPath); + const providers: TtsProvider[] = [ + userProvider, + userProvider === "openai" ? "elevenlabs" : "openai", + ]; + + let lastError: string | undefined; + + for (const provider of providers) { + const apiKey = resolveTtsApiKey(config, provider); + if (!apiKey) { + lastError = `No API key for ${provider}`; + continue; + } + + const providerStart = Date.now(); + try { + let audioBuffer: Buffer; + if (provider === "elevenlabs") { + audioBuffer = await elevenLabsTTS({ + text: params.text, + apiKey, + voiceId: config.elevenlabs.voiceId, + modelId: config.elevenlabs.modelId, + outputFormat: output.elevenlabs, + timeoutMs: config.timeoutMs, + }); + } else { + audioBuffer = await openaiTTS({ + text: params.text, + apiKey, + model: config.openai.model, + voice: config.openai.voice, + responseFormat: output.openai, + timeoutMs: config.timeoutMs, + }); + } + + const latencyMs = Date.now() - providerStart; + + const tempDir = mkdtempSync(path.join(tmpdir(), "tts-")); + const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`); + writeFileSync(audioPath, audioBuffer); + scheduleCleanup(tempDir); + + return { + success: true, + audioPath, + latencyMs, + provider, + outputFormat: provider === "openai" ? output.openai : output.elevenlabs, + voiceCompatible: output.voiceCompatible, + }; + } catch (err) { + const error = err as Error; + if (error.name === "AbortError") { + lastError = `${provider}: request timed out`; + } else { + lastError = `${provider}: ${error.message}`; + } + } + } + + return { + success: false, + error: `TTS conversion failed: ${lastError || "no providers available"}`, + }; +} + +export async function maybeApplyTtsToPayload(params: { + payload: ReplyPayload; + cfg: ClawdbotConfig; + channel?: string; + kind?: "tool" | "block" | "final"; +}): Promise { + const config = resolveTtsConfig(params.cfg); + const prefsPath = resolveTtsPrefsPath(config); + if (!isTtsEnabled(config, prefsPath)) return params.payload; + + const mode = config.mode ?? "final"; + if (mode === "final" && params.kind && params.kind !== "final") return params.payload; + + const text = params.payload.text ?? ""; + if (!text.trim()) return params.payload; + if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return params.payload; + if (text.includes("MEDIA:")) return params.payload; + if (text.trim().length < 10) return params.payload; + + const maxLength = getTtsMaxLength(prefsPath); + let textForAudio = text.trim(); + let wasSummarized = false; + + if (textForAudio.length > maxLength) { + if (!isSummarizationEnabled(prefsPath)) { + logVerbose( + `TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, + ); + return params.payload; + } + + const openaiKey = resolveTtsApiKey(config, "openai"); + if (!openaiKey) { + logVerbose("TTS: skipping summarization - OpenAI key missing."); + return params.payload; + } + + try { + const summary = await summarizeText(textForAudio, maxLength, openaiKey, config.timeoutMs); + textForAudio = summary.summary; + wasSummarized = true; + if (textForAudio.length > config.maxTextLength) { + logVerbose( + `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`, + ); + textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`; + } + } catch (err) { + const error = err as Error; + logVerbose(`TTS: summarization failed: ${error.message}`); + return params.payload; + } + } + + const ttsStart = Date.now(); + const result = await textToSpeech({ + text: textForAudio, + cfg: params.cfg, + prefsPath, + channel: params.channel, + }); + + if (result.success && result.audioPath) { + lastTtsAttempt = { + timestamp: Date.now(), + success: true, + textLength: text.length, + summarized: wasSummarized, + provider: result.provider, + latencyMs: result.latencyMs, + }; + + const channelId = resolveChannelId(params.channel); + const shouldVoice = channelId === "telegram" && result.voiceCompatible === true; + + return { + ...params.payload, + mediaUrl: result.audioPath, + audioAsVoice: shouldVoice || params.payload.audioAsVoice, + }; + } + + lastTtsAttempt = { + timestamp: Date.now(), + success: false, + textLength: text.length, + summarized: wasSummarized, + error: result.error, + }; + + const latency = Date.now() - ttsStart; + logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`); + return params.payload; +} + +export const _test = { + isValidVoiceId, + isValidOpenAIVoice, + isValidOpenAIModel, + OPENAI_TTS_MODELS, + OPENAI_TTS_VOICES, + summarizeText, + resolveOutputFormat, +};