feat(extensions): add telegram-tts extension for voice responses

Add a new extension that provides automatic text-to-speech for chat responses using ElevenLabs API. Features: - `speak` tool for converting text to voice messages - RPC methods: tts.status, tts.enable, tts.disable, tts.convert - User preferences file for persistent TTS state - Configurable voice ID, model, and max text length Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 00:38:43 +00:00
parent 202d7af855
commit 5428c97685
4 changed files with 434 additions and 0 deletions
--- a/extensions/telegram-tts/README.md
+++ b/extensions/telegram-tts/README.md
@@ -0,0 +1,122 @@
+# Telegram TTS Extension
+
+Automatic text-to-speech for chat responses using ElevenLabs.
+
+## Features
+
+- **`speak` Tool**: Converts text to speech and sends as voice message
+- **RPC Methods**: Control TTS via Gateway (`tts.status`, `tts.enable`, `tts.disable`, `tts.convert`)
+- **User Preferences**: Persistent TTS state via JSON file
+- **Multi-channel**: Works with Telegram and other channels
+
+## Requirements
+
+- ElevenLabs API key
+- `sag` CLI tool (ElevenLabs TTS wrapper)
+
+## Installation
+
+The extension is bundled with Clawdbot. Enable it in your config:
+
+```json
+{
+  "plugins": {
+    "entries": {
+      "telegram-tts": {
+        "enabled": true,
+        "elevenlabs": {
+          "apiKey": "your-api-key"
+        }
+      }
+    }
+  }
+}
+```
+
+Or set the API key via environment variable:
+
+```bash
+export ELEVENLABS_API_KEY=your-api-key
+```
+
+## Configuration
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `enabled` | boolean | `false` | Enable the plugin |
+| `provider` | string | `"elevenlabs"` | TTS provider |
+| `elevenlabs.apiKey` | string | - | ElevenLabs API key |
+| `elevenlabs.voiceId` | string | `"pMsXgVXv3BLzUgSXRplE"` | Voice ID |
+| `elevenlabs.modelId` | string | `"eleven_multilingual_v2"` | Model ID |
+| `prefsPath` | string | `~/clawd/.user-preferences.json` | User preferences file |
+| `maxTextLength` | number | `4000` | Max characters for TTS |
+
+## Usage
+
+### Agent Tool
+
+The agent can use the `speak` tool to send voice messages:
+
+```
+User: Send me a voice message saying hello
+Agent: [calls speak({ text: "Hello! How can I help you today?" })]
+```
+
+### RPC Methods
+
+```bash
+# Check TTS status
+clawdbot gateway call tts.status
+
+# Enable/disable TTS
+clawdbot gateway call tts.enable
+clawdbot gateway call tts.disable
+
+# Convert text to audio
+clawdbot gateway call tts.convert '{"text": "Hello world"}'
+```
+
+### Telegram Commands
+
+Add custom commands to toggle TTS mode:
+
+```json
+{
+  "channels": {
+    "telegram": {
+      "customCommands": [
+        {"command": "tts_on", "description": "Enable voice responses"},
+        {"command": "tts_off", "description": "Disable voice responses"}
+      ]
+    }
+  }
+}
+```
+
+Then add handling instructions to your agent workspace (CLAUDE.md or TOOLS.md).
+
+## Dependencies
+
+This extension requires the `sag` CLI tool. On Linux, you can create a Python wrapper:
+
+```python
+#!/usr/bin/env python3
+# ~/.local/bin/sag
+from elevenlabs.client import ElevenLabs
+import sys, os, tempfile
+
+client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
+audio = client.text_to_speech.convert(
+    voice_id=os.environ.get("ELEVENLABS_VOICE_ID", "pMsXgVXv3BLzUgSXRplE"),
+    model_id="eleven_multilingual_v2",
+    text=sys.argv[1]
+)
+with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+    for chunk in audio:
+        f.write(chunk)
+    print(f.name)
+```
+
+## License
+
+MIT
--- a/extensions/telegram-tts/clawdbot.plugin.json
+++ b/extensions/telegram-tts/clawdbot.plugin.json
@@ -0,0 +1,81 @@
+{
+  "id": "telegram-tts",
+  "uiHints": {
+    "enabled": {
+      "label": "Enable TTS",
+      "help": "Automatically convert text responses to voice messages"
+    },
+    "provider": {
+      "label": "TTS Provider"
+    },
+    "elevenlabs.apiKey": {
+      "label": "ElevenLabs API Key",
+      "sensitive": true
+    },
+    "elevenlabs.voiceId": {
+      "label": "ElevenLabs Voice ID",
+      "help": "Default: pMsXgVXv3BLzUgSXRplE (Borislav)"
+    },
+    "elevenlabs.modelId": {
+      "label": "ElevenLabs Model ID",
+      "help": "Default: eleven_multilingual_v2"
+    },
+    "prefsPath": {
+      "label": "User Preferences File",
+      "help": "Path to JSON file storing TTS state",
+      "advanced": true
+    },
+    "maxTextLength": {
+      "label": "Max Text Length",
+      "help": "Maximum characters to convert to speech",
+      "advanced": true
+    }
+  },
+  "configSchema": {
+    "type": "object",
+    "additionalProperties": false,
+    "properties": {
+      "enabled": {
+        "type": "boolean",
+        "default": false
+      },
+      "provider": {
+        "type": "string",
+        "enum": ["elevenlabs", "openai"],
+        "default": "elevenlabs"
+      },
+      "elevenlabs": {
+        "type": "object",
+        "additionalProperties": false,
+        "properties": {
+          "apiKey": {
+            "type": "string"
+          },
+          "voiceId": {
+            "type": "string",
+            "default": "pMsXgVXv3BLzUgSXRplE"
+          },
+          "modelId": {
+            "type": "string",
+            "default": "eleven_multilingual_v2"
+          }
+        }
+      },
+      "prefsPath": {
+        "type": "string"
+      },
+      "maxTextLength": {
+        "type": "integer",
+        "minimum": 1,
+        "default": 4000
+      },
+      "channels": {
+        "type": "array",
+        "items": {
+          "type": "string"
+        },
+        "default": ["telegram"]
+      }
+    }
+  }
+}
--- a/extensions/telegram-tts/index.ts
+++ b/extensions/telegram-tts/index.ts
@@ -0,0 +1,224 @@
+/**
+ * telegram-tts - Automatic TTS for chat responses
+ *
+ * This plugin provides a `speak` tool that converts text to speech using
+ * ElevenLabs API and sends the response as a voice message.
+ *
+ * When TTS mode is enabled (via user preferences or config), the agent
+ * is instructed to use the speak tool for all responses.
+ */
+
+import { execSync } from "child_process";
+import { existsSync, readFileSync, writeFileSync } from "fs";
+import { join } from "path";
+import type { PluginApi, PluginConfig } from "clawdbot";
+
+const PLUGIN_ID = "telegram-tts";
+
+interface TelegramTtsConfig {
+  enabled?: boolean;
+  provider?: "elevenlabs" | "openai";
+  elevenlabs?: {
+    apiKey?: string;
+    voiceId?: string;
+    modelId?: string;
+  };
+  prefsPath?: string;
+  maxTextLength?: number;
+  channels?: string[];
+}
+
+interface UserPreferences {
+  tts?: {
+    enabled?: boolean;
+  };
+}
+
+/**
+ * Load environment variables from .clawdbot/.env
+ */
+function loadEnv(): Record<string, string> {
+  const envPath = join(process.env.HOME || "/home/dev", ".clawdbot", ".env");
+  const env: Record<string, string> = { ...process.env } as Record<string, string>;
+
+  if (existsSync(envPath)) {
+    const content = readFileSync(envPath, "utf8");
+    for (const line of content.split("\n")) {
+      const trimmed = line.trim();
+      if (trimmed && !trimmed.startsWith("#")) {
+        const [key, ...valueParts] = trimmed.split("=");
+        if (key && valueParts.length > 0) {
+          let value = valueParts.join("=");
+          // Remove quotes if present
+          if (
+            (value.startsWith('"') && value.endsWith('"')) ||
+            (value.startsWith("'") && value.endsWith("'"))
+          ) {
+            value = value.slice(1, -1);
+          }
+          env[key.trim()] = value;
+        }
+      }
+    }
+  }
+  return env;
+}
+
+/**
+ * Check if TTS is enabled in user preferences
+ */
+function isTtsEnabled(prefsPath: string): boolean {
+  try {
+    if (!existsSync(prefsPath)) return false;
+    const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8"));
+    return prefs?.tts?.enabled === true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Set TTS enabled state in user preferences
+ */
+function setTtsEnabled(prefsPath: string, enabled: boolean): void {
+  let prefs: UserPreferences = {};
+  try {
+    if (existsSync(prefsPath)) {
+      prefs = JSON.parse(readFileSync(prefsPath, "utf8"));
+    }
+  } catch {
+    // ignore
+  }
+  prefs.tts = { enabled };
+  writeFileSync(prefsPath, JSON.stringify(prefs, null, 2));
+}
+
+/**
+ * Convert text to audio using sag CLI (ElevenLabs wrapper)
+ */
+function textToAudio(text: string): string | null {
+  try {
+    const escapedText = text.replace(/'/g, "'\\''");
+    const env = loadEnv();
+
+    const result = execSync(`sag '${escapedText}'`, {
+      encoding: "utf8",
+      timeout: 60000,
+      env,
+    }).trim();
+
+    if (result && existsSync(result)) {
+      return result;
+    }
+    return null;
+  } catch (err) {
+    console.error(`[${PLUGIN_ID}] TTS error:`, (err as Error).message);
+    return null;
+  }
+}
+
+/**
+ * Plugin registration
+ */
+export default function register(api: PluginApi) {
+  const log = api.logger;
+  const config = (api.pluginConfig || {}) as TelegramTtsConfig;
+  const prefsPath =
+    config.prefsPath ||
+    process.env.CLAWDBOT_TTS_PREFS ||
+    join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json");
+
+  log.info(`[${PLUGIN_ID}] Registering plugin...`);
+  log.info(`[${PLUGIN_ID}] Preferences path: ${prefsPath}`);
+
+  // Register the 'speak' tool for TTS
+  api.registerTool({
+    name: "speak",
+    description:
+      "Convert text to speech and send as voice message. Use this tool when TTS mode is enabled or when the user requests an audio response.",
+    parameters: {
+      type: "object",
+      properties: {
+        text: {
+          type: "string",
+          description: "The text to convert to speech and send as voice message",
+        },
+      },
+      required: ["text"],
+    },
+    execute: async (_id: string, params: { text: string }) => {
+      const { text } = params;
+      log.info(`[${PLUGIN_ID}] speak() called, text length: ${text?.length || 0}`);
+
+      if (!text) {
+        return { content: [{ type: "text", text: "Error: No text provided" }] };
+      }
+
+      const maxLen = config.maxTextLength || 4000;
+      if (text.length > maxLen) {
+        return {
+          content: [
+            {
+              type: "text",
+              text: `Error: Text too long (${text.length} chars, max ${maxLen})`,
+            },
+          ],
+        };
+      }
+
+      const audioPath = textToAudio(text);
+
+      if (audioPath) {
+        log.info(`[${PLUGIN_ID}] Audio generated: ${audioPath}`);
+        return {
+          content: [{ type: "text", text: `Voice message generated successfully.` }],
+          media: audioPath,
+          asVoice: true,
+        };
+      }
+
+      log.error(`[${PLUGIN_ID}] TTS conversion failed`);
+      return {
+        content: [{ type: "text", text: `TTS conversion failed. Original: ${text}` }],
+      };
+    },
+  });
+
+  // Register Gateway RPC methods
+  api.registerGatewayMethod("tts.status", async () => ({
+    enabled: isTtsEnabled(prefsPath),
+    prefsPath,
+    pluginId: PLUGIN_ID,
+    config: {
+      provider: config.provider || "elevenlabs",
+      maxTextLength: config.maxTextLength || 4000,
+    },
+  }));
+
+  api.registerGatewayMethod("tts.enable", async () => {
+    setTtsEnabled(prefsPath, true);
+    return { ok: true, enabled: true };
+  });
+
+  api.registerGatewayMethod("tts.disable", async () => {
+    setTtsEnabled(prefsPath, false);
+    return { ok: true, enabled: false };
+  });
+
+  api.registerGatewayMethod("tts.convert", async (params: { text: string }) => {
+    if (!params.text) return { ok: false, error: "No text provided" };
+    const audioPath = textToAudio(params.text);
+    return audioPath ? { ok: true, audioPath } : { ok: false, error: "Conversion failed" };
+  });
+
+  log.info(
+    `[${PLUGIN_ID}] Plugin ready. TTS is currently ${isTtsEnabled(prefsPath) ? "ENABLED" : "disabled"}`
+  );
+}
+
+export const meta = {
+  id: PLUGIN_ID,
+  name: "Telegram TTS",
+  description: "Automatic text-to-speech for chat responses using ElevenLabs",
+  version: "0.1.0",
+};
--- a/extensions/telegram-tts/package.json
+++ b/extensions/telegram-tts/package.json
@@ -0,0 +1,7 @@
+{
+  "name": "@clawdbot/telegram-tts",
+  "version": "0.1.0",
+  "private": true,
+  "description": "Automatic text-to-speech for chat responses using ElevenLabs",
+  "main": "index.ts"
+}