feat(extensions): add telegram-tts extension for voice responses

Add a new extension that provides automatic text-to-speech for chat responses using ElevenLabs API. Features: - `speak` tool for converting text to voice messages - RPC methods: tts.status, tts.enable, tts.disable, tts.convert - User preferences file for persistent TTS state - Configurable voice ID, model, and max text length Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 00:38:43 +00:00
parent 202d7af855
commit 5428c97685
4 changed files with 434 additions and 0 deletions
--- a/extensions/telegram-tts/README.md
+++ b/extensions/telegram-tts/README.md
@@ -0,0 +1,122 @@
 # Telegram TTS Extension
 Automatic text-to-speech for chat responses using ElevenLabs.
 ## Features
 - **`speak` Tool**: Converts text to speech and sends as voice message
 - **RPC Methods**: Control TTS via Gateway (`tts.status`, `tts.enable`, `tts.disable`, `tts.convert`)
 - **User Preferences**: Persistent TTS state via JSON file
 - **Multi-channel**: Works with Telegram and other channels
 ## Requirements
 - ElevenLabs API key
 - `sag` CLI tool (ElevenLabs TTS wrapper)
 ## Installation
 The extension is bundled with Clawdbot. Enable it in your config:
 ```json
 {
  "plugins": {
    "entries": {
      "telegram-tts": {
        "enabled": true,
        "elevenlabs": {
          "apiKey": "your-api-key"
        }
      }
    }
  }
 }
 ```
 Or set the API key via environment variable:
 ```bash
 export ELEVENLABS_API_KEY=your-api-key
 ```
 ## Configuration
 | Option | Type | Default | Description |
 |--------|------|---------|-------------|
 | `enabled` | boolean | `false` | Enable the plugin |
 | `provider` | string | `"elevenlabs"` | TTS provider |
 | `elevenlabs.apiKey` | string | - | ElevenLabs API key |
 | `elevenlabs.voiceId` | string | `"pMsXgVXv3BLzUgSXRplE"` | Voice ID |
 | `elevenlabs.modelId` | string | `"eleven_multilingual_v2"` | Model ID |
 | `prefsPath` | string | `~/clawd/.user-preferences.json` | User preferences file |
 | `maxTextLength` | number | `4000` | Max characters for TTS |
 ## Usage
 ### Agent Tool
 The agent can use the `speak` tool to send voice messages:
 ```
 User: Send me a voice message saying hello
 Agent: [calls speak({ text: "Hello! How can I help you today?" })]
 ```
 ### RPC Methods
 ```bash
 # Check TTS status
 clawdbot gateway call tts.status
 # Enable/disable TTS
 clawdbot gateway call tts.enable
 clawdbot gateway call tts.disable
 # Convert text to audio
 clawdbot gateway call tts.convert '{"text": "Hello world"}'
 ```
 ### Telegram Commands
 Add custom commands to toggle TTS mode:
 ```json
 {
  "channels": {
    "telegram": {
      "customCommands": [
        {"command": "tts_on", "description": "Enable voice responses"},
        {"command": "tts_off", "description": "Disable voice responses"}
      ]
    }
  }
 }
 ```
 Then add handling instructions to your agent workspace (CLAUDE.md or TOOLS.md).
 ## Dependencies
 This extension requires the `sag` CLI tool. On Linux, you can create a Python wrapper:
 ```python
 #!/usr/bin/env python3
 # ~/.local/bin/sag
 from elevenlabs.client import ElevenLabs
 import sys, os, tempfile
 client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
 audio = client.text_to_speech.convert(
    voice_id=os.environ.get("ELEVENLABS_VOICE_ID", "pMsXgVXv3BLzUgSXRplE"),
    model_id="eleven_multilingual_v2",
    text=sys.argv[1]
 )
 with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
    for chunk in audio:
        f.write(chunk)
    print(f.name)
 ```
 ## License
 MIT
--- a/extensions/telegram-tts/clawdbot.plugin.json
+++ b/extensions/telegram-tts/clawdbot.plugin.json
@@ -0,0 +1,81 @@
 {
  "id": "telegram-tts",
  "uiHints": {
    "enabled": {
      "label": "Enable TTS",
      "help": "Automatically convert text responses to voice messages"
    },
    "provider": {
      "label": "TTS Provider"
    },
    "elevenlabs.apiKey": {
      "label": "ElevenLabs API Key",
      "sensitive": true
    },
    "elevenlabs.voiceId": {
      "label": "ElevenLabs Voice ID",
      "help": "Default: pMsXgVXv3BLzUgSXRplE (Borislav)"
    },
    "elevenlabs.modelId": {
      "label": "ElevenLabs Model ID",
      "help": "Default: eleven_multilingual_v2"
    },
    "prefsPath": {
      "label": "User Preferences File",
      "help": "Path to JSON file storing TTS state",
      "advanced": true
    },
    "maxTextLength": {
      "label": "Max Text Length",
      "help": "Maximum characters to convert to speech",
      "advanced": true
    }
  },
  "configSchema": {
    "type": "object",
    "additionalProperties": false,
    "properties": {
      "enabled": {
        "type": "boolean",
        "default": false
      },
      "provider": {
        "type": "string",
        "enum": ["elevenlabs", "openai"],
        "default": "elevenlabs"
      },
      "elevenlabs": {
        "type": "object",
        "additionalProperties": false,
        "properties": {
          "apiKey": {
            "type": "string"
          },
          "voiceId": {
            "type": "string",
            "default": "pMsXgVXv3BLzUgSXRplE"
          },
          "modelId": {
            "type": "string",
            "default": "eleven_multilingual_v2"
          }
        }
      },
      "prefsPath": {
        "type": "string"
      },
      "maxTextLength": {
        "type": "integer",
        "minimum": 1,
        "default": 4000
      },
      "channels": {
        "type": "array",
        "items": {
          "type": "string"
        },
        "default": ["telegram"]
      }
    }
  }
 }
--- a/extensions/telegram-tts/index.ts
+++ b/extensions/telegram-tts/index.ts
@@ -0,0 +1,224 @@
 /**
 * telegram-tts - Automatic TTS for chat responses
 *
 * This plugin provides a `speak` tool that converts text to speech using
 * ElevenLabs API and sends the response as a voice message.
 *
 * When TTS mode is enabled (via user preferences or config), the agent
 * is instructed to use the speak tool for all responses.
 */
 import { execSync } from "child_process";
 import { existsSync, readFileSync, writeFileSync } from "fs";
 import { join } from "path";
 import type { PluginApi, PluginConfig } from "clawdbot";
 const PLUGIN_ID = "telegram-tts";
 interface TelegramTtsConfig {
  enabled?: boolean;
  provider?: "elevenlabs" | "openai";
  elevenlabs?: {
    apiKey?: string;
    voiceId?: string;
    modelId?: string;
  };
  prefsPath?: string;
  maxTextLength?: number;
  channels?: string[];
 }
 interface UserPreferences {
  tts?: {
    enabled?: boolean;
  };
 }
 /**
 * Load environment variables from .clawdbot/.env
 */
 function loadEnv(): Record<string, string> {
  const envPath = join(process.env.HOME || "/home/dev", ".clawdbot", ".env");
  const env: Record<string, string> = { ...process.env } as Record<string, string>;
  if (existsSync(envPath)) {
    const content = readFileSync(envPath, "utf8");
    for (const line of content.split("\n")) {
      const trimmed = line.trim();
      if (trimmed && !trimmed.startsWith("#")) {
        const [key, ...valueParts] = trimmed.split("=");
        if (key && valueParts.length > 0) {
          let value = valueParts.join("=");
          // Remove quotes if present
          if (
            (value.startsWith('"') && value.endsWith('"')) ||
            (value.startsWith("'") && value.endsWith("'"))
          ) {
            value = value.slice(1, -1);
          }
          env[key.trim()] = value;
        }
      }
    }
  }
  return env;
 }
 /**
 * Check if TTS is enabled in user preferences
 */
 function isTtsEnabled(prefsPath: string): boolean {
  try {
    if (!existsSync(prefsPath)) return false;
    const prefs: UserPreferences = JSON.parse(readFileSync(prefsPath, "utf8"));
    return prefs?.tts?.enabled === true;
  } catch {
    return false;
  }
 }
 /**
 * Set TTS enabled state in user preferences
 */
 function setTtsEnabled(prefsPath: string, enabled: boolean): void {
  let prefs: UserPreferences = {};
  try {
    if (existsSync(prefsPath)) {
      prefs = JSON.parse(readFileSync(prefsPath, "utf8"));
    }
  } catch {
    // ignore
  }
  prefs.tts = { enabled };
  writeFileSync(prefsPath, JSON.stringify(prefs, null, 2));
 }
 /**
 * Convert text to audio using sag CLI (ElevenLabs wrapper)
 */
 function textToAudio(text: string): string | null {
  try {
    const escapedText = text.replace(/'/g, "'\\''");
    const env = loadEnv();
    const result = execSync(`sag '${escapedText}'`, {
      encoding: "utf8",
      timeout: 60000,
      env,
    }).trim();
    if (result && existsSync(result)) {
      return result;
    }
    return null;
  } catch (err) {
    console.error(`[${PLUGIN_ID}] TTS error:`, (err as Error).message);
    return null;
  }
 }
 /**
 * Plugin registration
 */
 export default function register(api: PluginApi) {
  const log = api.logger;
  const config = (api.pluginConfig || {}) as TelegramTtsConfig;
  const prefsPath =
    config.prefsPath ||
    process.env.CLAWDBOT_TTS_PREFS ||
    join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json");
  log.info(`[${PLUGIN_ID}] Registering plugin...`);
  log.info(`[${PLUGIN_ID}] Preferences path: ${prefsPath}`);
  // Register the 'speak' tool for TTS
  api.registerTool({
    name: "speak",
    description:
      "Convert text to speech and send as voice message. Use this tool when TTS mode is enabled or when the user requests an audio response.",
    parameters: {
      type: "object",
      properties: {
        text: {
          type: "string",
          description: "The text to convert to speech and send as voice message",
        },
      },
      required: ["text"],
    },
    execute: async (_id: string, params: { text: string }) => {
      const { text } = params;
      log.info(`[${PLUGIN_ID}] speak() called, text length: ${text?.length || 0}`);
      if (!text) {
        return { content: [{ type: "text", text: "Error: No text provided" }] };
      }
      const maxLen = config.maxTextLength || 4000;
      if (text.length > maxLen) {
        return {
          content: [
            {
              type: "text",
              text: `Error: Text too long (${text.length} chars, max ${maxLen})`,
            },
          ],
        };
      }
      const audioPath = textToAudio(text);
      if (audioPath) {
        log.info(`[${PLUGIN_ID}] Audio generated: ${audioPath}`);
        return {
          content: [{ type: "text", text: `Voice message generated successfully.` }],
          media: audioPath,
          asVoice: true,
        };
      }
      log.error(`[${PLUGIN_ID}] TTS conversion failed`);
      return {
        content: [{ type: "text", text: `TTS conversion failed. Original: ${text}` }],
      };
    },
  });
  // Register Gateway RPC methods
  api.registerGatewayMethod("tts.status", async () => ({
    enabled: isTtsEnabled(prefsPath),
    prefsPath,
    pluginId: PLUGIN_ID,
    config: {
      provider: config.provider || "elevenlabs",
      maxTextLength: config.maxTextLength || 4000,
    },
  }));
  api.registerGatewayMethod("tts.enable", async () => {
    setTtsEnabled(prefsPath, true);
    return { ok: true, enabled: true };
  });
  api.registerGatewayMethod("tts.disable", async () => {
    setTtsEnabled(prefsPath, false);
    return { ok: true, enabled: false };
  });
  api.registerGatewayMethod("tts.convert", async (params: { text: string }) => {
    if (!params.text) return { ok: false, error: "No text provided" };
    const audioPath = textToAudio(params.text);
    return audioPath ? { ok: true, audioPath } : { ok: false, error: "Conversion failed" };
  });
  log.info(
    `[${PLUGIN_ID}] Plugin ready. TTS is currently ${isTtsEnabled(prefsPath) ? "ENABLED" : "disabled"}`
  );
 }
 export const meta = {
  id: PLUGIN_ID,
  name: "Telegram TTS",
  description: "Automatic text-to-speech for chat responses using ElevenLabs",
  version: "0.1.0",
 };
--- a/extensions/telegram-tts/package.json
+++ b/extensions/telegram-tts/package.json
@@ -0,0 +1,7 @@
 {
  "name": "@clawdbot/telegram-tts",
  "version": "0.1.0",
  "private": true,
  "description": "Automatic text-to-speech for chat responses using ElevenLabs",
  "main": "index.ts"
 }