feat(telegram-tts): make extension self-contained with direct API calls
- Remove sag CLI dependency - Add direct ElevenLabs API integration via fetch - Add OpenAI TTS as alternative provider - Support multi-provider configuration - Add tts.providers RPC method - Update config schema with OpenAI options - Bump version to 0.2.0 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Peter Steinberger
parent
5428c97685
commit
46e6546bb9
@@ -1,18 +1,18 @@
|
|||||||
# Telegram TTS Extension
|
# Telegram TTS Extension
|
||||||
|
|
||||||
Automatic text-to-speech for chat responses using ElevenLabs.
|
Automatic text-to-speech for chat responses using ElevenLabs or OpenAI.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- **`speak` Tool**: Converts text to speech and sends as voice message
|
- **`speak` Tool**: Converts text to speech and sends as voice message
|
||||||
- **RPC Methods**: Control TTS via Gateway (`tts.status`, `tts.enable`, `tts.disable`, `tts.convert`)
|
- **RPC Methods**: Control TTS via Gateway (`tts.status`, `tts.enable`, `tts.disable`, `tts.convert`, `tts.providers`)
|
||||||
- **User Preferences**: Persistent TTS state via JSON file
|
- **User Preferences**: Persistent TTS state via JSON file
|
||||||
- **Multi-channel**: Works with Telegram and other channels
|
- **Multi-provider**: ElevenLabs and OpenAI TTS support
|
||||||
|
- **Self-contained**: No external CLI dependencies - calls APIs directly
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
- ElevenLabs API key
|
- ElevenLabs API key OR OpenAI API key
|
||||||
- `sag` CLI tool (ElevenLabs TTS wrapper)
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@@ -24,6 +24,7 @@ The extension is bundled with Clawdbot. Enable it in your config:
|
|||||||
"entries": {
|
"entries": {
|
||||||
"telegram-tts": {
|
"telegram-tts": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
|
"provider": "elevenlabs",
|
||||||
"elevenlabs": {
|
"elevenlabs": {
|
||||||
"apiKey": "your-api-key"
|
"apiKey": "your-api-key"
|
||||||
}
|
}
|
||||||
@@ -33,10 +34,35 @@ The extension is bundled with Clawdbot. Enable it in your config:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Or set the API key via environment variable:
|
Or use OpenAI:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"plugins": {
|
||||||
|
"entries": {
|
||||||
|
"telegram-tts": {
|
||||||
|
"enabled": true,
|
||||||
|
"provider": "openai",
|
||||||
|
"openai": {
|
||||||
|
"apiKey": "your-api-key",
|
||||||
|
"voice": "nova"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Or set API keys via environment variables:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# For ElevenLabs
|
||||||
export ELEVENLABS_API_KEY=your-api-key
|
export ELEVENLABS_API_KEY=your-api-key
|
||||||
|
# or
|
||||||
|
export XI_API_KEY=your-api-key
|
||||||
|
|
||||||
|
# For OpenAI
|
||||||
|
export OPENAI_API_KEY=your-api-key
|
||||||
```
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
@@ -44,13 +70,20 @@ export ELEVENLABS_API_KEY=your-api-key
|
|||||||
| Option | Type | Default | Description |
|
| Option | Type | Default | Description |
|
||||||
|--------|------|---------|-------------|
|
|--------|------|---------|-------------|
|
||||||
| `enabled` | boolean | `false` | Enable the plugin |
|
| `enabled` | boolean | `false` | Enable the plugin |
|
||||||
| `provider` | string | `"elevenlabs"` | TTS provider |
|
| `provider` | string | `"elevenlabs"` | TTS provider (`elevenlabs` or `openai`) |
|
||||||
| `elevenlabs.apiKey` | string | - | ElevenLabs API key |
|
| `elevenlabs.apiKey` | string | - | ElevenLabs API key |
|
||||||
| `elevenlabs.voiceId` | string | `"pMsXgVXv3BLzUgSXRplE"` | Voice ID |
|
| `elevenlabs.voiceId` | string | `"pMsXgVXv3BLzUgSXRplE"` | ElevenLabs Voice ID |
|
||||||
| `elevenlabs.modelId` | string | `"eleven_multilingual_v2"` | Model ID |
|
| `elevenlabs.modelId` | string | `"eleven_multilingual_v2"` | ElevenLabs Model ID |
|
||||||
|
| `openai.apiKey` | string | - | OpenAI API key |
|
||||||
|
| `openai.model` | string | `"tts-1"` | OpenAI model (`tts-1` or `tts-1-hd`) |
|
||||||
|
| `openai.voice` | string | `"alloy"` | OpenAI voice |
|
||||||
| `prefsPath` | string | `~/clawd/.user-preferences.json` | User preferences file |
|
| `prefsPath` | string | `~/clawd/.user-preferences.json` | User preferences file |
|
||||||
| `maxTextLength` | number | `4000` | Max characters for TTS |
|
| `maxTextLength` | number | `4000` | Max characters for TTS |
|
||||||
|
|
||||||
|
### OpenAI Voices
|
||||||
|
|
||||||
|
Available voices: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
### Agent Tool
|
### Agent Tool
|
||||||
@@ -74,6 +107,9 @@ clawdbot gateway call tts.disable
|
|||||||
|
|
||||||
# Convert text to audio
|
# Convert text to audio
|
||||||
clawdbot gateway call tts.convert '{"text": "Hello world"}'
|
clawdbot gateway call tts.convert '{"text": "Hello world"}'
|
||||||
|
|
||||||
|
# List available providers
|
||||||
|
clawdbot gateway call tts.providers
|
||||||
```
|
```
|
||||||
|
|
||||||
### Telegram Commands
|
### Telegram Commands
|
||||||
@@ -86,7 +122,8 @@ Add custom commands to toggle TTS mode:
|
|||||||
"telegram": {
|
"telegram": {
|
||||||
"customCommands": [
|
"customCommands": [
|
||||||
{"command": "tts_on", "description": "Enable voice responses"},
|
{"command": "tts_on", "description": "Enable voice responses"},
|
||||||
{"command": "tts_off", "description": "Disable voice responses"}
|
{"command": "tts_off", "description": "Disable voice responses"},
|
||||||
|
{"command": "audio", "description": "Send response as voice message"}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -95,28 +132,6 @@ Add custom commands to toggle TTS mode:
|
|||||||
|
|
||||||
Then add handling instructions to your agent workspace (CLAUDE.md or TOOLS.md).
|
Then add handling instructions to your agent workspace (CLAUDE.md or TOOLS.md).
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
|
|
||||||
This extension requires the `sag` CLI tool. On Linux, you can create a Python wrapper:
|
|
||||||
|
|
||||||
```python
|
|
||||||
#!/usr/bin/env python3
|
|
||||||
# ~/.local/bin/sag
|
|
||||||
from elevenlabs.client import ElevenLabs
|
|
||||||
import sys, os, tempfile
|
|
||||||
|
|
||||||
client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
|
|
||||||
audio = client.text_to_speech.convert(
|
|
||||||
voice_id=os.environ.get("ELEVENLABS_VOICE_ID", "pMsXgVXv3BLzUgSXRplE"),
|
|
||||||
model_id="eleven_multilingual_v2",
|
|
||||||
text=sys.argv[1]
|
|
||||||
)
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
|
|
||||||
for chunk in audio:
|
|
||||||
f.write(chunk)
|
|
||||||
print(f.name)
|
|
||||||
```
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
MIT
|
MIT
|
||||||
|
|||||||
@@ -6,7 +6,8 @@
|
|||||||
"help": "Automatically convert text responses to voice messages"
|
"help": "Automatically convert text responses to voice messages"
|
||||||
},
|
},
|
||||||
"provider": {
|
"provider": {
|
||||||
"label": "TTS Provider"
|
"label": "TTS Provider",
|
||||||
|
"help": "Choose between ElevenLabs or OpenAI for voice synthesis"
|
||||||
},
|
},
|
||||||
"elevenlabs.apiKey": {
|
"elevenlabs.apiKey": {
|
||||||
"label": "ElevenLabs API Key",
|
"label": "ElevenLabs API Key",
|
||||||
@@ -20,6 +21,18 @@
|
|||||||
"label": "ElevenLabs Model ID",
|
"label": "ElevenLabs Model ID",
|
||||||
"help": "Default: eleven_multilingual_v2"
|
"help": "Default: eleven_multilingual_v2"
|
||||||
},
|
},
|
||||||
|
"openai.apiKey": {
|
||||||
|
"label": "OpenAI API Key",
|
||||||
|
"sensitive": true
|
||||||
|
},
|
||||||
|
"openai.model": {
|
||||||
|
"label": "OpenAI TTS Model",
|
||||||
|
"help": "tts-1 (faster) or tts-1-hd (higher quality)"
|
||||||
|
},
|
||||||
|
"openai.voice": {
|
||||||
|
"label": "OpenAI Voice",
|
||||||
|
"help": "alloy, echo, fable, onyx, nova, or shimmer"
|
||||||
|
},
|
||||||
"prefsPath": {
|
"prefsPath": {
|
||||||
"label": "User Preferences File",
|
"label": "User Preferences File",
|
||||||
"help": "Path to JSON file storing TTS state",
|
"help": "Path to JSON file storing TTS state",
|
||||||
@@ -29,6 +42,11 @@
|
|||||||
"label": "Max Text Length",
|
"label": "Max Text Length",
|
||||||
"help": "Maximum characters to convert to speech",
|
"help": "Maximum characters to convert to speech",
|
||||||
"advanced": true
|
"advanced": true
|
||||||
|
},
|
||||||
|
"timeoutMs": {
|
||||||
|
"label": "Request Timeout (ms)",
|
||||||
|
"help": "Maximum time to wait for TTS API response (default: 30000)",
|
||||||
|
"advanced": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"configSchema": {
|
"configSchema": {
|
||||||
@@ -61,6 +79,25 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"openai": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"properties": {
|
||||||
|
"apiKey": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["tts-1", "tts-1-hd"],
|
||||||
|
"default": "tts-1"
|
||||||
|
},
|
||||||
|
"voice": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
|
||||||
|
"default": "alloy"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"prefsPath": {
|
"prefsPath": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
@@ -69,12 +106,11 @@
|
|||||||
"minimum": 1,
|
"minimum": 1,
|
||||||
"default": 4000
|
"default": 4000
|
||||||
},
|
},
|
||||||
"channels": {
|
"timeoutMs": {
|
||||||
"type": "array",
|
"type": "integer",
|
||||||
"items": {
|
"minimum": 1000,
|
||||||
"type": "string"
|
"maximum": 120000,
|
||||||
},
|
"default": 30000
|
||||||
"default": ["telegram"]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,21 +1,32 @@
|
|||||||
/**
|
/**
|
||||||
* telegram-tts - Automatic TTS for chat responses
|
* telegram-tts - Automatic TTS for chat responses
|
||||||
*
|
*
|
||||||
* This plugin provides a `speak` tool that converts text to speech using
|
* Self-contained TTS extension that calls ElevenLabs/OpenAI APIs directly.
|
||||||
* ElevenLabs API and sends the response as a voice message.
|
* No external CLI dependencies.
|
||||||
*
|
*
|
||||||
* When TTS mode is enabled (via user preferences or config), the agent
|
* Features:
|
||||||
* is instructed to use the speak tool for all responses.
|
* - speak tool for programmatic TTS
|
||||||
|
* - Multi-provider support (ElevenLabs, OpenAI)
|
||||||
|
* - RPC methods for status and control
|
||||||
|
*
|
||||||
|
* Note: Slash commands (/tts_on, /tts_off, /audio) should be configured
|
||||||
|
* via Telegram customCommands and handled by the agent workspace.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { execSync } from "child_process";
|
import { existsSync, readFileSync, writeFileSync, mkdtempSync, rmSync } from "fs";
|
||||||
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import type { PluginApi, PluginConfig } from "clawdbot";
|
import { tmpdir } from "os";
|
||||||
|
import type { PluginApi } from "clawdbot";
|
||||||
|
|
||||||
const PLUGIN_ID = "telegram-tts";
|
const PLUGIN_ID = "telegram-tts";
|
||||||
|
const DEFAULT_TIMEOUT_MS = 30000;
|
||||||
|
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||||
|
|
||||||
interface TelegramTtsConfig {
|
// =============================================================================
|
||||||
|
// Types
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
interface TtsConfig {
|
||||||
enabled?: boolean;
|
enabled?: boolean;
|
||||||
provider?: "elevenlabs" | "openai";
|
provider?: "elevenlabs" | "openai";
|
||||||
elevenlabs?: {
|
elevenlabs?: {
|
||||||
@@ -23,9 +34,14 @@ interface TelegramTtsConfig {
|
|||||||
voiceId?: string;
|
voiceId?: string;
|
||||||
modelId?: string;
|
modelId?: string;
|
||||||
};
|
};
|
||||||
|
openai?: {
|
||||||
|
apiKey?: string;
|
||||||
|
model?: string;
|
||||||
|
voice?: string;
|
||||||
|
};
|
||||||
prefsPath?: string;
|
prefsPath?: string;
|
||||||
maxTextLength?: number;
|
maxTextLength?: number;
|
||||||
channels?: string[];
|
timeoutMs?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface UserPreferences {
|
interface UserPreferences {
|
||||||
@@ -34,39 +50,44 @@ interface UserPreferences {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
interface TtsResult {
|
||||||
* Load environment variables from .clawdbot/.env
|
success: boolean;
|
||||||
*/
|
audioPath?: string;
|
||||||
function loadEnv(): Record<string, string> {
|
error?: string;
|
||||||
const envPath = join(process.env.HOME || "/home/dev", ".clawdbot", ".env");
|
}
|
||||||
const env: Record<string, string> = { ...process.env } as Record<string, string>;
|
|
||||||
|
|
||||||
if (existsSync(envPath)) {
|
// =============================================================================
|
||||||
const content = readFileSync(envPath, "utf8");
|
// Validation
|
||||||
for (const line of content.split("\n")) {
|
// =============================================================================
|
||||||
const trimmed = line.trim();
|
|
||||||
if (trimmed && !trimmed.startsWith("#")) {
|
/**
|
||||||
const [key, ...valueParts] = trimmed.split("=");
|
* Validates ElevenLabs voiceId format to prevent URL injection.
|
||||||
if (key && valueParts.length > 0) {
|
* Voice IDs are alphanumeric strings, typically 20 characters.
|
||||||
let value = valueParts.join("=");
|
*/
|
||||||
// Remove quotes if present
|
function isValidVoiceId(voiceId: string): boolean {
|
||||||
if (
|
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||||
(value.startsWith('"') && value.endsWith('"')) ||
|
|
||||||
(value.startsWith("'") && value.endsWith("'"))
|
|
||||||
) {
|
|
||||||
value = value.slice(1, -1);
|
|
||||||
}
|
|
||||||
env[key.trim()] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return env;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if TTS is enabled in user preferences
|
* Validates OpenAI voice name.
|
||||||
*/
|
*/
|
||||||
|
function isValidOpenAIVoice(voice: string): boolean {
|
||||||
|
const validVoices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"];
|
||||||
|
return validVoices.includes(voice);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Configuration & Preferences
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
function getPrefsPath(config: TtsConfig): string {
|
||||||
|
return (
|
||||||
|
config.prefsPath ||
|
||||||
|
process.env.CLAWDBOT_TTS_PREFS ||
|
||||||
|
join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function isTtsEnabled(prefsPath: string): boolean {
|
function isTtsEnabled(prefsPath: string): boolean {
|
||||||
try {
|
try {
|
||||||
if (!existsSync(prefsPath)) return false;
|
if (!existsSync(prefsPath)) return false;
|
||||||
@@ -77,9 +98,6 @@ function isTtsEnabled(prefsPath: string): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Set TTS enabled state in user preferences
|
|
||||||
*/
|
|
||||||
function setTtsEnabled(prefsPath: string, enabled: boolean): void {
|
function setTtsEnabled(prefsPath: string, enabled: boolean): void {
|
||||||
let prefs: UserPreferences = {};
|
let prefs: UserPreferences = {};
|
||||||
try {
|
try {
|
||||||
@@ -93,132 +111,368 @@ function setTtsEnabled(prefsPath: string, enabled: boolean): void {
|
|||||||
writeFileSync(prefsPath, JSON.stringify(prefs, null, 2));
|
writeFileSync(prefsPath, JSON.stringify(prefs, null, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getApiKey(config: TtsConfig, provider: string): string | undefined {
|
||||||
|
if (provider === "elevenlabs") {
|
||||||
|
return (
|
||||||
|
config.elevenlabs?.apiKey ||
|
||||||
|
process.env.ELEVENLABS_API_KEY ||
|
||||||
|
process.env.XI_API_KEY
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (provider === "openai") {
|
||||||
|
return config.openai?.apiKey || process.env.OPENAI_API_KEY;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Temp File Cleanup
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert text to audio using sag CLI (ElevenLabs wrapper)
|
* Schedules cleanup of a temp directory after a delay.
|
||||||
|
* This ensures the file is consumed before deletion.
|
||||||
*/
|
*/
|
||||||
function textToAudio(text: string): string | null {
|
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
|
||||||
try {
|
setTimeout(() => {
|
||||||
const escapedText = text.replace(/'/g, "'\\''");
|
try {
|
||||||
const env = loadEnv();
|
rmSync(tempDir, { recursive: true, force: true });
|
||||||
|
} catch {
|
||||||
const result = execSync(`sag '${escapedText}'`, {
|
// Ignore cleanup errors
|
||||||
encoding: "utf8",
|
|
||||||
timeout: 60000,
|
|
||||||
env,
|
|
||||||
}).trim();
|
|
||||||
|
|
||||||
if (result && existsSync(result)) {
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
return null;
|
}, delayMs);
|
||||||
} catch (err) {
|
}
|
||||||
console.error(`[${PLUGIN_ID}] TTS error:`, (err as Error).message);
|
|
||||||
return null;
|
// =============================================================================
|
||||||
|
// TTS Providers
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
async function elevenLabsTTS(
|
||||||
|
text: string,
|
||||||
|
apiKey: string,
|
||||||
|
voiceId: string = "pMsXgVXv3BLzUgSXRplE",
|
||||||
|
modelId: string = "eleven_multilingual_v2",
|
||||||
|
timeoutMs: number = DEFAULT_TIMEOUT_MS
|
||||||
|
): Promise<Buffer> {
|
||||||
|
// Validate voiceId to prevent URL injection
|
||||||
|
if (!isValidVoiceId(voiceId)) {
|
||||||
|
throw new Error(`Invalid voiceId format`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(
|
||||||
|
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
|
||||||
|
{
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"xi-api-key": apiKey,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Accept: "audio/mpeg",
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
text,
|
||||||
|
model_id: modelId,
|
||||||
|
voice_settings: {
|
||||||
|
stability: 0.5,
|
||||||
|
similarity_boost: 0.75,
|
||||||
|
style: 0.0,
|
||||||
|
use_speaker_boost: true,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
signal: controller.signal,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
// Don't leak API error details to users
|
||||||
|
throw new Error(`ElevenLabs API error (${response.status})`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Buffer.from(await response.arrayBuffer());
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timeout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
async function openaiTTS(
|
||||||
* Plugin registration
|
text: string,
|
||||||
*/
|
apiKey: string,
|
||||||
|
model: string = "tts-1",
|
||||||
|
voice: string = "alloy",
|
||||||
|
timeoutMs: number = DEFAULT_TIMEOUT_MS
|
||||||
|
): Promise<Buffer> {
|
||||||
|
// Validate voice
|
||||||
|
if (!isValidOpenAIVoice(voice)) {
|
||||||
|
throw new Error(`Invalid voice: ${voice}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${apiKey}`,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model,
|
||||||
|
input: text,
|
||||||
|
voice,
|
||||||
|
response_format: "mp3",
|
||||||
|
}),
|
||||||
|
signal: controller.signal,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
// Don't leak API error details to users
|
||||||
|
throw new Error(`OpenAI TTS API error (${response.status})`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Buffer.from(await response.arrayBuffer());
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Core TTS Function
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
async function textToSpeech(text: string, config: TtsConfig): Promise<TtsResult> {
|
||||||
|
const provider = config.provider || "elevenlabs";
|
||||||
|
const apiKey = getApiKey(config, provider);
|
||||||
|
const timeoutMs = config.timeoutMs || DEFAULT_TIMEOUT_MS;
|
||||||
|
|
||||||
|
if (!apiKey) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `No API key configured for ${provider}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxLen = config.maxTextLength || 4000;
|
||||||
|
if (text.length > maxLen) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `Text too long (${text.length} chars, max ${maxLen})`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let audioBuffer: Buffer;
|
||||||
|
|
||||||
|
if (provider === "elevenlabs") {
|
||||||
|
audioBuffer = await elevenLabsTTS(
|
||||||
|
text,
|
||||||
|
apiKey,
|
||||||
|
config.elevenlabs?.voiceId,
|
||||||
|
config.elevenlabs?.modelId,
|
||||||
|
timeoutMs
|
||||||
|
);
|
||||||
|
} else if (provider === "openai") {
|
||||||
|
audioBuffer = await openaiTTS(
|
||||||
|
text,
|
||||||
|
apiKey,
|
||||||
|
config.openai?.model,
|
||||||
|
config.openai?.voice,
|
||||||
|
timeoutMs
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return { success: false, error: `Unknown provider: ${provider}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save to temp file
|
||||||
|
const tempDir = mkdtempSync(join(tmpdir(), "tts-"));
|
||||||
|
const audioPath = join(tempDir, `voice-${Date.now()}.mp3`);
|
||||||
|
writeFileSync(audioPath, audioBuffer);
|
||||||
|
|
||||||
|
// Schedule cleanup after delay (file should be consumed by then)
|
||||||
|
scheduleCleanup(tempDir);
|
||||||
|
|
||||||
|
return { success: true, audioPath };
|
||||||
|
} catch (err) {
|
||||||
|
const error = err as Error;
|
||||||
|
if (error.name === "AbortError") {
|
||||||
|
return { success: false, error: "TTS request timed out" };
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `TTS conversion failed: ${error.message}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Plugin Registration
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
export default function register(api: PluginApi) {
|
export default function register(api: PluginApi) {
|
||||||
const log = api.logger;
|
const log = api.logger;
|
||||||
const config = (api.pluginConfig || {}) as TelegramTtsConfig;
|
const config: TtsConfig = {
|
||||||
const prefsPath =
|
enabled: false,
|
||||||
config.prefsPath ||
|
provider: "elevenlabs",
|
||||||
process.env.CLAWDBOT_TTS_PREFS ||
|
maxTextLength: 4000,
|
||||||
join(process.env.HOME || "/home/dev", "clawd", ".user-preferences.json");
|
timeoutMs: DEFAULT_TIMEOUT_MS,
|
||||||
|
...(api.pluginConfig || {}),
|
||||||
|
};
|
||||||
|
const prefsPath = getPrefsPath(config);
|
||||||
|
|
||||||
log.info(`[${PLUGIN_ID}] Registering plugin...`);
|
log.info(`[${PLUGIN_ID}] Registering plugin...`);
|
||||||
log.info(`[${PLUGIN_ID}] Preferences path: ${prefsPath}`);
|
log.info(`[${PLUGIN_ID}] Provider: ${config.provider}`);
|
||||||
|
log.info(`[${PLUGIN_ID}] Preferences: ${prefsPath}`);
|
||||||
|
|
||||||
// Register the 'speak' tool for TTS
|
// ===========================================================================
|
||||||
|
// Tool: speak
|
||||||
|
// ===========================================================================
|
||||||
api.registerTool({
|
api.registerTool({
|
||||||
name: "speak",
|
name: "speak",
|
||||||
description:
|
description: `Convert text to speech and generate voice message.
|
||||||
"Convert text to speech and send as voice message. Use this tool when TTS mode is enabled or when the user requests an audio response.",
|
Use this tool when TTS mode is enabled or user requests audio.
|
||||||
|
|
||||||
|
IMPORTANT: After calling this tool, you MUST output the result exactly as returned.
|
||||||
|
The tool returns "MEDIA:/path/to/audio.mp3" - copy this EXACTLY to your response.
|
||||||
|
This MEDIA: directive tells the system to send the audio file.
|
||||||
|
|
||||||
|
Example flow:
|
||||||
|
1. User asks a question with TTS enabled
|
||||||
|
2. You call speak({text: "Your answer here"})
|
||||||
|
3. Tool returns: MEDIA:/tmp/tts-xxx/voice-123.mp3
|
||||||
|
4. You output: MEDIA:/tmp/tts-xxx/voice-123.mp3
|
||||||
|
|
||||||
|
Do NOT add extra text around the MEDIA directive.`,
|
||||||
parameters: {
|
parameters: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
text: {
|
text: {
|
||||||
type: "string",
|
type: "string",
|
||||||
description: "The text to convert to speech and send as voice message",
|
description: "The text to convert to speech",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
required: ["text"],
|
required: ["text"],
|
||||||
},
|
},
|
||||||
execute: async (_id: string, params: { text: string }) => {
|
execute: async (_id: string, params: { text?: unknown }) => {
|
||||||
const { text } = params;
|
// Validate text parameter
|
||||||
log.info(`[${PLUGIN_ID}] speak() called, text length: ${text?.length || 0}`);
|
if (typeof params?.text !== "string" || params.text.length === 0) {
|
||||||
|
return { content: [{ type: "text", text: "Error: Invalid or missing text parameter" }] };
|
||||||
if (!text) {
|
|
||||||
return { content: [{ type: "text", text: "Error: No text provided" }] };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const maxLen = config.maxTextLength || 4000;
|
const text = params.text;
|
||||||
if (text.length > maxLen) {
|
log.info(`[${PLUGIN_ID}] speak() called, length: ${text.length}`);
|
||||||
|
|
||||||
|
const result = await textToSpeech(text, config);
|
||||||
|
|
||||||
|
if (result.success && result.audioPath) {
|
||||||
|
log.info(`[${PLUGIN_ID}] Audio generated: ${result.audioPath}`);
|
||||||
|
// Return with MEDIA directive for clawdbot to send
|
||||||
return {
|
return {
|
||||||
content: [
|
content: [
|
||||||
{
|
{
|
||||||
type: "text",
|
type: "text",
|
||||||
text: `Error: Text too long (${text.length} chars, max ${maxLen})`,
|
text: `MEDIA:${result.audioPath}`,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const audioPath = textToAudio(text);
|
log.error(`[${PLUGIN_ID}] TTS failed: ${result.error}`);
|
||||||
|
|
||||||
if (audioPath) {
|
|
||||||
log.info(`[${PLUGIN_ID}] Audio generated: ${audioPath}`);
|
|
||||||
return {
|
|
||||||
content: [{ type: "text", text: `Voice message generated successfully.` }],
|
|
||||||
media: audioPath,
|
|
||||||
asVoice: true,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
log.error(`[${PLUGIN_ID}] TTS conversion failed`);
|
|
||||||
return {
|
return {
|
||||||
content: [{ type: "text", text: `TTS conversion failed. Original: ${text}` }],
|
content: [
|
||||||
|
{
|
||||||
|
type: "text",
|
||||||
|
text: result.error || "TTS conversion failed",
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Register Gateway RPC methods
|
// ===========================================================================
|
||||||
|
// RPC Methods
|
||||||
|
// ===========================================================================
|
||||||
|
|
||||||
|
// tts.status - Check if TTS is enabled
|
||||||
api.registerGatewayMethod("tts.status", async () => ({
|
api.registerGatewayMethod("tts.status", async () => ({
|
||||||
enabled: isTtsEnabled(prefsPath),
|
enabled: isTtsEnabled(prefsPath),
|
||||||
|
provider: config.provider,
|
||||||
prefsPath,
|
prefsPath,
|
||||||
pluginId: PLUGIN_ID,
|
hasApiKey: !!getApiKey(config, config.provider || "elevenlabs"),
|
||||||
config: {
|
|
||||||
provider: config.provider || "elevenlabs",
|
|
||||||
maxTextLength: config.maxTextLength || 4000,
|
|
||||||
},
|
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
// tts.enable - Enable TTS mode
|
||||||
api.registerGatewayMethod("tts.enable", async () => {
|
api.registerGatewayMethod("tts.enable", async () => {
|
||||||
setTtsEnabled(prefsPath, true);
|
setTtsEnabled(prefsPath, true);
|
||||||
|
log.info(`[${PLUGIN_ID}] TTS enabled via RPC`);
|
||||||
return { ok: true, enabled: true };
|
return { ok: true, enabled: true };
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// tts.disable - Disable TTS mode
|
||||||
api.registerGatewayMethod("tts.disable", async () => {
|
api.registerGatewayMethod("tts.disable", async () => {
|
||||||
setTtsEnabled(prefsPath, false);
|
setTtsEnabled(prefsPath, false);
|
||||||
|
log.info(`[${PLUGIN_ID}] TTS disabled via RPC`);
|
||||||
return { ok: true, enabled: false };
|
return { ok: true, enabled: false };
|
||||||
});
|
});
|
||||||
|
|
||||||
api.registerGatewayMethod("tts.convert", async (params: { text: string }) => {
|
// tts.convert - Convert text to audio (returns path)
|
||||||
if (!params.text) return { ok: false, error: "No text provided" };
|
api.registerGatewayMethod("tts.convert", async (params: { text?: unknown }) => {
|
||||||
const audioPath = textToAudio(params.text);
|
// Validate text parameter
|
||||||
return audioPath ? { ok: true, audioPath } : { ok: false, error: "Conversion failed" };
|
if (typeof params?.text !== "string" || params.text.length === 0) {
|
||||||
|
return { ok: false, error: "Invalid or missing 'text' parameter" };
|
||||||
|
}
|
||||||
|
const result = await textToSpeech(params.text, config);
|
||||||
|
if (result.success) {
|
||||||
|
return { ok: true, audioPath: result.audioPath };
|
||||||
|
}
|
||||||
|
return { ok: false, error: result.error };
|
||||||
});
|
});
|
||||||
|
|
||||||
log.info(
|
// tts.providers - List available providers and their status
|
||||||
`[${PLUGIN_ID}] Plugin ready. TTS is currently ${isTtsEnabled(prefsPath) ? "ENABLED" : "disabled"}`
|
api.registerGatewayMethod("tts.providers", async () => ({
|
||||||
);
|
providers: [
|
||||||
|
{
|
||||||
|
id: "elevenlabs",
|
||||||
|
name: "ElevenLabs",
|
||||||
|
configured: !!getApiKey(config, "elevenlabs"),
|
||||||
|
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "openai",
|
||||||
|
name: "OpenAI",
|
||||||
|
configured: !!getApiKey(config, "openai"),
|
||||||
|
models: ["tts-1", "tts-1-hd"],
|
||||||
|
voices: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
active: config.provider,
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ===========================================================================
|
||||||
|
// Startup
|
||||||
|
// ===========================================================================
|
||||||
|
|
||||||
|
const ttsEnabled = isTtsEnabled(prefsPath);
|
||||||
|
const hasKey = !!getApiKey(config, config.provider || "elevenlabs");
|
||||||
|
|
||||||
|
log.info(`[${PLUGIN_ID}] Ready. TTS: ${ttsEnabled ? "ON" : "OFF"}, API Key: ${hasKey ? "OK" : "MISSING"}`);
|
||||||
|
|
||||||
|
if (!hasKey) {
|
||||||
|
log.warn(
|
||||||
|
`[${PLUGIN_ID}] No API key configured. Set ELEVENLABS_API_KEY or OPENAI_API_KEY.`
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Plugin Metadata
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
export const meta = {
|
export const meta = {
|
||||||
id: PLUGIN_ID,
|
id: PLUGIN_ID,
|
||||||
name: "Telegram TTS",
|
name: "Telegram TTS",
|
||||||
description: "Automatic text-to-speech for chat responses using ElevenLabs",
|
description: "Text-to-speech for chat responses using ElevenLabs or OpenAI",
|
||||||
version: "0.1.0",
|
version: "0.3.0",
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
{
|
{
|
||||||
"name": "@clawdbot/telegram-tts",
|
"name": "@clawdbot/telegram-tts",
|
||||||
"version": "0.1.0",
|
"version": "0.3.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"description": "Automatic text-to-speech for chat responses using ElevenLabs",
|
"description": "Text-to-speech for chat responses using ElevenLabs or OpenAI",
|
||||||
"main": "index.ts"
|
"main": "index.ts",
|
||||||
|
"keywords": ["clawdbot", "tts", "elevenlabs", "openai", "telegram", "voice"]
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user