diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a5e3c874..f495a1f17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ Docs: https://docs.clawd.bot ## 2026.1.23 (Unreleased) +### Highlights +- TTS: allow model-driven TTS tags by default for expressive audio replies (laughter, singing cues, etc.). + ### Changes - Gateway: add /tools/invoke HTTP endpoint for direct tool calls and document it. (#1575) Thanks @vignesh07. - Agents: keep system prompt time zone-only and move current time to `session_status` for better cache hits. diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 59d332190..507a1487a 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1459,13 +1459,28 @@ voice notes; other channels send MP3 audio. enabled: true, mode: "final", // final | all (include tool/block replies) provider: "elevenlabs", + summaryModel: "openai/gpt-4.1-mini", + modelOverrides: { + enabled: true + }, maxTextLength: 4000, timeoutMs: 30000, prefsPath: "~/.clawdbot/settings/tts.json", elevenlabs: { apiKey: "elevenlabs_api_key", + baseUrl: "https://api.elevenlabs.io", voiceId: "voice_id", - modelId: "eleven_multilingual_v2" + modelId: "eleven_multilingual_v2", + seed: 42, + applyTextNormalization: "auto", + languageCode: "en", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0.0, + useSpeakerBoost: true, + speed: 1.0 + } }, openai: { apiKey: "openai_api_key", @@ -1478,11 +1493,17 @@ voice notes; other channels send MP3 audio. ``` Notes: -- `messages.tts.enabled` can be overridden by local user prefs (see `/tts_on`, `/tts_off`). +- `messages.tts.enabled` can be overridden by local user prefs (see `/tts on`, `/tts off`). - `prefsPath` stores local overrides (enabled/provider/limit/summarize). - `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit. -- `/tts_limit` and `/tts_summary` control per-user summarization settings. +- `summaryModel` overrides `agents.defaults.model.primary` for auto-summary. + - Accepts `provider/model` or an alias from `agents.defaults.models`. +- `modelOverrides` enables model-driven overrides like `[[tts:...]]` tags (on by default). +- `/tts limit` and `/tts summary` control per-user summarization settings. - `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY` and `OPENAI_API_KEY`. +- `elevenlabs.baseUrl` overrides the ElevenLabs API base URL. +- `elevenlabs.voiceSettings` supports `stability`/`similarityBoost`/`style` (0..1), + `useSpeakerBoost`, and `speed` (0.5..2.0). ### `talk` diff --git a/docs/tools/slash-commands.md b/docs/tools/slash-commands.md index b8ccb7c83..d3de2cd7b 100644 --- a/docs/tools/slash-commands.md +++ b/docs/tools/slash-commands.md @@ -67,13 +67,7 @@ Text + native (when enabled): - `/config show|get|set|unset` (persist config to disk, owner-only; requires `commands.config: true`) - `/debug show|set|unset|reset` (runtime overrides, owner-only; requires `commands.debug: true`) - `/usage off|tokens|full|cost` (per-response usage footer or local cost summary) -- `/tts_on` (enable TTS replies) -- `/tts_off` (disable TTS replies) -- `/tts_provider [openai|elevenlabs]` (set or show TTS provider) -- `/tts_limit ` (max chars before TTS summarization) -- `/tts_summary on|off` (toggle TTS auto-summary) -- `/tts_status` (show TTS status) -- `/audio ` (convert text to a TTS audio reply) +- `/tts on|off|status|provider|limit|summary|audio` (control TTS; see [/tts](/tts)) - `/stop` - `/restart` - `/dock-telegram` (alias: `/dock_telegram`) (switch replies to Telegram) diff --git a/docs/tts.md b/docs/tts.md new file mode 100644 index 000000000..0a7fef7ff --- /dev/null +++ b/docs/tts.md @@ -0,0 +1,293 @@ +--- +summary: "Text-to-speech (TTS) for outbound replies" +read_when: + - Enabling text-to-speech for replies + - Configuring TTS providers or limits + - Using /tts commands +--- + +# Text-to-speech (TTS) + +Clawdbot can convert outbound replies into audio using ElevenLabs or OpenAI. +It works anywhere Clawdbot can send audio; Telegram gets a round voice-note bubble. + +## Supported services + +- **ElevenLabs** (primary or fallback provider) +- **OpenAI** (primary or fallback provider; also used for summaries) + +## Required keys + +At least one of: +- `ELEVENLABS_API_KEY` (or `XI_API_KEY`) +- `OPENAI_API_KEY` + +If both are configured, the selected provider is used first and the other is a fallback. +Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`), +so that provider must also be authenticated if you enable summaries. + +## Service links + +- [OpenAI Text-to-Speech guide](https://platform.openai.com/docs/guides/text-to-speech) +- [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio) +- [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) +- [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) + +## Is it enabled by default? + +No. TTS is **disabled** by default. Enable it in config or with `/tts on`, +which writes a local preference override. + +## Config + +TTS config lives under `messages.tts` in `clawdbot.json`. +Full schema is in [Gateway configuration](/gateway/configuration). + +### Minimal config (enable + provider) + +```json5 +{ + messages: { + tts: { + enabled: true, + provider: "elevenlabs" + } + } +} +``` + +### OpenAI primary with ElevenLabs fallback + +```json5 +{ + messages: { + tts: { + enabled: true, + provider: "openai", + summaryModel: "openai/gpt-4.1-mini", + modelOverrides: { + enabled: true + }, + openai: { + apiKey: "openai_api_key", + model: "gpt-4o-mini-tts", + voice: "alloy" + }, + elevenlabs: { + apiKey: "elevenlabs_api_key", + baseUrl: "https://api.elevenlabs.io", + voiceId: "voice_id", + modelId: "eleven_multilingual_v2", + seed: 42, + applyTextNormalization: "auto", + languageCode: "en", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0.0, + useSpeakerBoost: true, + speed: 1.0 + } + } + } + } +} +``` + +### Custom limits + prefs path + +```json5 +{ + messages: { + tts: { + enabled: true, + maxTextLength: 4000, + timeoutMs: 30000, + prefsPath: "~/.clawdbot/settings/tts.json" + } + } +} +``` + +### Disable auto-summary for long replies + +```json5 +{ + messages: { + tts: { + enabled: true + } + } +} +``` + +Then run: + +``` +/tts summary off +``` + +### Notes on fields + +- `enabled`: master toggle (default `false`; local prefs can override). +- `mode`: `"final"` (default) or `"all"` (includes tool/block replies). +- `provider`: `"elevenlabs"` or `"openai"` (fallback is automatic). +- `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. + - Accepts `provider/model` or a configured model alias. +- `modelOverrides`: allow the model to emit TTS directives (on by default). +- `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. +- `timeoutMs`: request timeout (ms). +- `prefsPath`: override the local prefs JSON path. +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`). +- `elevenlabs.baseUrl`: override ElevenLabs API base URL. +- `elevenlabs.voiceSettings`: + - `stability`, `similarityBoost`, `style`: `0..1` + - `useSpeakerBoost`: `true|false` + - `speed`: `0.5..2.0` (1.0 = normal) +- `elevenlabs.applyTextNormalization`: `auto|on|off` +- `elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) +- `elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) + +## Model-driven overrides (default on) + +By default, the model **can** emit TTS directives for a single reply. + +When enabled, the model can emit `[[tts:...]]` directives to override the voice +for a single reply, plus an optional `[[tts:text]]...[[/tts:text]]` block to +provide expressive tags (laughter, singing cues, etc) that should only appear in +the audio. + +Example reply payload: + +``` +Here you go. + +[[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE model=eleven_v3 speed=1.1]] +[[tts:text]](laughs) Read the song once more.[[/tts:text]] +``` + +Available directive keys (when enabled): +- `provider` (`openai` | `elevenlabs`) +- `voice` (OpenAI voice) or `voiceId` (ElevenLabs) +- `model` (OpenAI TTS model or ElevenLabs model id) +- `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` +- `applyTextNormalization` (`auto|on|off`) +- `languageCode` (ISO 639-1) +- `seed` + +Disable all model overrides: + +```json5 +{ + messages: { + tts: { + modelOverrides: { + enabled: false + } + } + } +} +``` + +Optional allowlist (disable specific overrides while keeping tags enabled): + +```json5 +{ + messages: { + tts: { + modelOverrides: { + enabled: true, + allowProvider: false, + allowSeed: false + } + } + } +} +``` + +## Per-user preferences + +Slash commands write local overrides to `prefsPath` (default: +`~/.clawdbot/settings/tts.json`, override with `CLAWDBOT_TTS_PREFS` or +`messages.tts.prefsPath`). + +Stored fields: +- `enabled` +- `provider` +- `maxLength` (summary threshold; default 1500 chars) +- `summarize` (default `true`) + +These override `messages.tts.*` for that host. + +## Output formats (fixed) + +- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). + - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble. +- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). + - 44.1kHz / 128kbps is the default balance for speech clarity. + +This is not configurable; Telegram expects Opus for voice-note UX. + +## Auto-TTS behavior + +When enabled, Clawdbot: +- skips TTS if the reply already contains media or a `MEDIA:` directive. +- skips very short replies (< 10 chars). +- summarizes long replies when enabled using `agents.defaults.model.primary` (or `summaryModel`). +- attaches the generated audio to the reply. + +If the reply exceeds `maxLength` and summary is off (or no API key for the +summary model), audio +is skipped and the normal text reply is sent. + +## Flow diagram + +``` +Reply -> TTS enabled? + no -> send text + yes -> has media / MEDIA: / short? + yes -> send text + no -> length > limit? + no -> TTS -> attach audio + yes -> summary enabled? + no -> send text + yes -> summarize (summaryModel or agents.defaults.model.primary) + -> TTS -> attach audio +``` + +## Slash command usage + +There is a single command: `/tts`. +See [Slash commands](/tools/slash-commands) for enablement details. + +``` +/tts on +/tts off +/tts status +/tts provider openai +/tts limit 2000 +/tts summary off +/tts audio Hello from Clawdbot +``` + +Notes: +- Commands require an authorized sender (allowlist/owner rules still apply). +- `commands.text` or native command registration must be enabled. +- `limit` and `summary` are stored in local prefs, not the main config. +- `/tts audio` generates a one-off audio reply (does not toggle TTS on). + +## Agent tool + +The `tts` tool converts text to speech and returns a `MEDIA:` path. When the +result is Telegram-compatible, the tool includes `[[audio_as_voice]]` so +Telegram sends a voice bubble. + +## Gateway RPC + +Gateway methods: +- `tts.status` +- `tts.enable` +- `tts.disable` +- `tts.convert` +- `tts.setProvider` +- `tts.providers` diff --git a/src/auto-reply/commands-registry.data.ts b/src/auto-reply/commands-registry.data.ts index 3e2ad8775..536a64ea4 100644 --- a/src/auto-reply/commands-registry.data.ts +++ b/src/auto-reply/commands-registry.data.ts @@ -273,80 +273,26 @@ function buildChatCommands(): ChatCommandDefinition[] { argsMenu: "auto", }), defineChatCommand({ - key: "audio", - nativeName: "audio", - description: "Convert text to a TTS audio reply.", - textAlias: "/audio", + key: "tts", + nativeName: "tts", + description: "Control text-to-speech (TTS).", + textAlias: "/tts", args: [ { - name: "text", - description: "Text to speak", + name: "action", + description: "on | off | status | provider | limit | summary | audio | help", + type: "string", + choices: ["on", "off", "status", "provider", "limit", "summary", "audio", "help"], + }, + { + name: "value", + description: "Provider, limit, or text", type: "string", captureRemaining: true, }, ], - }), - defineChatCommand({ - key: "tts_on", - nativeName: "tts_on", - description: "Enable text-to-speech for replies.", - textAlias: "/tts_on", - }), - defineChatCommand({ - key: "tts_off", - nativeName: "tts_off", - description: "Disable text-to-speech for replies.", - textAlias: "/tts_off", - }), - defineChatCommand({ - key: "tts_provider", - nativeName: "tts_provider", - description: "Set or show the TTS provider.", - textAlias: "/tts_provider", - args: [ - { - name: "provider", - description: "openai or elevenlabs", - type: "string", - choices: ["openai", "elevenlabs"], - }, - ], argsMenu: "auto", }), - defineChatCommand({ - key: "tts_limit", - nativeName: "tts_limit", - description: "Set or show the max TTS text length.", - textAlias: "/tts_limit", - args: [ - { - name: "maxLength", - description: "Max chars before summarizing", - type: "number", - }, - ], - }), - defineChatCommand({ - key: "tts_summary", - nativeName: "tts_summary", - description: "Enable or disable TTS auto-summary.", - textAlias: "/tts_summary", - args: [ - { - name: "mode", - description: "on or off", - type: "string", - choices: ["on", "off"], - }, - ], - argsMenu: "auto", - }), - defineChatCommand({ - key: "tts_status", - nativeName: "tts_status", - description: "Show TTS status and last attempt.", - textAlias: "/tts_status", - }), defineChatCommand({ key: "stop", nativeName: "stop", diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 9582143af..23ee80bc7 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -18,22 +18,39 @@ import { textToSpeech, } from "../../tts/tts.js"; -function parseCommandArg(normalized: string, command: string): string | null { - if (normalized === command) return ""; - if (normalized.startsWith(`${command} `)) return normalized.slice(command.length).trim(); - return null; +type ParsedTtsCommand = { + action: string; + args: string; +}; + +function parseTtsCommand(normalized: string): ParsedTtsCommand | null { + // Accept `/tts` and `/tts [args]` as a single control surface. + if (normalized === "/tts") return { action: "status", args: "" }; + if (!normalized.startsWith("/tts ")) return null; + const rest = normalized.slice(5).trim(); + if (!rest) return { action: "status", args: "" }; + const [action, ...tail] = rest.split(/\s+/); + return { action: action.toLowerCase(), args: tail.join(" ").trim() }; +} + +function ttsUsage(): ReplyPayload { + // Keep usage in one place so help/validation stays consistent. + return { + text: + "⚙️ Usage: /tts [value]" + + "\nExamples:\n" + + "/tts on\n" + + "/tts provider openai\n" + + "/tts limit 2000\n" + + "/tts summary off\n" + + "/tts audio Hello from Clawdbot", + }; } export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => { if (!allowTextCommands) return null; - const normalized = params.command.commandBodyNormalized; - if ( - !normalized.startsWith("/tts_") && - normalized !== "/audio" && - !normalized.startsWith("/audio ") - ) { - return null; - } + const parsed = parseTtsCommand(params.command.commandBodyNormalized); + if (!parsed) return null; if (!params.command.isAuthorizedSender) { logVerbose( @@ -44,36 +61,42 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const config = resolveTtsConfig(params.cfg); const prefsPath = resolveTtsPrefsPath(config); + const action = parsed.action; + const args = parsed.args; - if (normalized === "/tts_on") { + if (action === "help") { + return { shouldContinue: false, reply: ttsUsage() }; + } + + if (action === "on") { setTtsEnabled(prefsPath, true); return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } }; } - if (normalized === "/tts_off") { + if (action === "off") { setTtsEnabled(prefsPath, false); return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } }; } - const audioArg = parseCommandArg(normalized, "/audio"); - if (audioArg !== null) { - if (!audioArg.trim()) { - return { shouldContinue: false, reply: { text: "⚙️ Usage: /audio " } }; + if (action === "audio") { + if (!args.trim()) { + return { shouldContinue: false, reply: ttsUsage() }; } const start = Date.now(); const result = await textToSpeech({ - text: audioArg, + text: args, cfg: params.cfg, channel: params.command.channel, prefsPath, }); if (result.success && result.audioPath) { + // Store last attempt for `/tts status`. setLastTtsAttempt({ timestamp: Date.now(), success: true, - textLength: audioArg.length, + textLength: args.length, summarized: false, provider: result.provider, latencyMs: result.latencyMs, @@ -85,10 +108,11 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand return { shouldContinue: false, reply: payload }; } + // Store failure details for `/tts status`. setLastTtsAttempt({ timestamp: Date.now(), success: false, - textLength: audioArg.length, + textLength: args.length, summarized: false, error: result.error, latencyMs: Date.now() - start, @@ -99,10 +123,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand }; } - const providerArg = parseCommandArg(normalized, "/tts_provider"); - if (providerArg !== null) { + if (action === "provider") { const currentProvider = getTtsProvider(config, prefsPath); - if (!providerArg.trim()) { + if (!args.trim()) { const fallback = currentProvider === "openai" ? "elevenlabs" : "openai"; const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); @@ -115,17 +138,14 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand `Fallback: ${fallback}\n` + `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` + `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` + - `Usage: /tts_provider openai | elevenlabs`, + `Usage: /tts provider openai | elevenlabs`, }, }; } - const requested = providerArg.trim().toLowerCase(); + const requested = args.trim().toLowerCase(); if (requested !== "openai" && requested !== "elevenlabs") { - return { - shouldContinue: false, - reply: { text: "⚙️ Usage: /tts_provider openai | elevenlabs" }, - }; + return { shouldContinue: false, reply: ttsUsage() }; } setTtsProvider(prefsPath, requested); @@ -136,21 +156,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand }; } - const limitArg = parseCommandArg(normalized, "/tts_limit"); - if (limitArg !== null) { - if (!limitArg.trim()) { + if (action === "limit") { + if (!args.trim()) { const currentLimit = getTtsMaxLength(prefsPath); return { shouldContinue: false, reply: { text: `📏 TTS limit: ${currentLimit} characters.` }, }; } - const next = Number.parseInt(limitArg.trim(), 10); + const next = Number.parseInt(args.trim(), 10); if (!Number.isFinite(next) || next < 100 || next > 10_000) { - return { - shouldContinue: false, - reply: { text: "⚙️ Usage: /tts_limit <100-10000>" }, - }; + return { shouldContinue: false, reply: ttsUsage() }; } setTtsMaxLength(prefsPath, next); return { @@ -159,18 +175,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand }; } - const summaryArg = parseCommandArg(normalized, "/tts_summary"); - if (summaryArg !== null) { - if (!summaryArg.trim()) { + if (action === "summary") { + if (!args.trim()) { const enabled = isSummarizationEnabled(prefsPath); return { shouldContinue: false, reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` }, }; } - const requested = summaryArg.trim().toLowerCase(); + const requested = args.trim().toLowerCase(); if (requested !== "on" && requested !== "off") { - return { shouldContinue: false, reply: { text: "⚙️ Usage: /tts_summary on|off" } }; + return { shouldContinue: false, reply: ttsUsage() }; } setSummarizationEnabled(prefsPath, requested === "on"); return { @@ -181,7 +196,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand }; } - if (normalized === "/tts_status") { + if (action === "status") { const enabled = isTtsEnabled(config, prefsPath); const provider = getTtsProvider(config, prefsPath); const hasKey = Boolean(resolveTtsApiKey(config, provider)); @@ -210,5 +225,5 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand return { shouldContinue: false, reply: { text: lines.join("\n") } }; } - return null; + return { shouldContinue: false, reply: ttsUsage() }; }; diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index cd991a82e..86d94deca 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -2,6 +2,25 @@ export type TtsProvider = "elevenlabs" | "openai"; export type TtsMode = "final" | "all"; +export type TtsModelOverrideConfig = { + /** Enable model-provided overrides for TTS. */ + enabled?: boolean; + /** Allow model-provided TTS text blocks. */ + allowText?: boolean; + /** Allow model-provided provider override. */ + allowProvider?: boolean; + /** Allow model-provided voice/voiceId override. */ + allowVoice?: boolean; + /** Allow model-provided modelId override. */ + allowModelId?: boolean; + /** Allow model-provided voice settings override. */ + allowVoiceSettings?: boolean; + /** Allow model-provided normalization or language overrides. */ + allowNormalization?: boolean; + /** Allow model-provided seed override. */ + allowSeed?: boolean; +}; + export type TtsConfig = { /** Enable auto-TTS (can be overridden by local prefs). */ enabled?: boolean; @@ -9,11 +28,26 @@ export type TtsConfig = { mode?: TtsMode; /** Primary TTS provider (fallbacks are automatic). */ provider?: TtsProvider; + /** Optional model override for TTS auto-summary (provider/model or alias). */ + summaryModel?: string; + /** Allow the model to override TTS parameters. */ + modelOverrides?: TtsModelOverrideConfig; /** ElevenLabs configuration. */ elevenlabs?: { apiKey?: string; + baseUrl?: string; voiceId?: string; modelId?: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings?: { + stability?: number; + similarityBoost?: number; + style?: number; + useSpeakerBoost?: boolean; + speed?: number; + }; }; /** OpenAI configuration. */ openai?: { diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index f58f467f0..0517df43d 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -162,11 +162,39 @@ export const TtsConfigSchema = z enabled: z.boolean().optional(), mode: TtsModeSchema.optional(), provider: TtsProviderSchema.optional(), + summaryModel: z.string().optional(), + modelOverrides: z + .object({ + enabled: z.boolean().optional(), + allowText: z.boolean().optional(), + allowProvider: z.boolean().optional(), + allowVoice: z.boolean().optional(), + allowModelId: z.boolean().optional(), + allowVoiceSettings: z.boolean().optional(), + allowNormalization: z.boolean().optional(), + allowSeed: z.boolean().optional(), + }) + .strict() + .optional(), elevenlabs: z .object({ apiKey: z.string().optional(), + baseUrl: z.string().optional(), voiceId: z.string().optional(), modelId: z.string().optional(), + seed: z.number().int().min(0).max(4294967295).optional(), + applyTextNormalization: z.enum(["auto", "on", "off"]).optional(), + languageCode: z.string().optional(), + voiceSettings: z + .object({ + stability: z.number().min(0).max(1).optional(), + similarityBoost: z.number().min(0).max(1).optional(), + style: z.number().min(0).max(1).optional(), + useSpeakerBoost: z.boolean().optional(), + speed: z.number().min(0.5).max(2).optional(), + }) + .strict() + .optional(), }) .strict() .optional(), diff --git a/src/plugins/types.ts b/src/plugins/types.ts index ea7a392f2..c5363d72e 100644 --- a/src/plugins/types.ts +++ b/src/plugins/types.ts @@ -170,7 +170,7 @@ export type PluginCommandHandler = ( * Definition for a plugin-registered command. */ export type ClawdbotPluginCommandDefinition = { - /** Command name without leading slash (e.g., "tts_on") */ + /** Command name without leading slash (e.g., "tts") */ name: string; /** Description shown in /help and command menus */ description: string; diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index c4725a723..635364364 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -1,6 +1,41 @@ -import { describe, expect, it, vi, beforeEach, afterEach } from "vitest"; +import { describe, expect, it, vi, beforeEach } from "vitest"; -import { _test } from "./tts.js"; +import { completeSimple } from "@mariozechner/pi-ai"; + +import { getApiKeyForModel } from "../agents/model-auth.js"; +import { resolveModel } from "../agents/pi-embedded-runner/model.js"; +import { _test, resolveTtsConfig } from "./tts.js"; + +vi.mock("@mariozechner/pi-ai", () => ({ + completeSimple: vi.fn(), +})); + +vi.mock("../agents/pi-embedded-runner/model.js", () => ({ + resolveModel: vi.fn((provider: string, modelId: string) => ({ + model: { + provider, + id: modelId, + name: modelId, + api: "openai-completions", + reasoning: false, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128000, + maxTokens: 8192, + }, + authStorage: { profiles: {} }, + modelRegistry: { find: vi.fn() }, + })), +})); + +vi.mock("../agents/model-auth.js", () => ({ + getApiKeyForModel: vi.fn(async () => ({ + apiKey: "test-api-key", + source: "test", + mode: "api-key", + })), + requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""), +})); const { isValidVoiceId, @@ -8,11 +43,20 @@ const { isValidOpenAIModel, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + parseTtsDirectives, + resolveModelOverridePolicy, summarizeText, resolveOutputFormat, } = _test; describe("tts", () => { + beforeEach(() => { + vi.clearAllMocks(); + vi.mocked(completeSimple).mockResolvedValue({ + content: [{ type: "text", text: "Summary" }], + }); + }); + describe("isValidVoiceId", () => { it("accepts valid ElevenLabs voice IDs", () => { expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true); @@ -105,130 +149,169 @@ describe("tts", () => { }); }); + describe("parseTtsDirectives", () => { + it("extracts overrides and strips directives when enabled", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = + "Hello [[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE stability=0.4 speed=1.1]] world\n\n" + + "[[tts:text]](laughs) Read the song once more.[[/tts:text]]"; + const result = parseTtsDirectives(input, policy); + + expect(result.cleanedText).not.toContain("[[tts:"); + expect(result.ttsText).toBe("(laughs) Read the song once more."); + expect(result.overrides.provider).toBe("elevenlabs"); + expect(result.overrides.elevenlabs?.voiceId).toBe("pMsXgVXv3BLzUgSXRplE"); + expect(result.overrides.elevenlabs?.voiceSettings?.stability).toBe(0.4); + expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1); + }); + + it("keeps text intact when overrides are disabled", () => { + const policy = resolveModelOverridePolicy({ enabled: false }); + const input = "Hello [[tts:voice=alloy]] world"; + const result = parseTtsDirectives(input, policy); + + expect(result.cleanedText).toBe(input); + expect(result.overrides.provider).toBeUndefined(); + }); + }); + describe("summarizeText", () => { - const mockApiKey = "test-api-key"; - const originalFetch = globalThis.fetch; - - beforeEach(() => { - vi.useFakeTimers({ shouldAdvanceTime: true }); - }); - - afterEach(() => { - globalThis.fetch = originalFetch; - vi.useRealTimers(); - }); + const baseCfg = { + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { tts: {} }, + }; + const baseConfig = resolveTtsConfig(baseCfg); it("summarizes text and returns result with metrics", async () => { const mockSummary = "This is a summarized version of the text."; - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ - choices: [{ message: { content: mockSummary } }], - }), + vi.mocked(completeSimple).mockResolvedValue({ + content: [{ type: "text", text: mockSummary }], }); const longText = "A".repeat(2000); - const result = await summarizeText(longText, 1500, mockApiKey, 30_000); + const result = await summarizeText({ + text: longText, + targetLength: 1500, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }); expect(result.summary).toBe(mockSummary); expect(result.inputLength).toBe(2000); expect(result.outputLength).toBe(mockSummary.length); expect(result.latencyMs).toBeGreaterThanOrEqual(0); - expect(globalThis.fetch).toHaveBeenCalledTimes(1); + expect(completeSimple).toHaveBeenCalledTimes(1); }); - it("calls OpenAI API with correct parameters", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ - choices: [{ message: { content: "Summary" } }], - }), + it("calls the summary model with the expected parameters", async () => { + await summarizeText({ + text: "Long text to summarize", + targetLength: 500, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, }); - await summarizeText("Long text to summarize", 500, mockApiKey, 30_000); + const callArgs = vi.mocked(completeSimple).mock.calls[0]; + expect(callArgs?.[1]?.messages?.[0]?.role).toBe("user"); + expect(callArgs?.[2]?.maxTokens).toBe(250); + expect(callArgs?.[2]?.temperature).toBe(0.3); + expect(getApiKeyForModel).toHaveBeenCalledTimes(1); + }); - expect(globalThis.fetch).toHaveBeenCalledWith( - "https://api.openai.com/v1/chat/completions", - expect.objectContaining({ - method: "POST", - headers: { - Authorization: `Bearer ${mockApiKey}`, - "Content-Type": "application/json", - }, - }), - ); + it("uses summaryModel override when configured", async () => { + const cfg = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-5" } } }, + messages: { tts: { summaryModel: "openai/gpt-4.1-mini" } }, + }; + const config = resolveTtsConfig(cfg); + await summarizeText({ + text: "Long text to summarize", + targetLength: 500, + cfg, + config, + timeoutMs: 30_000, + }); - const callArgs = (globalThis.fetch as ReturnType).mock.calls[0]; - const body = JSON.parse(callArgs[1].body); - expect(body.model).toBe("gpt-4o-mini"); - expect(body.temperature).toBe(0.3); - expect(body.max_tokens).toBe(250); + expect(resolveModel).toHaveBeenCalledWith("openai", "gpt-4.1-mini", undefined, cfg); }); it("rejects targetLength below minimum (100)", async () => { - await expect(summarizeText("text", 99, mockApiKey, 30_000)).rejects.toThrow( - "Invalid targetLength: 99", - ); + await expect( + summarizeText({ + text: "text", + targetLength: 99, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }), + ).rejects.toThrow("Invalid targetLength: 99"); }); it("rejects targetLength above maximum (10000)", async () => { - await expect(summarizeText("text", 10001, mockApiKey, 30_000)).rejects.toThrow( - "Invalid targetLength: 10001", - ); + await expect( + summarizeText({ + text: "text", + targetLength: 10001, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }), + ).rejects.toThrow("Invalid targetLength: 10001"); }); it("accepts targetLength at boundaries", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ - choices: [{ message: { content: "Summary" } }], - }), - }); - - await expect(summarizeText("text", 100, mockApiKey, 30_000)).resolves.toBeDefined(); - await expect(summarizeText("text", 10000, mockApiKey, 30_000)).resolves.toBeDefined(); - }); - - it("throws error when API returns non-ok response", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: false, - status: 500, - }); - - await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow( - "Summarization service unavailable", - ); + await expect( + summarizeText({ + text: "text", + targetLength: 100, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }), + ).resolves.toBeDefined(); + await expect( + summarizeText({ + text: "text", + targetLength: 10000, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }), + ).resolves.toBeDefined(); }); it("throws error when no summary is returned", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ - choices: [], - }), + vi.mocked(completeSimple).mockResolvedValue({ + content: [], }); - await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow( - "No summary returned", - ); + await expect( + summarizeText({ + text: "text", + targetLength: 500, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }), + ).rejects.toThrow("No summary returned"); }); it("throws error when summary content is empty", async () => { - globalThis.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ - choices: [{ message: { content: " " } }], - }), + vi.mocked(completeSimple).mockResolvedValue({ + content: [{ type: "text", text: " " }], }); - await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow( - "No summary returned", - ); + await expect( + summarizeText({ + text: "text", + targetLength: 500, + cfg: baseCfg, + config: baseConfig, + timeoutMs: 30_000, + }), + ).rejects.toThrow("No summary returned"); }); }); }); diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 0a03063a9..c89acc05c 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -11,13 +11,28 @@ import { import { tmpdir } from "node:os"; import path from "node:path"; +import { completeSimple, type TextContent } from "@mariozechner/pi-ai"; + import type { ReplyPayload } from "../auto-reply/types.js"; import { normalizeChannelId } from "../channels/plugins/index.js"; import type { ChannelId } from "../channels/plugins/types.js"; import type { ClawdbotConfig } from "../config/config.js"; -import type { TtsConfig, TtsMode, TtsProvider } from "../config/types.tts.js"; +import type { + TtsConfig, + TtsMode, + TtsProvider, + TtsModelOverrideConfig, +} from "../config/types.tts.js"; import { logVerbose } from "../globals.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; +import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js"; +import { + buildModelAliasIndex, + resolveDefaultModelForAgent, + resolveModelRefFromString, + type ModelRef, +} from "../agents/model-selection.js"; +import { resolveModel } from "../agents/pi-embedded-runner/model.js"; const DEFAULT_TIMEOUT_MS = 30_000; const DEFAULT_TTS_MAX_LENGTH = 1500; @@ -25,11 +40,20 @@ const DEFAULT_TTS_SUMMARIZE = true; const DEFAULT_MAX_TEXT_LENGTH = 4000; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes +const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"; const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"; const DEFAULT_OPENAI_VOICE = "alloy"; +const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { + stability: 0.5, + similarityBoost: 0.75, + style: 0.0, + useSpeakerBoost: true, + speed: 1.0, +}; + const TELEGRAM_OUTPUT = { openai: "opus" as const, // ElevenLabs output formats use codec_sample_rate_bitrate naming. @@ -50,10 +74,23 @@ export type ResolvedTtsConfig = { enabled: boolean; mode: TtsMode; provider: TtsProvider; + summaryModel?: string; + modelOverrides: ResolvedTtsModelOverrides; elevenlabs: { apiKey?: string; + baseUrl: string; voiceId: string; modelId: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings: { + stability: number; + similarityBoost: number; + style: number; + useSpeakerBoost: boolean; + speed: number; + }; }; openai: { apiKey?: string; @@ -74,6 +111,41 @@ type TtsUserPrefs = { }; }; +type ResolvedTtsModelOverrides = { + enabled: boolean; + allowText: boolean; + allowProvider: boolean; + allowVoice: boolean; + allowModelId: boolean; + allowVoiceSettings: boolean; + allowNormalization: boolean; + allowSeed: boolean; +}; + +type TtsDirectiveOverrides = { + ttsText?: string; + provider?: TtsProvider; + openai?: { + voice?: string; + model?: string; + }; + elevenlabs?: { + voiceId?: string; + modelId?: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings?: Partial; + }; +}; + +type TtsDirectiveParseResult = { + cleanedText: string; + ttsText?: string; + overrides: TtsDirectiveOverrides; + warnings: string[]; +}; + export type TtsResult = { success: boolean; audioPath?: string; @@ -96,16 +168,63 @@ type TtsStatusEntry = { let lastTtsAttempt: TtsStatusEntry | undefined; +function resolveModelOverridePolicy( + overrides: TtsModelOverrideConfig | undefined, +): ResolvedTtsModelOverrides { + const enabled = overrides?.enabled ?? true; + if (!enabled) { + return { + enabled: false, + allowText: false, + allowProvider: false, + allowVoice: false, + allowModelId: false, + allowVoiceSettings: false, + allowNormalization: false, + allowSeed: false, + }; + } + const allow = (value?: boolean) => value ?? true; + return { + enabled: true, + allowText: allow(overrides?.allowText), + allowProvider: allow(overrides?.allowProvider), + allowVoice: allow(overrides?.allowVoice), + allowModelId: allow(overrides?.allowModelId), + allowVoiceSettings: allow(overrides?.allowVoiceSettings), + allowNormalization: allow(overrides?.allowNormalization), + allowSeed: allow(overrides?.allowSeed), + }; +} + export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { const raw: TtsConfig = cfg.messages?.tts ?? {}; return { enabled: raw.enabled ?? false, mode: raw.mode ?? "final", provider: raw.provider ?? "elevenlabs", + summaryModel: raw.summaryModel?.trim() || undefined, + modelOverrides: resolveModelOverridePolicy(raw.modelOverrides), elevenlabs: { apiKey: raw.elevenlabs?.apiKey, + baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL, voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID, modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID, + seed: raw.elevenlabs?.seed, + applyTextNormalization: raw.elevenlabs?.applyTextNormalization, + languageCode: raw.elevenlabs?.languageCode, + voiceSettings: { + stability: + raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability, + similarityBoost: + raw.elevenlabs?.voiceSettings?.similarityBoost ?? + DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost, + style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style, + useSpeakerBoost: + raw.elevenlabs?.voiceSettings?.useSpeakerBoost ?? + DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost, + speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed, + }, }, openai: { apiKey: raw.openai?.apiKey, @@ -235,6 +354,261 @@ function isValidVoiceId(voiceId: string): boolean { return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); } +function normalizeElevenLabsBaseUrl(baseUrl: string): string { + const trimmed = baseUrl.trim(); + if (!trimmed) return DEFAULT_ELEVENLABS_BASE_URL; + return trimmed.replace(/\/+$/, ""); +} + +function requireInRange(value: number, min: number, max: number, label: string): void { + if (!Number.isFinite(value) || value < min || value > max) { + throw new Error(`${label} must be between ${min} and ${max}`); + } +} + +function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) { + requireInRange(settings.stability, 0, 1, "stability"); + requireInRange(settings.similarityBoost, 0, 1, "similarityBoost"); + requireInRange(settings.style, 0, 1, "style"); + requireInRange(settings.speed, 0.5, 2, "speed"); +} + +function normalizeLanguageCode(code?: string): string | undefined { + const trimmed = code?.trim(); + if (!trimmed) return undefined; + const normalized = trimmed.toLowerCase(); + if (!/^[a-z]{2}$/.test(normalized)) { + throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)"); + } + return normalized; +} + +function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined { + const trimmed = mode?.trim(); + if (!trimmed) return undefined; + const normalized = trimmed.toLowerCase(); + if (normalized === "auto" || normalized === "on" || normalized === "off") return normalized; + throw new Error("applyTextNormalization must be one of: auto, on, off"); +} + +function normalizeSeed(seed?: number): number | undefined { + if (seed == null) return undefined; + const next = Math.floor(seed); + if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) { + throw new Error("seed must be between 0 and 4294967295"); + } + return next; +} + +function parseBooleanValue(value: string): boolean | undefined { + const normalized = value.trim().toLowerCase(); + if (["true", "1", "yes", "on"].includes(normalized)) return true; + if (["false", "0", "no", "off"].includes(normalized)) return false; + return undefined; +} + +function parseNumberValue(value: string): number | undefined { + const parsed = Number.parseFloat(value); + return Number.isFinite(parsed) ? parsed : undefined; +} + +function parseTtsDirectives( + text: string, + policy: ResolvedTtsModelOverrides, +): TtsDirectiveParseResult { + if (!policy.enabled) { + return { cleanedText: text, overrides: {}, warnings: [] }; + } + + const overrides: TtsDirectiveOverrides = {}; + const warnings: string[] = []; + let cleanedText = text; + + const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi; + cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => { + if (policy.allowText && overrides.ttsText == null) { + overrides.ttsText = inner.trim(); + } + return ""; + }); + + const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi; + cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => { + const tokens = body.split(/\s+/).filter(Boolean); + for (const token of tokens) { + const eqIndex = token.indexOf("="); + if (eqIndex === -1) continue; + const rawKey = token.slice(0, eqIndex).trim(); + const rawValue = token.slice(eqIndex + 1).trim(); + if (!rawKey || !rawValue) continue; + const key = rawKey.toLowerCase(); + try { + switch (key) { + case "provider": + if (!policy.allowProvider) break; + if (rawValue === "openai" || rawValue === "elevenlabs") { + overrides.provider = rawValue; + } else { + warnings.push(`unsupported provider "${rawValue}"`); + } + break; + case "voice": + case "openai_voice": + case "openaivoice": + if (!policy.allowVoice) break; + if (isValidOpenAIVoice(rawValue)) { + overrides.openai = { ...overrides.openai, voice: rawValue }; + } else { + warnings.push(`invalid OpenAI voice "${rawValue}"`); + } + break; + case "voiceid": + case "voice_id": + case "elevenlabs_voice": + case "elevenlabsvoice": + if (!policy.allowVoice) break; + if (isValidVoiceId(rawValue)) { + overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue }; + } else { + warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`); + } + break; + case "model": + case "modelid": + case "model_id": + case "elevenlabs_model": + case "elevenlabsmodel": + case "openai_model": + case "openaimodel": + if (!policy.allowModelId) break; + if (isValidOpenAIModel(rawValue)) { + overrides.openai = { ...overrides.openai, model: rawValue }; + } else { + overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue }; + } + break; + case "stability": + if (!policy.allowVoiceSettings) break; + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid stability value"); + break; + } + requireInRange(value, 0, 1, "stability"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value }, + }; + } + break; + case "similarity": + case "similarityboost": + case "similarity_boost": + if (!policy.allowVoiceSettings) break; + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid similarityBoost value"); + break; + } + requireInRange(value, 0, 1, "similarityBoost"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value }, + }; + } + break; + case "style": + if (!policy.allowVoiceSettings) break; + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid style value"); + break; + } + requireInRange(value, 0, 1, "style"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value }, + }; + } + break; + case "speed": + if (!policy.allowVoiceSettings) break; + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid speed value"); + break; + } + requireInRange(value, 0.5, 2, "speed"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value }, + }; + } + break; + case "speakerboost": + case "speaker_boost": + case "usespeakerboost": + case "use_speaker_boost": + if (!policy.allowVoiceSettings) break; + { + const value = parseBooleanValue(rawValue); + if (value == null) { + warnings.push("invalid useSpeakerBoost value"); + break; + } + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value }, + }; + } + break; + case "normalize": + case "applytextnormalization": + case "apply_text_normalization": + if (!policy.allowNormalization) break; + overrides.elevenlabs = { + ...overrides.elevenlabs, + applyTextNormalization: normalizeApplyTextNormalization(rawValue), + }; + break; + case "language": + case "languagecode": + case "language_code": + if (!policy.allowNormalization) break; + overrides.elevenlabs = { + ...overrides.elevenlabs, + languageCode: normalizeLanguageCode(rawValue), + }; + break; + case "seed": + if (!policy.allowSeed) break; + overrides.elevenlabs = { + ...overrides.elevenlabs, + seed: normalizeSeed(Number.parseInt(rawValue, 10)), + }; + break; + default: + break; + } + } catch (err) { + warnings.push((err as Error).message); + } + } + return ""; + }); + + return { + cleanedText, + ttsText: overrides.ttsText, + overrides, + warnings, + }; +} + export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const; export const OPENAI_TTS_VOICES = [ "alloy", @@ -265,66 +639,110 @@ type SummarizeResult = { outputLength: number; }; -async function summarizeText( - text: string, - targetLength: number, - apiKey: string, - timeoutMs: number, -): Promise { +type SummaryModelSelection = { + ref: ModelRef; + source: "summaryModel" | "default"; +}; + +function resolveSummaryModelRef( + cfg: ClawdbotConfig, + config: ResolvedTtsConfig, +): SummaryModelSelection { + const defaultRef = resolveDefaultModelForAgent({ cfg }); + const override = config.summaryModel?.trim(); + if (!override) return { ref: defaultRef, source: "default" }; + + const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider }); + const resolved = resolveModelRefFromString({ + raw: override, + defaultProvider: defaultRef.provider, + aliasIndex, + }); + if (!resolved) return { ref: defaultRef, source: "default" }; + return { ref: resolved.ref, source: "summaryModel" }; +} + +function isTextContentBlock(block: { type: string }): block is TextContent { + return block.type === "text"; +} + +async function summarizeText(params: { + text: string; + targetLength: number; + cfg: ClawdbotConfig; + config: ResolvedTtsConfig; + timeoutMs: number; +}): Promise { + const { text, targetLength, cfg, config, timeoutMs } = params; if (targetLength < 100 || targetLength > 10_000) { throw new Error(`Invalid targetLength: ${targetLength}`); } const startTime = Date.now(); - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); + const { ref } = resolveSummaryModelRef(cfg, config); + const resolved = resolveModel(ref.provider, ref.model, undefined, cfg); + if (!resolved.model) { + throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`); + } + const apiKey = requireApiKey( + await getApiKeyForModel({ model: resolved.model, cfg }), + ref.provider, + ); try { - const response = await fetch("https://api.openai.com/v1/chat/completions", { - method: "POST", - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: "gpt-4o-mini", - messages: [ - { - role: "system", - content: `You are an assistant that summarizes texts concisely while keeping the most important information. Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. Reply only with the summary, without additional explanations.`, - }, - { - role: "user", - content: `\n${text}\n`, - }, - ], - max_tokens: Math.ceil(targetLength / 2), - temperature: 0.3, - }), - signal: controller.signal, - }); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); - if (!response.ok) { - throw new Error("Summarization service unavailable"); + try { + const res = await completeSimple( + resolved.model, + { + messages: [ + { + role: "user", + content: + `You are an assistant that summarizes texts concisely while keeping the most important information. ` + + `Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` + + `Reply only with the summary, without additional explanations.\n\n` + + `\n${text}\n`, + timestamp: Date.now(), + }, + ], + }, + { + apiKey, + maxTokens: Math.ceil(targetLength / 2), + temperature: 0.3, + signal: controller.signal, + }, + ); + + const summary = res.content + .filter(isTextContentBlock) + .map((block) => block.text.trim()) + .filter(Boolean) + .join(" ") + .trim(); + + if (!summary) { + throw new Error("No summary returned"); + } + + return { + summary, + latencyMs: Date.now() - startTime, + inputLength: text.length, + outputLength: summary.length, + }; + } finally { + clearTimeout(timeout); } - - const data = (await response.json()) as { - choices?: Array<{ message?: { content?: string } }>; - }; - const summary = data.choices?.[0]?.message?.content?.trim(); - - if (!summary) { - throw new Error("No summary returned"); + } catch (err) { + const error = err as Error; + if (error.name === "AbortError") { + throw new Error("Summarization timed out"); } - - return { - summary, - latencyMs: Date.now() - startTime, - inputLength: text.length, - outputLength: summary.length, - }; - } finally { - clearTimeout(timeout); + throw err; } } @@ -342,21 +760,42 @@ function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DE async function elevenLabsTTS(params: { text: string; apiKey: string; + baseUrl: string; voiceId: string; modelId: string; outputFormat: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]; timeoutMs: number; }): Promise { - const { text, apiKey, voiceId, modelId, outputFormat, timeoutMs } = params; + const { + text, + apiKey, + baseUrl, + voiceId, + modelId, + outputFormat, + seed, + applyTextNormalization, + languageCode, + voiceSettings, + timeoutMs, + } = params; if (!isValidVoiceId(voiceId)) { throw new Error("Invalid voiceId format"); } + assertElevenLabsVoiceSettings(voiceSettings); + const normalizedLanguage = normalizeLanguageCode(languageCode); + const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization); + const normalizedSeed = normalizeSeed(seed); const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs); try { - const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`); + const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`); if (outputFormat) { url.searchParams.set("output_format", outputFormat); } @@ -371,11 +810,15 @@ async function elevenLabsTTS(params: { body: JSON.stringify({ text, model_id: modelId, + seed: normalizedSeed, + apply_text_normalization: normalizedNormalization, + language_code: normalizedLanguage, voice_settings: { - stability: 0.5, - similarity_boost: 0.75, - style: 0.0, - use_speaker_boost: true, + stability: voiceSettings.stability, + similarity_boost: voiceSettings.similarityBoost, + style: voiceSettings.style, + use_speaker_boost: voiceSettings.useSpeakerBoost, + speed: voiceSettings.speed, }, }), signal: controller.signal, @@ -442,6 +885,7 @@ export async function textToSpeech(params: { cfg: ClawdbotConfig; prefsPath?: string; channel?: string; + overrides?: TtsDirectiveOverrides; }): Promise { const config = resolveTtsConfig(params.cfg); const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); @@ -456,10 +900,9 @@ export async function textToSpeech(params: { } const userProvider = getTtsProvider(config, prefsPath); - const providers: TtsProvider[] = [ - userProvider, - userProvider === "openai" ? "elevenlabs" : "openai", - ]; + const overrideProvider = params.overrides?.provider; + const provider = overrideProvider ?? userProvider; + const providers: TtsProvider[] = [provider, provider === "openai" ? "elevenlabs" : "openai"]; let lastError: string | undefined; @@ -474,20 +917,36 @@ export async function textToSpeech(params: { try { let audioBuffer: Buffer; if (provider === "elevenlabs") { + const voiceIdOverride = params.overrides?.elevenlabs?.voiceId; + const modelIdOverride = params.overrides?.elevenlabs?.modelId; + const voiceSettings = { + ...config.elevenlabs.voiceSettings, + ...params.overrides?.elevenlabs?.voiceSettings, + }; + const seedOverride = params.overrides?.elevenlabs?.seed; + const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization; + const languageOverride = params.overrides?.elevenlabs?.languageCode; audioBuffer = await elevenLabsTTS({ text: params.text, apiKey, - voiceId: config.elevenlabs.voiceId, - modelId: config.elevenlabs.modelId, + baseUrl: config.elevenlabs.baseUrl, + voiceId: voiceIdOverride ?? config.elevenlabs.voiceId, + modelId: modelIdOverride ?? config.elevenlabs.modelId, outputFormat: output.elevenlabs, + seed: seedOverride ?? config.elevenlabs.seed, + applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization, + languageCode: languageOverride ?? config.elevenlabs.languageCode, + voiceSettings, timeoutMs: config.timeoutMs, }); } else { + const openaiModelOverride = params.overrides?.openai?.model; + const openaiVoiceOverride = params.overrides?.openai?.voice; audioBuffer = await openaiTTS({ text: params.text, apiKey, - model: config.openai.model, - voice: config.openai.voice, + model: openaiModelOverride ?? config.openai.model, + voice: openaiVoiceOverride ?? config.openai.voice, responseFormat: output.openai, timeoutMs: config.timeoutMs, }); @@ -538,13 +997,31 @@ export async function maybeApplyTtsToPayload(params: { if (mode === "final" && params.kind && params.kind !== "final") return params.payload; const text = params.payload.text ?? ""; - if (!text.trim()) return params.payload; - if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return params.payload; - if (text.includes("MEDIA:")) return params.payload; - if (text.trim().length < 10) return params.payload; + const directives = parseTtsDirectives(text, config.modelOverrides); + if (directives.warnings.length > 0) { + logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`); + } + + const cleanedText = directives.cleanedText; + const trimmedCleaned = cleanedText.trim(); + const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : ""; + const ttsText = directives.ttsText?.trim() || visibleText; + + const nextPayload = + visibleText === text.trim() + ? params.payload + : { + ...params.payload, + text: visibleText.length > 0 ? visibleText : undefined, + }; + + if (!ttsText.trim()) return nextPayload; + if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return nextPayload; + if (text.includes("MEDIA:")) return nextPayload; + if (ttsText.trim().length < 10) return nextPayload; const maxLength = getTtsMaxLength(prefsPath); - let textForAudio = text.trim(); + let textForAudio = ttsText.trim(); let wasSummarized = false; if (textForAudio.length > maxLength) { @@ -555,14 +1032,14 @@ export async function maybeApplyTtsToPayload(params: { return params.payload; } - const openaiKey = resolveTtsApiKey(config, "openai"); - if (!openaiKey) { - logVerbose("TTS: skipping summarization - OpenAI key missing."); - return params.payload; - } - try { - const summary = await summarizeText(textForAudio, maxLength, openaiKey, config.timeoutMs); + const summary = await summarizeText({ + text: textForAudio, + targetLength: maxLength, + cfg: params.cfg, + config, + timeoutMs: config.timeoutMs, + }); textForAudio = summary.summary; wasSummarized = true; if (textForAudio.length > config.maxTextLength) { @@ -584,6 +1061,7 @@ export async function maybeApplyTtsToPayload(params: { cfg: params.cfg, prefsPath, channel: params.channel, + overrides: directives.overrides, }); if (result.success && result.audioPath) { @@ -600,7 +1078,7 @@ export async function maybeApplyTtsToPayload(params: { const shouldVoice = channelId === "telegram" && result.voiceCompatible === true; return { - ...params.payload, + ...nextPayload, mediaUrl: result.audioPath, audioAsVoice: shouldVoice || params.payload.audioAsVoice, }; @@ -616,7 +1094,7 @@ export async function maybeApplyTtsToPayload(params: { const latency = Date.now() - ttsStart; logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`); - return params.payload; + return nextPayload; } export const _test = { @@ -625,6 +1103,8 @@ export const _test = { isValidOpenAIModel, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + parseTtsDirectives, + resolveModelOverridePolicy, summarizeText, resolveOutputFormat, };