From fc0e303e05150bef9f56c3993b7bdccb52ba817b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 25 Jan 2026 01:05:23 +0000 Subject: [PATCH] feat: add edge tts fallback provider --- CHANGELOG.md | 1 + docs/channels/googlechat.md | 2 +- docs/tts.md | 88 ++++++++++++-- package.json | 1 + pnpm-lock.yaml | 17 +++ src/auto-reply/reply/commands-tts.ts | 39 ++++-- src/config/types.tts.ts | 16 ++- src/config/zod-schema.core.ts | 17 ++- src/gateway/server-methods/tts.ts | 22 +++- src/tts/tts.test.ts | 119 +++++++++++++++++- src/tts/tts.ts | 176 +++++++++++++++++++++++++-- 11 files changed, 466 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4d31e635..8a8737dd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Docs: https://docs.clawd.bot - Ollama: provider discovery + docs. (#1606) Thanks @abhaymundhara. https://docs.clawd.bot/providers/ollama ### Changes +- TTS: add Edge TTS provider fallback, defaulting to keyless Edge with MP3 retry on format failures. (#1668) Thanks @steipete. https://docs.clawd.bot/tts - Docs: expand FAQ (migration, scheduling, concurrency, model recommendations, OpenAI subscription auth, Pi sizing, hackable install, docs SSL workaround). - Docs: add verbose installer troubleshooting guidance. - Docs: update Fly.io guide notes. diff --git a/docs/channels/googlechat.md b/docs/channels/googlechat.md index bd745caa2..00cfa7c72 100644 --- a/docs/channels/googlechat.md +++ b/docs/channels/googlechat.md @@ -32,7 +32,7 @@ Status: ready for DMs + spaces via Google Chat API webhooks (HTTP only). - Under **Connection settings**, select **HTTP endpoint URL**. - Under **Triggers**, select **Use a common HTTP endpoint URL for all triggers** and set it to your gateway's public URL followed by `/googlechat`. - *Tip: Run `clawdbot status` to find your gateway's public URL.* - - Under **Visibility**, check **Make this Chat app available to specific people and groups in **. + - Under **Visibility**, check **Make this Chat app available to specific people and groups in <Your Domain>**. - Enter your email address (e.g. `user@example.com`) in the text box. - Click **Save** at the bottom. 6) **Enable the app status**: diff --git a/docs/tts.md b/docs/tts.md index a9aa141a5..61da1f0dc 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -8,21 +8,37 @@ read_when: # Text-to-speech (TTS) -Clawdbot can convert outbound replies into audio using ElevenLabs or OpenAI. +Clawdbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS. It works anywhere Clawdbot can send audio; Telegram gets a round voice-note bubble. ## Supported services - **ElevenLabs** (primary or fallback provider) - **OpenAI** (primary or fallback provider; also used for summaries) +- **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys) -## Required keys +### Edge TTS notes -At least one of: +Edge TTS uses Microsoft Edge's online neural TTS service via the `node-edge-tts` +library. It's a hosted service (not local), uses Microsoft’s endpoints, and does +not require an API key. `node-edge-tts` exposes speech configuration options and +output formats, but not all options are supported by the Edge service. citeturn2search0 + +Because Edge TTS is a public web service without a published SLA or quota, treat it +as best-effort. If you need guaranteed limits and support, use OpenAI or ElevenLabs. +Microsoft's Speech REST API documents a 10‑minute audio limit per request; Edge TTS +does not publish limits, so assume similar or lower limits. citeturn0search3 + +## Optional keys + +If you want OpenAI or ElevenLabs: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `OPENAI_API_KEY` -If both are configured, the selected provider is used first and the other is a fallback. +Edge TTS does **not** require an API key. If no API keys are found, Clawdbot defaults +to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`). + +If multiple providers are configured, the selected provider is used first and the others are fallback options. Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`), so that provider must also be authenticated if you enable summaries. @@ -32,12 +48,17 @@ so that provider must also be authenticated if you enable summaries. - [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio) - [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) +- [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) +- [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) ## Is it enabled by default? No. TTS is **disabled** by default. Enable it in config or with `/tts on`, which writes a local preference override. +Edge TTS **is** enabled by default once TTS is on, and is used automatically +when no OpenAI or ElevenLabs API keys are available. + ## Config TTS config lives under `messages.tts` in `clawdbot.json`. @@ -94,6 +115,41 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Edge TTS primary (no API key) + +```json5 +{ + messages: { + tts: { + enabled: true, + provider: "edge", + edge: { + enabled: true, + voice: "en-US-MichelleNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + rate: "+10%", + pitch: "-5%" + } + } + } +} +``` + +### Disable Edge TTS + +```json5 +{ + messages: { + tts: { + edge: { + enabled: false + } + } + } +} +``` + ### Custom limits + prefs path ```json5 @@ -131,7 +187,9 @@ Then run: - `enabled`: master toggle (default `false`; local prefs can override). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: `"elevenlabs"` or `"openai"` (fallback is automatic). +- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic). +- If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key), + otherwise `edge`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). @@ -147,6 +205,15 @@ Then run: - `elevenlabs.applyTextNormalization`: `auto|on|off` - `elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) - `elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) +- `edge.enabled`: allow Edge TTS usage (default `true`; no API key). +- `edge.voice`: Edge neural voice name (e.g. `en-US-MichelleNeural`). +- `edge.lang`: language code (e.g. `en-US`). +- `edge.outputFormat`: Edge output format (e.g. `audio-24khz-48kbitrate-mono-mp3`). + - See Microsoft Speech output formats for valid values; not all formats are supported by Edge. +- `edge.rate` / `edge.pitch` / `edge.volume`: percent strings (e.g. `+10%`, `-5%`). +- `edge.saveSubtitles`: write JSON subtitles alongside the audio file. +- `edge.proxy`: proxy URL for Edge TTS requests. +- `edge.timeoutMs`: request timeout override (ms). ## Model-driven overrides (default on) @@ -167,7 +234,7 @@ Here you go. ``` Available directive keys (when enabled): -- `provider` (`openai` | `elevenlabs`) +- `provider` (`openai` | `elevenlabs` | `edge`) - `voice` (OpenAI voice) or `voiceId` (ElevenLabs) - `model` (OpenAI TTS model or ElevenLabs model id) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` @@ -225,8 +292,15 @@ These override `messages.tts.*` for that host. - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. +- **Edge TTS**: uses `edge.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). + - `node-edge-tts` accepts an `outputFormat`, but not all formats are available + from the Edge service. citeturn2search0 + - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). citeturn1search0 + - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need + guaranteed Opus voice notes. citeturn1search1 + - If the configured Edge output format fails, Clawdbot retries with MP3. -This is not configurable; Telegram expects Opus for voice-note UX. +OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX. ## Auto-TTS behavior diff --git a/package.json b/package.json index 1119d3f24..bf6d003a5 100644 --- a/package.json +++ b/package.json @@ -185,6 +185,7 @@ "linkedom": "^0.18.12", "long": "5.3.2", "markdown-it": "^14.1.0", + "node-edge-tts": "^1.2.9", "osc-progress": "^0.3.0", "pdfjs-dist": "^5.4.530", "playwright-core": "1.58.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bbb6961a2..b36478256 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -127,6 +127,9 @@ importers: markdown-it: specifier: ^14.1.0 version: 14.1.0 + node-edge-tts: + specifier: ^1.2.9 + version: 1.2.9 osc-progress: specifier: ^0.3.0 version: 0.3.0 @@ -4298,6 +4301,10 @@ packages: engines: {node: '>=14.18'} hasBin: true + node-edge-tts@1.2.9: + resolution: {integrity: sha512-fvfW1dUgJdZAdTniC6MzLTMwnNUFKGKaUdRJ1OsveOYlfnPUETBU973CG89565txvbBowCQ4Czdeu3qSX8bNOg==} + hasBin: true + node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -10202,6 +10209,16 @@ snapshots: node-downloader-helper@2.1.10: {} + node-edge-tts@1.2.9: + dependencies: + https-proxy-agent: 7.0.6 + ws: 8.19.0 + yargs: 17.7.2 + transitivePeerDependencies: + - bufferutil + - supports-color + - utf-8-validate + node-fetch@2.7.0: dependencies: whatwg-url: 5.0.0 diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 23ee80bc7..3e8c71288 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -7,9 +7,11 @@ import { getTtsProvider, isSummarizationEnabled, isTtsEnabled, + isTtsProviderConfigured, resolveTtsApiKey, resolveTtsConfig, resolveTtsPrefsPath, + resolveTtsProviderOrder, setLastTtsAttempt, setSummarizationEnabled, setTtsEnabled, @@ -41,6 +43,7 @@ function ttsUsage(): ReplyPayload { "\nExamples:\n" + "/tts on\n" + "/tts provider openai\n" + + "/tts provider edge\n" + "/tts limit 2000\n" + "/tts summary off\n" + "/tts audio Hello from Clawdbot", @@ -126,33 +129,45 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "provider") { const currentProvider = getTtsProvider(config, prefsPath); if (!args.trim()) { - const fallback = currentProvider === "openai" ? "elevenlabs" : "openai"; + const fallback = resolveTtsProviderOrder(currentProvider) + .slice(1) + .filter((provider) => isTtsProviderConfigured(config, provider)); const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); + const hasEdge = isTtsProviderConfigured(config, "edge"); return { shouldContinue: false, reply: { text: `🎙️ TTS provider\n` + `Primary: ${currentProvider}\n` + - `Fallback: ${fallback}\n` + + `Fallbacks: ${fallback.join(", ") || "none"}\n` + `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` + `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` + - `Usage: /tts provider openai | elevenlabs`, + `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` + + `Usage: /tts provider openai | elevenlabs | edge`, }, }; } const requested = args.trim().toLowerCase(); - if (requested !== "openai" && requested !== "elevenlabs") { + if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") { return { shouldContinue: false, reply: ttsUsage() }; } setTtsProvider(prefsPath, requested); - const fallback = requested === "openai" ? "elevenlabs" : "openai"; + const fallback = resolveTtsProviderOrder(requested) + .slice(1) + .filter((provider) => isTtsProviderConfigured(config, provider)); return { shouldContinue: false, - reply: { text: `✅ TTS provider set to ${requested} (fallback: ${fallback}).` }, + reply: { + text: + `✅ TTS provider set to ${requested} (fallbacks: ${fallback.join(", ") || "none"}).` + + (requested === "edge" + ? "\nEnable Edge TTS in config: messages.tts.edge.enabled = true." + : ""), + }, }; } @@ -199,14 +214,22 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "status") { const enabled = isTtsEnabled(config, prefsPath); const provider = getTtsProvider(config, prefsPath); - const hasKey = Boolean(resolveTtsApiKey(config, provider)); + const hasKey = isTtsProviderConfigured(config, provider); + const providerStatus = + provider === "edge" + ? hasKey + ? "✅ enabled" + : "❌ disabled" + : hasKey + ? "✅ key" + : "❌ no key"; const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath); const last = getLastTtsAttempt(); const lines = [ "📊 TTS status", `State: ${enabled ? "✅ enabled" : "❌ disabled"}`, - `Provider: ${provider} (${hasKey ? "✅ key" : "❌ no key"})`, + `Provider: ${provider} (${providerStatus})`, `Text limit: ${maxLength} chars`, `Auto-summary: ${summarize ? "on" : "off"}`, ]; diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 86d94deca..28b65c96d 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -1,4 +1,4 @@ -export type TtsProvider = "elevenlabs" | "openai"; +export type TtsProvider = "elevenlabs" | "openai" | "edge"; export type TtsMode = "final" | "all"; @@ -55,6 +55,20 @@ export type TtsConfig = { model?: string; voice?: string; }; + /** Microsoft Edge (node-edge-tts) configuration. */ + edge?: { + /** Explicitly allow Edge TTS usage (no API key required). */ + enabled?: boolean; + voice?: string; + lang?: string; + outputFormat?: string; + pitch?: string; + rate?: string; + volume?: string; + saveSubtitles?: boolean; + proxy?: string; + timeoutMs?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 0301a52fe..bcf769b67 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z .strict() .optional(); -export const TtsProviderSchema = z.enum(["elevenlabs", "openai"]); +export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]); export const TtsModeSchema = z.enum(["final", "all"]); export const TtsConfigSchema = z .object({ @@ -207,6 +207,21 @@ export const TtsConfigSchema = z }) .strict() .optional(), + edge: z + .object({ + enabled: z.boolean().optional(), + voice: z.string().optional(), + lang: z.string().optional(), + outputFormat: z.string().optional(), + pitch: z.string().optional(), + rate: z.string().optional(), + volume: z.string().optional(), + saveSubtitles: z.boolean().optional(), + proxy: z.string().optional(), + timeoutMs: z.number().int().min(1000).max(120000).optional(), + }) + .strict() + .optional(), prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/gateway/server-methods/tts.ts b/src/gateway/server-methods/tts.ts index 1b1436988..e70fb112f 100644 --- a/src/gateway/server-methods/tts.ts +++ b/src/gateway/server-methods/tts.ts @@ -4,9 +4,11 @@ import { OPENAI_TTS_VOICES, getTtsProvider, isTtsEnabled, + isTtsProviderConfigured, resolveTtsApiKey, resolveTtsConfig, resolveTtsPrefsPath, + resolveTtsProviderOrder, setTtsEnabled, setTtsProvider, textToSpeech, @@ -22,13 +24,18 @@ export const ttsHandlers: GatewayRequestHandlers = { const config = resolveTtsConfig(cfg); const prefsPath = resolveTtsPrefsPath(config); const provider = getTtsProvider(config, prefsPath); + const fallbackProviders = resolveTtsProviderOrder(provider) + .slice(1) + .filter((candidate) => isTtsProviderConfigured(config, candidate)); respond(true, { enabled: isTtsEnabled(config, prefsPath), provider, - fallbackProvider: provider === "openai" ? "elevenlabs" : "openai", + fallbackProvider: fallbackProviders[0] ?? null, + fallbackProviders, prefsPath, hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")), hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")), + edgeEnabled: isTtsProviderConfigured(config, "edge"), }); } catch (err) { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err))); @@ -90,11 +97,14 @@ export const ttsHandlers: GatewayRequestHandlers = { }, "tts.setProvider": async ({ params, respond }) => { const provider = typeof params.provider === "string" ? params.provider.trim() : ""; - if (provider !== "openai" && provider !== "elevenlabs") { + if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") { respond( false, undefined, - errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai or elevenlabs."), + errorShape( + ErrorCodes.INVALID_REQUEST, + "Invalid provider. Use openai, elevenlabs, or edge.", + ), ); return; } @@ -128,6 +138,12 @@ export const ttsHandlers: GatewayRequestHandlers = { configured: Boolean(resolveTtsApiKey(config, "elevenlabs")), models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"], }, + { + id: "edge", + name: "Edge TTS", + configured: isTtsProviderConfigured(config, "edge"), + models: [], + }, ], active: getTtsProvider(config, prefsPath), }); diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 635364364..fafe3bbdf 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai"; import { getApiKeyForModel } from "../agents/model-auth.js"; import { resolveModel } from "../agents/pi-embedded-runner/model.js"; -import { _test, resolveTtsConfig } from "./tts.js"; +import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js"; vi.mock("@mariozechner/pi-ai", () => ({ completeSimple: vi.fn(), @@ -47,6 +47,7 @@ const { resolveModelOverridePolicy, summarizeText, resolveOutputFormat, + resolveEdgeOutputFormat, } = _test; describe("tts", () => { @@ -149,6 +150,30 @@ describe("tts", () => { }); }); + describe("resolveEdgeOutputFormat", () => { + const baseCfg = { + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { tts: {} }, + }; + + it("uses default output format when edge output format is not configured", () => { + const config = resolveTtsConfig(baseCfg); + expect(resolveEdgeOutputFormat(config)).toBe("audio-24khz-48kbitrate-mono-mp3"); + }); + + it("uses configured output format when provided", () => { + const config = resolveTtsConfig({ + ...baseCfg, + messages: { + tts: { + edge: { outputFormat: "audio-24khz-96kbitrate-mono-mp3" }, + }, + }, + }); + expect(resolveEdgeOutputFormat(config)).toBe("audio-24khz-96kbitrate-mono-mp3"); + }); + }); + describe("parseTtsDirectives", () => { it("extracts overrides and strips directives when enabled", () => { const policy = resolveModelOverridePolicy({ enabled: true }); @@ -165,6 +190,14 @@ describe("tts", () => { expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1); }); + it("accepts edge as provider override", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = "Hello [[tts:provider=edge]] world"; + const result = parseTtsDirectives(input, policy); + + expect(result.overrides.provider).toBe("edge"); + }); + it("keeps text intact when overrides are disabled", () => { const policy = resolveModelOverridePolicy({ enabled: false }); const input = "Hello [[tts:voice=alloy]] world"; @@ -314,4 +347,88 @@ describe("tts", () => { ).rejects.toThrow("No summary returned"); }); }); + + describe("getTtsProvider", () => { + const baseCfg = { + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { tts: {} }, + }; + + const restoreEnv = (snapshot: Record) => { + const keys = ["OPENAI_API_KEY", "ELEVENLABS_API_KEY", "XI_API_KEY"] as const; + for (const key of keys) { + const value = snapshot[key]; + if (value === undefined) { + delete process.env[key]; + } else { + process.env[key] = value; + } + } + }; + + const withEnv = (env: Record, run: () => void) => { + const snapshot = { + OPENAI_API_KEY: process.env.OPENAI_API_KEY, + ELEVENLABS_API_KEY: process.env.ELEVENLABS_API_KEY, + XI_API_KEY: process.env.XI_API_KEY, + }; + try { + for (const [key, value] of Object.entries(env)) { + if (value === undefined) { + delete process.env[key]; + } else { + process.env[key] = value; + } + } + run(); + } finally { + restoreEnv(snapshot); + } + }; + + it("prefers OpenAI when no provider is configured and API key exists", () => { + withEnv( + { + OPENAI_API_KEY: "test-openai-key", + ELEVENLABS_API_KEY: undefined, + XI_API_KEY: undefined, + }, + () => { + const config = resolveTtsConfig(baseCfg); + const provider = getTtsProvider(config, "/tmp/tts-prefs-openai.json"); + expect(provider).toBe("openai"); + }, + ); + }); + + it("prefers ElevenLabs when OpenAI is missing and ElevenLabs key exists", () => { + withEnv( + { + OPENAI_API_KEY: undefined, + ELEVENLABS_API_KEY: "test-elevenlabs-key", + XI_API_KEY: undefined, + }, + () => { + const config = resolveTtsConfig(baseCfg); + const provider = getTtsProvider(config, "/tmp/tts-prefs-elevenlabs.json"); + expect(provider).toBe("elevenlabs"); + }, + ); + }); + + it("falls back to Edge when no API keys are present", () => { + withEnv( + { + OPENAI_API_KEY: undefined, + ELEVENLABS_API_KEY: undefined, + XI_API_KEY: undefined, + }, + () => { + const config = resolveTtsConfig(baseCfg); + const provider = getTtsProvider(config, "/tmp/tts-prefs-edge.json"); + expect(provider).toBe("edge"); + }, + ); + }); + }); }); diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 54aa4c512..cf2823f95 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -12,6 +12,7 @@ import { tmpdir } from "node:os"; import path from "node:path"; import { completeSimple, type TextContent } from "@mariozechner/pi-ai"; +import { EdgeTTS } from "node-edge-tts"; import type { ReplyPayload } from "../auto-reply/types.js"; import { normalizeChannelId } from "../channels/plugins/index.js"; @@ -24,6 +25,7 @@ import type { TtsModelOverrideConfig, } from "../config/types.tts.js"; import { logVerbose } from "../globals.js"; +import { isVoiceCompatibleAudio } from "../media/audio.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js"; import { @@ -45,6 +47,9 @@ const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"; const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"; const DEFAULT_OPENAI_VOICE = "alloy"; +const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; +const DEFAULT_EDGE_LANG = "en-US"; +const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, @@ -74,6 +79,7 @@ export type ResolvedTtsConfig = { enabled: boolean; mode: TtsMode; provider: TtsProvider; + providerSource: "config" | "default"; summaryModel?: string; modelOverrides: ResolvedTtsModelOverrides; elevenlabs: { @@ -97,6 +103,19 @@ export type ResolvedTtsConfig = { model: string; voice: string; }; + edge: { + enabled: boolean; + voice: string; + lang: string; + outputFormat: string; + outputFormatConfigured: boolean; + pitch?: string; + rate?: string; + volume?: string; + saveSubtitles: boolean; + proxy?: string; + timeoutMs?: number; + }; prefsPath?: string; maxTextLength: number; timeoutMs: number; @@ -199,10 +218,13 @@ function resolveModelOverridePolicy( export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { const raw: TtsConfig = cfg.messages?.tts ?? {}; + const providerSource = raw.provider ? "config" : "default"; + const edgeOutputFormat = raw.edge?.outputFormat?.trim(); return { enabled: raw.enabled ?? false, mode: raw.mode ?? "final", - provider: raw.provider ?? "elevenlabs", + provider: raw.provider ?? "edge", + providerSource, summaryModel: raw.summaryModel?.trim() || undefined, modelOverrides: resolveModelOverridePolicy(raw.modelOverrides), elevenlabs: { @@ -231,6 +253,19 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, }, + edge: { + enabled: raw.edge?.enabled ?? true, + voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE, + lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG, + outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT, + outputFormatConfigured: Boolean(edgeOutputFormat), + pitch: raw.edge?.pitch?.trim() || undefined, + rate: raw.edge?.rate?.trim() || undefined, + volume: raw.edge?.volume?.trim() || undefined, + saveSubtitles: raw.edge?.saveSubtitles ?? false, + proxy: raw.edge?.proxy?.trim() || undefined, + timeoutMs: raw.edge?.timeoutMs, + }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, @@ -302,7 +337,12 @@ export function setTtsEnabled(prefsPath: string, enabled: boolean): void { export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider { const prefs = readPrefs(prefsPath); - return prefs.tts?.provider ?? config.provider; + if (prefs.tts?.provider) return prefs.tts.provider; + if (config.providerSource === "config") return config.provider; + + if (resolveTtsApiKey(config, "openai")) return "openai"; + if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs"; + return "edge"; } export function setTtsProvider(prefsPath: string, provider: TtsProvider): void { @@ -350,6 +390,10 @@ function resolveChannelId(channel: string | undefined): ChannelId | null { return channel ? normalizeChannelId(channel) : null; } +function resolveEdgeOutputFormat(config: ResolvedTtsConfig): string { + return config.edge.outputFormat; +} + export function resolveTtsApiKey( config: ResolvedTtsConfig, provider: TtsProvider, @@ -363,6 +407,17 @@ export function resolveTtsApiKey( return undefined; } +export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const; + +export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { + return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)]; +} + +export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean { + if (provider === "edge") return config.edge.enabled; + return Boolean(resolveTtsApiKey(config, provider)); +} + function isValidVoiceId(voiceId: string): boolean { return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); } @@ -459,7 +514,7 @@ function parseTtsDirectives( switch (key) { case "provider": if (!policy.allowProvider) break; - if (rawValue === "openai" || rawValue === "elevenlabs") { + if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { overrides.provider = rawValue; } else { warnings.push(`unsupported provider "${rawValue}"`); @@ -893,6 +948,38 @@ async function openaiTTS(params: { } } +function inferEdgeExtension(outputFormat: string): string { + const normalized = outputFormat.toLowerCase(); + if (normalized.includes("webm")) return ".webm"; + if (normalized.includes("ogg")) return ".ogg"; + if (normalized.includes("opus")) return ".opus"; + if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) { + return ".wav"; + } + return ".mp3"; +} + +async function edgeTTS(params: { + text: string; + outputPath: string; + config: ResolvedTtsConfig["edge"]; + timeoutMs: number; +}): Promise { + const { text, outputPath, config, timeoutMs } = params; + const tts = new EdgeTTS({ + voice: config.voice, + lang: config.lang, + outputFormat: config.outputFormat, + saveSubtitles: config.saveSubtitles, + proxy: config.proxy, + rate: config.rate, + pitch: config.pitch, + volume: config.volume, + timeout: config.timeoutMs ?? timeoutMs, + }); + await tts.ttsPromise(text, outputPath); +} + export async function textToSpeech(params: { text: string; cfg: ClawdbotConfig; @@ -915,19 +1002,87 @@ export async function textToSpeech(params: { const userProvider = getTtsProvider(config, prefsPath); const overrideProvider = params.overrides?.provider; const provider = overrideProvider ?? userProvider; - const providers: TtsProvider[] = [provider, provider === "openai" ? "elevenlabs" : "openai"]; + const providers = resolveTtsProviderOrder(provider); let lastError: string | undefined; for (const provider of providers) { - const apiKey = resolveTtsApiKey(config, provider); - if (!apiKey) { - lastError = `No API key for ${provider}`; - continue; - } - const providerStart = Date.now(); try { + if (provider === "edge") { + if (!config.edge.enabled) { + lastError = "edge: disabled"; + continue; + } + + const tempDir = mkdtempSync(path.join(tmpdir(), "tts-")); + let edgeOutputFormat = resolveEdgeOutputFormat(config); + const fallbackEdgeOutputFormat = + edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; + + const attemptEdgeTts = async (outputFormat: string) => { + const extension = inferEdgeExtension(outputFormat); + const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`); + await edgeTTS({ + text: params.text, + outputPath: audioPath, + config: { + ...config.edge, + outputFormat, + }, + timeoutMs: config.timeoutMs, + }); + return { audioPath, outputFormat }; + }; + + let edgeResult: { audioPath: string; outputFormat: string }; + try { + edgeResult = await attemptEdgeTts(edgeOutputFormat); + } catch (err) { + if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) { + logVerbose( + `TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`, + ); + edgeOutputFormat = fallbackEdgeOutputFormat; + try { + edgeResult = await attemptEdgeTts(edgeOutputFormat); + } catch (fallbackErr) { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } + throw fallbackErr; + } + } else { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } + throw err; + } + } + + scheduleCleanup(tempDir); + const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath }); + + return { + success: true, + audioPath: edgeResult.audioPath, + latencyMs: Date.now() - providerStart, + provider, + outputFormat: edgeResult.outputFormat, + voiceCompatible, + }; + } + + const apiKey = resolveTtsApiKey(config, provider); + if (!apiKey) { + lastError = `No API key for ${provider}`; + continue; + } + let audioBuffer: Buffer; if (provider === "elevenlabs") { const voiceIdOverride = params.overrides?.elevenlabs?.voiceId; @@ -1120,4 +1275,5 @@ export const _test = { resolveModelOverridePolicy, summarizeText, resolveOutputFormat, + resolveEdgeOutputFormat, };