feat: add edge tts fallback provider

This commit is contained in:
Peter Steinberger
2026-01-25 01:05:23 +00:00
parent 6a7a1d7085
commit fc0e303e05
11 changed files with 466 additions and 32 deletions

View File

@@ -8,6 +8,7 @@ Docs: https://docs.clawd.bot
- Ollama: provider discovery + docs. (#1606) Thanks @abhaymundhara. https://docs.clawd.bot/providers/ollama
### Changes
- TTS: add Edge TTS provider fallback, defaulting to keyless Edge with MP3 retry on format failures. (#1668) Thanks @steipete. https://docs.clawd.bot/tts
- Docs: expand FAQ (migration, scheduling, concurrency, model recommendations, OpenAI subscription auth, Pi sizing, hackable install, docs SSL workaround).
- Docs: add verbose installer troubleshooting guidance.
- Docs: update Fly.io guide notes.

View File

@@ -32,7 +32,7 @@ Status: ready for DMs + spaces via Google Chat API webhooks (HTTP only).
- Under **Connection settings**, select **HTTP endpoint URL**.
- Under **Triggers**, select **Use a common HTTP endpoint URL for all triggers** and set it to your gateway's public URL followed by `/googlechat`.
- *Tip: Run `clawdbot status` to find your gateway's public URL.*
- Under **Visibility**, check **Make this Chat app available to specific people and groups in <Your Domain>**.
- Under **Visibility**, check **Make this Chat app available to specific people and groups in &lt;Your Domain&gt;**.
- Enter your email address (e.g. `user@example.com`) in the text box.
- Click **Save** at the bottom.
6) **Enable the app status**:

View File

@@ -8,21 +8,37 @@ read_when:
# Text-to-speech (TTS)
Clawdbot can convert outbound replies into audio using ElevenLabs or OpenAI.
Clawdbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS.
It works anywhere Clawdbot can send audio; Telegram gets a round voice-note bubble.
## Supported services
- **ElevenLabs** (primary or fallback provider)
- **OpenAI** (primary or fallback provider; also used for summaries)
- **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys)
## Required keys
### Edge TTS notes
At least one of:
Edge TTS uses Microsoft Edge's online neural TTS service via the `node-edge-tts`
library. It's a hosted service (not local), uses Microsofts endpoints, and does
not require an API key. `node-edge-tts` exposes speech configuration options and
output formats, but not all options are supported by the Edge service. citeturn2search0
Because Edge TTS is a public web service without a published SLA or quota, treat it
as best-effort. If you need guaranteed limits and support, use OpenAI or ElevenLabs.
Microsoft's Speech REST API documents a 10minute audio limit per request; Edge TTS
does not publish limits, so assume similar or lower limits. citeturn0search3
## Optional keys
If you want OpenAI or ElevenLabs:
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
- `OPENAI_API_KEY`
If both are configured, the selected provider is used first and the other is a fallback.
Edge TTS does **not** require an API key. If no API keys are found, Clawdbot defaults
to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`).
If multiple providers are configured, the selected provider is used first and the others are fallback options.
Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`),
so that provider must also be authenticated if you enable summaries.
@@ -32,12 +48,17 @@ so that provider must also be authenticated if you enable summaries.
- [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio)
- [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech)
- [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication)
- [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts)
- [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs)
## Is it enabled by default?
No. TTS is **disabled** by default. Enable it in config or with `/tts on`,
which writes a local preference override.
Edge TTS **is** enabled by default once TTS is on, and is used automatically
when no OpenAI or ElevenLabs API keys are available.
## Config
TTS config lives under `messages.tts` in `clawdbot.json`.
@@ -94,6 +115,41 @@ Full schema is in [Gateway configuration](/gateway/configuration).
}
```
### Edge TTS primary (no API key)
```json5
{
messages: {
tts: {
enabled: true,
provider: "edge",
edge: {
enabled: true,
voice: "en-US-MichelleNeural",
lang: "en-US",
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
rate: "+10%",
pitch: "-5%"
}
}
}
}
```
### Disable Edge TTS
```json5
{
messages: {
tts: {
edge: {
enabled: false
}
}
}
}
```
### Custom limits + prefs path
```json5
@@ -131,7 +187,9 @@ Then run:
- `enabled`: master toggle (default `false`; local prefs can override).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
- `provider`: `"elevenlabs"` or `"openai"` (fallback is automatic).
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
- If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key),
otherwise `edge`.
- `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`.
- Accepts `provider/model` or a configured model alias.
- `modelOverrides`: allow the model to emit TTS directives (on by default).
@@ -147,6 +205,15 @@ Then run:
- `elevenlabs.applyTextNormalization`: `auto|on|off`
- `elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`)
- `elevenlabs.seed`: integer `0..4294967295` (best-effort determinism)
- `edge.enabled`: allow Edge TTS usage (default `true`; no API key).
- `edge.voice`: Edge neural voice name (e.g. `en-US-MichelleNeural`).
- `edge.lang`: language code (e.g. `en-US`).
- `edge.outputFormat`: Edge output format (e.g. `audio-24khz-48kbitrate-mono-mp3`).
- See Microsoft Speech output formats for valid values; not all formats are supported by Edge.
- `edge.rate` / `edge.pitch` / `edge.volume`: percent strings (e.g. `+10%`, `-5%`).
- `edge.saveSubtitles`: write JSON subtitles alongside the audio file.
- `edge.proxy`: proxy URL for Edge TTS requests.
- `edge.timeoutMs`: request timeout override (ms).
## Model-driven overrides (default on)
@@ -167,7 +234,7 @@ Here you go.
```
Available directive keys (when enabled):
- `provider` (`openai` | `elevenlabs`)
- `provider` (`openai` | `elevenlabs` | `edge`)
- `voice` (OpenAI voice) or `voiceId` (ElevenLabs)
- `model` (OpenAI TTS model or ElevenLabs model id)
- `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost`
@@ -225,8 +292,15 @@ These override `messages.tts.*` for that host.
- 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
- 44.1kHz / 128kbps is the default balance for speech clarity.
- **Edge TTS**: uses `edge.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
- `node-edge-tts` accepts an `outputFormat`, but not all formats are available
from the Edge service. citeturn2search0
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). citeturn1search0
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
guaranteed Opus voice notes. citeturn1search1
- If the configured Edge output format fails, Clawdbot retries with MP3.
This is not configurable; Telegram expects Opus for voice-note UX.
OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
## Auto-TTS behavior

View File

@@ -185,6 +185,7 @@
"linkedom": "^0.18.12",
"long": "5.3.2",
"markdown-it": "^14.1.0",
"node-edge-tts": "^1.2.9",
"osc-progress": "^0.3.0",
"pdfjs-dist": "^5.4.530",
"playwright-core": "1.58.0",

17
pnpm-lock.yaml generated
View File

@@ -127,6 +127,9 @@ importers:
markdown-it:
specifier: ^14.1.0
version: 14.1.0
node-edge-tts:
specifier: ^1.2.9
version: 1.2.9
osc-progress:
specifier: ^0.3.0
version: 0.3.0
@@ -4298,6 +4301,10 @@ packages:
engines: {node: '>=14.18'}
hasBin: true
node-edge-tts@1.2.9:
resolution: {integrity: sha512-fvfW1dUgJdZAdTniC6MzLTMwnNUFKGKaUdRJ1OsveOYlfnPUETBU973CG89565txvbBowCQ4Czdeu3qSX8bNOg==}
hasBin: true
node-fetch@2.7.0:
resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
engines: {node: 4.x || >=6.0.0}
@@ -10202,6 +10209,16 @@ snapshots:
node-downloader-helper@2.1.10: {}
node-edge-tts@1.2.9:
dependencies:
https-proxy-agent: 7.0.6
ws: 8.19.0
yargs: 17.7.2
transitivePeerDependencies:
- bufferutil
- supports-color
- utf-8-validate
node-fetch@2.7.0:
dependencies:
whatwg-url: 5.0.0

View File

@@ -7,9 +7,11 @@ import {
getTtsProvider,
isSummarizationEnabled,
isTtsEnabled,
isTtsProviderConfigured,
resolveTtsApiKey,
resolveTtsConfig,
resolveTtsPrefsPath,
resolveTtsProviderOrder,
setLastTtsAttempt,
setSummarizationEnabled,
setTtsEnabled,
@@ -41,6 +43,7 @@ function ttsUsage(): ReplyPayload {
"\nExamples:\n" +
"/tts on\n" +
"/tts provider openai\n" +
"/tts provider edge\n" +
"/tts limit 2000\n" +
"/tts summary off\n" +
"/tts audio Hello from Clawdbot",
@@ -126,33 +129,45 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (action === "provider") {
const currentProvider = getTtsProvider(config, prefsPath);
if (!args.trim()) {
const fallback = currentProvider === "openai" ? "elevenlabs" : "openai";
const fallback = resolveTtsProviderOrder(currentProvider)
.slice(1)
.filter((provider) => isTtsProviderConfigured(config, provider));
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
const hasEdge = isTtsProviderConfigured(config, "edge");
return {
shouldContinue: false,
reply: {
text:
`🎙️ TTS provider\n` +
`Primary: ${currentProvider}\n` +
`Fallback: ${fallback}\n` +
`Fallbacks: ${fallback.join(", ") || "none"}\n` +
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs`,
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs | edge`,
},
};
}
const requested = args.trim().toLowerCase();
if (requested !== "openai" && requested !== "elevenlabs") {
if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
return { shouldContinue: false, reply: ttsUsage() };
}
setTtsProvider(prefsPath, requested);
const fallback = requested === "openai" ? "elevenlabs" : "openai";
const fallback = resolveTtsProviderOrder(requested)
.slice(1)
.filter((provider) => isTtsProviderConfigured(config, provider));
return {
shouldContinue: false,
reply: { text: `✅ TTS provider set to ${requested} (fallback: ${fallback}).` },
reply: {
text:
`✅ TTS provider set to ${requested} (fallbacks: ${fallback.join(", ") || "none"}).` +
(requested === "edge"
? "\nEnable Edge TTS in config: messages.tts.edge.enabled = true."
: ""),
},
};
}
@@ -199,14 +214,22 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (action === "status") {
const enabled = isTtsEnabled(config, prefsPath);
const provider = getTtsProvider(config, prefsPath);
const hasKey = Boolean(resolveTtsApiKey(config, provider));
const hasKey = isTtsProviderConfigured(config, provider);
const providerStatus =
provider === "edge"
? hasKey
? "✅ enabled"
: "❌ disabled"
: hasKey
? "✅ key"
: "❌ no key";
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath);
const last = getLastTtsAttempt();
const lines = [
"📊 TTS status",
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
`Provider: ${provider} (${hasKey ? "✅ key" : "❌ no key"})`,
`Provider: ${provider} (${providerStatus})`,
`Text limit: ${maxLength} chars`,
`Auto-summary: ${summarize ? "on" : "off"}`,
];

View File

@@ -1,4 +1,4 @@
export type TtsProvider = "elevenlabs" | "openai";
export type TtsProvider = "elevenlabs" | "openai" | "edge";
export type TtsMode = "final" | "all";
@@ -55,6 +55,20 @@ export type TtsConfig = {
model?: string;
voice?: string;
};
/** Microsoft Edge (node-edge-tts) configuration. */
edge?: {
/** Explicitly allow Edge TTS usage (no API key required). */
enabled?: boolean;
voice?: string;
lang?: string;
outputFormat?: string;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles?: boolean;
proxy?: string;
timeoutMs?: number;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */

View File

@@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai"]);
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsConfigSchema = z
.object({
@@ -207,6 +207,21 @@ export const TtsConfigSchema = z
})
.strict()
.optional(),
edge: z
.object({
enabled: z.boolean().optional(),
voice: z.string().optional(),
lang: z.string().optional(),
outputFormat: z.string().optional(),
pitch: z.string().optional(),
rate: z.string().optional(),
volume: z.string().optional(),
saveSubtitles: z.boolean().optional(),
proxy: z.string().optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@@ -4,9 +4,11 @@ import {
OPENAI_TTS_VOICES,
getTtsProvider,
isTtsEnabled,
isTtsProviderConfigured,
resolveTtsApiKey,
resolveTtsConfig,
resolveTtsPrefsPath,
resolveTtsProviderOrder,
setTtsEnabled,
setTtsProvider,
textToSpeech,
@@ -22,13 +24,18 @@ export const ttsHandlers: GatewayRequestHandlers = {
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
const provider = getTtsProvider(config, prefsPath);
const fallbackProviders = resolveTtsProviderOrder(provider)
.slice(1)
.filter((candidate) => isTtsProviderConfigured(config, candidate));
respond(true, {
enabled: isTtsEnabled(config, prefsPath),
provider,
fallbackProvider: provider === "openai" ? "elevenlabs" : "openai",
fallbackProvider: fallbackProviders[0] ?? null,
fallbackProviders,
prefsPath,
hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")),
hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")),
edgeEnabled: isTtsProviderConfigured(config, "edge"),
});
} catch (err) {
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
@@ -90,11 +97,14 @@ export const ttsHandlers: GatewayRequestHandlers = {
},
"tts.setProvider": async ({ params, respond }) => {
const provider = typeof params.provider === "string" ? params.provider.trim() : "";
if (provider !== "openai" && provider !== "elevenlabs") {
if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") {
respond(
false,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai or elevenlabs."),
errorShape(
ErrorCodes.INVALID_REQUEST,
"Invalid provider. Use openai, elevenlabs, or edge.",
),
);
return;
}
@@ -128,6 +138,12 @@ export const ttsHandlers: GatewayRequestHandlers = {
configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
},
{
id: "edge",
name: "Edge TTS",
configured: isTtsProviderConfigured(config, "edge"),
models: [],
},
],
active: getTtsProvider(config, prefsPath),
});

View File

@@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai";
import { getApiKeyForModel } from "../agents/model-auth.js";
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
import { _test, resolveTtsConfig } from "./tts.js";
import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js";
vi.mock("@mariozechner/pi-ai", () => ({
completeSimple: vi.fn(),
@@ -47,6 +47,7 @@ const {
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
resolveEdgeOutputFormat,
} = _test;
describe("tts", () => {
@@ -149,6 +150,30 @@ describe("tts", () => {
});
});
describe("resolveEdgeOutputFormat", () => {
const baseCfg = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: { tts: {} },
};
it("uses default output format when edge output format is not configured", () => {
const config = resolveTtsConfig(baseCfg);
expect(resolveEdgeOutputFormat(config)).toBe("audio-24khz-48kbitrate-mono-mp3");
});
it("uses configured output format when provided", () => {
const config = resolveTtsConfig({
...baseCfg,
messages: {
tts: {
edge: { outputFormat: "audio-24khz-96kbitrate-mono-mp3" },
},
},
});
expect(resolveEdgeOutputFormat(config)).toBe("audio-24khz-96kbitrate-mono-mp3");
});
});
describe("parseTtsDirectives", () => {
it("extracts overrides and strips directives when enabled", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
@@ -165,6 +190,14 @@ describe("tts", () => {
expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1);
});
it("accepts edge as provider override", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:provider=edge]] world";
const result = parseTtsDirectives(input, policy);
expect(result.overrides.provider).toBe("edge");
});
it("keeps text intact when overrides are disabled", () => {
const policy = resolveModelOverridePolicy({ enabled: false });
const input = "Hello [[tts:voice=alloy]] world";
@@ -314,4 +347,88 @@ describe("tts", () => {
).rejects.toThrow("No summary returned");
});
});
describe("getTtsProvider", () => {
const baseCfg = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: { tts: {} },
};
const restoreEnv = (snapshot: Record<string, string | undefined>) => {
const keys = ["OPENAI_API_KEY", "ELEVENLABS_API_KEY", "XI_API_KEY"] as const;
for (const key of keys) {
const value = snapshot[key];
if (value === undefined) {
delete process.env[key];
} else {
process.env[key] = value;
}
}
};
const withEnv = (env: Record<string, string | undefined>, run: () => void) => {
const snapshot = {
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
ELEVENLABS_API_KEY: process.env.ELEVENLABS_API_KEY,
XI_API_KEY: process.env.XI_API_KEY,
};
try {
for (const [key, value] of Object.entries(env)) {
if (value === undefined) {
delete process.env[key];
} else {
process.env[key] = value;
}
}
run();
} finally {
restoreEnv(snapshot);
}
};
it("prefers OpenAI when no provider is configured and API key exists", () => {
withEnv(
{
OPENAI_API_KEY: "test-openai-key",
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
},
() => {
const config = resolveTtsConfig(baseCfg);
const provider = getTtsProvider(config, "/tmp/tts-prefs-openai.json");
expect(provider).toBe("openai");
},
);
});
it("prefers ElevenLabs when OpenAI is missing and ElevenLabs key exists", () => {
withEnv(
{
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: "test-elevenlabs-key",
XI_API_KEY: undefined,
},
() => {
const config = resolveTtsConfig(baseCfg);
const provider = getTtsProvider(config, "/tmp/tts-prefs-elevenlabs.json");
expect(provider).toBe("elevenlabs");
},
);
});
it("falls back to Edge when no API keys are present", () => {
withEnv(
{
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
},
() => {
const config = resolveTtsConfig(baseCfg);
const provider = getTtsProvider(config, "/tmp/tts-prefs-edge.json");
expect(provider).toBe("edge");
},
);
});
});
});

View File

@@ -12,6 +12,7 @@ import { tmpdir } from "node:os";
import path from "node:path";
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import type { ReplyPayload } from "../auto-reply/types.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
@@ -24,6 +25,7 @@ import type {
TtsModelOverrideConfig,
} from "../config/types.tts.js";
import { logVerbose } from "../globals.js";
import { isVoiceCompatibleAudio } from "../media/audio.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
import {
@@ -45,6 +47,9 @@ const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
@@ -74,6 +79,7 @@ export type ResolvedTtsConfig = {
enabled: boolean;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
summaryModel?: string;
modelOverrides: ResolvedTtsModelOverrides;
elevenlabs: {
@@ -97,6 +103,19 @@ export type ResolvedTtsConfig = {
model: string;
voice: string;
};
edge: {
enabled: boolean;
voice: string;
lang: string;
outputFormat: string;
outputFormatConfigured: boolean;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles: boolean;
proxy?: string;
timeoutMs?: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
@@ -199,10 +218,13 @@ function resolveModelOverridePolicy(
export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
return {
enabled: raw.enabled ?? false,
mode: raw.mode ?? "final",
provider: raw.provider ?? "elevenlabs",
provider: raw.provider ?? "edge",
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
elevenlabs: {
@@ -231,6 +253,19 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
},
edge: {
enabled: raw.edge?.enabled ?? true,
voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(edgeOutputFormat),
pitch: raw.edge?.pitch?.trim() || undefined,
rate: raw.edge?.rate?.trim() || undefined,
volume: raw.edge?.volume?.trim() || undefined,
saveSubtitles: raw.edge?.saveSubtitles ?? false,
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@@ -302,7 +337,12 @@ export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
const prefs = readPrefs(prefsPath);
return prefs.tts?.provider ?? config.provider;
if (prefs.tts?.provider) return prefs.tts.provider;
if (config.providerSource === "config") return config.provider;
if (resolveTtsApiKey(config, "openai")) return "openai";
if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs";
return "edge";
}
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
@@ -350,6 +390,10 @@ function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
}
function resolveEdgeOutputFormat(config: ResolvedTtsConfig): string {
return config.edge.outputFormat;
}
export function resolveTtsApiKey(
config: ResolvedTtsConfig,
provider: TtsProvider,
@@ -363,6 +407,17 @@ export function resolveTtsApiKey(
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
}
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
if (provider === "edge") return config.edge.enabled;
return Boolean(resolveTtsApiKey(config, provider));
}
function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
@@ -459,7 +514,7 @@ function parseTtsDirectives(
switch (key) {
case "provider":
if (!policy.allowProvider) break;
if (rawValue === "openai" || rawValue === "elevenlabs") {
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
@@ -893,6 +948,38 @@ async function openaiTTS(params: {
}
}
function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) return ".webm";
if (normalized.includes("ogg")) return ".ogg";
if (normalized.includes("opus")) return ".opus";
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
return ".wav";
}
return ".mp3";
}
async function edgeTTS(params: {
text: string;
outputPath: string;
config: ResolvedTtsConfig["edge"];
timeoutMs: number;
}): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
const tts = new EdgeTTS({
voice: config.voice,
lang: config.lang,
outputFormat: config.outputFormat,
saveSubtitles: config.saveSubtitles,
proxy: config.proxy,
rate: config.rate,
pitch: config.pitch,
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
}
export async function textToSpeech(params: {
text: string;
cfg: ClawdbotConfig;
@@ -915,19 +1002,87 @@ export async function textToSpeech(params: {
const userProvider = getTtsProvider(config, prefsPath);
const overrideProvider = params.overrides?.provider;
const provider = overrideProvider ?? userProvider;
const providers: TtsProvider[] = [provider, provider === "openai" ? "elevenlabs" : "openai"];
const providers = resolveTtsProviderOrder(provider);
let lastError: string | undefined;
for (const provider of providers) {
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
const providerStart = Date.now();
try {
if (provider === "edge") {
if (!config.edge.enabled) {
lastError = "edge: disabled";
continue;
}
const tempDir = mkdtempSync(path.join(tmpdir(), "tts-"));
let edgeOutputFormat = resolveEdgeOutputFormat(config);
const fallbackEdgeOutputFormat =
edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
const attemptEdgeTts = async (outputFormat: string) => {
const extension = inferEdgeExtension(outputFormat);
const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
await edgeTTS({
text: params.text,
outputPath: audioPath,
config: {
...config.edge,
outputFormat,
},
timeoutMs: config.timeoutMs,
});
return { audioPath, outputFormat };
};
let edgeResult: { audioPath: string; outputFormat: string };
try {
edgeResult = await attemptEdgeTts(edgeOutputFormat);
} catch (err) {
if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
logVerbose(
`TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
);
edgeOutputFormat = fallbackEdgeOutputFormat;
try {
edgeResult = await attemptEdgeTts(edgeOutputFormat);
} catch (fallbackErr) {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
throw fallbackErr;
}
} else {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
throw err;
}
}
scheduleCleanup(tempDir);
const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
return {
success: true,
audioPath: edgeResult.audioPath,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: edgeResult.outputFormat,
voiceCompatible,
};
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
let audioBuffer: Buffer;
if (provider === "elevenlabs") {
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
@@ -1120,4 +1275,5 @@ export const _test = {
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
resolveEdgeOutputFormat,
};