feat: default TTS model overrides on (#1559) (thanks @Glucksberg)
Co-authored-by: Glucksberg <80581902+Glucksberg@users.noreply.github.com>
This commit is contained in:
@@ -273,80 +273,26 @@ function buildChatCommands(): ChatCommandDefinition[] {
|
||||
argsMenu: "auto",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "audio",
|
||||
nativeName: "audio",
|
||||
description: "Convert text to a TTS audio reply.",
|
||||
textAlias: "/audio",
|
||||
key: "tts",
|
||||
nativeName: "tts",
|
||||
description: "Control text-to-speech (TTS).",
|
||||
textAlias: "/tts",
|
||||
args: [
|
||||
{
|
||||
name: "text",
|
||||
description: "Text to speak",
|
||||
name: "action",
|
||||
description: "on | off | status | provider | limit | summary | audio | help",
|
||||
type: "string",
|
||||
choices: ["on", "off", "status", "provider", "limit", "summary", "audio", "help"],
|
||||
},
|
||||
{
|
||||
name: "value",
|
||||
description: "Provider, limit, or text",
|
||||
type: "string",
|
||||
captureRemaining: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "tts_on",
|
||||
nativeName: "tts_on",
|
||||
description: "Enable text-to-speech for replies.",
|
||||
textAlias: "/tts_on",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "tts_off",
|
||||
nativeName: "tts_off",
|
||||
description: "Disable text-to-speech for replies.",
|
||||
textAlias: "/tts_off",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "tts_provider",
|
||||
nativeName: "tts_provider",
|
||||
description: "Set or show the TTS provider.",
|
||||
textAlias: "/tts_provider",
|
||||
args: [
|
||||
{
|
||||
name: "provider",
|
||||
description: "openai or elevenlabs",
|
||||
type: "string",
|
||||
choices: ["openai", "elevenlabs"],
|
||||
},
|
||||
],
|
||||
argsMenu: "auto",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "tts_limit",
|
||||
nativeName: "tts_limit",
|
||||
description: "Set or show the max TTS text length.",
|
||||
textAlias: "/tts_limit",
|
||||
args: [
|
||||
{
|
||||
name: "maxLength",
|
||||
description: "Max chars before summarizing",
|
||||
type: "number",
|
||||
},
|
||||
],
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "tts_summary",
|
||||
nativeName: "tts_summary",
|
||||
description: "Enable or disable TTS auto-summary.",
|
||||
textAlias: "/tts_summary",
|
||||
args: [
|
||||
{
|
||||
name: "mode",
|
||||
description: "on or off",
|
||||
type: "string",
|
||||
choices: ["on", "off"],
|
||||
},
|
||||
],
|
||||
argsMenu: "auto",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "tts_status",
|
||||
nativeName: "tts_status",
|
||||
description: "Show TTS status and last attempt.",
|
||||
textAlias: "/tts_status",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "stop",
|
||||
nativeName: "stop",
|
||||
|
||||
@@ -18,22 +18,39 @@ import {
|
||||
textToSpeech,
|
||||
} from "../../tts/tts.js";
|
||||
|
||||
function parseCommandArg(normalized: string, command: string): string | null {
|
||||
if (normalized === command) return "";
|
||||
if (normalized.startsWith(`${command} `)) return normalized.slice(command.length).trim();
|
||||
return null;
|
||||
type ParsedTtsCommand = {
|
||||
action: string;
|
||||
args: string;
|
||||
};
|
||||
|
||||
function parseTtsCommand(normalized: string): ParsedTtsCommand | null {
|
||||
// Accept `/tts` and `/tts <action> [args]` as a single control surface.
|
||||
if (normalized === "/tts") return { action: "status", args: "" };
|
||||
if (!normalized.startsWith("/tts ")) return null;
|
||||
const rest = normalized.slice(5).trim();
|
||||
if (!rest) return { action: "status", args: "" };
|
||||
const [action, ...tail] = rest.split(/\s+/);
|
||||
return { action: action.toLowerCase(), args: tail.join(" ").trim() };
|
||||
}
|
||||
|
||||
function ttsUsage(): ReplyPayload {
|
||||
// Keep usage in one place so help/validation stays consistent.
|
||||
return {
|
||||
text:
|
||||
"⚙️ Usage: /tts <on|off|status|provider|limit|summary|audio> [value]" +
|
||||
"\nExamples:\n" +
|
||||
"/tts on\n" +
|
||||
"/tts provider openai\n" +
|
||||
"/tts limit 2000\n" +
|
||||
"/tts summary off\n" +
|
||||
"/tts audio Hello from Clawdbot",
|
||||
};
|
||||
}
|
||||
|
||||
export const handleTtsCommands: CommandHandler = async (params, allowTextCommands) => {
|
||||
if (!allowTextCommands) return null;
|
||||
const normalized = params.command.commandBodyNormalized;
|
||||
if (
|
||||
!normalized.startsWith("/tts_") &&
|
||||
normalized !== "/audio" &&
|
||||
!normalized.startsWith("/audio ")
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
const parsed = parseTtsCommand(params.command.commandBodyNormalized);
|
||||
if (!parsed) return null;
|
||||
|
||||
if (!params.command.isAuthorizedSender) {
|
||||
logVerbose(
|
||||
@@ -44,36 +61,42 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
const action = parsed.action;
|
||||
const args = parsed.args;
|
||||
|
||||
if (normalized === "/tts_on") {
|
||||
if (action === "help") {
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
|
||||
if (action === "on") {
|
||||
setTtsEnabled(prefsPath, true);
|
||||
return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } };
|
||||
}
|
||||
|
||||
if (normalized === "/tts_off") {
|
||||
if (action === "off") {
|
||||
setTtsEnabled(prefsPath, false);
|
||||
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
|
||||
}
|
||||
|
||||
const audioArg = parseCommandArg(normalized, "/audio");
|
||||
if (audioArg !== null) {
|
||||
if (!audioArg.trim()) {
|
||||
return { shouldContinue: false, reply: { text: "⚙️ Usage: /audio <text>" } };
|
||||
if (action === "audio") {
|
||||
if (!args.trim()) {
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
const result = await textToSpeech({
|
||||
text: audioArg,
|
||||
text: args,
|
||||
cfg: params.cfg,
|
||||
channel: params.command.channel,
|
||||
prefsPath,
|
||||
});
|
||||
|
||||
if (result.success && result.audioPath) {
|
||||
// Store last attempt for `/tts status`.
|
||||
setLastTtsAttempt({
|
||||
timestamp: Date.now(),
|
||||
success: true,
|
||||
textLength: audioArg.length,
|
||||
textLength: args.length,
|
||||
summarized: false,
|
||||
provider: result.provider,
|
||||
latencyMs: result.latencyMs,
|
||||
@@ -85,10 +108,11 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
return { shouldContinue: false, reply: payload };
|
||||
}
|
||||
|
||||
// Store failure details for `/tts status`.
|
||||
setLastTtsAttempt({
|
||||
timestamp: Date.now(),
|
||||
success: false,
|
||||
textLength: audioArg.length,
|
||||
textLength: args.length,
|
||||
summarized: false,
|
||||
error: result.error,
|
||||
latencyMs: Date.now() - start,
|
||||
@@ -99,10 +123,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
};
|
||||
}
|
||||
|
||||
const providerArg = parseCommandArg(normalized, "/tts_provider");
|
||||
if (providerArg !== null) {
|
||||
if (action === "provider") {
|
||||
const currentProvider = getTtsProvider(config, prefsPath);
|
||||
if (!providerArg.trim()) {
|
||||
if (!args.trim()) {
|
||||
const fallback = currentProvider === "openai" ? "elevenlabs" : "openai";
|
||||
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
|
||||
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
|
||||
@@ -115,17 +138,14 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
`Fallback: ${fallback}\n` +
|
||||
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
|
||||
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
|
||||
`Usage: /tts_provider openai | elevenlabs`,
|
||||
`Usage: /tts provider openai | elevenlabs`,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const requested = providerArg.trim().toLowerCase();
|
||||
const requested = args.trim().toLowerCase();
|
||||
if (requested !== "openai" && requested !== "elevenlabs") {
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: "⚙️ Usage: /tts_provider openai | elevenlabs" },
|
||||
};
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
|
||||
setTtsProvider(prefsPath, requested);
|
||||
@@ -136,21 +156,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
};
|
||||
}
|
||||
|
||||
const limitArg = parseCommandArg(normalized, "/tts_limit");
|
||||
if (limitArg !== null) {
|
||||
if (!limitArg.trim()) {
|
||||
if (action === "limit") {
|
||||
if (!args.trim()) {
|
||||
const currentLimit = getTtsMaxLength(prefsPath);
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: `📏 TTS limit: ${currentLimit} characters.` },
|
||||
};
|
||||
}
|
||||
const next = Number.parseInt(limitArg.trim(), 10);
|
||||
const next = Number.parseInt(args.trim(), 10);
|
||||
if (!Number.isFinite(next) || next < 100 || next > 10_000) {
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: "⚙️ Usage: /tts_limit <100-10000>" },
|
||||
};
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
setTtsMaxLength(prefsPath, next);
|
||||
return {
|
||||
@@ -159,18 +175,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
};
|
||||
}
|
||||
|
||||
const summaryArg = parseCommandArg(normalized, "/tts_summary");
|
||||
if (summaryArg !== null) {
|
||||
if (!summaryArg.trim()) {
|
||||
if (action === "summary") {
|
||||
if (!args.trim()) {
|
||||
const enabled = isSummarizationEnabled(prefsPath);
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` },
|
||||
};
|
||||
}
|
||||
const requested = summaryArg.trim().toLowerCase();
|
||||
const requested = args.trim().toLowerCase();
|
||||
if (requested !== "on" && requested !== "off") {
|
||||
return { shouldContinue: false, reply: { text: "⚙️ Usage: /tts_summary on|off" } };
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
setSummarizationEnabled(prefsPath, requested === "on");
|
||||
return {
|
||||
@@ -181,7 +196,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
};
|
||||
}
|
||||
|
||||
if (normalized === "/tts_status") {
|
||||
if (action === "status") {
|
||||
const enabled = isTtsEnabled(config, prefsPath);
|
||||
const provider = getTtsProvider(config, prefsPath);
|
||||
const hasKey = Boolean(resolveTtsApiKey(config, provider));
|
||||
@@ -210,5 +225,5 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
return { shouldContinue: false, reply: { text: lines.join("\n") } };
|
||||
}
|
||||
|
||||
return null;
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
};
|
||||
|
||||
@@ -2,6 +2,25 @@ export type TtsProvider = "elevenlabs" | "openai";
|
||||
|
||||
export type TtsMode = "final" | "all";
|
||||
|
||||
export type TtsModelOverrideConfig = {
|
||||
/** Enable model-provided overrides for TTS. */
|
||||
enabled?: boolean;
|
||||
/** Allow model-provided TTS text blocks. */
|
||||
allowText?: boolean;
|
||||
/** Allow model-provided provider override. */
|
||||
allowProvider?: boolean;
|
||||
/** Allow model-provided voice/voiceId override. */
|
||||
allowVoice?: boolean;
|
||||
/** Allow model-provided modelId override. */
|
||||
allowModelId?: boolean;
|
||||
/** Allow model-provided voice settings override. */
|
||||
allowVoiceSettings?: boolean;
|
||||
/** Allow model-provided normalization or language overrides. */
|
||||
allowNormalization?: boolean;
|
||||
/** Allow model-provided seed override. */
|
||||
allowSeed?: boolean;
|
||||
};
|
||||
|
||||
export type TtsConfig = {
|
||||
/** Enable auto-TTS (can be overridden by local prefs). */
|
||||
enabled?: boolean;
|
||||
@@ -9,11 +28,26 @@ export type TtsConfig = {
|
||||
mode?: TtsMode;
|
||||
/** Primary TTS provider (fallbacks are automatic). */
|
||||
provider?: TtsProvider;
|
||||
/** Optional model override for TTS auto-summary (provider/model or alias). */
|
||||
summaryModel?: string;
|
||||
/** Allow the model to override TTS parameters. */
|
||||
modelOverrides?: TtsModelOverrideConfig;
|
||||
/** ElevenLabs configuration. */
|
||||
elevenlabs?: {
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
voiceId?: string;
|
||||
modelId?: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings?: {
|
||||
stability?: number;
|
||||
similarityBoost?: number;
|
||||
style?: number;
|
||||
useSpeakerBoost?: boolean;
|
||||
speed?: number;
|
||||
};
|
||||
};
|
||||
/** OpenAI configuration. */
|
||||
openai?: {
|
||||
|
||||
@@ -162,11 +162,39 @@ export const TtsConfigSchema = z
|
||||
enabled: z.boolean().optional(),
|
||||
mode: TtsModeSchema.optional(),
|
||||
provider: TtsProviderSchema.optional(),
|
||||
summaryModel: z.string().optional(),
|
||||
modelOverrides: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
allowText: z.boolean().optional(),
|
||||
allowProvider: z.boolean().optional(),
|
||||
allowVoice: z.boolean().optional(),
|
||||
allowModelId: z.boolean().optional(),
|
||||
allowVoiceSettings: z.boolean().optional(),
|
||||
allowNormalization: z.boolean().optional(),
|
||||
allowSeed: z.boolean().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
elevenlabs: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
voiceId: z.string().optional(),
|
||||
modelId: z.string().optional(),
|
||||
seed: z.number().int().min(0).max(4294967295).optional(),
|
||||
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
|
||||
languageCode: z.string().optional(),
|
||||
voiceSettings: z
|
||||
.object({
|
||||
stability: z.number().min(0).max(1).optional(),
|
||||
similarityBoost: z.number().min(0).max(1).optional(),
|
||||
style: z.number().min(0).max(1).optional(),
|
||||
useSpeakerBoost: z.boolean().optional(),
|
||||
speed: z.number().min(0.5).max(2).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
|
||||
@@ -170,7 +170,7 @@ export type PluginCommandHandler = (
|
||||
* Definition for a plugin-registered command.
|
||||
*/
|
||||
export type ClawdbotPluginCommandDefinition = {
|
||||
/** Command name without leading slash (e.g., "tts_on") */
|
||||
/** Command name without leading slash (e.g., "tts") */
|
||||
name: string;
|
||||
/** Description shown in /help and command menus */
|
||||
description: string;
|
||||
|
||||
@@ -1,6 +1,41 @@
|
||||
import { describe, expect, it, vi, beforeEach, afterEach } from "vitest";
|
||||
import { describe, expect, it, vi, beforeEach } from "vitest";
|
||||
|
||||
import { _test } from "./tts.js";
|
||||
import { completeSimple } from "@mariozechner/pi-ai";
|
||||
|
||||
import { getApiKeyForModel } from "../agents/model-auth.js";
|
||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
import { _test, resolveTtsConfig } from "./tts.js";
|
||||
|
||||
vi.mock("@mariozechner/pi-ai", () => ({
|
||||
completeSimple: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("../agents/pi-embedded-runner/model.js", () => ({
|
||||
resolveModel: vi.fn((provider: string, modelId: string) => ({
|
||||
model: {
|
||||
provider,
|
||||
id: modelId,
|
||||
name: modelId,
|
||||
api: "openai-completions",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 128000,
|
||||
maxTokens: 8192,
|
||||
},
|
||||
authStorage: { profiles: {} },
|
||||
modelRegistry: { find: vi.fn() },
|
||||
})),
|
||||
}));
|
||||
|
||||
vi.mock("../agents/model-auth.js", () => ({
|
||||
getApiKeyForModel: vi.fn(async () => ({
|
||||
apiKey: "test-api-key",
|
||||
source: "test",
|
||||
mode: "api-key",
|
||||
})),
|
||||
requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
|
||||
}));
|
||||
|
||||
const {
|
||||
isValidVoiceId,
|
||||
@@ -8,11 +43,20 @@ const {
|
||||
isValidOpenAIModel,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
parseTtsDirectives,
|
||||
resolveModelOverridePolicy,
|
||||
summarizeText,
|
||||
resolveOutputFormat,
|
||||
} = _test;
|
||||
|
||||
describe("tts", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
vi.mocked(completeSimple).mockResolvedValue({
|
||||
content: [{ type: "text", text: "Summary" }],
|
||||
});
|
||||
});
|
||||
|
||||
describe("isValidVoiceId", () => {
|
||||
it("accepts valid ElevenLabs voice IDs", () => {
|
||||
expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true);
|
||||
@@ -105,130 +149,169 @@ describe("tts", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseTtsDirectives", () => {
|
||||
it("extracts overrides and strips directives when enabled", () => {
|
||||
const policy = resolveModelOverridePolicy({ enabled: true });
|
||||
const input =
|
||||
"Hello [[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE stability=0.4 speed=1.1]] world\n\n" +
|
||||
"[[tts:text]](laughs) Read the song once more.[[/tts:text]]";
|
||||
const result = parseTtsDirectives(input, policy);
|
||||
|
||||
expect(result.cleanedText).not.toContain("[[tts:");
|
||||
expect(result.ttsText).toBe("(laughs) Read the song once more.");
|
||||
expect(result.overrides.provider).toBe("elevenlabs");
|
||||
expect(result.overrides.elevenlabs?.voiceId).toBe("pMsXgVXv3BLzUgSXRplE");
|
||||
expect(result.overrides.elevenlabs?.voiceSettings?.stability).toBe(0.4);
|
||||
expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1);
|
||||
});
|
||||
|
||||
it("keeps text intact when overrides are disabled", () => {
|
||||
const policy = resolveModelOverridePolicy({ enabled: false });
|
||||
const input = "Hello [[tts:voice=alloy]] world";
|
||||
const result = parseTtsDirectives(input, policy);
|
||||
|
||||
expect(result.cleanedText).toBe(input);
|
||||
expect(result.overrides.provider).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe("summarizeText", () => {
|
||||
const mockApiKey = "test-api-key";
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers({ shouldAdvanceTime: true });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
vi.useRealTimers();
|
||||
});
|
||||
const baseCfg = {
|
||||
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||
messages: { tts: {} },
|
||||
};
|
||||
const baseConfig = resolveTtsConfig(baseCfg);
|
||||
|
||||
it("summarizes text and returns result with metrics", async () => {
|
||||
const mockSummary = "This is a summarized version of the text.";
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: mockSummary } }],
|
||||
}),
|
||||
vi.mocked(completeSimple).mockResolvedValue({
|
||||
content: [{ type: "text", text: mockSummary }],
|
||||
});
|
||||
|
||||
const longText = "A".repeat(2000);
|
||||
const result = await summarizeText(longText, 1500, mockApiKey, 30_000);
|
||||
const result = await summarizeText({
|
||||
text: longText,
|
||||
targetLength: 1500,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
expect(result.summary).toBe(mockSummary);
|
||||
expect(result.inputLength).toBe(2000);
|
||||
expect(result.outputLength).toBe(mockSummary.length);
|
||||
expect(result.latencyMs).toBeGreaterThanOrEqual(0);
|
||||
expect(globalThis.fetch).toHaveBeenCalledTimes(1);
|
||||
expect(completeSimple).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("calls OpenAI API with correct parameters", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: "Summary" } }],
|
||||
}),
|
||||
it("calls the summary model with the expected parameters", async () => {
|
||||
await summarizeText({
|
||||
text: "Long text to summarize",
|
||||
targetLength: 500,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
await summarizeText("Long text to summarize", 500, mockApiKey, 30_000);
|
||||
const callArgs = vi.mocked(completeSimple).mock.calls[0];
|
||||
expect(callArgs?.[1]?.messages?.[0]?.role).toBe("user");
|
||||
expect(callArgs?.[2]?.maxTokens).toBe(250);
|
||||
expect(callArgs?.[2]?.temperature).toBe(0.3);
|
||||
expect(getApiKeyForModel).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
expect(globalThis.fetch).toHaveBeenCalledWith(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
expect.objectContaining({
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${mockApiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
}),
|
||||
);
|
||||
it("uses summaryModel override when configured", async () => {
|
||||
const cfg = {
|
||||
agents: { defaults: { model: { primary: "anthropic/claude-opus-4-5" } } },
|
||||
messages: { tts: { summaryModel: "openai/gpt-4.1-mini" } },
|
||||
};
|
||||
const config = resolveTtsConfig(cfg);
|
||||
await summarizeText({
|
||||
text: "Long text to summarize",
|
||||
targetLength: 500,
|
||||
cfg,
|
||||
config,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
const callArgs = (globalThis.fetch as ReturnType<typeof vi.fn>).mock.calls[0];
|
||||
const body = JSON.parse(callArgs[1].body);
|
||||
expect(body.model).toBe("gpt-4o-mini");
|
||||
expect(body.temperature).toBe(0.3);
|
||||
expect(body.max_tokens).toBe(250);
|
||||
expect(resolveModel).toHaveBeenCalledWith("openai", "gpt-4.1-mini", undefined, cfg);
|
||||
});
|
||||
|
||||
it("rejects targetLength below minimum (100)", async () => {
|
||||
await expect(summarizeText("text", 99, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"Invalid targetLength: 99",
|
||||
);
|
||||
await expect(
|
||||
summarizeText({
|
||||
text: "text",
|
||||
targetLength: 99,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).rejects.toThrow("Invalid targetLength: 99");
|
||||
});
|
||||
|
||||
it("rejects targetLength above maximum (10000)", async () => {
|
||||
await expect(summarizeText("text", 10001, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"Invalid targetLength: 10001",
|
||||
);
|
||||
await expect(
|
||||
summarizeText({
|
||||
text: "text",
|
||||
targetLength: 10001,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).rejects.toThrow("Invalid targetLength: 10001");
|
||||
});
|
||||
|
||||
it("accepts targetLength at boundaries", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: "Summary" } }],
|
||||
}),
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 100, mockApiKey, 30_000)).resolves.toBeDefined();
|
||||
await expect(summarizeText("text", 10000, mockApiKey, 30_000)).resolves.toBeDefined();
|
||||
});
|
||||
|
||||
it("throws error when API returns non-ok response", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: false,
|
||||
status: 500,
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"Summarization service unavailable",
|
||||
);
|
||||
await expect(
|
||||
summarizeText({
|
||||
text: "text",
|
||||
targetLength: 100,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).resolves.toBeDefined();
|
||||
await expect(
|
||||
summarizeText({
|
||||
text: "text",
|
||||
targetLength: 10000,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).resolves.toBeDefined();
|
||||
});
|
||||
|
||||
it("throws error when no summary is returned", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [],
|
||||
}),
|
||||
vi.mocked(completeSimple).mockResolvedValue({
|
||||
content: [],
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"No summary returned",
|
||||
);
|
||||
await expect(
|
||||
summarizeText({
|
||||
text: "text",
|
||||
targetLength: 500,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).rejects.toThrow("No summary returned");
|
||||
});
|
||||
|
||||
it("throws error when summary content is empty", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: " " } }],
|
||||
}),
|
||||
vi.mocked(completeSimple).mockResolvedValue({
|
||||
content: [{ type: "text", text: " " }],
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"No summary returned",
|
||||
);
|
||||
await expect(
|
||||
summarizeText({
|
||||
text: "text",
|
||||
targetLength: 500,
|
||||
cfg: baseCfg,
|
||||
config: baseConfig,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).rejects.toThrow("No summary returned");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
638
src/tts/tts.ts
638
src/tts/tts.ts
@@ -11,13 +11,28 @@ import {
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
|
||||
|
||||
import type { ReplyPayload } from "../auto-reply/types.js";
|
||||
import { normalizeChannelId } from "../channels/plugins/index.js";
|
||||
import type { ChannelId } from "../channels/plugins/types.js";
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { TtsConfig, TtsMode, TtsProvider } from "../config/types.tts.js";
|
||||
import type {
|
||||
TtsConfig,
|
||||
TtsMode,
|
||||
TtsProvider,
|
||||
TtsModelOverrideConfig,
|
||||
} from "../config/types.tts.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
|
||||
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
resolveDefaultModelForAgent,
|
||||
resolveModelRefFromString,
|
||||
type ModelRef,
|
||||
} from "../agents/model-selection.js";
|
||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 30_000;
|
||||
const DEFAULT_TTS_MAX_LENGTH = 1500;
|
||||
@@ -25,11 +40,20 @@ const DEFAULT_TTS_SUMMARIZE = true;
|
||||
const DEFAULT_MAX_TEXT_LENGTH = 4000;
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
|
||||
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
|
||||
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
|
||||
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
|
||||
const DEFAULT_OPENAI_VOICE = "alloy";
|
||||
|
||||
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
|
||||
stability: 0.5,
|
||||
similarityBoost: 0.75,
|
||||
style: 0.0,
|
||||
useSpeakerBoost: true,
|
||||
speed: 1.0,
|
||||
};
|
||||
|
||||
const TELEGRAM_OUTPUT = {
|
||||
openai: "opus" as const,
|
||||
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
|
||||
@@ -50,10 +74,23 @@ export type ResolvedTtsConfig = {
|
||||
enabled: boolean;
|
||||
mode: TtsMode;
|
||||
provider: TtsProvider;
|
||||
summaryModel?: string;
|
||||
modelOverrides: ResolvedTtsModelOverrides;
|
||||
elevenlabs: {
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings: {
|
||||
stability: number;
|
||||
similarityBoost: number;
|
||||
style: number;
|
||||
useSpeakerBoost: boolean;
|
||||
speed: number;
|
||||
};
|
||||
};
|
||||
openai: {
|
||||
apiKey?: string;
|
||||
@@ -74,6 +111,41 @@ type TtsUserPrefs = {
|
||||
};
|
||||
};
|
||||
|
||||
type ResolvedTtsModelOverrides = {
|
||||
enabled: boolean;
|
||||
allowText: boolean;
|
||||
allowProvider: boolean;
|
||||
allowVoice: boolean;
|
||||
allowModelId: boolean;
|
||||
allowVoiceSettings: boolean;
|
||||
allowNormalization: boolean;
|
||||
allowSeed: boolean;
|
||||
};
|
||||
|
||||
type TtsDirectiveOverrides = {
|
||||
ttsText?: string;
|
||||
provider?: TtsProvider;
|
||||
openai?: {
|
||||
voice?: string;
|
||||
model?: string;
|
||||
};
|
||||
elevenlabs?: {
|
||||
voiceId?: string;
|
||||
modelId?: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings?: Partial<ResolvedTtsConfig["elevenlabs"]["voiceSettings"]>;
|
||||
};
|
||||
};
|
||||
|
||||
type TtsDirectiveParseResult = {
|
||||
cleanedText: string;
|
||||
ttsText?: string;
|
||||
overrides: TtsDirectiveOverrides;
|
||||
warnings: string[];
|
||||
};
|
||||
|
||||
export type TtsResult = {
|
||||
success: boolean;
|
||||
audioPath?: string;
|
||||
@@ -96,16 +168,63 @@ type TtsStatusEntry = {
|
||||
|
||||
let lastTtsAttempt: TtsStatusEntry | undefined;
|
||||
|
||||
function resolveModelOverridePolicy(
|
||||
overrides: TtsModelOverrideConfig | undefined,
|
||||
): ResolvedTtsModelOverrides {
|
||||
const enabled = overrides?.enabled ?? true;
|
||||
if (!enabled) {
|
||||
return {
|
||||
enabled: false,
|
||||
allowText: false,
|
||||
allowProvider: false,
|
||||
allowVoice: false,
|
||||
allowModelId: false,
|
||||
allowVoiceSettings: false,
|
||||
allowNormalization: false,
|
||||
allowSeed: false,
|
||||
};
|
||||
}
|
||||
const allow = (value?: boolean) => value ?? true;
|
||||
return {
|
||||
enabled: true,
|
||||
allowText: allow(overrides?.allowText),
|
||||
allowProvider: allow(overrides?.allowProvider),
|
||||
allowVoice: allow(overrides?.allowVoice),
|
||||
allowModelId: allow(overrides?.allowModelId),
|
||||
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
|
||||
allowNormalization: allow(overrides?.allowNormalization),
|
||||
allowSeed: allow(overrides?.allowSeed),
|
||||
};
|
||||
}
|
||||
|
||||
export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
||||
const raw: TtsConfig = cfg.messages?.tts ?? {};
|
||||
return {
|
||||
enabled: raw.enabled ?? false,
|
||||
mode: raw.mode ?? "final",
|
||||
provider: raw.provider ?? "elevenlabs",
|
||||
summaryModel: raw.summaryModel?.trim() || undefined,
|
||||
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
|
||||
elevenlabs: {
|
||||
apiKey: raw.elevenlabs?.apiKey,
|
||||
baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL,
|
||||
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
|
||||
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
|
||||
seed: raw.elevenlabs?.seed,
|
||||
applyTextNormalization: raw.elevenlabs?.applyTextNormalization,
|
||||
languageCode: raw.elevenlabs?.languageCode,
|
||||
voiceSettings: {
|
||||
stability:
|
||||
raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
|
||||
similarityBoost:
|
||||
raw.elevenlabs?.voiceSettings?.similarityBoost ??
|
||||
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
|
||||
style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
|
||||
useSpeakerBoost:
|
||||
raw.elevenlabs?.voiceSettings?.useSpeakerBoost ??
|
||||
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
|
||||
speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
|
||||
},
|
||||
},
|
||||
openai: {
|
||||
apiKey: raw.openai?.apiKey,
|
||||
@@ -235,6 +354,261 @@ function isValidVoiceId(voiceId: string): boolean {
|
||||
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||
}
|
||||
|
||||
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
|
||||
const trimmed = baseUrl.trim();
|
||||
if (!trimmed) return DEFAULT_ELEVENLABS_BASE_URL;
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
}
|
||||
}
|
||||
|
||||
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
|
||||
requireInRange(settings.stability, 0, 1, "stability");
|
||||
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
|
||||
requireInRange(settings.style, 0, 1, "style");
|
||||
requireInRange(settings.speed, 0.5, 2, "speed");
|
||||
}
|
||||
|
||||
function normalizeLanguageCode(code?: string): string | undefined {
|
||||
const trimmed = code?.trim();
|
||||
if (!trimmed) return undefined;
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (!/^[a-z]{2}$/.test(normalized)) {
|
||||
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
|
||||
const trimmed = mode?.trim();
|
||||
if (!trimmed) return undefined;
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (normalized === "auto" || normalized === "on" || normalized === "off") return normalized;
|
||||
throw new Error("applyTextNormalization must be one of: auto, on, off");
|
||||
}
|
||||
|
||||
function normalizeSeed(seed?: number): number | undefined {
|
||||
if (seed == null) return undefined;
|
||||
const next = Math.floor(seed);
|
||||
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
|
||||
throw new Error("seed must be between 0 and 4294967295");
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
function parseBooleanValue(value: string): boolean | undefined {
|
||||
const normalized = value.trim().toLowerCase();
|
||||
if (["true", "1", "yes", "on"].includes(normalized)) return true;
|
||||
if (["false", "0", "no", "off"].includes(normalized)) return false;
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseNumberValue(value: string): number | undefined {
|
||||
const parsed = Number.parseFloat(value);
|
||||
return Number.isFinite(parsed) ? parsed : undefined;
|
||||
}
|
||||
|
||||
function parseTtsDirectives(
|
||||
text: string,
|
||||
policy: ResolvedTtsModelOverrides,
|
||||
): TtsDirectiveParseResult {
|
||||
if (!policy.enabled) {
|
||||
return { cleanedText: text, overrides: {}, warnings: [] };
|
||||
}
|
||||
|
||||
const overrides: TtsDirectiveOverrides = {};
|
||||
const warnings: string[] = [];
|
||||
let cleanedText = text;
|
||||
|
||||
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
|
||||
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
|
||||
if (policy.allowText && overrides.ttsText == null) {
|
||||
overrides.ttsText = inner.trim();
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
|
||||
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
|
||||
const tokens = body.split(/\s+/).filter(Boolean);
|
||||
for (const token of tokens) {
|
||||
const eqIndex = token.indexOf("=");
|
||||
if (eqIndex === -1) continue;
|
||||
const rawKey = token.slice(0, eqIndex).trim();
|
||||
const rawValue = token.slice(eqIndex + 1).trim();
|
||||
if (!rawKey || !rawValue) continue;
|
||||
const key = rawKey.toLowerCase();
|
||||
try {
|
||||
switch (key) {
|
||||
case "provider":
|
||||
if (!policy.allowProvider) break;
|
||||
if (rawValue === "openai" || rawValue === "elevenlabs") {
|
||||
overrides.provider = rawValue;
|
||||
} else {
|
||||
warnings.push(`unsupported provider "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voice":
|
||||
case "openai_voice":
|
||||
case "openaivoice":
|
||||
if (!policy.allowVoice) break;
|
||||
if (isValidOpenAIVoice(rawValue)) {
|
||||
overrides.openai = { ...overrides.openai, voice: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid OpenAI voice "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voiceid":
|
||||
case "voice_id":
|
||||
case "elevenlabs_voice":
|
||||
case "elevenlabsvoice":
|
||||
if (!policy.allowVoice) break;
|
||||
if (isValidVoiceId(rawValue)) {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "model":
|
||||
case "modelid":
|
||||
case "model_id":
|
||||
case "elevenlabs_model":
|
||||
case "elevenlabsmodel":
|
||||
case "openai_model":
|
||||
case "openaimodel":
|
||||
if (!policy.allowModelId) break;
|
||||
if (isValidOpenAIModel(rawValue)) {
|
||||
overrides.openai = { ...overrides.openai, model: rawValue };
|
||||
} else {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
|
||||
}
|
||||
break;
|
||||
case "stability":
|
||||
if (!policy.allowVoiceSettings) break;
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid stability value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "stability");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "similarity":
|
||||
case "similarityboost":
|
||||
case "similarity_boost":
|
||||
if (!policy.allowVoiceSettings) break;
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid similarityBoost value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "similarityBoost");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "style":
|
||||
if (!policy.allowVoiceSettings) break;
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid style value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "style");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "speed":
|
||||
if (!policy.allowVoiceSettings) break;
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid speed value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0.5, 2, "speed");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "speakerboost":
|
||||
case "speaker_boost":
|
||||
case "usespeakerboost":
|
||||
case "use_speaker_boost":
|
||||
if (!policy.allowVoiceSettings) break;
|
||||
{
|
||||
const value = parseBooleanValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid useSpeakerBoost value");
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "normalize":
|
||||
case "applytextnormalization":
|
||||
case "apply_text_normalization":
|
||||
if (!policy.allowNormalization) break;
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
applyTextNormalization: normalizeApplyTextNormalization(rawValue),
|
||||
};
|
||||
break;
|
||||
case "language":
|
||||
case "languagecode":
|
||||
case "language_code":
|
||||
if (!policy.allowNormalization) break;
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
languageCode: normalizeLanguageCode(rawValue),
|
||||
};
|
||||
break;
|
||||
case "seed":
|
||||
if (!policy.allowSeed) break;
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
|
||||
};
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} catch (err) {
|
||||
warnings.push((err as Error).message);
|
||||
}
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
return {
|
||||
cleanedText,
|
||||
ttsText: overrides.ttsText,
|
||||
overrides,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const;
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
@@ -265,66 +639,110 @@ type SummarizeResult = {
|
||||
outputLength: number;
|
||||
};
|
||||
|
||||
async function summarizeText(
|
||||
text: string,
|
||||
targetLength: number,
|
||||
apiKey: string,
|
||||
timeoutMs: number,
|
||||
): Promise<SummarizeResult> {
|
||||
type SummaryModelSelection = {
|
||||
ref: ModelRef;
|
||||
source: "summaryModel" | "default";
|
||||
};
|
||||
|
||||
function resolveSummaryModelRef(
|
||||
cfg: ClawdbotConfig,
|
||||
config: ResolvedTtsConfig,
|
||||
): SummaryModelSelection {
|
||||
const defaultRef = resolveDefaultModelForAgent({ cfg });
|
||||
const override = config.summaryModel?.trim();
|
||||
if (!override) return { ref: defaultRef, source: "default" };
|
||||
|
||||
const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
|
||||
const resolved = resolveModelRefFromString({
|
||||
raw: override,
|
||||
defaultProvider: defaultRef.provider,
|
||||
aliasIndex,
|
||||
});
|
||||
if (!resolved) return { ref: defaultRef, source: "default" };
|
||||
return { ref: resolved.ref, source: "summaryModel" };
|
||||
}
|
||||
|
||||
function isTextContentBlock(block: { type: string }): block is TextContent {
|
||||
return block.type === "text";
|
||||
}
|
||||
|
||||
async function summarizeText(params: {
|
||||
text: string;
|
||||
targetLength: number;
|
||||
cfg: ClawdbotConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
timeoutMs: number;
|
||||
}): Promise<SummarizeResult> {
|
||||
const { text, targetLength, cfg, config, timeoutMs } = params;
|
||||
if (targetLength < 100 || targetLength > 10_000) {
|
||||
throw new Error(`Invalid targetLength: ${targetLength}`);
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
const { ref } = resolveSummaryModelRef(cfg, config);
|
||||
const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
|
||||
if (!resolved.model) {
|
||||
throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
|
||||
}
|
||||
const apiKey = requireApiKey(
|
||||
await getApiKeyForModel({ model: resolved.model, cfg }),
|
||||
ref.provider,
|
||||
);
|
||||
|
||||
try {
|
||||
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: "gpt-4o-mini",
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: `You are an assistant that summarizes texts concisely while keeping the most important information. Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. Reply only with the summary, without additional explanations.`,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `<text_to_summarize>\n${text}\n</text_to_summarize>`,
|
||||
},
|
||||
],
|
||||
max_tokens: Math.ceil(targetLength / 2),
|
||||
temperature: 0.3,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Summarization service unavailable");
|
||||
try {
|
||||
const res = await completeSimple(
|
||||
resolved.model,
|
||||
{
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
`You are an assistant that summarizes texts concisely while keeping the most important information. ` +
|
||||
`Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
|
||||
`Reply only with the summary, without additional explanations.\n\n` +
|
||||
`<text_to_summarize>\n${text}\n</text_to_summarize>`,
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiKey,
|
||||
maxTokens: Math.ceil(targetLength / 2),
|
||||
temperature: 0.3,
|
||||
signal: controller.signal,
|
||||
},
|
||||
);
|
||||
|
||||
const summary = res.content
|
||||
.filter(isTextContentBlock)
|
||||
.map((block) => block.text.trim())
|
||||
.filter(Boolean)
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
if (!summary) {
|
||||
throw new Error("No summary returned");
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
latencyMs: Date.now() - startTime,
|
||||
inputLength: text.length,
|
||||
outputLength: summary.length,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
choices?: Array<{ message?: { content?: string } }>;
|
||||
};
|
||||
const summary = data.choices?.[0]?.message?.content?.trim();
|
||||
|
||||
if (!summary) {
|
||||
throw new Error("No summary returned");
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
if (error.name === "AbortError") {
|
||||
throw new Error("Summarization timed out");
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
latencyMs: Date.now() - startTime,
|
||||
inputLength: text.length,
|
||||
outputLength: summary.length,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -342,21 +760,42 @@ function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DE
|
||||
async function elevenLabsTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
outputFormat: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, voiceId, modelId, outputFormat, timeoutMs } = params;
|
||||
const {
|
||||
text,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
voiceId,
|
||||
modelId,
|
||||
outputFormat,
|
||||
seed,
|
||||
applyTextNormalization,
|
||||
languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs,
|
||||
} = params;
|
||||
if (!isValidVoiceId(voiceId)) {
|
||||
throw new Error("Invalid voiceId format");
|
||||
}
|
||||
assertElevenLabsVoiceSettings(voiceSettings);
|
||||
const normalizedLanguage = normalizeLanguageCode(languageCode);
|
||||
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
|
||||
const normalizedSeed = normalizeSeed(seed);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`);
|
||||
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
|
||||
if (outputFormat) {
|
||||
url.searchParams.set("output_format", outputFormat);
|
||||
}
|
||||
@@ -371,11 +810,15 @@ async function elevenLabsTTS(params: {
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: modelId,
|
||||
seed: normalizedSeed,
|
||||
apply_text_normalization: normalizedNormalization,
|
||||
language_code: normalizedLanguage,
|
||||
voice_settings: {
|
||||
stability: 0.5,
|
||||
similarity_boost: 0.75,
|
||||
style: 0.0,
|
||||
use_speaker_boost: true,
|
||||
stability: voiceSettings.stability,
|
||||
similarity_boost: voiceSettings.similarityBoost,
|
||||
style: voiceSettings.style,
|
||||
use_speaker_boost: voiceSettings.useSpeakerBoost,
|
||||
speed: voiceSettings.speed,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
@@ -442,6 +885,7 @@ export async function textToSpeech(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
prefsPath?: string;
|
||||
channel?: string;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
}): Promise<TtsResult> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
|
||||
@@ -456,10 +900,9 @@ export async function textToSpeech(params: {
|
||||
}
|
||||
|
||||
const userProvider = getTtsProvider(config, prefsPath);
|
||||
const providers: TtsProvider[] = [
|
||||
userProvider,
|
||||
userProvider === "openai" ? "elevenlabs" : "openai",
|
||||
];
|
||||
const overrideProvider = params.overrides?.provider;
|
||||
const provider = overrideProvider ?? userProvider;
|
||||
const providers: TtsProvider[] = [provider, provider === "openai" ? "elevenlabs" : "openai"];
|
||||
|
||||
let lastError: string | undefined;
|
||||
|
||||
@@ -474,20 +917,36 @@ export async function textToSpeech(params: {
|
||||
try {
|
||||
let audioBuffer: Buffer;
|
||||
if (provider === "elevenlabs") {
|
||||
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
|
||||
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
|
||||
const voiceSettings = {
|
||||
...config.elevenlabs.voiceSettings,
|
||||
...params.overrides?.elevenlabs?.voiceSettings,
|
||||
};
|
||||
const seedOverride = params.overrides?.elevenlabs?.seed;
|
||||
const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
|
||||
const languageOverride = params.overrides?.elevenlabs?.languageCode;
|
||||
audioBuffer = await elevenLabsTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
voiceId: config.elevenlabs.voiceId,
|
||||
modelId: config.elevenlabs.modelId,
|
||||
baseUrl: config.elevenlabs.baseUrl,
|
||||
voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
|
||||
modelId: modelIdOverride ?? config.elevenlabs.modelId,
|
||||
outputFormat: output.elevenlabs,
|
||||
seed: seedOverride ?? config.elevenlabs.seed,
|
||||
applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
|
||||
languageCode: languageOverride ?? config.elevenlabs.languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
} else {
|
||||
const openaiModelOverride = params.overrides?.openai?.model;
|
||||
const openaiVoiceOverride = params.overrides?.openai?.voice;
|
||||
audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
model: openaiModelOverride ?? config.openai.model,
|
||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||
responseFormat: output.openai,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
@@ -538,13 +997,31 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
|
||||
|
||||
const text = params.payload.text ?? "";
|
||||
if (!text.trim()) return params.payload;
|
||||
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return params.payload;
|
||||
if (text.includes("MEDIA:")) return params.payload;
|
||||
if (text.trim().length < 10) return params.payload;
|
||||
const directives = parseTtsDirectives(text, config.modelOverrides);
|
||||
if (directives.warnings.length > 0) {
|
||||
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
|
||||
}
|
||||
|
||||
const cleanedText = directives.cleanedText;
|
||||
const trimmedCleaned = cleanedText.trim();
|
||||
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
|
||||
const ttsText = directives.ttsText?.trim() || visibleText;
|
||||
|
||||
const nextPayload =
|
||||
visibleText === text.trim()
|
||||
? params.payload
|
||||
: {
|
||||
...params.payload,
|
||||
text: visibleText.length > 0 ? visibleText : undefined,
|
||||
};
|
||||
|
||||
if (!ttsText.trim()) return nextPayload;
|
||||
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return nextPayload;
|
||||
if (text.includes("MEDIA:")) return nextPayload;
|
||||
if (ttsText.trim().length < 10) return nextPayload;
|
||||
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
let textForAudio = text.trim();
|
||||
let textForAudio = ttsText.trim();
|
||||
let wasSummarized = false;
|
||||
|
||||
if (textForAudio.length > maxLength) {
|
||||
@@ -555,14 +1032,14 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
return params.payload;
|
||||
}
|
||||
|
||||
const openaiKey = resolveTtsApiKey(config, "openai");
|
||||
if (!openaiKey) {
|
||||
logVerbose("TTS: skipping summarization - OpenAI key missing.");
|
||||
return params.payload;
|
||||
}
|
||||
|
||||
try {
|
||||
const summary = await summarizeText(textForAudio, maxLength, openaiKey, config.timeoutMs);
|
||||
const summary = await summarizeText({
|
||||
text: textForAudio,
|
||||
targetLength: maxLength,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
textForAudio = summary.summary;
|
||||
wasSummarized = true;
|
||||
if (textForAudio.length > config.maxTextLength) {
|
||||
@@ -584,6 +1061,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
cfg: params.cfg,
|
||||
prefsPath,
|
||||
channel: params.channel,
|
||||
overrides: directives.overrides,
|
||||
});
|
||||
|
||||
if (result.success && result.audioPath) {
|
||||
@@ -600,7 +1078,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
|
||||
|
||||
return {
|
||||
...params.payload,
|
||||
...nextPayload,
|
||||
mediaUrl: result.audioPath,
|
||||
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
|
||||
};
|
||||
@@ -616,7 +1094,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
|
||||
const latency = Date.now() - ttsStart;
|
||||
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
|
||||
return params.payload;
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
export const _test = {
|
||||
@@ -625,6 +1103,8 @@ export const _test = {
|
||||
isValidOpenAIModel,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
parseTtsDirectives,
|
||||
resolveModelOverridePolicy,
|
||||
summarizeText,
|
||||
resolveOutputFormat,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user