feat: move TTS into core (#1559) (thanks @Glucksberg)

This commit is contained in:
Peter Steinberger
2026-01-24 07:57:46 +00:00
parent aef88cd9f1
commit d9a467fe3b
26 changed files with 1522 additions and 1649 deletions

View File

@@ -17,6 +17,7 @@ import { createSessionsListTool } from "./tools/sessions-list-tool.js";
import { createSessionsSendTool } from "./tools/sessions-send-tool.js";
import { createSessionsSpawnTool } from "./tools/sessions-spawn-tool.js";
import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js";
import { createTtsTool } from "./tools/tts-tool.js";
export function createClawdbotTools(options?: {
browserControlUrl?: string;
@@ -96,6 +97,10 @@ export function createClawdbotTools(options?: {
replyToMode: options?.replyToMode,
hasRepliedRef: options?.hasRepliedRef,
}),
createTtsTool({
agentChannel: options?.agentChannel,
config: options?.config,
}),
createGatewayTool({
agentSessionKey: options?.agentSessionKey,
config: options?.config,

View File

@@ -0,0 +1,60 @@
import { Type } from "@sinclair/typebox";
import { loadConfig } from "../../config/config.js";
import type { ClawdbotConfig } from "../../config/config.js";
import type { GatewayMessageChannel } from "../../utils/message-channel.js";
import { textToSpeech } from "../../tts/tts.js";
import type { AnyAgentTool } from "./common.js";
import { readStringParam } from "./common.js";
const TtsToolSchema = Type.Object({
text: Type.String({ description: "Text to convert to speech." }),
channel: Type.Optional(
Type.String({ description: "Optional channel id to pick output format (e.g. telegram)." }),
),
});
export function createTtsTool(opts?: {
config?: ClawdbotConfig;
agentChannel?: GatewayMessageChannel;
}): AnyAgentTool {
return {
label: "TTS",
name: "tts",
description:
"Convert text to speech and return a MEDIA: path. Use when the user requests audio or TTS is enabled. Copy the MEDIA line exactly.",
parameters: TtsToolSchema,
execute: async (_toolCallId, args) => {
const params = args as Record<string, unknown>;
const text = readStringParam(params, "text", { required: true });
const channel = readStringParam(params, "channel");
const cfg = opts?.config ?? loadConfig();
const result = await textToSpeech({
text,
cfg,
channel: channel ?? opts?.agentChannel,
});
if (result.success && result.audioPath) {
const lines: string[] = [];
// Tag Telegram Opus output as a voice bubble instead of a file attachment.
if (result.voiceCompatible) lines.push("[[audio_as_voice]]");
lines.push(`MEDIA:${result.audioPath}`);
return {
content: [{ type: "text", text: lines.join("\n") }],
details: { audioPath: result.audioPath, provider: result.provider },
};
}
return {
content: [
{
type: "text",
text: result.error ?? "TTS conversion failed",
},
],
details: { error: result.error },
};
},
};
}