feat: move TTS into core (#1559) (thanks @Glucksberg)

2026-01-24 07:57:46 +00:00
parent aef88cd9f1
commit d9a467fe3b
26 changed files with 1522 additions and 1649 deletions
--- a/src/agents/clawdbot-tools.ts
+++ b/src/agents/clawdbot-tools.ts
@@ -17,6 +17,7 @@ import { createSessionsListTool } from "./tools/sessions-list-tool.js";
 import { createSessionsSendTool } from "./tools/sessions-send-tool.js";
 import { createSessionsSpawnTool } from "./tools/sessions-spawn-tool.js";
 import { createWebFetchTool, createWebSearchTool } from "./tools/web-tools.js";
+import { createTtsTool } from "./tools/tts-tool.js";

 export function createClawdbotTools(options?: {
  browserControlUrl?: string;
@@ -96,6 +97,10 @@ export function createClawdbotTools(options?: {
      replyToMode: options?.replyToMode,
      hasRepliedRef: options?.hasRepliedRef,
    }),
+    createTtsTool({
+      agentChannel: options?.agentChannel,
+      config: options?.config,
+    }),
    createGatewayTool({
      agentSessionKey: options?.agentSessionKey,
      config: options?.config,
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -0,0 +1,60 @@
+import { Type } from "@sinclair/typebox";
+
+import { loadConfig } from "../../config/config.js";
+import type { ClawdbotConfig } from "../../config/config.js";
+import type { GatewayMessageChannel } from "../../utils/message-channel.js";
+import { textToSpeech } from "../../tts/tts.js";
+import type { AnyAgentTool } from "./common.js";
+import { readStringParam } from "./common.js";
+
+const TtsToolSchema = Type.Object({
+  text: Type.String({ description: "Text to convert to speech." }),
+  channel: Type.Optional(
+    Type.String({ description: "Optional channel id to pick output format (e.g. telegram)." }),
+  ),
+});
+
+export function createTtsTool(opts?: {
+  config?: ClawdbotConfig;
+  agentChannel?: GatewayMessageChannel;
+}): AnyAgentTool {
+  return {
+    label: "TTS",
+    name: "tts",
+    description:
+      "Convert text to speech and return a MEDIA: path. Use when the user requests audio or TTS is enabled. Copy the MEDIA line exactly.",
+    parameters: TtsToolSchema,
+    execute: async (_toolCallId, args) => {
+      const params = args as Record<string, unknown>;
+      const text = readStringParam(params, "text", { required: true });
+      const channel = readStringParam(params, "channel");
+      const cfg = opts?.config ?? loadConfig();
+      const result = await textToSpeech({
+        text,
+        cfg,
+        channel: channel ?? opts?.agentChannel,
+      });
+
+      if (result.success && result.audioPath) {
+        const lines: string[] = [];
+        // Tag Telegram Opus output as a voice bubble instead of a file attachment.
+        if (result.voiceCompatible) lines.push("[[audio_as_voice]]");
+        lines.push(`MEDIA:${result.audioPath}`);
+        return {
+          content: [{ type: "text", text: lines.join("\n") }],
+          details: { audioPath: result.audioPath, provider: result.provider },
+        };
+      }
+
+      return {
+        content: [
+          {
+            type: "text",
+            text: result.error ?? "TTS conversion failed",
+          },
+        ],
+        details: { error: result.error },
+      };
+    },
+  };
+}