feat(telegram-tts): add auto-TTS hook and provider switching

- Integrate message_sending hook into Telegram delivery path - Send text first, then audio as voice message after - Add /tts_provider command to switch between OpenAI and ElevenLabs - Implement automatic fallback when primary provider fails - Use gpt-4o-mini-tts as default OpenAI model - Add hook integration to route-reply.ts for other channels Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 00:19:08 +00:00
parent 46e6546bb9
commit df09e583aa
3 changed files with 397 additions and 78 deletions
--- a/src/auto-reply/reply/route-reply.ts
+++ b/src/auto-reply/reply/route-reply.ts
@@ -10,6 +10,7 @@
 import { resolveSessionAgentId } from "../../agents/agent-scope.js";
 import { resolveEffectiveMessagesConfig } from "../../agents/identity.js";
 import { normalizeChannelId } from "../../channels/plugins/index.js";
+import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
 import type { ClawdbotConfig } from "../../config/config.js";
 import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
 import type { OriginatingChannelType } from "../templating.js";
@@ -72,14 +73,56 @@ export async function routeReply(params: RouteReplyParams): Promise<RouteReplyRe
  });
  if (!normalized) return { ok: true };

-  const text = normalized.text ?? "";
-  const mediaUrls = (normalized.mediaUrls?.filter(Boolean) ?? []).length
+  let text = normalized.text ?? "";
+  let mediaUrls = (normalized.mediaUrls?.filter(Boolean) ?? []).length
    ? (normalized.mediaUrls?.filter(Boolean) as string[])
    : normalized.mediaUrl
      ? [normalized.mediaUrl]
      : [];
  const replyToId = normalized.replyToId;

+  // Run message_sending hook (allows plugins to modify or cancel)
+  const hookRunner = getGlobalHookRunner();
+  const normalizedChannel = normalizeChannelId(channel);
+  if (hookRunner && text.trim() && normalizedChannel) {
+    try {
+      const hookResult = await hookRunner.runMessageSending(
+        {
+          to,
+          content: text,
+          metadata: { channel, accountId, threadId },
+        },
+        {
+          channelId: normalizedChannel,
+          accountId: accountId ?? undefined,
+          conversationId: to,
+        },
+      );
+
+      // Check if hook wants to cancel the message
+      if (hookResult?.cancel) {
+        return { ok: true }; // Silently cancel
+      }
+
+      // Check if hook modified the content
+      if (hookResult?.content !== undefined) {
+        // Check if the modified content contains MEDIA: directive
+        const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m);
+        if (mediaMatch) {
+          // Extract media path and add to mediaUrls
+          const mediaPath = mediaMatch[1].trim();
+          mediaUrls = [mediaPath];
+          // Remove MEDIA: directive from text (send audio only)
+          text = hookResult.content.replace(/^MEDIA:.+$/m, "").trim();
+        } else {
+          text = hookResult.content;
+        }
+      }
+    } catch {
+      // Hook errors shouldn't block message sending
+    }
+  }
+
  // Skip empty replies.
  if (!text.trim() && mediaUrls.length === 0) {
    return { ok: true };
--- a/src/telegram/bot/delivery.ts
+++ b/src/telegram/bot/delivery.ts
@@ -14,6 +14,7 @@ import { mediaKindFromMime } from "../../media/constants.js";
 import { fetchRemoteMedia } from "../../media/fetch.js";
 import { isGifMedia } from "../../media/mime.js";
 import { saveMediaBuffer } from "../../media/store.js";
+import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
 import type { RuntimeEnv } from "../../runtime.js";
 import { loadWebMedia } from "../../web/media.js";
 import { resolveTelegramVoiceSend } from "../voice.js";
@@ -39,6 +40,45 @@ export async function deliverReplies(params: {
  const threadParams = buildTelegramThreadParams(messageThreadId);
  let hasReplied = false;
  for (const reply of replies) {
+    // Track if hook wants to send audio after text
+    let audioToSendAfter: string | undefined;
+
+    // Run message_sending hook (allows plugins like TTS to generate audio)
+    const hookRunner = getGlobalHookRunner();
+    if (hookRunner && reply?.text?.trim()) {
+      try {
+        const hookResult = await hookRunner.runMessageSending(
+          {
+            to: chatId,
+            content: reply.text,
+            metadata: { channel: "telegram", threadId: messageThreadId },
+          },
+          {
+            channelId: "telegram",
+            accountId: undefined,
+            conversationId: chatId,
+          },
+        );
+
+        // Check if hook wants to cancel the message
+        if (hookResult?.cancel) {
+          continue; // Skip this reply
+        }
+
+        // Check if hook returned a MEDIA directive (TTS audio)
+        if (hookResult?.content !== undefined) {
+          const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m);
+          if (mediaMatch) {
+            // Save audio path to send AFTER the text message
+            audioToSendAfter = mediaMatch[1].trim();
+          }
+        }
+      } catch (err) {
+        // Hook errors shouldn't block message sending
+        logVerbose(`[telegram delivery] hook error: ${String(err)}`);
+      }
+    }
+
    const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0;
    if (!reply?.text && !hasMedia) {
      if (reply?.audioAsVoice) {
@@ -70,6 +110,25 @@ export async function deliverReplies(params: {
          hasReplied = true;
        }
      }
+
+      // Send TTS audio after text (if hook generated one)
+      if (audioToSendAfter) {
+        try {
+          const audioMedia = await loadWebMedia(audioToSendAfter);
+          const audioFile = new InputFile(audioMedia.buffer, "voice.mp3");
+          // Switch typing indicator to record_voice before sending
+          await params.onVoiceRecording?.();
+          const audioParams: Record<string, unknown> = {};
+          if (threadParams) {
+            audioParams.message_thread_id = threadParams.message_thread_id;
+          }
+          await bot.api.sendVoice(chatId, audioFile, audioParams);
+          logVerbose(`[telegram delivery] TTS audio sent: ${audioToSendAfter}`);
+        } catch (err) {
+          logVerbose(`[telegram delivery] TTS audio send failed: ${String(err)}`);
+        }
+      }
+
      continue;
    }
    // media with optional caption on first item