feat(telegram): add [[audio_as_file]] tag support

Allow agents to specify audio mode via inline tag: - Default: voice bubble (sendVoice) - [[audio_as_file]]: audio file with metadata (sendAudio) The tag is stripped from the final message text. Example agent response: Here's a podcast episode! [[audio_as_file]] MEDIA:https://example.com/episode.mp3
2026-01-04 22:15:22 +01:00
parent 5e1b91b32c
commit 262f8a8d45
2 changed files with 50 additions and 7 deletions
--- a/src/auto-reply/reply/agent-runner.ts
+++ b/src/auto-reply/reply/agent-runner.ts
@@ -30,12 +30,14 @@ import {
  type QueueSettings,
  scheduleFollowupDrain,
 } from "./queue.js";
+import { extractAudioTag } from "./audio-tags.js";
 import {
  applyReplyTagsToPayload,
  applyReplyThreading,
  filterMessagingToolDuplicates,
  isRenderablePayload,
 } from "./reply-payloads.js";
+import { extractReplyToTag } from "./reply-tags.js";
 import {
  createReplyToModeFilter,
  resolveReplyToMode,
@@ -334,16 +336,18 @@ export async function runReplyAgent(params: {
                      sessionCtx.MessageSid,
                    );
                    if (!isRenderablePayload(taggedPayload)) return;
+                    const audioTagResult = extractAudioTag(taggedPayload.text);
+                    const cleaned = audioTagResult.cleaned || undefined;
                    const hasMedia =
                      Boolean(taggedPayload.mediaUrl) ||
                      (taggedPayload.mediaUrls?.length ?? 0) > 0;
-                    if (
-                      taggedPayload.text?.trim() === SILENT_REPLY_TOKEN &&
-                      !hasMedia
-                    )
+                    if (cleaned?.trim() === SILENT_REPLY_TOKEN && !hasMedia)
                      return;
-                    const blockPayload: ReplyPayload =
-                      applyReplyToMode(taggedPayload);
+                    const blockPayload: ReplyPayload = applyReplyToMode({
+                      ...taggedPayload,
+                      text: cleaned,
+                      audioAsVoice: audioTagResult.audioAsVoice,
+                    });
                    const payloadKey = buildPayloadKey(blockPayload);
                    if (
                      streamedPayloadKeys.has(payloadKey) ||
@@ -519,7 +523,16 @@ export async function runReplyAgent(params: {
      payloads: sanitizedPayloads,
      applyReplyToMode,
      currentMessageId: sessionCtx.MessageSid,
-    });
+    })
+      .map((payload) => {
+        const audioTagResult = extractAudioTag(payload.text);
+        return {
+          ...payload,
+          text: audioTagResult.cleaned ? audioTagResult.cleaned : undefined,
+          audioAsVoice: audioTagResult.audioAsVoice,
+        };
+      })
+      .filter(isRenderablePayload);

    // Drop final payloads if block streaming is enabled and we already streamed
    // block replies. Tool-sent duplicates are filtered below.
--- a/src/auto-reply/reply/audio-tags.ts
+++ b/src/auto-reply/reply/audio-tags.ts
@@ -0,0 +1,30 @@
+/**
+ * Extract audio mode tag from text.
+ * Supports [[audio_as_file]] to send audio as file instead of voice bubble.
+ */
+export function extractAudioTag(text?: string): {
+  cleaned: string;
+  audioAsVoice: boolean;
+  hasTag: boolean;
+} {
+  if (!text) return { cleaned: "", audioAsVoice: true, hasTag: false };
+  let cleaned = text;
+  let audioAsVoice = true; // default: voice bubble
+  let hasTag = false;
+
+  // [[audio_as_file]] -> send as file with metadata, not voice bubble
+  const fileMatch = cleaned.match(/\[\[audio_as_file\]\]/i);
+  if (fileMatch) {
+    cleaned = cleaned.replace(/\[\[audio_as_file\]\]/gi, " ");
+    audioAsVoice = false;
+    hasTag = true;
+  }
+
+  // Clean up whitespace
+  cleaned = cleaned
+    .replace(/[ \t]+/g, " ")
+    .replace(/[ \t]*\n[ \t]*/g, "\n")
+    .trim();
+
+  return { cleaned, audioAsVoice, hasTag };
+}