feat(telegram): add [[audio_as_file]] tag support

Allow agents to specify audio mode via inline tag:
- Default: voice bubble (sendVoice)
- [[audio_as_file]]: audio file with metadata (sendAudio)

The tag is stripped from the final message text.

Example agent response:
  Here's a podcast episode! [[audio_as_file]]
  MEDIA:https://example.com/episode.mp3
This commit is contained in:
Manuel Maly
2026-01-04 22:15:22 +01:00
committed by Peter Steinberger
parent 5e1b91b32c
commit 262f8a8d45
2 changed files with 50 additions and 7 deletions

View File

@@ -30,12 +30,14 @@ import {
type QueueSettings, type QueueSettings,
scheduleFollowupDrain, scheduleFollowupDrain,
} from "./queue.js"; } from "./queue.js";
import { extractAudioTag } from "./audio-tags.js";
import { import {
applyReplyTagsToPayload, applyReplyTagsToPayload,
applyReplyThreading, applyReplyThreading,
filterMessagingToolDuplicates, filterMessagingToolDuplicates,
isRenderablePayload, isRenderablePayload,
} from "./reply-payloads.js"; } from "./reply-payloads.js";
import { extractReplyToTag } from "./reply-tags.js";
import { import {
createReplyToModeFilter, createReplyToModeFilter,
resolveReplyToMode, resolveReplyToMode,
@@ -334,16 +336,18 @@ export async function runReplyAgent(params: {
sessionCtx.MessageSid, sessionCtx.MessageSid,
); );
if (!isRenderablePayload(taggedPayload)) return; if (!isRenderablePayload(taggedPayload)) return;
const audioTagResult = extractAudioTag(taggedPayload.text);
const cleaned = audioTagResult.cleaned || undefined;
const hasMedia = const hasMedia =
Boolean(taggedPayload.mediaUrl) || Boolean(taggedPayload.mediaUrl) ||
(taggedPayload.mediaUrls?.length ?? 0) > 0; (taggedPayload.mediaUrls?.length ?? 0) > 0;
if ( if (cleaned?.trim() === SILENT_REPLY_TOKEN && !hasMedia)
taggedPayload.text?.trim() === SILENT_REPLY_TOKEN &&
!hasMedia
)
return; return;
const blockPayload: ReplyPayload = const blockPayload: ReplyPayload = applyReplyToMode({
applyReplyToMode(taggedPayload); ...taggedPayload,
text: cleaned,
audioAsVoice: audioTagResult.audioAsVoice,
});
const payloadKey = buildPayloadKey(blockPayload); const payloadKey = buildPayloadKey(blockPayload);
if ( if (
streamedPayloadKeys.has(payloadKey) || streamedPayloadKeys.has(payloadKey) ||
@@ -519,7 +523,16 @@ export async function runReplyAgent(params: {
payloads: sanitizedPayloads, payloads: sanitizedPayloads,
applyReplyToMode, applyReplyToMode,
currentMessageId: sessionCtx.MessageSid, currentMessageId: sessionCtx.MessageSid,
}); })
.map((payload) => {
const audioTagResult = extractAudioTag(payload.text);
return {
...payload,
text: audioTagResult.cleaned ? audioTagResult.cleaned : undefined,
audioAsVoice: audioTagResult.audioAsVoice,
};
})
.filter(isRenderablePayload);
// Drop final payloads if block streaming is enabled and we already streamed // Drop final payloads if block streaming is enabled and we already streamed
// block replies. Tool-sent duplicates are filtered below. // block replies. Tool-sent duplicates are filtered below.

View File

@@ -0,0 +1,30 @@
/**
* Extract audio mode tag from text.
* Supports [[audio_as_file]] to send audio as file instead of voice bubble.
*/
export function extractAudioTag(text?: string): {
cleaned: string;
audioAsVoice: boolean;
hasTag: boolean;
} {
if (!text) return { cleaned: "", audioAsVoice: true, hasTag: false };
let cleaned = text;
let audioAsVoice = true; // default: voice bubble
let hasTag = false;
// [[audio_as_file]] -> send as file with metadata, not voice bubble
const fileMatch = cleaned.match(/\[\[audio_as_file\]\]/i);
if (fileMatch) {
cleaned = cleaned.replace(/\[\[audio_as_file\]\]/gi, " ");
audioAsVoice = false;
hasTag = true;
}
// Clean up whitespace
cleaned = cleaned
.replace(/[ \t]+/g, " ")
.replace(/[ \t]*\n[ \t]*/g, "\n")
.trim();
return { cleaned, audioAsVoice, hasTag };
}