feat(telegram): buffer audio blocks for [[audio_as_voice]] tag support

- Add [[audio_as_voice]] detection to splitMediaFromOutput()
- Pass audioAsVoice through onBlockReply callback chain
- Buffer audio blocks during streaming, flush at end with correct flag
- Non-audio media still streams immediately
- Fix: emit payloads with audioAsVoice flag even if text is empty

Co-authored-by: Manuel Hettich <17690367+ManuelHettich@users.noreply.github.com>
This commit is contained in:
Jarvis
2026-01-08 12:40:31 +00:00
committed by Peter Steinberger
parent 60bd65dfac
commit 05a99aa49b
5 changed files with 127 additions and 16 deletions

View File

@@ -260,6 +260,14 @@ export async function runReplyAgent(params: {
const pendingToolTasks = new Set<Promise<void>>();
const blockReplyTimeoutMs =
opts?.blockReplyTimeoutMs ?? BLOCK_REPLY_SEND_TIMEOUT_MS;
// Buffer audio blocks to apply [[audio_as_voice]] tag that may come later
const bufferedAudioBlocks: ReplyPayload[] = [];
let seenAudioAsVoice = false;
const AUDIO_EXTENSIONS = /\.(opus|mp3|m4a|wav|ogg|aac|flac)$/i;
const hasAudioMedia = (urls?: string[]): boolean =>
Boolean(urls?.some((u) => AUDIO_EXTENSIONS.test(u)));
const replyToChannel =
sessionCtx.OriginatingChannel ??
((sessionCtx.Surface ?? sessionCtx.Provider)?.toLowerCase() as
@@ -532,23 +540,34 @@ export async function runReplyAgent(params: {
},
sessionCtx.MessageSid,
);
if (!isRenderablePayload(taggedPayload)) return;
// Let through payloads with audioAsVoice flag even if empty (need to track it)
if (!isRenderablePayload(taggedPayload) && !payload.audioAsVoice)
return;
const audioTagResult = extractAudioTag(taggedPayload.text);
const cleaned = audioTagResult.cleaned || undefined;
const hasMedia =
Boolean(taggedPayload.mediaUrl) ||
(taggedPayload.mediaUrls?.length ?? 0) > 0;
if (!cleaned && !hasMedia) return;
// Skip empty payloads unless they have audioAsVoice flag (need to track it)
if (!cleaned && !hasMedia && !payload.audioAsVoice) return;
if (
isSilentReplyText(cleaned, SILENT_REPLY_TOKEN) &&
!hasMedia
)
return;
// Track if we've seen [[audio_as_voice]] from payload or text extraction
if (payload.audioAsVoice || audioTagResult.audioAsVoice) {
seenAudioAsVoice = true;
}
const blockPayload: ReplyPayload = applyReplyToMode({
...taggedPayload,
text: cleaned,
audioAsVoice: audioTagResult.audioAsVoice,
audioAsVoice:
audioTagResult.audioAsVoice || payload.audioAsVoice,
});
void typingSignals
.signalTextDelta(taggedPayload.text)
.catch((err) => {
@@ -556,6 +575,14 @@ export async function runReplyAgent(params: {
`block reply typing signal failed: ${String(err)}`,
);
});
// Buffer audio blocks to apply [[audio_as_voice]] that may come later
const isAudioBlock = hasAudioMedia(taggedPayload.mediaUrls);
if (isAudioBlock) {
bufferedAudioBlocks.push(blockPayload);
return; // Don't send immediately - wait for potential [[audio_as_voice]] tag
}
blockReplyPipeline?.enqueue(blockPayload);
}
: undefined,
@@ -670,6 +697,17 @@ export async function runReplyAgent(params: {
}
const payloadArray = runResult.payloads ?? [];
if (bufferedAudioBlocks.length > 0 && blockReplyPipeline) {
for (const audioPayload of bufferedAudioBlocks) {
const finalPayload = seenAudioAsVoice
? { ...audioPayload, audioAsVoice: true }
: audioPayload;
blockReplyPipeline.enqueue(finalPayload);
}
bufferedAudioBlocks.length = 0;
}
if (blockReplyPipeline) {
await blockReplyPipeline.flush({ force: true });
blockReplyPipeline.stop();
@@ -677,6 +715,7 @@ export async function runReplyAgent(params: {
if (pendingToolTasks.size > 0) {
await Promise.allSettled(pendingToolTasks);
}
// Drain any late tool/block deliveries before deciding there's "nothing to send".
// Otherwise, a late typing trigger (e.g. from a tool callback) can outlive the run and
// keep the typing indicator stuck.