From 05a99aa49b0a1842d365c91a152c2ee6ff3ea7a5 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 8 Jan 2026 12:40:31 +0000 Subject: [PATCH 1/7] feat(telegram): buffer audio blocks for [[audio_as_voice]] tag support - Add [[audio_as_voice]] detection to splitMediaFromOutput() - Pass audioAsVoice through onBlockReply callback chain - Buffer audio blocks during streaming, flush at end with correct flag - Non-audio media still streams immediately - Fix: emit payloads with audioAsVoice flag even if text is empty Co-authored-by: Manuel Hettich <17690367+ManuelHettich@users.noreply.github.com> --- src/agents/pi-embedded-runner.ts | 25 ++++++++++--- src/agents/pi-embedded-subscribe.ts | 17 +++++++-- src/auto-reply/reply/agent-runner.ts | 45 +++++++++++++++++++++-- src/media/parse.ts | 55 ++++++++++++++++++++++++++-- src/telegram/bot.ts | 1 + 5 files changed, 127 insertions(+), 16 deletions(-) diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 7a416c60c..5ef32ea65 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -1044,6 +1044,7 @@ export async function runEmbeddedPiAgent(params: { onBlockReply?: (payload: { text?: string; mediaUrls?: string[]; + audioAsVoice?: boolean; }) => void | Promise; blockReplyBreak?: "text_end" | "message_end"; blockReplyChunking?: BlockReplyChunking; @@ -1640,6 +1641,7 @@ export async function runEmbeddedPiAgent(params: { text: string; media?: string[]; isError?: boolean; + audioAsVoice?: boolean; }> = []; const errorText = lastAssistant @@ -1656,10 +1658,10 @@ export async function runEmbeddedPiAgent(params: { if (inlineToolResults) { for (const { toolName, meta } of toolMetas) { const agg = formatToolAggregate(toolName, meta ? [meta] : []); - const { text: cleanedText, mediaUrls } = + const { text: cleanedText, mediaUrls, audioAsVoice } = splitMediaFromOutput(agg); if (cleanedText) - replyItems.push({ text: cleanedText, media: mediaUrls }); + replyItems.push({ text: cleanedText, media: mediaUrls, audioAsVoice }); } } @@ -1678,18 +1680,31 @@ export async function runEmbeddedPiAgent(params: { ? [fallbackAnswerText] : []; for (const text of answerTexts) { - const { text: cleanedText, mediaUrls } = splitMediaFromOutput(text); - if (!cleanedText && (!mediaUrls || mediaUrls.length === 0)) + const { text: cleanedText, mediaUrls, audioAsVoice } = + splitMediaFromOutput(text); + if ( + !cleanedText && + (!mediaUrls || mediaUrls.length === 0) && + !audioAsVoice + ) continue; - replyItems.push({ text: cleanedText, media: mediaUrls }); + replyItems.push({ text: cleanedText, media: mediaUrls, audioAsVoice }); } + // Check if any replyItem has audioAsVoice tag - if so, apply to all media payloads + const hasAudioAsVoiceTag = replyItems.some( + (item) => item.audioAsVoice, + ); + const payloads = replyItems .map((item) => ({ text: item.text?.trim() ? item.text.trim() : undefined, mediaUrls: item.media?.length ? item.media : undefined, mediaUrl: item.media?.[0], isError: item.isError, + // Apply audioAsVoice to media payloads if tag was found anywhere in response + audioAsVoice: + item.audioAsVoice || (hasAudioAsVoiceTag && item.media?.length), })) .filter( (p) => diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 4c407a483..6dc5957cd 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -262,6 +262,7 @@ export function subscribeEmbeddedPiSession(params: { onBlockReply?: (payload: { text?: string; mediaUrls?: string[]; + audioAsVoice?: boolean; }) => void | Promise; blockReplyBreak?: "text_end" | "message_end"; blockReplyChunking?: BlockReplyChunking; @@ -436,11 +437,13 @@ export function subscribeEmbeddedPiSession(params: { lastBlockReplyText = chunk; assistantTexts.push(chunk); if (!params.onBlockReply) return; - const { text: cleanedText, mediaUrls } = splitMediaFromOutput(chunk); - if (!cleanedText && (!mediaUrls || mediaUrls.length === 0)) return; + const { text: cleanedText, mediaUrls, audioAsVoice } = splitMediaFromOutput(chunk); + // Skip empty payloads, but always emit if audioAsVoice is set (to propagate the flag) + if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) return; void params.onBlockReply({ text: cleanedText, mediaUrls: mediaUrls?.length ? mediaUrls : undefined, + audioAsVoice, }); }; @@ -859,12 +862,18 @@ export function subscribeEmbeddedPiSession(params: { ); } else { lastBlockReplyText = text; - const { text: cleanedText, mediaUrls } = + const { text: cleanedText, mediaUrls, audioAsVoice } = splitMediaFromOutput(text); - if (cleanedText || (mediaUrls && mediaUrls.length > 0)) { + // Emit if there's content OR audioAsVoice flag (to propagate the flag) + if ( + cleanedText || + (mediaUrls && mediaUrls.length > 0) || + audioAsVoice + ) { void onBlockReply({ text: cleanedText, mediaUrls: mediaUrls?.length ? mediaUrls : undefined, + audioAsVoice, }); } } diff --git a/src/auto-reply/reply/agent-runner.ts b/src/auto-reply/reply/agent-runner.ts index 56a04b5b2..1063dcdd7 100644 --- a/src/auto-reply/reply/agent-runner.ts +++ b/src/auto-reply/reply/agent-runner.ts @@ -260,6 +260,14 @@ export async function runReplyAgent(params: { const pendingToolTasks = new Set>(); const blockReplyTimeoutMs = opts?.blockReplyTimeoutMs ?? BLOCK_REPLY_SEND_TIMEOUT_MS; + + // Buffer audio blocks to apply [[audio_as_voice]] tag that may come later + const bufferedAudioBlocks: ReplyPayload[] = []; + let seenAudioAsVoice = false; + + const AUDIO_EXTENSIONS = /\.(opus|mp3|m4a|wav|ogg|aac|flac)$/i; + const hasAudioMedia = (urls?: string[]): boolean => + Boolean(urls?.some((u) => AUDIO_EXTENSIONS.test(u))); const replyToChannel = sessionCtx.OriginatingChannel ?? ((sessionCtx.Surface ?? sessionCtx.Provider)?.toLowerCase() as @@ -532,23 +540,34 @@ export async function runReplyAgent(params: { }, sessionCtx.MessageSid, ); - if (!isRenderablePayload(taggedPayload)) return; + // Let through payloads with audioAsVoice flag even if empty (need to track it) + if (!isRenderablePayload(taggedPayload) && !payload.audioAsVoice) + return; const audioTagResult = extractAudioTag(taggedPayload.text); const cleaned = audioTagResult.cleaned || undefined; const hasMedia = Boolean(taggedPayload.mediaUrl) || (taggedPayload.mediaUrls?.length ?? 0) > 0; - if (!cleaned && !hasMedia) return; + // Skip empty payloads unless they have audioAsVoice flag (need to track it) + if (!cleaned && !hasMedia && !payload.audioAsVoice) return; if ( isSilentReplyText(cleaned, SILENT_REPLY_TOKEN) && !hasMedia ) return; + + // Track if we've seen [[audio_as_voice]] from payload or text extraction + if (payload.audioAsVoice || audioTagResult.audioAsVoice) { + seenAudioAsVoice = true; + } + const blockPayload: ReplyPayload = applyReplyToMode({ ...taggedPayload, text: cleaned, - audioAsVoice: audioTagResult.audioAsVoice, + audioAsVoice: + audioTagResult.audioAsVoice || payload.audioAsVoice, }); + void typingSignals .signalTextDelta(taggedPayload.text) .catch((err) => { @@ -556,6 +575,14 @@ export async function runReplyAgent(params: { `block reply typing signal failed: ${String(err)}`, ); }); + + // Buffer audio blocks to apply [[audio_as_voice]] that may come later + const isAudioBlock = hasAudioMedia(taggedPayload.mediaUrls); + if (isAudioBlock) { + bufferedAudioBlocks.push(blockPayload); + return; // Don't send immediately - wait for potential [[audio_as_voice]] tag + } + blockReplyPipeline?.enqueue(blockPayload); } : undefined, @@ -670,6 +697,17 @@ export async function runReplyAgent(params: { } const payloadArray = runResult.payloads ?? []; + + if (bufferedAudioBlocks.length > 0 && blockReplyPipeline) { + for (const audioPayload of bufferedAudioBlocks) { + const finalPayload = seenAudioAsVoice + ? { ...audioPayload, audioAsVoice: true } + : audioPayload; + blockReplyPipeline.enqueue(finalPayload); + } + bufferedAudioBlocks.length = 0; + } + if (blockReplyPipeline) { await blockReplyPipeline.flush({ force: true }); blockReplyPipeline.stop(); @@ -677,6 +715,7 @@ export async function runReplyAgent(params: { if (pendingToolTasks.size > 0) { await Promise.allSettled(pendingToolTasks); } + // Drain any late tool/block deliveries before deciding there's "nothing to send". // Otherwise, a late typing trigger (e.g. from a tool callback) can outlive the run and // keep the typing indicator stuck. diff --git a/src/media/parse.ts b/src/media/parse.ts index 2a01156c4..08f6d492e 100644 --- a/src/media/parse.ts +++ b/src/media/parse.ts @@ -1,5 +1,7 @@ // Shared helpers for parsing MEDIA tokens from command/stdout text. +import { parseFenceSpans } from "../markdown/fences.js"; + // Allow optional wrapping backticks and punctuation after the token; capture the core token. export const MEDIA_TOKEN_RE = /\bMEDIA:\s*`?([^\n]+)`?/gi; @@ -22,10 +24,22 @@ function isValidMedia(candidate: string) { ); } +// Check if a character offset is inside any fenced code block +function isInsideFence( + fenceSpans: Array<{ start: number; end: number }>, + offset: number, +): boolean { + return fenceSpans.some((span) => offset >= span.start && offset < span.end); +} + +// Regex to detect [[audio_as_voice]] tag +const AUDIO_AS_VOICE_RE = /\[\[audio_as_voice\]\]/gi; + export function splitMediaFromOutput(raw: string): { text: string; mediaUrls?: string[]; mediaUrl?: string; // legacy first item for backward compatibility + audioAsVoice?: boolean; // true if [[audio_as_voice]] tag was found } { // KNOWN: Leading whitespace is semantically meaningful in Markdown (lists, indented fences). // We only trim the end; token cleanup below handles removing `MEDIA:` lines. @@ -35,14 +49,26 @@ export function splitMediaFromOutput(raw: string): { const media: string[] = []; let foundMediaToken = false; + // Parse fenced code blocks to avoid extracting MEDIA tokens from inside them + const fenceSpans = parseFenceSpans(trimmedRaw); + // Collect tokens line by line so we can strip them cleanly. const lines = trimmedRaw.split("\n"); const keptLines: string[] = []; + let lineOffset = 0; // Track character offset for fence checking for (const line of lines) { + // Skip MEDIA extraction if this line is inside a fenced code block + if (isInsideFence(fenceSpans, lineOffset)) { + keptLines.push(line); + lineOffset += line.length + 1; // +1 for newline + continue; + } + const matches = Array.from(line.matchAll(MEDIA_TOKEN_RE)); if (matches.length === 0) { keptLines.push(line); + lineOffset += line.length + 1; // +1 for newline continue; } @@ -86,18 +112,39 @@ export function splitMediaFromOutput(raw: string): { if (cleanedLine) { keptLines.push(cleanedLine); } + lineOffset += line.length + 1; // +1 for newline } - const cleanedText = keptLines + let cleanedText = keptLines .join("\n") .replace(/[ \t]+\n/g, "\n") .replace(/[ \t]{2,}/g, " ") .replace(/\n{2,}/g, "\n") .trim(); - if (media.length === 0) { - return { text: foundMediaToken ? cleanedText : trimmedRaw }; + // Detect and strip [[audio_as_voice]] tag + const hasAudioAsVoice = AUDIO_AS_VOICE_RE.test(cleanedText); + if (hasAudioAsVoice) { + cleanedText = cleanedText + .replace(AUDIO_AS_VOICE_RE, "") + .replace(/[ \t]+/g, " ") + .replace(/\n{2,}/g, "\n") + .trim(); } - return { text: cleanedText, mediaUrls: media, mediaUrl: media[0] }; + if (media.length === 0) { + const result: ReturnType = { + // Return cleaned text if we found a media token OR audio tag, otherwise original + text: (foundMediaToken || hasAudioAsVoice) ? cleanedText : trimmedRaw, + }; + if (hasAudioAsVoice) result.audioAsVoice = true; + return result; + } + + return { + text: cleanedText, + mediaUrls: media, + mediaUrl: media[0], + ...(hasAudioAsVoice ? { audioAsVoice: true } : {}), + }; } diff --git a/src/telegram/bot.ts b/src/telegram/bot.ts index aaba18307..80404a185 100644 --- a/src/telegram/bot.ts +++ b/src/telegram/bot.ts @@ -1388,6 +1388,7 @@ async function deliverReplies(params: { }); } else if (kind === "audio") { const useVoice = reply.audioAsVoice === true; // default false (backward compatible) + log.warn(`[DEBUG] Audio media: audioAsVoice=${reply.audioAsVoice}, useVoice=${useVoice}`); if (useVoice) { // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]]) await bot.api.sendVoice(chatId, file, { From 2f036f7173e1b3e29a39841df9bbaee7285213e6 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 8 Jan 2026 13:26:06 +0000 Subject: [PATCH 2/7] fix(audio): preserve audioAsVoice propagation - Keep audioAsVoice-only payloads from being filtered out - Allow empty payloads through when they carry the flag - Remove temporary debug logs around audioAsVoice buffering Co-authored-by: Manuel Hettich <17690367+ManuelHettich@users.noreply.github.com> --- src/agents/pi-embedded-subscribe.ts | 3 ++- src/telegram/bot.ts | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 6dc5957cd..8511911d8 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -437,7 +437,8 @@ export function subscribeEmbeddedPiSession(params: { lastBlockReplyText = chunk; assistantTexts.push(chunk); if (!params.onBlockReply) return; - const { text: cleanedText, mediaUrls, audioAsVoice } = splitMediaFromOutput(chunk); + const splitResult = splitMediaFromOutput(chunk); + const { text: cleanedText, mediaUrls, audioAsVoice } = splitResult; // Skip empty payloads, but always emit if audioAsVoice is set (to propagate the flag) if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) return; void params.onBlockReply({ diff --git a/src/telegram/bot.ts b/src/telegram/bot.ts index 80404a185..aaba18307 100644 --- a/src/telegram/bot.ts +++ b/src/telegram/bot.ts @@ -1388,7 +1388,6 @@ async function deliverReplies(params: { }); } else if (kind === "audio") { const useVoice = reply.audioAsVoice === true; // default false (backward compatible) - log.warn(`[DEBUG] Audio media: audioAsVoice=${reply.audioAsVoice}, useVoice=${useVoice}`); if (useVoice) { // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]]) await bot.api.sendVoice(chatId, file, { From ce786762db2ba4d8ca8a3fd81a04cf7729819aed Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 8 Jan 2026 13:55:36 +0000 Subject: [PATCH 3/7] fix(telegram): guard voice note sends --- src/telegram/bot.ts | 15 ++++++++++++++- src/telegram/send.test.ts | 34 ++++++++++++++++++++++++++++++++++ src/telegram/send.ts | 13 ++++++++++++- src/telegram/voice.ts | 15 +++++++++++++++ 4 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 src/telegram/voice.ts diff --git a/src/telegram/bot.ts b/src/telegram/bot.ts index aaba18307..4ece77cc8 100644 --- a/src/telegram/bot.ts +++ b/src/telegram/bot.ts @@ -60,6 +60,7 @@ import { resolveTelegramAccount } from "./accounts.js"; import { createTelegramDraftStream } from "./draft-stream.js"; import { resolveTelegramFetch } from "./fetch.js"; import { markdownToTelegramHtml } from "./format.js"; +import { isTelegramVoiceCompatible } from "./voice.js"; import { readTelegramAllowFromStore, upsertTelegramPairingRequest, @@ -1387,7 +1388,19 @@ async function deliverReplies(params: { ...mediaParams, }); } else if (kind === "audio") { - const useVoice = reply.audioAsVoice === true; // default false (backward compatible) + const wantsVoice = reply.audioAsVoice === true; // default false (backward compatible) + const canVoice = wantsVoice + ? isTelegramVoiceCompatible({ + contentType: media.contentType, + fileName, + }) + : false; + const useVoice = wantsVoice && canVoice; + if (wantsVoice && !canVoice) { + logVerbose( + `Telegram voice requested but media is ${media.contentType ?? "unknown"} (${fileName}); sending as audio file instead.`, + ); + } if (useVoice) { // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]]) await bot.api.sendVoice(chatId, file, { diff --git a/src/telegram/send.test.ts b/src/telegram/send.test.ts index 115f88851..d2172d65d 100644 --- a/src/telegram/send.test.ts +++ b/src/telegram/send.test.ts @@ -324,6 +324,40 @@ describe("sendMessageTelegram", () => { expect(sendAudio).not.toHaveBeenCalled(); }); + it("falls back to audio when asVoice is true but media is not voice compatible", async () => { + const chatId = "123"; + const sendAudio = vi.fn().mockResolvedValue({ + message_id: 14, + chat: { id: chatId }, + }); + const sendVoice = vi.fn().mockResolvedValue({ + message_id: 15, + chat: { id: chatId }, + }); + const api = { sendAudio, sendVoice } as unknown as { + sendAudio: typeof sendAudio; + sendVoice: typeof sendVoice; + }; + + loadWebMedia.mockResolvedValueOnce({ + buffer: Buffer.from("audio"), + contentType: "audio/mpeg", + fileName: "clip.mp3", + }); + + await sendMessageTelegram(chatId, "caption", { + token: "tok", + api, + mediaUrl: "https://example.com/clip.mp3", + asVoice: true, + }); + + expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), { + caption: "caption", + }); + expect(sendVoice).not.toHaveBeenCalled(); + }); + it("includes message_thread_id for forum topic messages", async () => { const chatId = "-1001234567890"; const sendMessage = vi.fn().mockResolvedValue({ diff --git a/src/telegram/send.ts b/src/telegram/send.ts index 8799e32f7..b293f7844 100644 --- a/src/telegram/send.ts +++ b/src/telegram/send.ts @@ -10,6 +10,7 @@ import { formatErrorMessage } from "../infra/errors.js"; import { recordProviderActivity } from "../infra/provider-activity.js"; import type { RetryConfig } from "../infra/retry.js"; import { createTelegramRetryRunner } from "../infra/retry-policy.js"; +import { logVerbose } from "../globals.js"; import { mediaKindFromMime } from "../media/constants.js"; import { isGifMedia } from "../media/mime.js"; import { loadWebMedia } from "../web/media.js"; @@ -20,6 +21,7 @@ import { parseTelegramTarget, stripTelegramInternalPrefixes, } from "./targets.js"; +import { resolveTelegramVoiceDecision } from "./voice.js"; type TelegramSendOpts = { token?: string; @@ -237,7 +239,16 @@ export async function sendMessageTelegram( throw wrapChatNotFound(err); }); } else if (kind === "audio") { - const useVoice = opts.asVoice === true; // default false (backward compatible) + const { useVoice, reason } = resolveTelegramVoiceDecision({ + wantsVoice: opts.asVoice === true, // default false (backward compatible) + contentType: media.contentType, + fileName, + }); + if (reason) { + logVerbose( + `Telegram voice requested but ${reason}; sending as audio file instead.`, + ); + } if (useVoice) { result = await request( () => api.sendVoice(chatId, file, mediaParams), diff --git a/src/telegram/voice.ts b/src/telegram/voice.ts new file mode 100644 index 000000000..623bc58ef --- /dev/null +++ b/src/telegram/voice.ts @@ -0,0 +1,15 @@ +import path from "node:path"; + +export function isTelegramVoiceCompatible(opts: { + contentType?: string | null; + fileName?: string | null; +}): boolean { + const mime = opts.contentType?.toLowerCase(); + if (mime && (mime.includes("ogg") || mime.includes("opus"))) { + return true; + } + const fileName = opts.fileName?.trim(); + if (!fileName) return false; + const ext = path.extname(fileName).toLowerCase(); + return ext === ".ogg" || ext === ".opus" || ext === ".oga"; +} From 9a7f05056838da8ee9f4560898e1f9333761ac57 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 8 Jan 2026 14:00:55 +0000 Subject: [PATCH 4/7] refactor(telegram): centralize voice decisions - Share voice compatibility decision logic across send + bot flows - Keep voice fallback logging consistent - Simplify voice handling in the audio send path --- src/telegram/bot.ts | 19 ++++++++----------- src/telegram/voice.ts | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/telegram/bot.ts b/src/telegram/bot.ts index 4ece77cc8..75d339c41 100644 --- a/src/telegram/bot.ts +++ b/src/telegram/bot.ts @@ -60,7 +60,7 @@ import { resolveTelegramAccount } from "./accounts.js"; import { createTelegramDraftStream } from "./draft-stream.js"; import { resolveTelegramFetch } from "./fetch.js"; import { markdownToTelegramHtml } from "./format.js"; -import { isTelegramVoiceCompatible } from "./voice.js"; +import { resolveTelegramVoiceDecision } from "./voice.js"; import { readTelegramAllowFromStore, upsertTelegramPairingRequest, @@ -1388,17 +1388,14 @@ async function deliverReplies(params: { ...mediaParams, }); } else if (kind === "audio") { - const wantsVoice = reply.audioAsVoice === true; // default false (backward compatible) - const canVoice = wantsVoice - ? isTelegramVoiceCompatible({ - contentType: media.contentType, - fileName, - }) - : false; - const useVoice = wantsVoice && canVoice; - if (wantsVoice && !canVoice) { + const { useVoice, reason } = resolveTelegramVoiceDecision({ + wantsVoice: reply.audioAsVoice === true, // default false (backward compatible) + contentType: media.contentType, + fileName, + }); + if (reason) { logVerbose( - `Telegram voice requested but media is ${media.contentType ?? "unknown"} (${fileName}); sending as audio file instead.`, + `Telegram voice requested but ${reason}; sending as audio file instead.`, ); } if (useVoice) { diff --git a/src/telegram/voice.ts b/src/telegram/voice.ts index 623bc58ef..d1527a4ce 100644 --- a/src/telegram/voice.ts +++ b/src/telegram/voice.ts @@ -13,3 +13,18 @@ export function isTelegramVoiceCompatible(opts: { const ext = path.extname(fileName).toLowerCase(); return ext === ".ogg" || ext === ".opus" || ext === ".oga"; } + +export function resolveTelegramVoiceDecision(opts: { + wantsVoice: boolean; + contentType?: string | null; + fileName?: string | null; +}): { useVoice: boolean; reason?: string } { + if (!opts.wantsVoice) return { useVoice: false }; + if (isTelegramVoiceCompatible(opts)) return { useVoice: true }; + const contentType = opts.contentType ?? "unknown"; + const fileName = opts.fileName ?? "unknown"; + return { + useVoice: false, + reason: `media is ${contentType} (${fileName})`, + }; +} From 8c7d1781bcdf89065a001115cac64e70cf8b1442 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 8 Jan 2026 14:21:50 +0000 Subject: [PATCH 5/7] fix(pi): preserve audio_as_voice tag - Allow flag-only chunks so audio_as_voice propagates - Keep reply item scan aware of empty audio tag blocks Co-authored-by: Manuel Hettich <17690367+ManuelHettich@users.noreply.github.com> --- src/agents/pi-embedded-runner.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 5ef32ea65..1860c9b6d 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -1695,7 +1695,6 @@ export async function runEmbeddedPiAgent(params: { const hasAudioAsVoiceTag = replyItems.some( (item) => item.audioAsVoice, ); - const payloads = replyItems .map((item) => ({ text: item.text?.trim() ? item.text.trim() : undefined, From 5fedfd8d159a0ff5a114204ccd7639090ebe051c Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 8 Jan 2026 14:26:54 +0000 Subject: [PATCH 6/7] chore: format audioAsVoice updates Co-authored-by: Manuel Hettich <17690367+ManuelHettich@users.noreply.github.com> --- src/agents/pi-embedded-runner.ts | 26 ++++++++++++++++++++------ src/agents/pi-embedded-subscribe.ts | 10 +++++++--- src/auto-reply/reply/agent-runner.ts | 5 ++++- src/media/parse.ts | 2 +- src/telegram/bot.ts | 2 +- src/telegram/send.ts | 2 +- 6 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 1860c9b6d..e36b58e63 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -1658,10 +1658,17 @@ export async function runEmbeddedPiAgent(params: { if (inlineToolResults) { for (const { toolName, meta } of toolMetas) { const agg = formatToolAggregate(toolName, meta ? [meta] : []); - const { text: cleanedText, mediaUrls, audioAsVoice } = - splitMediaFromOutput(agg); + const { + text: cleanedText, + mediaUrls, + audioAsVoice, + } = splitMediaFromOutput(agg); if (cleanedText) - replyItems.push({ text: cleanedText, media: mediaUrls, audioAsVoice }); + replyItems.push({ + text: cleanedText, + media: mediaUrls, + audioAsVoice, + }); } } @@ -1680,15 +1687,22 @@ export async function runEmbeddedPiAgent(params: { ? [fallbackAnswerText] : []; for (const text of answerTexts) { - const { text: cleanedText, mediaUrls, audioAsVoice } = - splitMediaFromOutput(text); + const { + text: cleanedText, + mediaUrls, + audioAsVoice, + } = splitMediaFromOutput(text); if ( !cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice ) continue; - replyItems.push({ text: cleanedText, media: mediaUrls, audioAsVoice }); + replyItems.push({ + text: cleanedText, + media: mediaUrls, + audioAsVoice, + }); } // Check if any replyItem has audioAsVoice tag - if so, apply to all media payloads diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 8511911d8..001d579f1 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -440,7 +440,8 @@ export function subscribeEmbeddedPiSession(params: { const splitResult = splitMediaFromOutput(chunk); const { text: cleanedText, mediaUrls, audioAsVoice } = splitResult; // Skip empty payloads, but always emit if audioAsVoice is set (to propagate the flag) - if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) return; + if (!cleanedText && (!mediaUrls || mediaUrls.length === 0) && !audioAsVoice) + return; void params.onBlockReply({ text: cleanedText, mediaUrls: mediaUrls?.length ? mediaUrls : undefined, @@ -863,8 +864,11 @@ export function subscribeEmbeddedPiSession(params: { ); } else { lastBlockReplyText = text; - const { text: cleanedText, mediaUrls, audioAsVoice } = - splitMediaFromOutput(text); + const { + text: cleanedText, + mediaUrls, + audioAsVoice, + } = splitMediaFromOutput(text); // Emit if there's content OR audioAsVoice flag (to propagate the flag) if ( cleanedText || diff --git a/src/auto-reply/reply/agent-runner.ts b/src/auto-reply/reply/agent-runner.ts index 1063dcdd7..c4878690d 100644 --- a/src/auto-reply/reply/agent-runner.ts +++ b/src/auto-reply/reply/agent-runner.ts @@ -541,7 +541,10 @@ export async function runReplyAgent(params: { sessionCtx.MessageSid, ); // Let through payloads with audioAsVoice flag even if empty (need to track it) - if (!isRenderablePayload(taggedPayload) && !payload.audioAsVoice) + if ( + !isRenderablePayload(taggedPayload) && + !payload.audioAsVoice + ) return; const audioTagResult = extractAudioTag(taggedPayload.text); const cleaned = audioTagResult.cleaned || undefined; diff --git a/src/media/parse.ts b/src/media/parse.ts index 08f6d492e..77b4bd9f9 100644 --- a/src/media/parse.ts +++ b/src/media/parse.ts @@ -135,7 +135,7 @@ export function splitMediaFromOutput(raw: string): { if (media.length === 0) { const result: ReturnType = { // Return cleaned text if we found a media token OR audio tag, otherwise original - text: (foundMediaToken || hasAudioAsVoice) ? cleanedText : trimmedRaw, + text: foundMediaToken || hasAudioAsVoice ? cleanedText : trimmedRaw, }; if (hasAudioAsVoice) result.audioAsVoice = true; return result; diff --git a/src/telegram/bot.ts b/src/telegram/bot.ts index 75d339c41..8ec6756b4 100644 --- a/src/telegram/bot.ts +++ b/src/telegram/bot.ts @@ -60,11 +60,11 @@ import { resolveTelegramAccount } from "./accounts.js"; import { createTelegramDraftStream } from "./draft-stream.js"; import { resolveTelegramFetch } from "./fetch.js"; import { markdownToTelegramHtml } from "./format.js"; -import { resolveTelegramVoiceDecision } from "./voice.js"; import { readTelegramAllowFromStore, upsertTelegramPairingRequest, } from "./pairing-store.js"; +import { resolveTelegramVoiceDecision } from "./voice.js"; const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i; diff --git a/src/telegram/send.ts b/src/telegram/send.ts index b293f7844..7aed6ee31 100644 --- a/src/telegram/send.ts +++ b/src/telegram/send.ts @@ -6,11 +6,11 @@ import type { } from "@grammyjs/types"; import { type ApiClientOptions, Bot, InputFile } from "grammy"; import { loadConfig } from "../config/config.js"; +import { logVerbose } from "../globals.js"; import { formatErrorMessage } from "../infra/errors.js"; import { recordProviderActivity } from "../infra/provider-activity.js"; import type { RetryConfig } from "../infra/retry.js"; import { createTelegramRetryRunner } from "../infra/retry-policy.js"; -import { logVerbose } from "../globals.js"; import { mediaKindFromMime } from "../media/constants.js"; import { isGifMedia } from "../media/mime.js"; import { loadWebMedia } from "../web/media.js"; From c56b2f4bc166c09f40cb07a070ed77811ac05d7c Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 10 Jan 2026 01:50:33 +0100 Subject: [PATCH 7/7] fix: honor audio_as_voice streaming + parse tests (#490) (thanks @jarvis-medmatic) --- CHANGELOG.md | 1 + src/agents/pi-embedded-runner.ts | 1 + src/agents/pi-tools.ts | 4 +++- src/auto-reply/reply/abort.ts | 3 ++- .../reply/dispatch-from-config.test.ts | 5 ++++- src/media/parse.test.ts | 19 +++++++++++++++++++ src/media/parse.ts | 3 ++- 7 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 src/media/parse.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fdb94ec9..17d774c8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ - Commands: harden slash command registry and list text-only commands in `/commands`. - Models/Auth: show per-agent auth candidates in `/model status`, and add `clawdbot models auth order {get,set,clear}` (per-agent auth rotation overrides). — thanks @steipete - Telegram: keep streamMode draft-only; avoid forcing block streaming. (#619) — thanks @rubyrunsstuff +- Telegram: add `[[audio_as_voice]]` tag support for voice notes with streaming-safe delivery. (#490) — thanks @jarvis-medmatic - Debugging: add raw model stream logging flags and document gateway watch mode. - Gateway: decode dns-sd escaped UTF-8 in discovery output and show scan progress immediately. — thanks @steipete - Agent: add claude-cli/opus-4.5 runner via Claude CLI with resume support (tools disabled). diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index e36b58e63..b65e4cd3f 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -776,6 +776,7 @@ export async function compactEmbeddedPiSession(params: { const enqueueGlobal = params.enqueue ?? ((task, opts) => enqueueCommandInLane(globalLane, task, opts)); + const runAbortController = new AbortController(); return enqueueCommandInLane(sessionLane, () => enqueueGlobal(async () => { const resolvedWorkspace = resolveUserPath(params.workspaceDir); diff --git a/src/agents/pi-tools.ts b/src/agents/pi-tools.ts index b330506dc..e68b530d7 100644 --- a/src/agents/pi-tools.ts +++ b/src/agents/pi-tools.ts @@ -651,7 +651,9 @@ export function createClawdbotCodingTools(options?: { // Without this, some providers (notably OpenAI) will reject root-level union schemas. const normalized = subagentFiltered.map(normalizeToolParameters); const withAbort = options?.abortSignal - ? normalized.map((tool) => wrapToolWithAbortSignal(tool, options.abortSignal)) + ? normalized.map((tool) => + wrapToolWithAbortSignal(tool, options.abortSignal), + ) : normalized; // Anthropic blocks specific lowercase tool names (bash, read, write, edit) with OAuth tokens. diff --git a/src/auto-reply/reply/abort.ts b/src/auto-reply/reply/abort.ts index 7f9fefa05..690b266d9 100644 --- a/src/auto-reply/reply/abort.ts +++ b/src/auto-reply/reply/abort.ts @@ -3,6 +3,7 @@ import type { ClawdbotConfig } from "../../config/config.js"; import { loadSessionStore, resolveStorePath, + type SessionEntry, saveSessionStore, } from "../../config/sessions.js"; import { @@ -35,7 +36,7 @@ export function setAbortMemory(key: string, value: boolean): void { } function resolveSessionEntryForKey( - store: Record | undefined, + store: Record | undefined, sessionKey: string | undefined, ) { if (!store || !sessionKey) return {}; diff --git a/src/auto-reply/reply/dispatch-from-config.test.ts b/src/auto-reply/reply/dispatch-from-config.test.ts index 557e804d1..26bb863cb 100644 --- a/src/auto-reply/reply/dispatch-from-config.test.ts +++ b/src/auto-reply/reply/dispatch-from-config.test.ts @@ -7,7 +7,10 @@ import type { ReplyDispatcher } from "./reply-dispatcher.js"; const mocks = vi.hoisted(() => ({ routeReply: vi.fn(async () => ({ ok: true, messageId: "mock" })), - tryFastAbortFromMessage: vi.fn(async () => ({ handled: false, aborted: false })), + tryFastAbortFromMessage: vi.fn(async () => ({ + handled: false, + aborted: false, + })), })); vi.mock("./route-reply.js", () => ({ diff --git a/src/media/parse.test.ts b/src/media/parse.test.ts new file mode 100644 index 000000000..f910d851c --- /dev/null +++ b/src/media/parse.test.ts @@ -0,0 +1,19 @@ +import { describe, expect, it } from "vitest"; + +import { splitMediaFromOutput } from "./parse.js"; + +describe("splitMediaFromOutput", () => { + it("detects audio_as_voice tag and strips it", () => { + const result = splitMediaFromOutput("Hello [[audio_as_voice]] world"); + expect(result.audioAsVoice).toBe(true); + expect(result.text).toBe("Hello world"); + }); + + it("keeps audio_as_voice detection stable across calls", () => { + const input = "Hello [[audio_as_voice]]"; + const first = splitMediaFromOutput(input); + const second = splitMediaFromOutput(input); + expect(first.audioAsVoice).toBe(true); + expect(second.audioAsVoice).toBe(true); + }); +}); diff --git a/src/media/parse.ts b/src/media/parse.ts index 77b4bd9f9..38f751992 100644 --- a/src/media/parse.ts +++ b/src/media/parse.ts @@ -34,6 +34,7 @@ function isInsideFence( // Regex to detect [[audio_as_voice]] tag const AUDIO_AS_VOICE_RE = /\[\[audio_as_voice\]\]/gi; +const AUDIO_AS_VOICE_TEST_RE = /\[\[audio_as_voice\]\]/i; export function splitMediaFromOutput(raw: string): { text: string; @@ -123,7 +124,7 @@ export function splitMediaFromOutput(raw: string): { .trim(); // Detect and strip [[audio_as_voice]] tag - const hasAudioAsVoice = AUDIO_AS_VOICE_RE.test(cleanedText); + const hasAudioAsVoice = AUDIO_AS_VOICE_TEST_RE.test(cleanedText); if (hasAudioAsVoice) { cleanedText = cleanedText .replace(AUDIO_AS_VOICE_RE, "")