feat(telegram): buffer audio blocks for [[audio_as_voice]] tag support

- Add [[audio_as_voice]] detection to splitMediaFromOutput() - Pass audioAsVoice through onBlockReply callback chain - Buffer audio blocks during streaming, flush at end with correct flag - Non-audio media still streams immediately - Fix: emit payloads with audioAsVoice flag even if text is empty Co-authored-by: Manuel Hettich <17690367+ManuelHettich@users.noreply.github.com>
2026-01-08 12:40:31 +00:00
parent 60bd65dfac
commit 05a99aa49b
5 changed files with 127 additions and 16 deletions
--- a/src/media/parse.ts
+++ b/src/media/parse.ts
@@ -1,5 +1,7 @@
 // Shared helpers for parsing MEDIA tokens from command/stdout text.

+import { parseFenceSpans } from "../markdown/fences.js";
+
 // Allow optional wrapping backticks and punctuation after the token; capture the core token.
 export const MEDIA_TOKEN_RE = /\bMEDIA:\s*`?([^\n]+)`?/gi;

@@ -22,10 +24,22 @@ function isValidMedia(candidate: string) {
  );
 }

+// Check if a character offset is inside any fenced code block
+function isInsideFence(
+  fenceSpans: Array<{ start: number; end: number }>,
+  offset: number,
+): boolean {
+  return fenceSpans.some((span) => offset >= span.start && offset < span.end);
+}
+
+// Regex to detect [[audio_as_voice]] tag
+const AUDIO_AS_VOICE_RE = /\[\[audio_as_voice\]\]/gi;
+
 export function splitMediaFromOutput(raw: string): {
  text: string;
  mediaUrls?: string[];
  mediaUrl?: string; // legacy first item for backward compatibility
+  audioAsVoice?: boolean; // true if [[audio_as_voice]] tag was found
 } {
  // KNOWN: Leading whitespace is semantically meaningful in Markdown (lists, indented fences).
  // We only trim the end; token cleanup below handles removing `MEDIA:` lines.
@@ -35,14 +49,26 @@ export function splitMediaFromOutput(raw: string): {
  const media: string[] = [];
  let foundMediaToken = false;

+  // Parse fenced code blocks to avoid extracting MEDIA tokens from inside them
+  const fenceSpans = parseFenceSpans(trimmedRaw);
+
  // Collect tokens line by line so we can strip them cleanly.
  const lines = trimmedRaw.split("\n");
  const keptLines: string[] = [];

+  let lineOffset = 0; // Track character offset for fence checking
  for (const line of lines) {
+    // Skip MEDIA extraction if this line is inside a fenced code block
+    if (isInsideFence(fenceSpans, lineOffset)) {
+      keptLines.push(line);
+      lineOffset += line.length + 1; // +1 for newline
+      continue;
+    }
+
    const matches = Array.from(line.matchAll(MEDIA_TOKEN_RE));
    if (matches.length === 0) {
      keptLines.push(line);
+      lineOffset += line.length + 1; // +1 for newline
      continue;
    }

@@ -86,18 +112,39 @@ export function splitMediaFromOutput(raw: string): {
    if (cleanedLine) {
      keptLines.push(cleanedLine);
    }
+    lineOffset += line.length + 1; // +1 for newline
  }

-  const cleanedText = keptLines
+  let cleanedText = keptLines
    .join("\n")
    .replace(/[ \t]+\n/g, "\n")
    .replace(/[ \t]{2,}/g, " ")
    .replace(/\n{2,}/g, "\n")
    .trim();

-  if (media.length === 0) {
-    return { text: foundMediaToken ? cleanedText : trimmedRaw };
+  // Detect and strip [[audio_as_voice]] tag
+  const hasAudioAsVoice = AUDIO_AS_VOICE_RE.test(cleanedText);
+  if (hasAudioAsVoice) {
+    cleanedText = cleanedText
+      .replace(AUDIO_AS_VOICE_RE, "")
+      .replace(/[ \t]+/g, " ")
+      .replace(/\n{2,}/g, "\n")
+      .trim();
  }

-  return { text: cleanedText, mediaUrls: media, mediaUrl: media[0] };
+  if (media.length === 0) {
+    const result: ReturnType<typeof splitMediaFromOutput> = {
+      // Return cleaned text if we found a media token OR audio tag, otherwise original
+      text: (foundMediaToken || hasAudioAsVoice) ? cleanedText : trimmedRaw,
+    };
+    if (hasAudioAsVoice) result.audioAsVoice = true;
+    return result;
+  }
+
+  return {
+    text: cleanedText,
+    mediaUrls: media,
+    mediaUrl: media[0],
+    ...(hasAudioAsVoice ? { audioAsVoice: true } : {}),
+  };
 }