refactor: consolidate reply/media helpers

2026-01-10 02:40:41 +01:00
parent 9cd2662a86
commit 4075895c4c
17 changed files with 437 additions and 277 deletions
--- a/src/auto-reply/reply/agent-runner.ts
+++ b/src/auto-reply/reply/agent-runner.ts
@@ -22,6 +22,7 @@ import {
  emitAgentEvent,
  registerAgentRunContext,
 } from "../../infra/agent-events.js";
 import { isAudioFileName } from "../../media/mime.js";
 import { defaultRuntime } from "../../runtime.js";
 import {
  estimateUsageCost,
@@ -34,8 +35,11 @@ import type { OriginatingChannelType, TemplateContext } from "../templating.js";
 import { normalizeVerboseLevel, type VerboseLevel } from "../thinking.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
-import { extractAudioTag } from "./audio-tags.js";
+import { parseAudioTag } from "./audio-tags.js";
-import { createBlockReplyPipeline } from "./block-reply-pipeline.js";
+import {
  createAudioAsVoiceBuffer,
  createBlockReplyPipeline,
 } from "./block-reply-pipeline.js";
 import { resolveBlockStreamingCoalescing } from "./block-streaming.js";
 import { createFollowupRunner } from "./followup-runner.js";
 import {
@@ -261,13 +265,12 @@ export async function runReplyAgent(params: {
  const blockReplyTimeoutMs =
    opts?.blockReplyTimeoutMs ?? BLOCK_REPLY_SEND_TIMEOUT_MS;
  // Buffer audio blocks to apply [[audio_as_voice]] tag that may come later
  const bufferedAudioBlocks: ReplyPayload[] = [];
  let seenAudioAsVoice = false;
  const AUDIO_EXTENSIONS = /\.(opus|mp3|m4a|wav|ogg|aac|flac)$/i;
  const hasAudioMedia = (urls?: string[]): boolean =>
-    Boolean(urls?.some((u) => AUDIO_EXTENSIONS.test(u)));
+    Boolean(urls?.some((u) => isAudioFileName(u)));
  const isAudioPayload = (payload: ReplyPayload) =>
    hasAudioMedia(
      payload.mediaUrls ?? (payload.mediaUrl ? [payload.mediaUrl] : undefined),
    );
  const replyToChannel =
    sessionCtx.OriginatingChannel ??
    ((sessionCtx.Surface ?? sessionCtx.Provider)?.toLowerCase() as
@@ -297,6 +300,7 @@ export async function runReplyAgent(params: {
          onBlockReply: opts.onBlockReply,
          timeoutMs: blockReplyTimeoutMs,
          coalescing: blockReplyCoalescing,
          buffer: createAudioAsVoiceBuffer({ isAudioPayload }),
        })
      : null;
@@ -546,8 +550,8 @@ export async function runReplyAgent(params: {
                      !payload.audioAsVoice
                    )
                      return;
-                    const audioTagResult = extractAudioTag(taggedPayload.text);
+                    const audioTagResult = parseAudioTag(taggedPayload.text);
-                    const cleaned = audioTagResult.cleaned || undefined;
+                    const cleaned = audioTagResult.text || undefined;
                    const hasMedia =
                      Boolean(taggedPayload.mediaUrl) ||
                      (taggedPayload.mediaUrls?.length ?? 0) > 0;
@@ -559,11 +563,6 @@ export async function runReplyAgent(params: {
                    )
                      return;
                    // Track if we've seen [[audio_as_voice]] from payload or text extraction
                    if (payload.audioAsVoice || audioTagResult.audioAsVoice) {
                      seenAudioAsVoice = true;
                    }
                    const blockPayload: ReplyPayload = applyReplyToMode({
                      ...taggedPayload,
                      text: cleaned,
@@ -579,13 +578,6 @@ export async function runReplyAgent(params: {
                        );
                      });
                    // Buffer audio blocks to apply [[audio_as_voice]] that may come later
                    const isAudioBlock = hasAudioMedia(taggedPayload.mediaUrls);
                    if (isAudioBlock) {
                      bufferedAudioBlocks.push(blockPayload);
                      return; // Don't send immediately - wait for potential [[audio_as_voice]] tag
                    }
                    blockReplyPipeline?.enqueue(blockPayload);
                  }
                : undefined,
@@ -701,16 +693,6 @@ export async function runReplyAgent(params: {
    const payloadArray = runResult.payloads ?? [];
    if (bufferedAudioBlocks.length > 0 && blockReplyPipeline) {
      for (const audioPayload of bufferedAudioBlocks) {
        const finalPayload = seenAudioAsVoice
          ? { ...audioPayload, audioAsVoice: true }
          : audioPayload;
        blockReplyPipeline.enqueue(finalPayload);
      }
      bufferedAudioBlocks.length = 0;
    }
    if (blockReplyPipeline) {
      await blockReplyPipeline.flush({ force: true });
      blockReplyPipeline.stop();
@@ -753,10 +735,10 @@ export async function runReplyAgent(params: {
      currentMessageId: sessionCtx.MessageSid,
    })
      .map((payload) => {
-        const audioTagResult = extractAudioTag(payload.text);
+        const audioTagResult = parseAudioTag(payload.text);
        return {
          ...payload,
-          text: audioTagResult.cleaned ? audioTagResult.cleaned : undefined,
+          text: audioTagResult.text ? audioTagResult.text : undefined,
          audioAsVoice: audioTagResult.audioAsVoice,
        };
      })
--- a/src/auto-reply/reply/audio-tags.test.ts
+++ b/src/auto-reply/reply/audio-tags.test.ts
@@ -1,25 +1,25 @@
 import { describe, expect, it } from "vitest";
-import { extractAudioTag } from "./audio-tags.js";
+import { parseAudioTag } from "./audio-tags.js";
-describe("extractAudioTag", () => {
+describe("parseAudioTag", () => {
  it("detects audio_as_voice and strips the tag", () => {
-    const result = extractAudioTag("Hello [[audio_as_voice]] world");
+    const result = parseAudioTag("Hello [[audio_as_voice]] world");
    expect(result.audioAsVoice).toBe(true);
-    expect(result.hasTag).toBe(true);
+    expect(result.hadTag).toBe(true);
-    expect(result.cleaned).toBe("Hello world");
+    expect(result.text).toBe("Hello world");
  });
  it("returns empty output for missing text", () => {
-    const result = extractAudioTag(undefined);
+    const result = parseAudioTag(undefined);
    expect(result.audioAsVoice).toBe(false);
-    expect(result.hasTag).toBe(false);
+    expect(result.hadTag).toBe(false);
-    expect(result.cleaned).toBe("");
+    expect(result.text).toBe("");
  });
  it("removes tag-only messages", () => {
-    const result = extractAudioTag("[[audio_as_voice]]");
+    const result = parseAudioTag("[[audio_as_voice]]");
    expect(result.audioAsVoice).toBe(true);
-    expect(result.cleaned).toBe("");
+    expect(result.text).toBe("");
  });
 });
--- a/src/auto-reply/reply/audio-tags.ts
+++ b/src/auto-reply/reply/audio-tags.ts
@@ -1,31 +1 @@
-/**
+export { parseAudioTag } from "../../media/audio-tags.js";
 * Extract audio mode tag from text.
 * Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
 * Default is file (preserves backward compatibility).
 */
 export function extractAudioTag(text?: string): {
  cleaned: string;
  audioAsVoice: boolean;
  hasTag: boolean;
 } {
  if (!text) return { cleaned: "", audioAsVoice: false, hasTag: false };
  let cleaned = text;
  let audioAsVoice = false; // default: audio file (backward compatible)
  let hasTag = false;
  // [[audio_as_voice]] -> send as voice bubble (opt-in)
  const voiceMatch = cleaned.match(/\[\[audio_as_voice\]\]/i);
  if (voiceMatch) {
    cleaned = cleaned.replace(/\[\[audio_as_voice\]\]/gi, " ");
    audioAsVoice = true;
    hasTag = true;
  }
  // Clean up whitespace
  cleaned = cleaned
    .replace(/[ \t]+/g, " ")
    .replace(/[ \t]*\n[ \t]*/g, "\n")
    .trim();
  return { cleaned, audioAsVoice, hasTag };
 }
--- a/src/auto-reply/reply/block-reply-pipeline.ts
+++ b/src/auto-reply/reply/block-reply-pipeline.ts
@@ -13,6 +13,28 @@ export type BlockReplyPipeline = {
  hasSentPayload: (payload: ReplyPayload) => boolean;
 };
 export type BlockReplyBuffer = {
  shouldBuffer: (payload: ReplyPayload) => boolean;
  onEnqueue?: (payload: ReplyPayload) => void;
  finalize?: (payload: ReplyPayload) => ReplyPayload;
 };
 export function createAudioAsVoiceBuffer(params: {
  isAudioPayload: (payload: ReplyPayload) => boolean;
 }): BlockReplyBuffer {
  let seenAudioAsVoice = false;
  return {
    onEnqueue: (payload) => {
      if (payload.audioAsVoice) {
        seenAudioAsVoice = true;
      }
    },
    shouldBuffer: (payload) => params.isAudioPayload(payload),
    finalize: (payload) =>
      seenAudioAsVoice ? { ...payload, audioAsVoice: true } : payload,
  };
 }
 export function createBlockReplyPayloadKey(payload: ReplyPayload): string {
  const text = payload.text?.trim() ?? "";
  const mediaList = payload.mediaUrls?.length
@@ -51,12 +73,15 @@ export function createBlockReplyPipeline(params: {
  ) => Promise<void> | void;
  timeoutMs: number;
  coalescing?: BlockStreamingCoalescing;
  buffer?: BlockReplyBuffer;
 }): BlockReplyPipeline {
-  const { onBlockReply, timeoutMs, coalescing } = params;
+  const { onBlockReply, timeoutMs, coalescing, buffer } = params;
  const sentKeys = new Set<string>();
  const pendingKeys = new Set<string>();
  const seenKeys = new Set<string>();
  const bufferedKeys = new Set<string>();
  const bufferedPayloadKeys = new Set<string>();
  const bufferedPayloads: ReplyPayload[] = [];
  let sendChain: Promise<void> = Promise.resolve();
  let aborted = false;
  let didStream = false;
@@ -124,8 +149,37 @@ export function createBlockReplyPipeline(params: {
      })
    : null;
  const bufferPayload = (payload: ReplyPayload) => {
    buffer?.onEnqueue?.(payload);
    if (!buffer?.shouldBuffer(payload)) return false;
    const payloadKey = createBlockReplyPayloadKey(payload);
    if (
      seenKeys.has(payloadKey) ||
      sentKeys.has(payloadKey) ||
      pendingKeys.has(payloadKey) ||
      bufferedPayloadKeys.has(payloadKey)
    ) {
      return true;
    }
    seenKeys.add(payloadKey);
    bufferedPayloadKeys.add(payloadKey);
    bufferedPayloads.push(payload);
    return true;
  };
  const flushBuffered = () => {
    if (!bufferedPayloads.length) return;
    for (const payload of bufferedPayloads) {
      const finalPayload = buffer?.finalize?.(payload) ?? payload;
      sendPayload(finalPayload, true);
    }
    bufferedPayloads.length = 0;
    bufferedPayloadKeys.clear();
  };
  const enqueue = (payload: ReplyPayload) => {
    if (aborted) return;
    if (bufferPayload(payload)) return;
    const hasMedia =
      Boolean(payload.mediaUrl) || (payload.mediaUrls?.length ?? 0) > 0;
    if (hasMedia) {
@@ -151,6 +205,7 @@ export function createBlockReplyPipeline(params: {
  const flush = async (options?: { force?: boolean }) => {
    await coalescer?.flush(options);
    flushBuffered();
    await sendChain;
  };
@@ -162,7 +217,8 @@ export function createBlockReplyPipeline(params: {
    enqueue,
    flush,
    stop,
-    hasBuffered: () => Boolean(coalescer?.hasBuffered()),
+    hasBuffered: () =>
      Boolean(coalescer?.hasBuffered() || bufferedPayloads.length > 0),
    didStream: () => didStream,
    isAborted: () => aborted,
    hasSentPayload: (payload) => {
--- a/src/auto-reply/reply/dispatch-from-config.ts
+++ b/src/auto-reply/reply/dispatch-from-config.ts
@@ -7,7 +7,7 @@ import { tryFastAbortFromMessage } from "./abort.js";
 import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js";
 import { isRoutableChannel, routeReply } from "./route-reply.js";
-type DispatchFromConfigResult = {
+export type DispatchFromConfigResult = {
  queuedFinal: boolean;
  counts: Record<ReplyDispatchKind, number>;
 };
--- a/src/auto-reply/reply/provider-dispatcher.ts
+++ b/src/auto-reply/reply/provider-dispatcher.ts
@@ -0,0 +1,34 @@
 import type { ClawdbotConfig } from "../../config/config.js";
 import type { MsgContext } from "../templating.js";
 import type { GetReplyOptions } from "../types.js";
 import type { DispatchFromConfigResult } from "./dispatch-from-config.js";
 import { dispatchReplyFromConfig } from "./dispatch-from-config.js";
 import {
  createReplyDispatcherWithTyping,
  type ReplyDispatcherWithTypingOptions,
 } from "./reply-dispatcher.js";
 export async function dispatchReplyWithBufferedBlockDispatcher(params: {
  ctx: MsgContext;
  cfg: ClawdbotConfig;
  dispatcherOptions: ReplyDispatcherWithTypingOptions;
  replyOptions?: Omit<GetReplyOptions, "onToolResult" | "onBlockReply">;
  replyResolver?: typeof import("../reply.js").getReplyFromConfig;
 }): Promise<DispatchFromConfigResult> {
  const { dispatcher, replyOptions, markDispatchIdle } =
    createReplyDispatcherWithTyping(params.dispatcherOptions);
  const result = await dispatchReplyFromConfig({
    ctx: params.ctx,
    cfg: params.cfg,
    dispatcher,
    replyResolver: params.replyResolver,
    replyOptions: {
      ...params.replyOptions,
      ...replyOptions,
    },
  });
  markDispatchIdle();
  return result;
 }
--- a/src/auto-reply/reply/reply-dispatcher.ts
+++ b/src/auto-reply/reply/reply-dispatcher.ts
@@ -22,7 +22,7 @@ export type ReplyDispatcherOptions = {
  onError?: ReplyDispatchErrorHandler;
 };
-type ReplyDispatcherWithTypingOptions = Omit<
+export type ReplyDispatcherWithTypingOptions = Omit<
  ReplyDispatcherOptions,
  "onIdle"
 > & {
--- a/src/media/audio-tags.ts
+++ b/src/media/audio-tags.ts
@@ -0,0 +1,31 @@
 /**
 * Extract audio mode tag from text.
 * Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
 * Default is file (preserves backward compatibility).
 */
 export function parseAudioTag(text?: string): {
  text: string;
  audioAsVoice: boolean;
  hadTag: boolean;
 } {
  if (!text) return { text: "", audioAsVoice: false, hadTag: false };
  let cleaned = text;
  let audioAsVoice = false; // default: audio file (backward compatible)
  let hadTag = false;
  // [[audio_as_voice]] -> send as voice bubble (opt-in)
  const voiceMatch = cleaned.match(/\[\[audio_as_voice\]\]/i);
  if (voiceMatch) {
    cleaned = cleaned.replace(/\[\[audio_as_voice\]\]/gi, " ");
    audioAsVoice = true;
    hadTag = true;
  }
  // Clean up whitespace
  cleaned = cleaned
    .replace(/[ \t]+/g, " ")
    .replace(/[ \t]*\n[ \t]*/g, "\n")
    .trim();
  return { text: cleaned, audioAsVoice, hadTag };
 }
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -0,0 +1,125 @@
 import path from "node:path";
 import { detectMime, extensionForMime } from "./mime.js";
 type FetchMediaResult = {
  buffer: Buffer;
  contentType?: string;
  fileName?: string;
 };
 type FetchMediaOptions = {
  url: string;
  fetchImpl?: typeof fetch;
  filePathHint?: string;
 };
 function stripQuotes(value: string): string {
  return value.replace(/^["']|["']$/g, "");
 }
 function parseContentDispositionFileName(
  header?: string | null,
 ): string | undefined {
  if (!header) return undefined;
  const starMatch = /filename\*\s*=\s*([^;]+)/i.exec(header);
  if (starMatch?.[1]) {
    const cleaned = stripQuotes(starMatch[1].trim());
    const encoded = cleaned.split("''").slice(1).join("''") || cleaned;
    try {
      return path.basename(decodeURIComponent(encoded));
    } catch {
      return path.basename(encoded);
    }
  }
  const match = /filename\s*=\s*([^;]+)/i.exec(header);
  if (match?.[1]) return path.basename(stripQuotes(match[1].trim()));
  return undefined;
 }
 async function readErrorBodySnippet(
  res: Response,
  maxChars = 200,
 ): Promise<string | undefined> {
  try {
    const text = await res.text();
    if (!text) return undefined;
    const collapsed = text.replace(/\s+/g, " ").trim();
    if (!collapsed) return undefined;
    if (collapsed.length <= maxChars) return collapsed;
    return `${collapsed.slice(0, maxChars)}…`;
  } catch {
    return undefined;
  }
 }
 export async function fetchRemoteMedia(
  options: FetchMediaOptions,
 ): Promise<FetchMediaResult> {
  const { url, fetchImpl, filePathHint } = options;
  const fetcher = fetchImpl ?? globalThis.fetch;
  if (!fetcher) {
    throw new Error("fetch is not available");
  }
  let res: Response;
  try {
    res = await fetcher(url);
  } catch (err) {
    throw new Error(`Failed to fetch media from ${url}: ${String(err)}`);
  }
  if (!res.ok) {
    const statusText = res.statusText ? ` ${res.statusText}` : "";
    const redirected =
      res.url && res.url !== url ? ` (redirected to ${res.url})` : "";
    let detail = `HTTP ${res.status}${statusText}`;
    if (!res.body) {
      detail = `HTTP ${res.status}${statusText}; empty response body`;
    } else {
      const snippet = await readErrorBodySnippet(res);
      if (snippet) detail += `; body: ${snippet}`;
    }
    throw new Error(
      `Failed to fetch media from ${url}${redirected}: ${detail}`,
    );
  }
  const buffer = Buffer.from(await res.arrayBuffer());
  let fileNameFromUrl: string | undefined;
  try {
    const parsed = new URL(url);
    const base = path.basename(parsed.pathname);
    fileNameFromUrl = base || undefined;
  } catch {
    // ignore parse errors; leave undefined
  }
  const headerFileName = parseContentDispositionFileName(
    res.headers.get("content-disposition"),
  );
  let fileName =
    headerFileName ||
    fileNameFromUrl ||
    (filePathHint ? path.basename(filePathHint) : undefined);
  const filePathForMime =
    headerFileName && path.extname(headerFileName)
      ? headerFileName
      : (filePathHint ?? url);
  const contentType = await detectMime({
    buffer,
    headerMime: res.headers.get("content-type"),
    filePath: filePathForMime,
  });
  if (fileName && !path.extname(fileName) && contentType) {
    const ext = extensionForMime(contentType);
    if (ext) fileName = `${fileName}${ext}`;
  }
  return {
    buffer,
    contentType: contentType ?? undefined,
    fileName,
  };
 }
--- a/src/media/mime.ts
+++ b/src/media/mime.ts
@@ -36,6 +36,17 @@ const MIME_BY_EXT: Record<string, string> = Object.fromEntries(
  Object.entries(EXT_BY_MIME).map(([mime, ext]) => [ext, mime]),
 );
 const AUDIO_FILE_EXTENSIONS = new Set([
  ".aac",
  ".flac",
  ".m4a",
  ".mp3",
  ".oga",
  ".ogg",
  ".opus",
  ".wav",
 ]);
 function normalizeHeaderMime(mime?: string | null): string | undefined {
  if (!mime) return undefined;
  const cleaned = mime.split(";")[0]?.trim().toLowerCase();
@@ -52,7 +63,7 @@ async function sniffMime(buffer?: Buffer): Promise<string | undefined> {
  }
 }
-function extFromPath(filePath?: string): string | undefined {
+export function getFileExtension(filePath?: string | null): string | undefined {
  if (!filePath) return undefined;
  try {
    if (/^https?:\/\//i.test(filePath)) {
@@ -66,6 +77,12 @@ function extFromPath(filePath?: string): string | undefined {
  return ext || undefined;
 }
 export function isAudioFileName(fileName?: string | null): boolean {
  const ext = getFileExtension(fileName);
  if (!ext) return false;
  return AUDIO_FILE_EXTENSIONS.has(ext);
 }
 export function detectMime(opts: {
  buffer?: Buffer;
  headerMime?: string | null;
@@ -85,7 +102,7 @@ async function detectMimeImpl(opts: {
  headerMime?: string | null;
  filePath?: string;
 }): Promise<string | undefined> {
-  const ext = extFromPath(opts.filePath);
+  const ext = getFileExtension(opts.filePath);
  const extMime = ext ? MIME_BY_EXT[ext] : undefined;
  const headerMime = normalizeHeaderMime(opts.headerMime);
@@ -112,9 +129,7 @@ export function isGifMedia(opts: {
  fileName?: string | null;
 }): boolean {
  if (opts.contentType?.toLowerCase() === "image/gif") return true;
-  const ext = opts.fileName
+  const ext = getFileExtension(opts.fileName);
    ? path.extname(opts.fileName).toLowerCase()
    : undefined;
  return ext === ".gif";
 }
--- a/src/media/parse.ts
+++ b/src/media/parse.ts
@@ -1,6 +1,7 @@
 // Shared helpers for parsing MEDIA tokens from command/stdout text.
 import { parseFenceSpans } from "../markdown/fences.js";
 import { parseAudioTag } from "./audio-tags.js";
 // Allow optional wrapping backticks and punctuation after the token; capture the core token.
 export const MEDIA_TOKEN_RE = /\bMEDIA:\s*`?([^\n]+)`?/gi;
@@ -32,10 +33,6 @@ function isInsideFence(
  return fenceSpans.some((span) => offset >= span.start && offset < span.end);
 }
 // Regex to detect [[audio_as_voice]] tag
 const AUDIO_AS_VOICE_RE = /\[\[audio_as_voice\]\]/gi;
 const AUDIO_AS_VOICE_TEST_RE = /\[\[audio_as_voice\]\]/i;
 export function splitMediaFromOutput(raw: string): {
  text: string;
  mediaUrls?: string[];
@@ -124,13 +121,10 @@ export function splitMediaFromOutput(raw: string): {
    .trim();
  // Detect and strip [[audio_as_voice]] tag
-  const hasAudioAsVoice = AUDIO_AS_VOICE_TEST_RE.test(cleanedText);
+  const audioTagResult = parseAudioTag(cleanedText);
-  if (hasAudioAsVoice) {
+  const hasAudioAsVoice = audioTagResult.audioAsVoice;
-    cleanedText = cleanedText
+  if (audioTagResult.hadTag) {
-      .replace(AUDIO_AS_VOICE_RE, "")
+    cleanedText = audioTagResult.text.replace(/\n{2,}/g, "\n").trim();
      .replace(/[ \t]+/g, " ")
      .replace(/\n{2,}/g, "\n")
      .trim();
  }
  if (media.length === 0) {
--- a/src/telegram/bot.ts
+++ b/src/telegram/bot.ts
@@ -1,6 +1,4 @@
 // @ts-nocheck
 import { Buffer } from "node:buffer";
 import { sequentialize } from "@grammyjs/runner";
 import { apiThrottler } from "@grammyjs/transformer-throttler";
 import type { ApiClientOptions, Message } from "grammy";
@@ -22,12 +20,11 @@ import {
 } from "../auto-reply/commands-registry.js";
 import { formatAgentEnvelope } from "../auto-reply/envelope.js";
 import { resolveBlockStreamingChunking } from "../auto-reply/reply/block-streaming.js";
 import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
 import {
  buildMentionRegexes,
  matchesMentionPatterns,
 } from "../auto-reply/reply/mentions.js";
-import { createReplyDispatcherWithTyping } from "../auto-reply/reply/reply-dispatcher.js";
+import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
 import { getReplyFromConfig } from "../auto-reply/reply.js";
 import type { ReplyPayload } from "../auto-reply/types.js";
 import type { ClawdbotConfig, ReplyToMode } from "../config/config.js";
@@ -46,7 +43,8 @@ import { formatErrorMessage } from "../infra/errors.js";
 import { recordProviderActivity } from "../infra/provider-activity.js";
 import { getChildLogger } from "../logging.js";
 import { mediaKindFromMime } from "../media/constants.js";
-import { detectMime, isGifMedia } from "../media/mime.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
 import { isGifMedia } from "../media/mime.js";
 import { saveMediaBuffer } from "../media/store.js";
 import {
  formatLocationText,
@@ -64,7 +62,7 @@ import {
  readTelegramAllowFromStore,
  upsertTelegramPairingRequest,
 } from "./pairing-store.js";
-import { resolveTelegramVoiceDecision } from "./voice.js";
+import { resolveTelegramVoiceSend } from "./voice.js";
 const PARSE_ERR_RE =
  /can't parse entities|parse entities|find end of the entity/i;
@@ -805,8 +803,16 @@ export function createTelegramBot(opts: TelegramBotOptions) {
      await draftStream.flush();
    };
-    const { dispatcher, replyOptions, markDispatchIdle } =
+    const disableBlockStreaming =
-      createReplyDispatcherWithTyping({
+      Boolean(draftStream) ||
      (typeof telegramCfg.blockStreaming === "boolean"
        ? !telegramCfg.blockStreaming
        : undefined);
    const { queuedFinal } = await dispatchReplyWithBufferedBlockDispatcher({
      ctx: ctxPayload,
      cfg,
      dispatcherOptions: {
        responsePrefix: resolveEffectiveMessagesConfig(cfg, route.agentId)
          .responsePrefix,
        deliver: async (payload, info) => {
@@ -831,20 +837,8 @@ export function createTelegramBot(opts: TelegramBotOptions) {
          );
        },
        onReplyStart: sendTyping,
-      });
+      },
    const disableBlockStreaming =
      Boolean(draftStream) ||
      (typeof telegramCfg.blockStreaming === "boolean"
        ? !telegramCfg.blockStreaming
        : undefined);
    const { queuedFinal } = await dispatchReplyFromConfig({
      ctx: ctxPayload,
      cfg,
      dispatcher,
      replyOptions: {
        ...replyOptions,
        skillFilter,
        onPartialReply: draftStream
          ? (payload) => updateDraftFromPartial(payload.text)
@@ -857,7 +851,6 @@ export function createTelegramBot(opts: TelegramBotOptions) {
        disableBlockStreaming,
      },
    });
    markDispatchIdle();
    draftStream?.stop();
    if (!queuedFinal) return;
    if (
@@ -1409,16 +1402,12 @@ async function deliverReplies(params: {
          ...mediaParams,
        });
      } else if (kind === "audio") {
-        const { useVoice, reason } = resolveTelegramVoiceDecision({
+        const { useVoice } = resolveTelegramVoiceSend({
          wantsVoice: reply.audioAsVoice === true, // default false (backward compatible)
          contentType: media.contentType,
          fileName,
          logFallback: logVerbose,
        });
        if (reason) {
          logVerbose(
            `Telegram voice requested but ${reason}; sending as audio file instead.`,
          );
        }
        if (useVoice) {
          // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]])
          await bot.api.sendVoice(chatId, file, {
@@ -1571,19 +1560,17 @@ async function resolveMedia(
    throw new Error("fetch is not available; set telegram.proxy in config");
  }
  const url = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
-  const res = await fetchImpl(url);
+  const fetched = await fetchRemoteMedia({
-  if (!res.ok) {
+    url,
-    throw new Error(
+    fetchImpl,
-      `Failed to download telegram file: HTTP ${res.status} ${res.statusText}`,
+    filePathHint: file.file_path,
    );
  }
  const data = Buffer.from(await res.arrayBuffer());
  const mime = await detectMime({
    buffer: data,
    headerMime: res.headers.get("content-type"),
    filePath: file.file_path,
  });
-  const saved = await saveMediaBuffer(data, mime, "inbound", maxBytes);
+  const saved = await saveMediaBuffer(
    fetched.buffer,
    fetched.contentType,
    "inbound",
    maxBytes,
  );
  let placeholder = "<media:document>";
  if (msg.photo) placeholder = "<media:image>";
  else if (msg.video) placeholder = "<media:video>";
--- a/src/telegram/send.ts
+++ b/src/telegram/send.ts
@@ -21,7 +21,7 @@ import {
  parseTelegramTarget,
  stripTelegramInternalPrefixes,
 } from "./targets.js";
-import { resolveTelegramVoiceDecision } from "./voice.js";
+import { resolveTelegramVoiceSend } from "./voice.js";
 type TelegramSendOpts = {
  token?: string;
@@ -239,16 +239,12 @@ export async function sendMessageTelegram(
        throw wrapChatNotFound(err);
      });
    } else if (kind === "audio") {
-      const { useVoice, reason } = resolveTelegramVoiceDecision({
+      const { useVoice } = resolveTelegramVoiceSend({
        wantsVoice: opts.asVoice === true, // default false (backward compatible)
        contentType: media.contentType,
        fileName,
        logFallback: logVerbose,
      });
      if (reason) {
        logVerbose(
          `Telegram voice requested but ${reason}; sending as audio file instead.`,
        );
      }
      if (useVoice) {
        result = await request(
          () => api.sendVoice(chatId, file, mediaParams),
--- a/src/telegram/voice.test.ts
+++ b/src/telegram/voice.test.ts
@@ -0,0 +1,43 @@
 import { describe, expect, it, vi } from "vitest";
 import { resolveTelegramVoiceSend } from "./voice.js";
 describe("resolveTelegramVoiceSend", () => {
  it("skips voice when wantsVoice is false", () => {
    const logFallback = vi.fn();
    const result = resolveTelegramVoiceSend({
      wantsVoice: false,
      contentType: "audio/ogg",
      fileName: "voice.ogg",
      logFallback,
    });
    expect(result.useVoice).toBe(false);
    expect(logFallback).not.toHaveBeenCalled();
  });
  it("logs fallback for incompatible media", () => {
    const logFallback = vi.fn();
    const result = resolveTelegramVoiceSend({
      wantsVoice: true,
      contentType: "audio/mpeg",
      fileName: "track.mp3",
      logFallback,
    });
    expect(result.useVoice).toBe(false);
    expect(logFallback).toHaveBeenCalledWith(
      "Telegram voice requested but media is audio/mpeg (track.mp3); sending as audio file instead.",
    );
  });
  it("keeps voice when compatible", () => {
    const logFallback = vi.fn();
    const result = resolveTelegramVoiceSend({
      wantsVoice: true,
      contentType: "audio/ogg",
      fileName: "voice.ogg",
      logFallback,
    });
    expect(result.useVoice).toBe(true);
    expect(logFallback).not.toHaveBeenCalled();
  });
 });
--- a/src/telegram/voice.ts
+++ b/src/telegram/voice.ts
@@ -1,4 +1,4 @@
-import path from "node:path";
+import { getFileExtension } from "../media/mime.js";
 export function isTelegramVoiceCompatible(opts: {
  contentType?: string | null;
@@ -10,7 +10,8 @@ export function isTelegramVoiceCompatible(opts: {
  }
  const fileName = opts.fileName?.trim();
  if (!fileName) return false;
-  const ext = path.extname(fileName).toLowerCase();
+  const ext = getFileExtension(fileName);
  if (!ext) return false;
  return ext === ".ogg" || ext === ".opus" || ext === ".oga";
 }
@@ -28,3 +29,18 @@ export function resolveTelegramVoiceDecision(opts: {
    reason: `media is ${contentType} (${fileName})`,
  };
 }
 export function resolveTelegramVoiceSend(opts: {
  wantsVoice: boolean;
  contentType?: string | null;
  fileName?: string | null;
  logFallback?: (message: string) => void;
 }): { useVoice: boolean } {
  const decision = resolveTelegramVoiceDecision(opts);
  if (decision.reason && opts.logFallback) {
    opts.logFallback(
      `Telegram voice requested but ${decision.reason}; sending as audio file instead.`,
    );
  }
  return { useVoice: decision.useVoice };
 }
--- a/src/web/auto-reply.ts
+++ b/src/web/auto-reply.ts
@@ -17,12 +17,11 @@ import {
  resolveHeartbeatPrompt,
  stripHeartbeatToken,
 } from "../auto-reply/heartbeat.js";
 import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
 import {
  buildMentionRegexes,
  normalizeMentionText,
 } from "../auto-reply/reply/mentions.js";
-import { createReplyDispatcherWithTyping } from "../auto-reply/reply/reply-dispatcher.js";
+import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
 import { getReplyFromConfig } from "../auto-reply/reply.js";
 import { HEARTBEAT_TOKEN, SILENT_REPLY_TOKEN } from "../auto-reply/tokens.js";
 import type { ReplyPayload } from "../auto-reply/types.js";
@@ -1219,8 +1218,39 @@ export async function monitorWebProvider(
        cfg,
        route.agentId,
      ).responsePrefix;
-      const { dispatcher, replyOptions, markDispatchIdle } =
+      const { queuedFinal } = await dispatchReplyWithBufferedBlockDispatcher({
-        createReplyDispatcherWithTyping({
+        ctx: {
          Body: combinedBody,
          From: msg.from,
          To: msg.to,
          SessionKey: route.sessionKey,
          AccountId: route.accountId,
          MessageSid: msg.id,
          ReplyToId: msg.replyToId,
          ReplyToBody: msg.replyToBody,
          ReplyToSender: msg.replyToSender,
          MediaPath: msg.mediaPath,
          MediaUrl: msg.mediaUrl,
          MediaType: msg.mediaType,
          ChatType: msg.chatType,
          GroupSubject: msg.groupSubject,
          GroupMembers: formatGroupMembers(
            msg.groupParticipants,
            groupMemberNames.get(groupHistoryKey),
            msg.senderE164,
          ),
          SenderName: msg.senderName,
          SenderE164: msg.senderE164,
          WasMentioned: msg.wasMentioned,
          ...(msg.location ? toLocationContext(msg.location) : {}),
          Provider: "whatsapp",
          Surface: "whatsapp",
          OriginatingChannel: "whatsapp",
          OriginatingTo: msg.from,
        },
        cfg,
        replyResolver,
        dispatcherOptions: {
          responsePrefix,
          onHeartbeatStrip: () => {
            if (!didLogHeartbeatStrip) {
@@ -1283,50 +1313,14 @@ export async function monitorWebProvider(
            );
          },
          onReplyStart: msg.sendComposing,
        });
      const { queuedFinal } = await dispatchReplyFromConfig({
        ctx: {
          Body: combinedBody,
          From: msg.from,
          To: msg.to,
          SessionKey: route.sessionKey,
          AccountId: route.accountId,
          MessageSid: msg.id,
          ReplyToId: msg.replyToId,
          ReplyToBody: msg.replyToBody,
          ReplyToSender: msg.replyToSender,
          MediaPath: msg.mediaPath,
          MediaUrl: msg.mediaUrl,
          MediaType: msg.mediaType,
          ChatType: msg.chatType,
          GroupSubject: msg.groupSubject,
          GroupMembers: formatGroupMembers(
            msg.groupParticipants,
            groupMemberNames.get(groupHistoryKey),
            msg.senderE164,
          ),
          SenderName: msg.senderName,
          SenderE164: msg.senderE164,
          WasMentioned: msg.wasMentioned,
          ...(msg.location ? toLocationContext(msg.location) : {}),
          Provider: "whatsapp",
          Surface: "whatsapp",
          OriginatingChannel: "whatsapp",
          OriginatingTo: msg.from,
        },
        cfg,
        dispatcher,
        replyResolver,
        replyOptions: {
          ...replyOptions,
          disableBlockStreaming:
            typeof cfg.whatsapp?.blockStreaming === "boolean"
              ? !cfg.whatsapp.blockStreaming
              : undefined,
        },
      });
      markDispatchIdle();
      if (!queuedFinal) {
        if (shouldClearGroupHistory && didSendReply) {
          groupHistories.set(groupHistoryKey, []);
--- a/src/web/media.ts
+++ b/src/web/media.ts
@@ -7,6 +7,7 @@ import {
  maxBytesForKind,
  mediaKindFromMime,
 } from "../media/constants.js";
 import { fetchRemoteMedia } from "../media/fetch.js";
 import { resizeToJpeg } from "../media/image-ops.js";
 import { detectMime, extensionForMime } from "../media/mime.js";
@@ -22,45 +23,6 @@ type WebMediaOptions = {
  optimizeImages?: boolean;
 };
 function stripQuotes(value: string): string {
  return value.replace(/^["']|["']$/g, "");
 }
 function parseContentDispositionFileName(
  header?: string | null,
 ): string | undefined {
  if (!header) return undefined;
  const starMatch = /filename\*\s*=\s*([^;]+)/i.exec(header);
  if (starMatch?.[1]) {
    const cleaned = stripQuotes(starMatch[1].trim());
    const encoded = cleaned.split("''").slice(1).join("''") || cleaned;
    try {
      return path.basename(decodeURIComponent(encoded));
    } catch {
      return path.basename(encoded);
    }
  }
  const match = /filename\s*=\s*([^;]+)/i.exec(header);
  if (match?.[1]) return path.basename(stripQuotes(match[1].trim()));
  return undefined;
 }
 async function readErrorBodySnippet(
  res: Response,
  maxChars = 200,
 ): Promise<string | undefined> {
  try {
    const text = await res.text();
    if (!text) return undefined;
    const collapsed = text.replace(/\s+/g, " ").trim();
    if (!collapsed) return undefined;
    if (collapsed.length <= maxChars) return collapsed;
    return `${collapsed.slice(0, maxChars)}…`;
  } catch {
    return undefined;
  }
 }
 async function loadWebMediaInternal(
  mediaUrl: string,
  options: WebMediaOptions = {},
@@ -93,53 +55,8 @@ async function loadWebMediaInternal(
  };
  if (/^https?:\/\//i.test(mediaUrl)) {
-    let fileNameFromUrl: string | undefined;
+    const fetched = await fetchRemoteMedia({ url: mediaUrl });
-    try {
+    const { buffer, contentType, fileName } = fetched;
      const url = new URL(mediaUrl);
      const base = path.basename(url.pathname);
      fileNameFromUrl = base || undefined;
    } catch {
      // ignore parse errors; leave undefined
    }
    let res: Response;
    try {
      res = await fetch(mediaUrl);
    } catch (err) {
      throw new Error(`Failed to fetch media from ${mediaUrl}: ${String(err)}`);
    }
    if (!res.ok || !res.body) {
      const statusText = res.statusText ? ` ${res.statusText}` : "";
      const redirected =
        res.url && res.url !== mediaUrl ? ` (redirected to ${res.url})` : "";
      let detail = `HTTP ${res.status}${statusText}`;
      if (!res.body) {
        detail = `HTTP ${res.status}${statusText}; empty response body`;
      } else if (!res.ok) {
        const snippet = await readErrorBodySnippet(res);
        if (snippet) detail += `; body: ${snippet}`;
      }
      throw new Error(
        `Failed to fetch media from ${mediaUrl}${redirected}: ${detail}`,
      );
    }
    const array = Buffer.from(await res.arrayBuffer());
    const headerFileName = parseContentDispositionFileName(
      res.headers.get("content-disposition"),
    );
    let fileName = headerFileName || fileNameFromUrl || undefined;
    const filePathForMime =
      headerFileName && path.extname(headerFileName)
        ? headerFileName
        : mediaUrl;
    const contentType = await detectMime({
      buffer: array,
      headerMime: res.headers.get("content-type"),
      filePath: filePathForMime,
    });
    if (fileName && !path.extname(fileName) && contentType) {
      const ext = extensionForMime(contentType);
      if (ext) fileName = `${fileName}${ext}`;
    }
    const kind = mediaKindFromMime(contentType);
    const cap = Math.min(
      maxBytes ?? maxBytesForKind(kind),
@@ -148,28 +65,28 @@ async function loadWebMediaInternal(
    if (kind === "image") {
      // Skip optimization for GIFs to preserve animation.
      if (contentType === "image/gif" || !optimizeImages) {
-        if (array.length > cap) {
+        if (buffer.length > cap) {
          throw new Error(
            `${
              contentType === "image/gif" ? "GIF" : "Media"
            } exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
-              array.length / (1024 * 1024)
+              buffer.length / (1024 * 1024)
            ).toFixed(2)}MB)`,
          );
        }
-        return { buffer: array, contentType, kind, fileName };
+        return { buffer, contentType, kind, fileName };
      }
-      return { ...(await optimizeAndClampImage(array, cap)), fileName };
+      return { ...(await optimizeAndClampImage(buffer, cap)), fileName };
    }
-    if (array.length > cap) {
+    if (buffer.length > cap) {
      throw new Error(
        `Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
-          array.length / (1024 * 1024)
+          buffer.length / (1024 * 1024)
        ).toFixed(2)}MB)`,
      );
    }
    return {
-      buffer: array,
+      buffer,
      contentType: contentType ?? undefined,
      kind,
      fileName,