refactor: consolidate reply/media helpers

2026-01-10 02:40:41 +01:00
parent 9cd2662a86
commit 4075895c4c
17 changed files with 437 additions and 277 deletions
--- a/src/auto-reply/reply/agent-runner.ts
+++ b/src/auto-reply/reply/agent-runner.ts
@@ -22,6 +22,7 @@ import {
  emitAgentEvent,
  registerAgentRunContext,
 } from "../../infra/agent-events.js";
+import { isAudioFileName } from "../../media/mime.js";
 import { defaultRuntime } from "../../runtime.js";
 import {
  estimateUsageCost,
@@ -34,8 +35,11 @@ import type { OriginatingChannelType, TemplateContext } from "../templating.js";
 import { normalizeVerboseLevel, type VerboseLevel } from "../thinking.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
-import { extractAudioTag } from "./audio-tags.js";
-import { createBlockReplyPipeline } from "./block-reply-pipeline.js";
+import { parseAudioTag } from "./audio-tags.js";
+import {
+  createAudioAsVoiceBuffer,
+  createBlockReplyPipeline,
+} from "./block-reply-pipeline.js";
 import { resolveBlockStreamingCoalescing } from "./block-streaming.js";
 import { createFollowupRunner } from "./followup-runner.js";
 import {
@@ -261,13 +265,12 @@ export async function runReplyAgent(params: {
  const blockReplyTimeoutMs =
    opts?.blockReplyTimeoutMs ?? BLOCK_REPLY_SEND_TIMEOUT_MS;

-  // Buffer audio blocks to apply [[audio_as_voice]] tag that may come later
-  const bufferedAudioBlocks: ReplyPayload[] = [];
-  let seenAudioAsVoice = false;
-
-  const AUDIO_EXTENSIONS = /\.(opus|mp3|m4a|wav|ogg|aac|flac)$/i;
  const hasAudioMedia = (urls?: string[]): boolean =>
-    Boolean(urls?.some((u) => AUDIO_EXTENSIONS.test(u)));
+    Boolean(urls?.some((u) => isAudioFileName(u)));
+  const isAudioPayload = (payload: ReplyPayload) =>
+    hasAudioMedia(
+      payload.mediaUrls ?? (payload.mediaUrl ? [payload.mediaUrl] : undefined),
+    );
  const replyToChannel =
    sessionCtx.OriginatingChannel ??
    ((sessionCtx.Surface ?? sessionCtx.Provider)?.toLowerCase() as
@@ -297,6 +300,7 @@ export async function runReplyAgent(params: {
          onBlockReply: opts.onBlockReply,
          timeoutMs: blockReplyTimeoutMs,
          coalescing: blockReplyCoalescing,
+          buffer: createAudioAsVoiceBuffer({ isAudioPayload }),
        })
      : null;

@@ -546,8 +550,8 @@ export async function runReplyAgent(params: {
                      !payload.audioAsVoice
                    )
                      return;
-                    const audioTagResult = extractAudioTag(taggedPayload.text);
-                    const cleaned = audioTagResult.cleaned || undefined;
+                    const audioTagResult = parseAudioTag(taggedPayload.text);
+                    const cleaned = audioTagResult.text || undefined;
                    const hasMedia =
                      Boolean(taggedPayload.mediaUrl) ||
                      (taggedPayload.mediaUrls?.length ?? 0) > 0;
@@ -559,11 +563,6 @@ export async function runReplyAgent(params: {
                    )
                      return;

-                    // Track if we've seen [[audio_as_voice]] from payload or text extraction
-                    if (payload.audioAsVoice || audioTagResult.audioAsVoice) {
-                      seenAudioAsVoice = true;
-                    }
-
                    const blockPayload: ReplyPayload = applyReplyToMode({
                      ...taggedPayload,
                      text: cleaned,
@@ -579,13 +578,6 @@ export async function runReplyAgent(params: {
                        );
                      });

-                    // Buffer audio blocks to apply [[audio_as_voice]] that may come later
-                    const isAudioBlock = hasAudioMedia(taggedPayload.mediaUrls);
-                    if (isAudioBlock) {
-                      bufferedAudioBlocks.push(blockPayload);
-                      return; // Don't send immediately - wait for potential [[audio_as_voice]] tag
-                    }
-
                    blockReplyPipeline?.enqueue(blockPayload);
                  }
                : undefined,
@@ -701,16 +693,6 @@ export async function runReplyAgent(params: {

    const payloadArray = runResult.payloads ?? [];

-    if (bufferedAudioBlocks.length > 0 && blockReplyPipeline) {
-      for (const audioPayload of bufferedAudioBlocks) {
-        const finalPayload = seenAudioAsVoice
-          ? { ...audioPayload, audioAsVoice: true }
-          : audioPayload;
-        blockReplyPipeline.enqueue(finalPayload);
-      }
-      bufferedAudioBlocks.length = 0;
-    }
-
    if (blockReplyPipeline) {
      await blockReplyPipeline.flush({ force: true });
      blockReplyPipeline.stop();
@@ -753,10 +735,10 @@ export async function runReplyAgent(params: {
      currentMessageId: sessionCtx.MessageSid,
    })
      .map((payload) => {
-        const audioTagResult = extractAudioTag(payload.text);
+        const audioTagResult = parseAudioTag(payload.text);
        return {
          ...payload,
-          text: audioTagResult.cleaned ? audioTagResult.cleaned : undefined,
+          text: audioTagResult.text ? audioTagResult.text : undefined,
          audioAsVoice: audioTagResult.audioAsVoice,
        };
      })
--- a/src/auto-reply/reply/audio-tags.test.ts
+++ b/src/auto-reply/reply/audio-tags.test.ts
@@ -1,25 +1,25 @@
 import { describe, expect, it } from "vitest";

-import { extractAudioTag } from "./audio-tags.js";
+import { parseAudioTag } from "./audio-tags.js";

-describe("extractAudioTag", () => {
+describe("parseAudioTag", () => {
  it("detects audio_as_voice and strips the tag", () => {
-    const result = extractAudioTag("Hello [[audio_as_voice]] world");
+    const result = parseAudioTag("Hello [[audio_as_voice]] world");
    expect(result.audioAsVoice).toBe(true);
-    expect(result.hasTag).toBe(true);
-    expect(result.cleaned).toBe("Hello world");
+    expect(result.hadTag).toBe(true);
+    expect(result.text).toBe("Hello world");
  });

  it("returns empty output for missing text", () => {
-    const result = extractAudioTag(undefined);
+    const result = parseAudioTag(undefined);
    expect(result.audioAsVoice).toBe(false);
-    expect(result.hasTag).toBe(false);
-    expect(result.cleaned).toBe("");
+    expect(result.hadTag).toBe(false);
+    expect(result.text).toBe("");
  });

  it("removes tag-only messages", () => {
-    const result = extractAudioTag("[[audio_as_voice]]");
+    const result = parseAudioTag("[[audio_as_voice]]");
    expect(result.audioAsVoice).toBe(true);
-    expect(result.cleaned).toBe("");
+    expect(result.text).toBe("");
  });
 });
--- a/src/auto-reply/reply/audio-tags.ts
+++ b/src/auto-reply/reply/audio-tags.ts
@@ -1,31 +1 @@
-/**
- * Extract audio mode tag from text.
- * Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
- * Default is file (preserves backward compatibility).
- */
-export function extractAudioTag(text?: string): {
-  cleaned: string;
-  audioAsVoice: boolean;
-  hasTag: boolean;
-} {
-  if (!text) return { cleaned: "", audioAsVoice: false, hasTag: false };
-  let cleaned = text;
-  let audioAsVoice = false; // default: audio file (backward compatible)
-  let hasTag = false;
-
-  // [[audio_as_voice]] -> send as voice bubble (opt-in)
-  const voiceMatch = cleaned.match(/\[\[audio_as_voice\]\]/i);
-  if (voiceMatch) {
-    cleaned = cleaned.replace(/\[\[audio_as_voice\]\]/gi, " ");
-    audioAsVoice = true;
-    hasTag = true;
-  }
-
-  // Clean up whitespace
-  cleaned = cleaned
-    .replace(/[ \t]+/g, " ")
-    .replace(/[ \t]*\n[ \t]*/g, "\n")
-    .trim();
-
-  return { cleaned, audioAsVoice, hasTag };
-}
+export { parseAudioTag } from "../../media/audio-tags.js";
--- a/src/auto-reply/reply/block-reply-pipeline.ts
+++ b/src/auto-reply/reply/block-reply-pipeline.ts
@@ -13,6 +13,28 @@ export type BlockReplyPipeline = {
  hasSentPayload: (payload: ReplyPayload) => boolean;
 };

+export type BlockReplyBuffer = {
+  shouldBuffer: (payload: ReplyPayload) => boolean;
+  onEnqueue?: (payload: ReplyPayload) => void;
+  finalize?: (payload: ReplyPayload) => ReplyPayload;
+};
+
+export function createAudioAsVoiceBuffer(params: {
+  isAudioPayload: (payload: ReplyPayload) => boolean;
+}): BlockReplyBuffer {
+  let seenAudioAsVoice = false;
+  return {
+    onEnqueue: (payload) => {
+      if (payload.audioAsVoice) {
+        seenAudioAsVoice = true;
+      }
+    },
+    shouldBuffer: (payload) => params.isAudioPayload(payload),
+    finalize: (payload) =>
+      seenAudioAsVoice ? { ...payload, audioAsVoice: true } : payload,
+  };
+}
+
 export function createBlockReplyPayloadKey(payload: ReplyPayload): string {
  const text = payload.text?.trim() ?? "";
  const mediaList = payload.mediaUrls?.length
@@ -51,12 +73,15 @@ export function createBlockReplyPipeline(params: {
  ) => Promise<void> | void;
  timeoutMs: number;
  coalescing?: BlockStreamingCoalescing;
+  buffer?: BlockReplyBuffer;
 }): BlockReplyPipeline {
-  const { onBlockReply, timeoutMs, coalescing } = params;
+  const { onBlockReply, timeoutMs, coalescing, buffer } = params;
  const sentKeys = new Set<string>();
  const pendingKeys = new Set<string>();
  const seenKeys = new Set<string>();
  const bufferedKeys = new Set<string>();
+  const bufferedPayloadKeys = new Set<string>();
+  const bufferedPayloads: ReplyPayload[] = [];
  let sendChain: Promise<void> = Promise.resolve();
  let aborted = false;
  let didStream = false;
@@ -124,8 +149,37 @@ export function createBlockReplyPipeline(params: {
      })
    : null;

+  const bufferPayload = (payload: ReplyPayload) => {
+    buffer?.onEnqueue?.(payload);
+    if (!buffer?.shouldBuffer(payload)) return false;
+    const payloadKey = createBlockReplyPayloadKey(payload);
+    if (
+      seenKeys.has(payloadKey) ||
+      sentKeys.has(payloadKey) ||
+      pendingKeys.has(payloadKey) ||
+      bufferedPayloadKeys.has(payloadKey)
+    ) {
+      return true;
+    }
+    seenKeys.add(payloadKey);
+    bufferedPayloadKeys.add(payloadKey);
+    bufferedPayloads.push(payload);
+    return true;
+  };
+
+  const flushBuffered = () => {
+    if (!bufferedPayloads.length) return;
+    for (const payload of bufferedPayloads) {
+      const finalPayload = buffer?.finalize?.(payload) ?? payload;
+      sendPayload(finalPayload, true);
+    }
+    bufferedPayloads.length = 0;
+    bufferedPayloadKeys.clear();
+  };
+
  const enqueue = (payload: ReplyPayload) => {
    if (aborted) return;
+    if (bufferPayload(payload)) return;
    const hasMedia =
      Boolean(payload.mediaUrl) || (payload.mediaUrls?.length ?? 0) > 0;
    if (hasMedia) {
@@ -151,6 +205,7 @@ export function createBlockReplyPipeline(params: {

  const flush = async (options?: { force?: boolean }) => {
    await coalescer?.flush(options);
+    flushBuffered();
    await sendChain;
  };

@@ -162,7 +217,8 @@ export function createBlockReplyPipeline(params: {
    enqueue,
    flush,
    stop,
-    hasBuffered: () => Boolean(coalescer?.hasBuffered()),
+    hasBuffered: () =>
+      Boolean(coalescer?.hasBuffered() || bufferedPayloads.length > 0),
    didStream: () => didStream,
    isAborted: () => aborted,
    hasSentPayload: (payload) => {
--- a/src/auto-reply/reply/dispatch-from-config.ts
+++ b/src/auto-reply/reply/dispatch-from-config.ts
@@ -7,7 +7,7 @@ import { tryFastAbortFromMessage } from "./abort.js";
 import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js";
 import { isRoutableChannel, routeReply } from "./route-reply.js";

-type DispatchFromConfigResult = {
+export type DispatchFromConfigResult = {
  queuedFinal: boolean;
  counts: Record<ReplyDispatchKind, number>;
 };
--- a/src/auto-reply/reply/provider-dispatcher.ts
+++ b/src/auto-reply/reply/provider-dispatcher.ts
@@ -0,0 +1,34 @@
+import type { ClawdbotConfig } from "../../config/config.js";
+import type { MsgContext } from "../templating.js";
+import type { GetReplyOptions } from "../types.js";
+import type { DispatchFromConfigResult } from "./dispatch-from-config.js";
+import { dispatchReplyFromConfig } from "./dispatch-from-config.js";
+import {
+  createReplyDispatcherWithTyping,
+  type ReplyDispatcherWithTypingOptions,
+} from "./reply-dispatcher.js";
+
+export async function dispatchReplyWithBufferedBlockDispatcher(params: {
+  ctx: MsgContext;
+  cfg: ClawdbotConfig;
+  dispatcherOptions: ReplyDispatcherWithTypingOptions;
+  replyOptions?: Omit<GetReplyOptions, "onToolResult" | "onBlockReply">;
+  replyResolver?: typeof import("../reply.js").getReplyFromConfig;
+}): Promise<DispatchFromConfigResult> {
+  const { dispatcher, replyOptions, markDispatchIdle } =
+    createReplyDispatcherWithTyping(params.dispatcherOptions);
+
+  const result = await dispatchReplyFromConfig({
+    ctx: params.ctx,
+    cfg: params.cfg,
+    dispatcher,
+    replyResolver: params.replyResolver,
+    replyOptions: {
+      ...params.replyOptions,
+      ...replyOptions,
+    },
+  });
+
+  markDispatchIdle();
+  return result;
+}
--- a/src/auto-reply/reply/reply-dispatcher.ts
+++ b/src/auto-reply/reply/reply-dispatcher.ts
@@ -22,7 +22,7 @@ export type ReplyDispatcherOptions = {
  onError?: ReplyDispatchErrorHandler;
 };

-type ReplyDispatcherWithTypingOptions = Omit<
+export type ReplyDispatcherWithTypingOptions = Omit<
  ReplyDispatcherOptions,
  "onIdle"
 > & {
--- a/src/media/audio-tags.ts
+++ b/src/media/audio-tags.ts
@@ -0,0 +1,31 @@
+/**
+ * Extract audio mode tag from text.
+ * Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
+ * Default is file (preserves backward compatibility).
+ */
+export function parseAudioTag(text?: string): {
+  text: string;
+  audioAsVoice: boolean;
+  hadTag: boolean;
+} {
+  if (!text) return { text: "", audioAsVoice: false, hadTag: false };
+  let cleaned = text;
+  let audioAsVoice = false; // default: audio file (backward compatible)
+  let hadTag = false;
+
+  // [[audio_as_voice]] -> send as voice bubble (opt-in)
+  const voiceMatch = cleaned.match(/\[\[audio_as_voice\]\]/i);
+  if (voiceMatch) {
+    cleaned = cleaned.replace(/\[\[audio_as_voice\]\]/gi, " ");
+    audioAsVoice = true;
+    hadTag = true;
+  }
+
+  // Clean up whitespace
+  cleaned = cleaned
+    .replace(/[ \t]+/g, " ")
+    .replace(/[ \t]*\n[ \t]*/g, "\n")
+    .trim();
+
+  return { text: cleaned, audioAsVoice, hadTag };
+}
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -0,0 +1,125 @@
+import path from "node:path";
+
+import { detectMime, extensionForMime } from "./mime.js";
+
+type FetchMediaResult = {
+  buffer: Buffer;
+  contentType?: string;
+  fileName?: string;
+};
+
+type FetchMediaOptions = {
+  url: string;
+  fetchImpl?: typeof fetch;
+  filePathHint?: string;
+};
+
+function stripQuotes(value: string): string {
+  return value.replace(/^["']|["']$/g, "");
+}
+
+function parseContentDispositionFileName(
+  header?: string | null,
+): string | undefined {
+  if (!header) return undefined;
+  const starMatch = /filename\*\s*=\s*([^;]+)/i.exec(header);
+  if (starMatch?.[1]) {
+    const cleaned = stripQuotes(starMatch[1].trim());
+    const encoded = cleaned.split("''").slice(1).join("''") || cleaned;
+    try {
+      return path.basename(decodeURIComponent(encoded));
+    } catch {
+      return path.basename(encoded);
+    }
+  }
+  const match = /filename\s*=\s*([^;]+)/i.exec(header);
+  if (match?.[1]) return path.basename(stripQuotes(match[1].trim()));
+  return undefined;
+}
+
+async function readErrorBodySnippet(
+  res: Response,
+  maxChars = 200,
+): Promise<string | undefined> {
+  try {
+    const text = await res.text();
+    if (!text) return undefined;
+    const collapsed = text.replace(/\s+/g, " ").trim();
+    if (!collapsed) return undefined;
+    if (collapsed.length <= maxChars) return collapsed;
+    return `${collapsed.slice(0, maxChars)}…`;
+  } catch {
+    return undefined;
+  }
+}
+
+export async function fetchRemoteMedia(
+  options: FetchMediaOptions,
+): Promise<FetchMediaResult> {
+  const { url, fetchImpl, filePathHint } = options;
+  const fetcher = fetchImpl ?? globalThis.fetch;
+  if (!fetcher) {
+    throw new Error("fetch is not available");
+  }
+
+  let res: Response;
+  try {
+    res = await fetcher(url);
+  } catch (err) {
+    throw new Error(`Failed to fetch media from ${url}: ${String(err)}`);
+  }
+
+  if (!res.ok) {
+    const statusText = res.statusText ? ` ${res.statusText}` : "";
+    const redirected =
+      res.url && res.url !== url ? ` (redirected to ${res.url})` : "";
+    let detail = `HTTP ${res.status}${statusText}`;
+    if (!res.body) {
+      detail = `HTTP ${res.status}${statusText}; empty response body`;
+    } else {
+      const snippet = await readErrorBodySnippet(res);
+      if (snippet) detail += `; body: ${snippet}`;
+    }
+    throw new Error(
+      `Failed to fetch media from ${url}${redirected}: ${detail}`,
+    );
+  }
+
+  const buffer = Buffer.from(await res.arrayBuffer());
+  let fileNameFromUrl: string | undefined;
+  try {
+    const parsed = new URL(url);
+    const base = path.basename(parsed.pathname);
+    fileNameFromUrl = base || undefined;
+  } catch {
+    // ignore parse errors; leave undefined
+  }
+
+  const headerFileName = parseContentDispositionFileName(
+    res.headers.get("content-disposition"),
+  );
+  let fileName =
+    headerFileName ||
+    fileNameFromUrl ||
+    (filePathHint ? path.basename(filePathHint) : undefined);
+
+  const filePathForMime =
+    headerFileName && path.extname(headerFileName)
+      ? headerFileName
+      : (filePathHint ?? url);
+  const contentType = await detectMime({
+    buffer,
+    headerMime: res.headers.get("content-type"),
+    filePath: filePathForMime,
+  });
+  if (fileName && !path.extname(fileName) && contentType) {
+    const ext = extensionForMime(contentType);
+    if (ext) fileName = `${fileName}${ext}`;
+  }
+
+  return {
+    buffer,
+    contentType: contentType ?? undefined,
+    fileName,
+  };
+}
--- a/src/media/mime.ts
+++ b/src/media/mime.ts
@@ -36,6 +36,17 @@ const MIME_BY_EXT: Record<string, string> = Object.fromEntries(
  Object.entries(EXT_BY_MIME).map(([mime, ext]) => [ext, mime]),
 );

+const AUDIO_FILE_EXTENSIONS = new Set([
+  ".aac",
+  ".flac",
+  ".m4a",
+  ".mp3",
+  ".oga",
+  ".ogg",
+  ".opus",
+  ".wav",
+]);
+
 function normalizeHeaderMime(mime?: string | null): string | undefined {
  if (!mime) return undefined;
  const cleaned = mime.split(";")[0]?.trim().toLowerCase();
@@ -52,7 +63,7 @@ async function sniffMime(buffer?: Buffer): Promise<string | undefined> {
  }
 }

-function extFromPath(filePath?: string): string | undefined {
+export function getFileExtension(filePath?: string | null): string | undefined {
  if (!filePath) return undefined;
  try {
    if (/^https?:\/\//i.test(filePath)) {
@@ -66,6 +77,12 @@ function extFromPath(filePath?: string): string | undefined {
  return ext || undefined;
 }

+export function isAudioFileName(fileName?: string | null): boolean {
+  const ext = getFileExtension(fileName);
+  if (!ext) return false;
+  return AUDIO_FILE_EXTENSIONS.has(ext);
+}
+
 export function detectMime(opts: {
  buffer?: Buffer;
  headerMime?: string | null;
@@ -85,7 +102,7 @@ async function detectMimeImpl(opts: {
  headerMime?: string | null;
  filePath?: string;
 }): Promise<string | undefined> {
-  const ext = extFromPath(opts.filePath);
+  const ext = getFileExtension(opts.filePath);
  const extMime = ext ? MIME_BY_EXT[ext] : undefined;

  const headerMime = normalizeHeaderMime(opts.headerMime);
@@ -112,9 +129,7 @@ export function isGifMedia(opts: {
  fileName?: string | null;
 }): boolean {
  if (opts.contentType?.toLowerCase() === "image/gif") return true;
-  const ext = opts.fileName
-    ? path.extname(opts.fileName).toLowerCase()
-    : undefined;
+  const ext = getFileExtension(opts.fileName);
  return ext === ".gif";
 }

--- a/src/media/parse.ts
+++ b/src/media/parse.ts
@@ -1,6 +1,7 @@
 // Shared helpers for parsing MEDIA tokens from command/stdout text.

 import { parseFenceSpans } from "../markdown/fences.js";
+import { parseAudioTag } from "./audio-tags.js";

 // Allow optional wrapping backticks and punctuation after the token; capture the core token.
 export const MEDIA_TOKEN_RE = /\bMEDIA:\s*`?([^\n]+)`?/gi;
@@ -32,10 +33,6 @@ function isInsideFence(
  return fenceSpans.some((span) => offset >= span.start && offset < span.end);
 }

-// Regex to detect [[audio_as_voice]] tag
-const AUDIO_AS_VOICE_RE = /\[\[audio_as_voice\]\]/gi;
-const AUDIO_AS_VOICE_TEST_RE = /\[\[audio_as_voice\]\]/i;
-
 export function splitMediaFromOutput(raw: string): {
  text: string;
  mediaUrls?: string[];
@@ -124,13 +121,10 @@ export function splitMediaFromOutput(raw: string): {
    .trim();

  // Detect and strip [[audio_as_voice]] tag
-  const hasAudioAsVoice = AUDIO_AS_VOICE_TEST_RE.test(cleanedText);
-  if (hasAudioAsVoice) {
-    cleanedText = cleanedText
-      .replace(AUDIO_AS_VOICE_RE, "")
-      .replace(/[ \t]+/g, " ")
-      .replace(/\n{2,}/g, "\n")
-      .trim();
+  const audioTagResult = parseAudioTag(cleanedText);
+  const hasAudioAsVoice = audioTagResult.audioAsVoice;
+  if (audioTagResult.hadTag) {
+    cleanedText = audioTagResult.text.replace(/\n{2,}/g, "\n").trim();
  }

  if (media.length === 0) {
--- a/src/telegram/bot.ts
+++ b/src/telegram/bot.ts
@@ -1,6 +1,4 @@
 // @ts-nocheck
-import { Buffer } from "node:buffer";
-
 import { sequentialize } from "@grammyjs/runner";
 import { apiThrottler } from "@grammyjs/transformer-throttler";
 import type { ApiClientOptions, Message } from "grammy";
@@ -22,12 +20,11 @@ import {
 } from "../auto-reply/commands-registry.js";
 import { formatAgentEnvelope } from "../auto-reply/envelope.js";
 import { resolveBlockStreamingChunking } from "../auto-reply/reply/block-streaming.js";
-import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
 import {
  buildMentionRegexes,
  matchesMentionPatterns,
 } from "../auto-reply/reply/mentions.js";
-import { createReplyDispatcherWithTyping } from "../auto-reply/reply/reply-dispatcher.js";
+import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
 import { getReplyFromConfig } from "../auto-reply/reply.js";
 import type { ReplyPayload } from "../auto-reply/types.js";
 import type { ClawdbotConfig, ReplyToMode } from "../config/config.js";
@@ -46,7 +43,8 @@ import { formatErrorMessage } from "../infra/errors.js";
 import { recordProviderActivity } from "../infra/provider-activity.js";
 import { getChildLogger } from "../logging.js";
 import { mediaKindFromMime } from "../media/constants.js";
-import { detectMime, isGifMedia } from "../media/mime.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
+import { isGifMedia } from "../media/mime.js";
 import { saveMediaBuffer } from "../media/store.js";
 import {
  formatLocationText,
@@ -64,7 +62,7 @@ import {
  readTelegramAllowFromStore,
  upsertTelegramPairingRequest,
 } from "./pairing-store.js";
-import { resolveTelegramVoiceDecision } from "./voice.js";
+import { resolveTelegramVoiceSend } from "./voice.js";

 const PARSE_ERR_RE =
  /can't parse entities|parse entities|find end of the entity/i;
@@ -805,8 +803,16 @@ export function createTelegramBot(opts: TelegramBotOptions) {
      await draftStream.flush();
    };

-    const { dispatcher, replyOptions, markDispatchIdle } =
-      createReplyDispatcherWithTyping({
+    const disableBlockStreaming =
+      Boolean(draftStream) ||
+      (typeof telegramCfg.blockStreaming === "boolean"
+        ? !telegramCfg.blockStreaming
+        : undefined);
+
+    const { queuedFinal } = await dispatchReplyWithBufferedBlockDispatcher({
+      ctx: ctxPayload,
+      cfg,
+      dispatcherOptions: {
        responsePrefix: resolveEffectiveMessagesConfig(cfg, route.agentId)
          .responsePrefix,
        deliver: async (payload, info) => {
@@ -831,20 +837,8 @@ export function createTelegramBot(opts: TelegramBotOptions) {
          );
        },
        onReplyStart: sendTyping,
-      });
-
-    const disableBlockStreaming =
-      Boolean(draftStream) ||
-      (typeof telegramCfg.blockStreaming === "boolean"
-        ? !telegramCfg.blockStreaming
-        : undefined);
-
-    const { queuedFinal } = await dispatchReplyFromConfig({
-      ctx: ctxPayload,
-      cfg,
-      dispatcher,
+      },
      replyOptions: {
-        ...replyOptions,
        skillFilter,
        onPartialReply: draftStream
          ? (payload) => updateDraftFromPartial(payload.text)
@@ -857,7 +851,6 @@ export function createTelegramBot(opts: TelegramBotOptions) {
        disableBlockStreaming,
      },
    });
-    markDispatchIdle();
    draftStream?.stop();
    if (!queuedFinal) return;
    if (
@@ -1409,16 +1402,12 @@ async function deliverReplies(params: {
          ...mediaParams,
        });
      } else if (kind === "audio") {
-        const { useVoice, reason } = resolveTelegramVoiceDecision({
+        const { useVoice } = resolveTelegramVoiceSend({
          wantsVoice: reply.audioAsVoice === true, // default false (backward compatible)
          contentType: media.contentType,
          fileName,
+          logFallback: logVerbose,
        });
-        if (reason) {
-          logVerbose(
-            `Telegram voice requested but ${reason}; sending as audio file instead.`,
-          );
-        }
        if (useVoice) {
          // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]])
          await bot.api.sendVoice(chatId, file, {
@@ -1571,19 +1560,17 @@ async function resolveMedia(
    throw new Error("fetch is not available; set telegram.proxy in config");
  }
  const url = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
-  const res = await fetchImpl(url);
-  if (!res.ok) {
-    throw new Error(
-      `Failed to download telegram file: HTTP ${res.status} ${res.statusText}`,
-    );
-  }
-  const data = Buffer.from(await res.arrayBuffer());
-  const mime = await detectMime({
-    buffer: data,
-    headerMime: res.headers.get("content-type"),
-    filePath: file.file_path,
+  const fetched = await fetchRemoteMedia({
+    url,
+    fetchImpl,
+    filePathHint: file.file_path,
  });
-  const saved = await saveMediaBuffer(data, mime, "inbound", maxBytes);
+  const saved = await saveMediaBuffer(
+    fetched.buffer,
+    fetched.contentType,
+    "inbound",
+    maxBytes,
+  );
  let placeholder = "<media:document>";
  if (msg.photo) placeholder = "<media:image>";
  else if (msg.video) placeholder = "<media:video>";
--- a/src/telegram/send.ts
+++ b/src/telegram/send.ts
@@ -21,7 +21,7 @@ import {
  parseTelegramTarget,
  stripTelegramInternalPrefixes,
 } from "./targets.js";
-import { resolveTelegramVoiceDecision } from "./voice.js";
+import { resolveTelegramVoiceSend } from "./voice.js";

 type TelegramSendOpts = {
  token?: string;
@@ -239,16 +239,12 @@ export async function sendMessageTelegram(
        throw wrapChatNotFound(err);
      });
    } else if (kind === "audio") {
-      const { useVoice, reason } = resolveTelegramVoiceDecision({
+      const { useVoice } = resolveTelegramVoiceSend({
        wantsVoice: opts.asVoice === true, // default false (backward compatible)
        contentType: media.contentType,
        fileName,
+        logFallback: logVerbose,
      });
-      if (reason) {
-        logVerbose(
-          `Telegram voice requested but ${reason}; sending as audio file instead.`,
-        );
-      }
      if (useVoice) {
        result = await request(
          () => api.sendVoice(chatId, file, mediaParams),
--- a/src/telegram/voice.test.ts
+++ b/src/telegram/voice.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it, vi } from "vitest";
+
+import { resolveTelegramVoiceSend } from "./voice.js";
+
+describe("resolveTelegramVoiceSend", () => {
+  it("skips voice when wantsVoice is false", () => {
+    const logFallback = vi.fn();
+    const result = resolveTelegramVoiceSend({
+      wantsVoice: false,
+      contentType: "audio/ogg",
+      fileName: "voice.ogg",
+      logFallback,
+    });
+    expect(result.useVoice).toBe(false);
+    expect(logFallback).not.toHaveBeenCalled();
+  });
+
+  it("logs fallback for incompatible media", () => {
+    const logFallback = vi.fn();
+    const result = resolveTelegramVoiceSend({
+      wantsVoice: true,
+      contentType: "audio/mpeg",
+      fileName: "track.mp3",
+      logFallback,
+    });
+    expect(result.useVoice).toBe(false);
+    expect(logFallback).toHaveBeenCalledWith(
+      "Telegram voice requested but media is audio/mpeg (track.mp3); sending as audio file instead.",
+    );
+  });
+
+  it("keeps voice when compatible", () => {
+    const logFallback = vi.fn();
+    const result = resolveTelegramVoiceSend({
+      wantsVoice: true,
+      contentType: "audio/ogg",
+      fileName: "voice.ogg",
+      logFallback,
+    });
+    expect(result.useVoice).toBe(true);
+    expect(logFallback).not.toHaveBeenCalled();
+  });
+});
--- a/src/telegram/voice.ts
+++ b/src/telegram/voice.ts
@@ -1,4 +1,4 @@
-import path from "node:path";
+import { getFileExtension } from "../media/mime.js";

 export function isTelegramVoiceCompatible(opts: {
  contentType?: string | null;
@@ -10,7 +10,8 @@ export function isTelegramVoiceCompatible(opts: {
  }
  const fileName = opts.fileName?.trim();
  if (!fileName) return false;
-  const ext = path.extname(fileName).toLowerCase();
+  const ext = getFileExtension(fileName);
+  if (!ext) return false;
  return ext === ".ogg" || ext === ".opus" || ext === ".oga";
 }

@@ -28,3 +29,18 @@ export function resolveTelegramVoiceDecision(opts: {
    reason: `media is ${contentType} (${fileName})`,
  };
 }
+
+export function resolveTelegramVoiceSend(opts: {
+  wantsVoice: boolean;
+  contentType?: string | null;
+  fileName?: string | null;
+  logFallback?: (message: string) => void;
+}): { useVoice: boolean } {
+  const decision = resolveTelegramVoiceDecision(opts);
+  if (decision.reason && opts.logFallback) {
+    opts.logFallback(
+      `Telegram voice requested but ${decision.reason}; sending as audio file instead.`,
+    );
+  }
+  return { useVoice: decision.useVoice };
+}
--- a/src/web/auto-reply.ts
+++ b/src/web/auto-reply.ts
@@ -17,12 +17,11 @@ import {
  resolveHeartbeatPrompt,
  stripHeartbeatToken,
 } from "../auto-reply/heartbeat.js";
-import { dispatchReplyFromConfig } from "../auto-reply/reply/dispatch-from-config.js";
 import {
  buildMentionRegexes,
  normalizeMentionText,
 } from "../auto-reply/reply/mentions.js";
-import { createReplyDispatcherWithTyping } from "../auto-reply/reply/reply-dispatcher.js";
+import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
 import { getReplyFromConfig } from "../auto-reply/reply.js";
 import { HEARTBEAT_TOKEN, SILENT_REPLY_TOKEN } from "../auto-reply/tokens.js";
 import type { ReplyPayload } from "../auto-reply/types.js";
@@ -1219,8 +1218,39 @@ export async function monitorWebProvider(
        cfg,
        route.agentId,
      ).responsePrefix;
-      const { dispatcher, replyOptions, markDispatchIdle } =
-        createReplyDispatcherWithTyping({
+      const { queuedFinal } = await dispatchReplyWithBufferedBlockDispatcher({
+        ctx: {
+          Body: combinedBody,
+          From: msg.from,
+          To: msg.to,
+          SessionKey: route.sessionKey,
+          AccountId: route.accountId,
+          MessageSid: msg.id,
+          ReplyToId: msg.replyToId,
+          ReplyToBody: msg.replyToBody,
+          ReplyToSender: msg.replyToSender,
+          MediaPath: msg.mediaPath,
+          MediaUrl: msg.mediaUrl,
+          MediaType: msg.mediaType,
+          ChatType: msg.chatType,
+          GroupSubject: msg.groupSubject,
+          GroupMembers: formatGroupMembers(
+            msg.groupParticipants,
+            groupMemberNames.get(groupHistoryKey),
+            msg.senderE164,
+          ),
+          SenderName: msg.senderName,
+          SenderE164: msg.senderE164,
+          WasMentioned: msg.wasMentioned,
+          ...(msg.location ? toLocationContext(msg.location) : {}),
+          Provider: "whatsapp",
+          Surface: "whatsapp",
+          OriginatingChannel: "whatsapp",
+          OriginatingTo: msg.from,
+        },
+        cfg,
+        replyResolver,
+        dispatcherOptions: {
          responsePrefix,
          onHeartbeatStrip: () => {
            if (!didLogHeartbeatStrip) {
@@ -1283,50 +1313,14 @@ export async function monitorWebProvider(
            );
          },
          onReplyStart: msg.sendComposing,
-        });
-
-      const { queuedFinal } = await dispatchReplyFromConfig({
-        ctx: {
-          Body: combinedBody,
-          From: msg.from,
-          To: msg.to,
-          SessionKey: route.sessionKey,
-          AccountId: route.accountId,
-          MessageSid: msg.id,
-          ReplyToId: msg.replyToId,
-          ReplyToBody: msg.replyToBody,
-          ReplyToSender: msg.replyToSender,
-          MediaPath: msg.mediaPath,
-          MediaUrl: msg.mediaUrl,
-          MediaType: msg.mediaType,
-          ChatType: msg.chatType,
-          GroupSubject: msg.groupSubject,
-          GroupMembers: formatGroupMembers(
-            msg.groupParticipants,
-            groupMemberNames.get(groupHistoryKey),
-            msg.senderE164,
-          ),
-          SenderName: msg.senderName,
-          SenderE164: msg.senderE164,
-          WasMentioned: msg.wasMentioned,
-          ...(msg.location ? toLocationContext(msg.location) : {}),
-          Provider: "whatsapp",
-          Surface: "whatsapp",
-          OriginatingChannel: "whatsapp",
-          OriginatingTo: msg.from,
        },
-        cfg,
-        dispatcher,
-        replyResolver,
        replyOptions: {
-          ...replyOptions,
          disableBlockStreaming:
            typeof cfg.whatsapp?.blockStreaming === "boolean"
              ? !cfg.whatsapp.blockStreaming
              : undefined,
        },
      });
-      markDispatchIdle();
      if (!queuedFinal) {
        if (shouldClearGroupHistory && didSendReply) {
          groupHistories.set(groupHistoryKey, []);
--- a/src/web/media.ts
+++ b/src/web/media.ts
@@ -7,6 +7,7 @@ import {
  maxBytesForKind,
  mediaKindFromMime,
 } from "../media/constants.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
 import { resizeToJpeg } from "../media/image-ops.js";
 import { detectMime, extensionForMime } from "../media/mime.js";

@@ -22,45 +23,6 @@ type WebMediaOptions = {
  optimizeImages?: boolean;
 };

-function stripQuotes(value: string): string {
-  return value.replace(/^["']|["']$/g, "");
-}
-
-function parseContentDispositionFileName(
-  header?: string | null,
-): string | undefined {
-  if (!header) return undefined;
-  const starMatch = /filename\*\s*=\s*([^;]+)/i.exec(header);
-  if (starMatch?.[1]) {
-    const cleaned = stripQuotes(starMatch[1].trim());
-    const encoded = cleaned.split("''").slice(1).join("''") || cleaned;
-    try {
-      return path.basename(decodeURIComponent(encoded));
-    } catch {
-      return path.basename(encoded);
-    }
-  }
-  const match = /filename\s*=\s*([^;]+)/i.exec(header);
-  if (match?.[1]) return path.basename(stripQuotes(match[1].trim()));
-  return undefined;
-}
-
-async function readErrorBodySnippet(
-  res: Response,
-  maxChars = 200,
-): Promise<string | undefined> {
-  try {
-    const text = await res.text();
-    if (!text) return undefined;
-    const collapsed = text.replace(/\s+/g, " ").trim();
-    if (!collapsed) return undefined;
-    if (collapsed.length <= maxChars) return collapsed;
-    return `${collapsed.slice(0, maxChars)}…`;
-  } catch {
-    return undefined;
-  }
-}
-
 async function loadWebMediaInternal(
  mediaUrl: string,
  options: WebMediaOptions = {},
@@ -93,53 +55,8 @@ async function loadWebMediaInternal(
  };

  if (/^https?:\/\//i.test(mediaUrl)) {
-    let fileNameFromUrl: string | undefined;
-    try {
-      const url = new URL(mediaUrl);
-      const base = path.basename(url.pathname);
-      fileNameFromUrl = base || undefined;
-    } catch {
-      // ignore parse errors; leave undefined
-    }
-    let res: Response;
-    try {
-      res = await fetch(mediaUrl);
-    } catch (err) {
-      throw new Error(`Failed to fetch media from ${mediaUrl}: ${String(err)}`);
-    }
-    if (!res.ok || !res.body) {
-      const statusText = res.statusText ? ` ${res.statusText}` : "";
-      const redirected =
-        res.url && res.url !== mediaUrl ? ` (redirected to ${res.url})` : "";
-      let detail = `HTTP ${res.status}${statusText}`;
-      if (!res.body) {
-        detail = `HTTP ${res.status}${statusText}; empty response body`;
-      } else if (!res.ok) {
-        const snippet = await readErrorBodySnippet(res);
-        if (snippet) detail += `; body: ${snippet}`;
-      }
-      throw new Error(
-        `Failed to fetch media from ${mediaUrl}${redirected}: ${detail}`,
-      );
-    }
-    const array = Buffer.from(await res.arrayBuffer());
-    const headerFileName = parseContentDispositionFileName(
-      res.headers.get("content-disposition"),
-    );
-    let fileName = headerFileName || fileNameFromUrl || undefined;
-    const filePathForMime =
-      headerFileName && path.extname(headerFileName)
-        ? headerFileName
-        : mediaUrl;
-    const contentType = await detectMime({
-      buffer: array,
-      headerMime: res.headers.get("content-type"),
-      filePath: filePathForMime,
-    });
-    if (fileName && !path.extname(fileName) && contentType) {
-      const ext = extensionForMime(contentType);
-      if (ext) fileName = `${fileName}${ext}`;
-    }
+    const fetched = await fetchRemoteMedia({ url: mediaUrl });
+    const { buffer, contentType, fileName } = fetched;
    const kind = mediaKindFromMime(contentType);
    const cap = Math.min(
      maxBytes ?? maxBytesForKind(kind),
@@ -148,28 +65,28 @@ async function loadWebMediaInternal(
    if (kind === "image") {
      // Skip optimization for GIFs to preserve animation.
      if (contentType === "image/gif" || !optimizeImages) {
-        if (array.length > cap) {
+        if (buffer.length > cap) {
          throw new Error(
            `${
              contentType === "image/gif" ? "GIF" : "Media"
            } exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
-              array.length / (1024 * 1024)
+              buffer.length / (1024 * 1024)
            ).toFixed(2)}MB)`,
          );
        }
-        return { buffer: array, contentType, kind, fileName };
+        return { buffer, contentType, kind, fileName };
      }
-      return { ...(await optimizeAndClampImage(array, cap)), fileName };
+      return { ...(await optimizeAndClampImage(buffer, cap)), fileName };
    }
-    if (array.length > cap) {
+    if (buffer.length > cap) {
      throw new Error(
        `Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
-          array.length / (1024 * 1024)
+          buffer.length / (1024 * 1024)
        ).toFixed(2)}MB)`,
      );
    }
    return {
-      buffer: array,
+      buffer,
      contentType: contentType ?? undefined,
      kind,
      fileName,